In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
from mlxtend.frequent_patterns import fpgrowth, association_rules


In [11]:
# load dataset
dataset = pd.read_csv('Dataset.csv')

In [12]:
# print n of rows
print('Number of rows: {}'.format(len(dataset)))

# print number of nan rows
print('Number of nan rows: {}'.format(dataset.isnull().sum().sum()))

# print number of rows in StockCode that has a non number in its string
print('Number of rows in StockCode that has a non number in its string: {}'.format(len(dataset[dataset['StockCode'].str.contains('[^0-9]')])))


Number of rows: 541909
Number of nan rows: 136534
Number of rows in StockCode that has a non number in its string: 54873


In [13]:
#print the first row that has nan
print(dataset[dataset.isnull().any(axis=1)].head(1))

    InvoiceNo StockCode Description  Quantity          InvoiceDate  UnitPrice  \
622    536414     22139         NaN        56  2010-12-01 11:52:00        0.0   

     CustomerID         Country  
622         NaN  United Kingdom  


In [14]:
# need only columns of ['InvoiceNo', 'StockCode', 'Quantity', 'CustomerID']
df = dataset[['InvoiceNo', 'StockCode', 'Quantity', 'CustomerID']]
# print len
print(len(df))
# drop duplicates
df = df.drop_duplicates()
# print len
print(len(df))
# drop null values
df = df.dropna()
# print len
print(len(df))
# drop any row that have non numeric value, and convert to int
df = df[pd.to_numeric(df['CustomerID'], errors='coerce').notnull()]
df['CustomerID'] = df['CustomerID'].astype(int)
df = df[pd.to_numeric(df['InvoiceNo'], errors='coerce').notnull()]
df['InvoiceNo'] = df['InvoiceNo'].astype(int)
df = df[pd.to_numeric(df['StockCode'], errors='coerce').notnull()]
df['StockCode'] = df['StockCode'].astype(int)
df = df[pd.to_numeric(df['Quantity'], errors='coerce').notnull()]
df['Quantity'] = df['Quantity'].astype(int)

# print len
print(len(df))

# head
df.head()


541909
536478
401548
358302


Unnamed: 0,InvoiceNo,StockCode,Quantity,CustomerID
1,536365,71053,6,17850
5,536365,22752,2,17850
6,536365,21730,6,17850
7,536366,22633,6,17850
8,536366,22632,6,17850


In [15]:
def pre_process_df (dataset : pd.DataFrame) -> pd.DataFrame :
    dataset = deepcopy(dataset)
    df = dataset[['InvoiceNo', 'StockCode', 'Quantity', 'CustomerID']]
    df = df.drop_duplicates()
    
    # drop null values
    df = df.dropna()
    # drop any row that have non numeric value, and convert to int
    df = df[pd.to_numeric(df['CustomerID'], errors='coerce').notnull()]
    df['CustomerID'] = df['CustomerID'].astype(int)
    df = df[pd.to_numeric(df['InvoiceNo'], errors='coerce').notnull()]
    df['InvoiceNo'] = df['InvoiceNo'].astype(int)
    df = df[pd.to_numeric(df['StockCode'], errors='coerce').notnull()]
    df['StockCode'] = df['StockCode'].astype(int)
    df = df[pd.to_numeric(df['Quantity'], errors='coerce').notnull()]
    df['Quantity'] = df['Quantity'].astype(int)

    return df


In [20]:
df = pre_process_df(dataset= dataset)
df.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,CustomerID
1,536365,71053,6,17850
5,536365,22752,2,17850
6,536365,21730,6,17850
7,536366,22633,6,17850
8,536366,22632,6,17850


In [21]:
min_support = 0.01
min_confidence = 0.5
# drop customer id column
# for quantitiy if(quantitiy > 0) then 1 else 0
df['Quantity'] = df['Quantity'].apply(lambda x: 1 if x >= 1 else 0)

transaction_data = df.pivot_table(index='InvoiceNo', columns='StockCode', values='Quantity', fill_value=0)
frequent_itemsets = fpgrowth(transaction_data, min_support=min_support, use_colnames=True)




In [22]:
def get_item_suggestions(customer_id, num_suggestions=5):
    user_transactions = df[(df['CustomerID'] == customer_id) & (df['Quantity'] > 0)]['StockCode']
    user_items = list(user_transactions.unique())
    
    suggestions = []
    for item in user_items:
        related_items = frequent_itemsets[frequent_itemsets['itemsets'].apply(lambda x: item in x)]
        suggestions.extend(related_items['itemsets'])
    
    suggestions = pd.Series(suggestions).value_counts().head(num_suggestions)
    return suggestions


In [23]:
# Get item suggestions for a specific CustomerID
customer_id = 17850  # Replace with a valid CustomerID
suggestions = get_item_suggestions(customer_id)
print("Item suggestions for CustomerID:", customer_id)
print(suggestions)

Item suggestions for CustomerID: 17850
(82482, 82483)    2
(82483, 82486)    2
(82482, 82486)    2
(82486)           1
(21928, 22411)    1
Name: count, dtype: int64


In [None]:
# InvoiceNo: A unique number that identifies the invoice1.
# StockCode: A code that represents the product or service sold2.
# Description: A brief description of the product or service sold3.
# Quantity: The number of units of the product or service sold3.
# InvoiceDate: The date when the invoice was issued1.
# UnitPrice: The price per unit of the product or service sold3.
# CustomerID: A unique number that identifies the customer who bought the product or service2.
# Country: The country where the customer is located2.