In [1]:
#https://github.com/siddiquiamir/PyCaret/blob/main/Clustering.ipynb
#https://www.youtube.com/watch?v=XYAGwts5qGw

In [None]:
'''Market Basket Analysis is a data mining technique used to uncover purchase patterns in customer transactions.
If a customer buys bread and butter, they’re also likely to buy jam.'''

In [5]:
import pycaret
from pycaret.datasets import get_data

In [7]:
dataset = get_data("germany") #  Association Rule Mining	InvoiceNo	Description

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536527,22809,SET OF 6 T-LIGHTS SANTA,6,12/1/2010 13:04,2.95,12662,Germany
1,536527,84347,ROTATING SILVER ANGELS T-LIGHT HLDR,6,12/1/2010 13:04,2.55,12662,Germany
2,536527,84945,MULTI COLOUR SILVER T-LIGHT HOLDER,12,12/1/2010 13:04,0.85,12662,Germany
3,536527,22242,5 HOOK HANGER MAGIC TOADSTOOL,12,12/1/2010 13:04,1.65,12662,Germany
4,536527,22244,3 HOOK HANGER MAGIC GARDEN,12,12/1/2010 13:04,1.95,12662,Germany


In [8]:
dataset.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [10]:
dataset.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536527,22809,SET OF 6 T-LIGHTS SANTA,6,12/1/2010 13:04,2.95,12662,Germany
1,536527,84347,ROTATING SILVER ANGELS T-LIGHT HLDR,6,12/1/2010 13:04,2.55,12662,Germany


In [11]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [14]:
df=dataset

In [16]:

# Step 1: Basic cleaning
df['Description']=df['Description'].str.strip()
df.dropna(subset=['InvoiceNo', 'Description'], inplace=True)
df['InvoiceNo'] = df['InvoiceNo'].astype(str)
df = df[~df['InvoiceNo'].str.contains('C')]  # Remove cancellations

# Step 2: Filter only Germany
germany_df = df[df['Country'] == 'Germany']


In [18]:
df[df["Quantity"]<0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


In [19]:
df[df["UnitPrice"]<0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


In [None]:
# Create Basket Format (One-hot encoded)

In [21]:
# Group by Invoice and Description and sum quantities
basket = germany_df.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().fillna(0)

# Encode: 1 if Quantity > 0, else 0
basket = basket.applymap(lambda x: 1 if x > 0 else 0)


In [None]:
'''CustomerID → not needed
Country → already filtered
InvoiceDate → not used in MBA
UnitPrice → not used for frequent pattern mining
stack: Converts item names (Description) into columns
'''

In [25]:
#Apply Apriori
frequent_itemsets = apriori(basket, min_support=0.03, use_colnames=True)

In [34]:
# Generate Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Show important metrics
rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]
display(rules.sort_values(by='lift', ascending=False).head(17))


Unnamed: 0,antecedents,consequents,support,confidence,lift
296,(SPACEBOY CHILDRENS BOWL),(SPACEBOY CHILDRENS CUP),0.037199,0.894737,20.444737
297,(SPACEBOY CHILDRENS CUP),(SPACEBOY CHILDRENS BOWL),0.037199,0.85,20.444737
615,(SPACEBOY CHILDRENS CUP),"(POSTAGE, SPACEBOY CHILDRENS BOWL)",0.035011,0.8,20.311111
610,"(POSTAGE, SPACEBOY CHILDRENS BOWL)",(SPACEBOY CHILDRENS CUP),0.035011,0.888889,20.311111
611,"(POSTAGE, SPACEBOY CHILDRENS CUP)",(SPACEBOY CHILDRENS BOWL),0.035011,0.842105,20.254848
614,(SPACEBOY CHILDRENS BOWL),"(POSTAGE, SPACEBOY CHILDRENS CUP)",0.035011,0.842105,20.254848
358,(CHILDRENS CUTLERY SPACEBOY),"(POSTAGE, CHILDRENS CUTLERY DOLLY GIRL)",0.037199,0.772727,16.816017
355,"(POSTAGE, CHILDRENS CUTLERY DOLLY GIRL)",(CHILDRENS CUTLERY SPACEBOY),0.037199,0.809524,16.816017
41,(CHILDRENS CUTLERY DOLLY GIRL),(CHILDRENS CUTLERY SPACEBOY),0.039387,0.782609,16.256917
40,(CHILDRENS CUTLERY SPACEBOY),(CHILDRENS CUTLERY DOLLY GIRL),0.039387,0.818182,16.256917


In [None]:
'''if you want exact matches only, go back to:
lambda x: product in x

If you want partial match (recommended), use:
lambda x: any(product in i for i in x)'''

In [61]:
import pickle

In [62]:
with open('model_MBA.pkl', 'wb') as file:
    pickle.dump(rules, file)

In [64]:
import pickle

with open('model_MBA.pkl', 'rb') as file:
    rules = pickle.load(file)
