In [1]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

In [2]:
data = pd.read_csv('Raw Database.csv')
data.size

4335272

In [3]:
# Stripping extra spaces in the description
data['Description'] = data['Description'].str.strip()

# Dropping all transactions which were done on credit
data = data[~data['InvoiceNo'].str.contains('C')]
data.size

4260968

In [4]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/01/10 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/01/10 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/01/10 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/01/10 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/01/10 8:26,3.39,17850.0,United Kingdom


In [5]:
data['StockCode'] = data['StockCode'].str.upper()
data = data[
    data['Description'].str.upper() == data['Description']
]
data = data[~data.duplicated(subset=['StockCode'])]
data.size

30256

In [15]:
# Transactions done in France
basket_France = (data[data['Country'] =="France"]
        .groupby(['InvoiceNo', 'Description'])['Quantity']
        .sum().unstack().reset_index().fillna(0)
        .set_index('InvoiceNo'))

# Transactions done in the United Kingdom
basket_UK = (data[data['Country'] =="United Kingdom"]
        .groupby(['InvoiceNo', 'Description'])['Quantity']
        .sum().unstack().reset_index().fillna(0)
        .set_index('InvoiceNo'))

# Transactions done in Portugal
basket_Por = (data[data['Country'] =="Portugal"]
        .groupby(['InvoiceNo', 'Description'])['Quantity']
        .sum().unstack().reset_index().fillna(0)
        .set_index('InvoiceNo'))

basket_Sweden = (data[data['Country'] =="Sweden"]
        .groupby(['InvoiceNo', 'Description'])['Quantity']
        .sum().unstack().reset_index().fillna(0)
        .set_index('InvoiceNo'))

In [19]:
# Defining the hot encoding function to make the data suitable
# for the concerned libraries

# Encoding the datasets

basket_France = pd.get_dummies(basket_France).astype(bool)
basket_UK = pd.get_dummies(basket_UK).astype(bool)
basket_Por = pd.get_dummies(basket_Por).astype(bool)
basket_Sweden = pd.get_dummies(basket_Sweden).astype(bool)

In [7]:
# Building the model
frq_items_France = apriori(basket_France, min_support = 0.01, use_colnames = True)
rules_France = association_rules(frq_items_France, metric ="lift", min_threshold = 1)
rules_France = rules_France.sort_values(['confidence', 'lift'], ascending =[False, False])

In [8]:
frq_items_UK = apriori(basket_UK, min_support = 0.02, use_colnames = True)
rules_UK = association_rules(frq_items_UK, metric ="lift", min_threshold = 1)
rules_UK = rules_UK.sort_values(['confidence', 'lift'], ascending =[False, False])

In [9]:
frq_items_Por = apriori(basket_Por, min_support = 0.05, use_colnames = True)
rules_Por = association_rules(frq_items_Por, metric ="lift", min_threshold = 1)
rules_Por = rules_Por.sort_values(['confidence', 'lift'], ascending =[False, False])

In [20]:
frq_items_Sweden = apriori(basket_Sweden, min_support = 0.05, use_colnames = True)
rules_Sweden = association_rules(frq_items_Sweden, metric ="lift", min_threshold = 1)
rules_Sweden = rules_Sweden.sort_values(['confidence', 'lift'], ascending =[False, False])

In [11]:
def predict(antecedent, rules, max_results=6):
    
    # get the rules for this antecedent
    preds = rules[rules['antecedents'] == antecedent]
    
    # a way to convert a frozen set with one element to string
    preds = preds['consequents'].apply(iter).apply(next)
    
    return preds.iloc[:max_results]

In [9]:
rules_Sweden.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(12 PENCILS SMALL TUBE SKULL),(PACK OF 72 SKULL CAKE CASES),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
1,(PACK OF 72 SKULL CAKE CASES),(12 PENCILS SMALL TUBE SKULL),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
4,(36 DOILIES DOLLY GIRL),(ASSORTED BOTTLE TOP MAGNETS),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
5,(ASSORTED BOTTLE TOP MAGNETS),(36 DOILIES DOLLY GIRL),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
180,(CHILDRENS CUTLERY CIRCUS PARADE),(CHILDRENS CUTLERY DOLLY GIRL),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf


In [12]:
preds = predict({'CHILDRENS CUTLERY CIRCUS PARADE'}, rules=rules_Sweden)
preds

180        CHILDRENS CUTLERY DOLLY GIRL
2143       CHILDRENS CUTLERY DOLLY GIRL
2149                            POSTAGE
2155       CHILDRENS CUTLERY DOLLY GIRL
2167    CHILDRENS CUTLERY POLKADOT PINK
9372                            POSTAGE
Name: consequents, dtype: object

In [18]:
# Transactions done in France
basket_France = (data[data['Country'] =="France"]
        .groupby(['InvoiceNo', 'StockCode'])['Quantity']
        .sum().unstack().reset_index().fillna(0)
        .set_index('InvoiceNo'))

# Transactions done in the United Kingdom
basket_UK = (data[data['Country'] =="United Kingdom"]
        .groupby(['InvoiceNo', 'StockCode'])['Quantity']
        .sum().unstack().reset_index().fillna(0)
        .set_index('InvoiceNo'))

# Transactions done in Portugal
basket_Por = (data[data['Country'] =="Portugal"]
        .groupby(['InvoiceNo', 'StockCode'])['Quantity']
        .sum().unstack().reset_index().fillna(0)
        .set_index('InvoiceNo'))

basket_Sweden = (data[data['Country'] =="Sweden"]
        .groupby(['InvoiceNo', 'StockCode'])['Quantity']
        .sum().unstack().reset_index().fillna(0)
        .set_index('InvoiceNo'))