# Experiment 8: Apriori Algorithm

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.preprocessing import StandardScaler

## Import Dataset

Load the transaction data from the 'Online Retail.xlsx' file into a panda DataFrame.

In [2]:
df=pd.read_excel('Data\\Online Retail.xlsx')
df.head()

Unnamed: 0,InvoiceNo,StockCode,lower,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,white hanging heart t-light holder,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,white metal lantern,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,cream cupid hearts coat hanger,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,knitted union flag hot water bottle,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,red woolly hottie white heart.,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


## Data Cleaning 

Preprocess the data by removing extra spaces in the 'Description' column, dropping rows without invoice numbers, and filtering out credit transactions.
Create separate transaction baskets for each country of interest (France, United Kingdom, Portugal, and Sweden) by grouping the data based on 'Country', 'InvoiceNo', and 'Description' columns. Calculate the sum of 'Quantity' for each unique combination of 'InvoiceNo' and 'Description'. Reshape the resulting DataFrame to have 'InvoiceNo' as the index and each unique 'Description' as a column, representing the quantity of the corresponding item in the transaction.

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   lower        1816 non-null    object        
 3   Description  540455 non-null  object        
 4   Quantity     541909 non-null  int64         
 5   InvoiceDate  541909 non-null  datetime64[ns]
 6   UnitPrice    541909 non-null  float64       
 7   CustomerID   406829 non-null  float64       
 8   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 37.2+ MB


In [4]:
df.dropna(subset=["InvoiceNo"], axis=0,inplace=True)
df["Description"].str.replace("  "," ")

0          WHITE HANGING HEART T-LIGHT HOLDER
1                         WHITE METAL LANTERN
2              CREAM CUPID HEARTS COAT HANGER
3         KNITTED UNION FLAG HOT WATER BOTTLE
4              RED WOOLLY HOTTIE WHITE HEART.
                         ...                 
541904            PACK OF 20 SPACEBOY NAPKINS
541905           CHILDREN'S APRON DOLLY GIRL 
541906          CHILDRENS CUTLERY DOLLY GIRL 
541907        CHILDRENS CUTLERY CIRCUS PARADE
541908          BAKING SET 9 PIECE RETROSPOT 
Name: Description, Length: 541909, dtype: object

In [5]:
df["InvoiceNo"]=df["InvoiceNo"].astype(str)
df=df[~ df["InvoiceNo"].str.contains('C')]
len(df)

532621

In [6]:
def makeBasket(country):
           return (df[df['Country'] =="France"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [7]:
FranceBasket=makeBasket("France")
UkBasket=makeBasket("United Kingdom")
PortugalBasket=makeBasket("Portugal")
SwedenBasket=makeBasket("Sweden")

## Applying Apriori Alogrithm

In [8]:
def hot_encode(x):
    if(x<= 0):
        return 0
    if(x>= 1):
        return 1

FranceBasket=FranceBasket.applymap(hot_encode)
UkBasket=UkBasket.applymap(hot_encode)
PortugalBasket=PortugalBasket.applymap(hot_encode)
SwedenBasket=SwedenBasket.applymap(hot_encode)

In [9]:
def getSortedRules(basket):
    frequency_items=apriori(FranceBasket,min_support=0.05, use_colnames=True)
    rules= association_rules(frequency_items, metric="lift",min_threshold=1)
    return rules.sort_values(['confidence','lift'],ascending=[False,False])

In [10]:
france_rules=getSortedRules(FranceBasket)
uk_rules=getSortedRules(UkBasket)
portugal_rules=getSortedRules(PortugalBasket)
sweden_rules=getSortedRules(SwedenBasket)



## Displaying Rules

In [11]:
france_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
44,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.076531,0.765306,0.076531,1.0,1.306667,0.017961,inf,0.254144
258,"(PLASTERS IN TIN CIRCUS PARADE , RED TOADSTOOL...",(POSTAGE),0.05102,0.765306,0.05102,1.0,1.306667,0.011974,inf,0.247312
270,"(PLASTERS IN TIN WOODLAND ANIMALS, RED TOADSTO...",(POSTAGE),0.053571,0.765306,0.053571,1.0,1.306667,0.012573,inf,0.247978
301,"(SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.09949,0.975,7.644,0.086474,34.897959,0.967949
302,"(SET/20 RED RETROSPOT PAPER NAPKINS , SET/6 RE...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.09949,0.975,7.077778,0.085433,34.489796,0.956294


In [12]:
uk_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
44,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.076531,0.765306,0.076531,1.0,1.306667,0.017961,inf,0.254144
258,"(PLASTERS IN TIN CIRCUS PARADE , RED TOADSTOOL...",(POSTAGE),0.05102,0.765306,0.05102,1.0,1.306667,0.011974,inf,0.247312
270,"(PLASTERS IN TIN WOODLAND ANIMALS, RED TOADSTO...",(POSTAGE),0.053571,0.765306,0.053571,1.0,1.306667,0.012573,inf,0.247978
301,"(SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.09949,0.975,7.644,0.086474,34.897959,0.967949
302,"(SET/20 RED RETROSPOT PAPER NAPKINS , SET/6 RE...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.09949,0.975,7.077778,0.085433,34.489796,0.956294


In [13]:
portugal_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
44,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.076531,0.765306,0.076531,1.0,1.306667,0.017961,inf,0.254144
258,"(PLASTERS IN TIN CIRCUS PARADE , RED TOADSTOOL...",(POSTAGE),0.05102,0.765306,0.05102,1.0,1.306667,0.011974,inf,0.247312
270,"(PLASTERS IN TIN WOODLAND ANIMALS, RED TOADSTO...",(POSTAGE),0.053571,0.765306,0.053571,1.0,1.306667,0.012573,inf,0.247978
301,"(SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.09949,0.975,7.644,0.086474,34.897959,0.967949
302,"(SET/20 RED RETROSPOT PAPER NAPKINS , SET/6 RE...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.09949,0.975,7.077778,0.085433,34.489796,0.956294


In [14]:
sweden_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
44,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.076531,0.765306,0.076531,1.0,1.306667,0.017961,inf,0.254144
258,"(PLASTERS IN TIN CIRCUS PARADE , RED TOADSTOOL...",(POSTAGE),0.05102,0.765306,0.05102,1.0,1.306667,0.011974,inf,0.247312
270,"(PLASTERS IN TIN WOODLAND ANIMALS, RED TOADSTO...",(POSTAGE),0.053571,0.765306,0.053571,1.0,1.306667,0.012573,inf,0.247978
301,"(SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.09949,0.975,7.644,0.086474,34.897959,0.967949
302,"(SET/20 RED RETROSPOT PAPER NAPKINS , SET/6 RE...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.09949,0.975,7.077778,0.085433,34.489796,0.956294


## Interpretation

On analyzing the above rules, it is found that boys’ and girls’ cutlery are paired together. This makes practical sense because when a parent goes shopping for cutlery for his/her children, he/she would want the product to be a little customized according to the kid’s wishes.