#Apriory Algorithm

In [9]:
import numpy as np 
import pandas as pd 
from mlxtend.frequent_patterns import apriori, association_rules 

In [10]:
df = pd.DataFrame(pd.read_csv("Shopping_Mall.csv"))

In [11]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6.0,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6.0,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8.0,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6.0,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6.0,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [12]:
df.shape

(285174, 8)

In [13]:
df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [14]:
df.isnull().values.any()

True

In [15]:
df.isnull().sum()

InvoiceNo          0
StockCode          0
Description     1079
Quantity           1
InvoiceDate        1
UnitPrice          1
CustomerID     81031
Country            1
dtype: int64

#### Data Preprocessing 

In [16]:
df['Description'] = df['Description'].str.strip() 
  
# Dropping the rows without any invoice number 
df.dropna(axis = 0, subset =['InvoiceNo'], inplace = True) 
df['InvoiceNo'] = df['InvoiceNo'].astype('str') 
  
# Dropping all transactions which were done on credit 
df = df[~df['InvoiceNo'].str.contains('C')] 

In [17]:
# Let's see the countries in our dataset

df.Country.unique() 


array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Finland',
       'Austria', 'Bahrain', 'Israel', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', nan], dtype=object)

In [18]:
# Transactions done in France 
basket_France = (df[df['Country'] =="France"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 

In [19]:
# Defining the hot encoding function to make the data suitable  

def hot_encode(x): 
    if(x<= 0): 
        return 0
    if(x>= 1): 
        return 1

Suppose we want to analyze the market trend to France! 

In [20]:
# Applying one hot encoding 

basket_encoded = basket_France.applymap(hot_encode) 
basket_France = basket_encoded 

In [21]:
basket_France.head()

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,18PC WOODEN CUTLERY SET DISPOSABLE,...,WRAP POPPIES DESIGN,WRAP RED APPLES,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536852,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536974,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537065,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Building the model

In [22]:
# Building the model 
frq_items = apriori(basket_France, min_support = 0.1, use_colnames = True) 
  
# Collecting the inferred rules in a dataframe 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 


In [23]:
print(rules.head()) 

                                          antecedents  \
4                   (PACK OF 72 RETROSPOT CAKE CASES)   
35                          (TEA PARTY BIRTHDAY CARD)   
20                    (RED TOADSTOOL LED NIGHT LIGHT)   
24               (ROUND SNACK BOXES SET OF4 WOODLAND)   
62  (SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...   

                        consequents  antecedent support  consequent support  \
4                         (POSTAGE)            0.118812            0.792079   
35                        (POSTAGE)            0.113861            0.792079   
20                        (POSTAGE)            0.168317            0.792079   
24                        (POSTAGE)            0.128713            0.792079   
62  (SET/6 RED SPOTTY PAPER PLATES)            0.113861            0.163366   

     support  confidence      lift  leverage  conviction  
4   0.118812    1.000000  1.262500  0.024703         inf  
35  0.113861    1.000000  1.262500  0.023674         inf  
20  0

#From the above output, it can be seen that paper cups and plates are bought together in France. 