In [2]:
# IMPORTING libraries
import pandas as pd
import numpy as np

from mlxtend.frequent_patterns import apriori, association_rules
"""This is a Horizontal format association rule and used BFS(breadth-first search).
It is used in frequent itemset and association rule. We can clearly understand the association of one item to other"""

'This is a Horizontal format association rule and used BFS(breadth-first search)'

In [3]:
# Reading DATA
file = '/home/shiva/Desktop/DATA SCIENCE/LEARNING/DATA/Online_Retail.xlsx'
data = pd.read_excel('../DATA/Online_Retail.xlsx') # this execution is slow as the data is in another directory of the parent folder



In [4]:
# Exploring the DATA

print(data.head())
print('\n\nCOlumns:',data.columns)
print('\n\nCountries:',data.Country.unique())

  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  
0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  
1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  
3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  


COlumns: Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')


Countries: ['United Kingdom' 'Fra

In [25]:
# DATA CLEANING

# Stripping extra spaces in the description
data['Description'] = data['Description'].str.strip()
  
# Dropping the rows without any invoice number
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True)
data['InvoiceNo'] = data['InvoiceNo'].astype('str')
  
# Dropping all transactions which were done on credit
data = data[~data['InvoiceNo'].str.contains('C')]

In [26]:
# Splitting the data according to the region of transaction

# Transactions done in France
basket_France = (data[data['Country'] =="France"]
		.groupby(['InvoiceNo', 'Description'])['Quantity']
		.sum().unstack().reset_index().fillna(0)
		.set_index('InvoiceNo'))

# Transactions done in the United Kingdom
basket_UK = (data[data['Country'] =="United Kingdom"]
		.groupby(['InvoiceNo', 'Description'])['Quantity']
		.sum().unstack().reset_index().fillna(0)
		.set_index('InvoiceNo'))

# Transactions done in Portugal
basket_Por = (data[data['Country'] =="Portugal"]
		.groupby(['InvoiceNo', 'Description'])['Quantity']
		.sum().unstack().reset_index().fillna(0)
		.set_index('InvoiceNo'))

basket_Sweden = (data[data['Country'] =="Sweden"]
		.groupby(['InvoiceNo', 'Description'])['Quantity']
		.sum().unstack().reset_index().fillna(0)
		.set_index('InvoiceNo'))



In [35]:
# Hot encoding the Data

# Defining the hot encoding function to make the data suitable
# for the concerned libraries
def hot_encode(x):
    """This function converts all the values into 0 or 1
    Where 0 - There is not relation bewtween the ITEMS
          1 - There is association between the ITEMS"""
    if(x<= 0):
        return 0
    if(x>= 1):
        return 1

# Encoding the datasets
basket_encoded = basket_France.applymap(hot_encode)
basket_France = basket_encoded

basket_encoded = basket_UK.applymap(hot_encode)
basket_UK = basket_encoded

basket_encoded = basket_Por.applymap(hot_encode)
basket_Por = basket_encoded

basket_encoded = basket_Sweden.applymap(hot_encode)
basket_Sweden = basket_encoded


In [38]:
# Building the models and analyzing the results FOR FRANCE

frq_items = apriori(basket_France, min_support = 0.05, use_colnames = True)

# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 3)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())


                                           antecedents  \
130  (SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...   
132  (SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...   
160  (SET/6 RED SPOTTY PAPER CUPS, POSTAGE, SET/20 ...   
162  (POSTAGE, SET/20 RED RETROSPOT PAPER NAPKINS, ...   
49                     (SET/6 RED SPOTTY PAPER PLATES)   

                         consequents  antecedent support  consequent support  \
130  (SET/6 RED SPOTTY PAPER PLATES)            0.102041            0.127551   
132    (SET/6 RED SPOTTY PAPER CUPS)            0.102041            0.137755   
160  (SET/6 RED SPOTTY PAPER PLATES)            0.084184            0.127551   
162    (SET/6 RED SPOTTY PAPER CUPS)            0.084184            0.137755   
49     (SET/6 RED SPOTTY PAPER CUPS)            0.127551            0.137755   

      support  confidence      lift  leverage  conviction  
130  0.099490    0.975000  7.644000  0.086474   34.897959  
132  0.099490    0.975000  7.077778  0.085433   34



In [7]:
data.shape

(541909, 8)