In [1]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules


# New Section

In [5]:
data1 = pd.read_excel('Online_Retail.xlsx')
data1.head()
data1.columns
data1.Country.unique()


array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [6]:
# Stripping extra spaces in the description
data1['Description'] = data1['Description'].str.strip()

  
# Dropping the rows without any invoice number
data1.dropna(axis = 0, subset =['InvoiceNo'], inplace = True)
data1['InvoiceNo'] = data1['InvoiceNo'].astype('str')

  
# Dropping all transactions which were done on credit
data1 = data1[~data1['InvoiceNo'].str.contains('C')]


In [7]:
# Transactions done in the USA
basket_USA = (data1[data1['Country'] =="USA"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [8]:
# Defining the hot encoding function to make the data suitable 
# for the concerned libraries
def hot_encode(x):
    if(x<= 0):
        return 0
    if(x>= 1):
        return 1

In [10]:
# Encoding the datasets
basket_encoded = basket_USA.applymap(hot_encode)
basket_USA = basket_encoded

In [None]:
frq_items = apriori(basket_USA, min_support = 0.01, use_colnames = True)

rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())