<h2>Association Rules: Market Basket Analysis<h2>

In [None]:
import pandas as pd
# Import the dataset
data = pd.read_csv('Market_Basket.csv', header = None)
# Get a look at the first rows
data.head(20)

In [None]:
# Get the dataset dimensions
print(data.shape)

In [None]:
# Get some general info
data.info()
# Object data (e.g. strings or timestamps)

In [None]:
# Get an overview (descriptive statistics) of the dataset
data.describe()
# count -> overall number of (not NaN) values
# unique -> number of different values
# top -> most common value
# freq -> frequency of the most common value

<b>Exploratory data analysis</b>

In [None]:
# Get the most demanded (purchased) items in dataset (looking at the top 30)

# Create a list containing all the items purchased in the overall set of transactions
all_items = []
for i in range(0, data.shape[0]):
    for j in range(0, data.shape[1]):
        all_items.append(data.values[i,j])
print(all_items)

In [None]:
# Compute the number of distinct items in the set of transactions
from collections import Counter
print("No of distinct items:", len(Counter(all_items).keys()))

In [None]:
# Compute the frequency of each item
print(pd.Series(all_items).value_counts())
#print(pd.Series(all_items).value_counts(dropna=False))

In [None]:
# Record the result into a dataframe for visualization pusposes
df_all_items_frequency = pd.Series(all_items).value_counts().reset_index().rename(columns={"index": "Item", 0: "Count"})
df_all_items_frequency.head(20)

In [None]:
# Create nicer visualization (1)
df_all_items_frequency.head(30).style.background_gradient(cmap='Blues')

In [None]:
# Create nicer visualization (2)
import plotly.express as px
df_all_items_frequency["Category"] = "All" # To put all the items in the same hierarchy
fig = px.treemap(df_all_items_frequency.head(30), path=['Category', 'Item'], values='Count',
                color=df_all_items_frequency["Count"].head(30), hover_data=['Item'],
                color_continuous_scale='Blues')
fig.show()

In [None]:
# To see how treemap works can be used for data organized into hierarchies
data_temp = pd.read_csv('Items_Category.csv')
data_temp.head()

In [None]:
fig = px.treemap(data_temp.head(30), path=['Category', 'Item'], values='Count',
                color=data_temp["Count"].head(30), hover_data=['Item'],
                color_continuous_scale='Blues')
fig.show()

<b>Association rules extraction</b>

In [None]:
# The Apriori algorithm receives in input a 0–1 matrix, where rows are transactions and columns are items
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori,association_rules

# Convert the original dataset into an array of lists
list_item = []
for i in range(data.shape[0]):
    list_item.append([str(data.values[i,j]) for j in range(data.shape[1])])
arr_list_item = np.array(list_item)
arr_list_item


#df_all_items = pd.DataFrame(all_items, columns=['Item'])
#df_all_items.to_csv("all_items.csv")


In [None]:
# Create a dataframe containing the transactions as list of present-absent items
te = TransactionEncoder()
te_ary = te.fit(arr_list_item).transform(arr_list_item)
df_transaction = pd.DataFrame(te_ary, columns=te.columns_)
df_transaction.head()

In [None]:
df_transaction = df_transaction.drop('nan', axis=1)
df_transaction.head()

In [None]:
# Transform the dataframe of True-False values into binary values
df_transaction.replace({False: 0, True: 1},inplace=True)
df_transaction.head()

In [None]:
# Since computations might be expensive, let's consider just the 50 most frequent items
df_all_items_frequency.head(50)

In [None]:
list_top50_items = df_all_items_frequency.head(50)['Item'].tolist()
df_transaction_reduced = df_transaction[df_transaction.columns.intersection(list_top50_items)]
df_transaction_reduced

In [None]:
# Extract the most frequest itemsets
# .astype('bool') is added to suppress boring warning :-)

frequent_itemsets = apriori(df_transaction_reduced.astype('bool'), min_support=0.01, use_colnames=True)
frequent_itemsets

In [None]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
rules.sort_values("lift", ascending=False)

In [None]:
# Let's change the perspective...
rules.sort_values("confidence", ascending=False)

In [None]:
#...or refine the analysis 
# The most demanded item, i.e. mineral water, may attract too "attention"...
# Let's remove all the rules containing it and see what happens
rules[~rules["consequents"].str.contains("mineral water", regex=False) & 
      ~rules["antecedents"].str.contains("mineral water", regex=False)].sort_values("confidence", ascending=False)

<b>Useful exercise: Repeat the analysis by introducing items hierarchies (e.g. at the category level)<b>