# 2020: Week 15

This week is all about [market basket analysis](https://preppindata.blogspot.com/2020/04/2020-week-15.html), something I haven't done before. 

Task: Calculate three key metrics - support, confidence, and lift. These metrics are calculated for individual items and for association rules between items.

In [2]:
## Setup
import pandas as pd

## Load, clean and calculate

In [3]:
# Load data
df = pd.read_excel(".\\inputs\\Transactions.xlsx")
print("raw data")
print(df)
print("")

#get unique list of items
items = ', '.join(map(str, df["Items"])).split(", ")
items = list (set (items))
print("unique items")
print(items)
print("")

#create individual rows
individual = df.copy()
individual["Items"] = individual["Items"].str.split(", ")
tags = individual.apply(lambda x: pd.Series(x['Items']),axis=1).rename(columns={0:"item_1", 1:"item_2", 2:"item_3"})
individual = pd.merge(individual, tags, left_index = True, right_index= True).drop("Items", axis=1)
individual = individual.melt(id_vars=["TransactionID"],value_name="Item").drop("variable", axis=1).dropna()
individual = individual.sort_values("TransactionID").reset_index(drop=True)

#total appearances and support
support = pd.DataFrame(individual["Item"].value_counts())
support = support.reset_index().rename(columns={"index":"Items","Item":"Appearance"})
support["Support"] = support["Appearance"] / df["TransactionID"].nunique()
print("support and appearance of each item")
print(support)
print("")

#get combinations of items
s=[]
for i in range(len(items)):
    for b in range(len(items)):
        x = items[i] + " > " + items[b]
        s.append(x)
results = pd.DataFrame(s, columns=["AssociationRule"])
results = results.loc[results["AssociationRule"].str.split(" > ").str[0] != results["AssociationRule"].str.split(" > ").str[1]]
results = results.reset_index(drop=True)
results["LHSItem"] = results["AssociationRule"].str.split(" > ").str[0]
results["RHSItem"] = results["AssociationRule"].str.split(" > ").str[1]

#merge with support data
results = pd.merge(results, support, left_on="LHSItem", right_on="Items", how="left").drop(["Items"], axis=1).rename(columns={"Support":"LHS Support", "Appearance":"LHS Appearance"})
results = pd.merge(results, support, left_on="RHSItem", right_on="Items", how="left").drop(["Items", "Appearance"], axis=1).rename(columns={"Support":"RHS Support"})

#calculate set appearance 
results["Set Appearance"] = [df.loc[(df["Items"].str.contains(results["LHSItem"][i])) & 
           (df["Items"].str.contains(results["RHSItem"][i])) ,
            "TransactionID" ].count() for i in range(len(results))]

#remove sets with no appearance
results = results.loc[results["Set Appearance"] > 0].reset_index(drop=True)

#calculate confidence
results["Confidence"] = results["Set Appearance"] / results["LHS Appearance"]

#calculate lift
results["lift"] = (results["Set Appearance"] / df["TransactionID"].nunique()) / (results["LHS Support"] * results["RHS Support"])

print(results)

raw data
  TransactionID                                 Items
0          x001                  razors, shaving soap
1          x002                moisturiser, hand soap
2          x003     bath bomb, hand soap, moisturiser
3          x004  hand soap, moisturiser, shaving soap
4          x005                   razors, moisturiser

unique items
['razors', 'hand soap', 'moisturiser', 'bath bomb', 'shaving soap']

support and appearance of each item
          Items  Appearance  Support
0   moisturiser           4      0.8
1     hand soap           3      0.6
2  shaving soap           2      0.4
3        razors           2      0.4
4     bath bomb           1      0.2

               AssociationRule       LHSItem       RHSItem  LHS Appearance  \
0         razors > moisturiser        razors   moisturiser               2   
1        razors > shaving soap        razors  shaving soap               2   
2      hand soap > moisturiser     hand soap   moisturiser               3   
3        hand

## Export

In [4]:
results.drop(["LHS Appearance", "Set Appearance"], axis=1).to_csv(".\\outputs\\2020-15_data-preppin-output.csv", index=False)