In [1]:
import pandas as pd
import numpy as np
import os
from collections import defaultdict

In [2]:
dataset_location = r'C:\Users\USER\Desktop\Assignment\DM\Project Assignment I'
dataset_filename = r'JapanMenuItems.xlsx'
source = os.path.join(dataset_location,dataset_filename)

df = pd.read_excel(source)

# Part 1 - Load & Explore

In [3]:
df.head()

Unnamed: 0,California Roll,Salmon Nigiri,Tonkotsu Ramen,Chicken Teriyaki Bento,Edamame,Gyoza (Dumplings),Tempura (Shrimp),Green Tea Ice Cream,Mochi Ice Cream,Matcha Latte
0,1,0,1,0,1,0,1,0,0,0
1,0,0,1,0,0,0,1,0,1,0
2,0,1,0,1,0,1,0,0,1,0
3,1,1,0,1,1,1,0,0,1,0
4,1,1,1,1,1,1,0,1,1,0


In [23]:
df['Edamame']

0      1
1      0
2      0
3      1
4      1
      ..
194    1
195    0
196    1
197    0
198    1
Name: Edamame, Length: 199, dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   California Roll         199 non-null    int64
 1   Salmon Nigiri           199 non-null    int64
 2   Tonkotsu Ramen          199 non-null    int64
 3   Chicken Teriyaki Bento  199 non-null    int64
 4   Edamame                 199 non-null    int64
 5   Gyoza (Dumplings)       199 non-null    int64
 6   Tempura (Shrimp)        199 non-null    int64
 7   Green Tea Ice Cream     199 non-null    int64
 8   Mochi Ice Cream         199 non-null    int64
 9   Matcha Latte            199 non-null    int64
dtypes: int64(10)
memory usage: 15.7 KB


In [5]:
n_samples, n_features = df.shape
print("This dataset has {0} samples and {1} features".format(n_samples, n_features))

This dataset has 199 samples and 10 features


In [6]:
df.isnull().sum()

California Roll           0
Salmon Nigiri             0
Tonkotsu Ramen            0
Chicken Teriyaki Bento    0
Edamame                   0
Gyoza (Dumplings)         0
Tempura (Shrimp)          0
Green Tea Ice Cream       0
Mochi Ice Cream           0
Matcha Latte              0
dtype: int64

In [7]:
features = list(df.columns.values)
print("The Features :", features)

The Features : ['California Roll', 'Salmon Nigiri', 'Tonkotsu Ramen', 'Chicken Teriyaki Bento', 'Edamame', 'Gyoza (Dumplings)', 'Tempura (Shrimp)', 'Green Tea Ice Cream', 'Mochi Ice Cream', 'Matcha Latte']


In [8]:
print(len(features))

10


# Part 1 - Examine frequency

In [9]:
dfnp = df.to_numpy()

In [10]:
print(dfnp[0:5])

[[1 0 1 0 1 0 1 0 0 0]
 [0 0 1 0 0 0 1 0 1 0]
 [0 1 0 1 0 1 0 0 1 0]
 [1 1 0 1 1 1 0 0 1 0]
 [1 1 1 1 1 1 0 1 1 0]]


## Popular dishes

In [11]:
def calSales(dsnp,ds,col):
    sales = 0
    name = ds.iloc[:, col].name
    
    for i in dsnp:
        if i[col] == 1:
            sales += 1

    return name, sales

In [12]:
popular_dishes = {}
for i in range(n_features):
    name, sales = calSales(dfnp,df,i)
    popular_dishes[name] = sales

In [13]:
sorted_dishes = dict(sorted(popular_dishes.items(), key=lambda item: item[1], reverse=True))

print("Top sales list:")
for name, sales in sorted_dishes.items():
    print("The total sales of {0} is {1}.".format(name, sales))

Top sales list:
The total sales of Mochi Ice Cream is 114.
The total sales of Green Tea Ice Cream is 107.
The total sales of Tonkotsu Ramen is 105.
The total sales of Matcha Latte is 105.
The total sales of Edamame is 101.
The total sales of Chicken Teriyaki Bento is 96.
The total sales of Tempura (Shrimp) is 95.
The total sales of California Roll is 94.
The total sales of Salmon Nigiri is 87.
The total sales of Gyoza (Dumplings) is 86.


# Part 2 - Affinity analysis(Basket analysis) Using Support, confidence and lift

In [15]:
valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurences = defaultdict(int)

for sample in dfnp:
    for premise in range(len(features)):
        if sample[premise] == 0: 
            continue
        # Record that the premise was bought in another transaction
        num_occurences[premise] += 1
        for conclusion in range(len(features)):
            if premise == conclusion:  # It makes little sense to measure if X -> X.
                continue
            if sample[conclusion] == 1:
                # This person also bought the conclusion item
                valid_rules[(premise, conclusion)] += 1
            else:
                # This person bought the premise, but not the conclusion
                invalid_rules[(premise, conclusion)] += 1

support = defaultdict(float)
confidence = defaultdict(float)
lift = defaultdict(float)

for premise, conclusion in valid_rules.keys():
    support[(premise, conclusion)] =  valid_rules[(premise, conclusion)] / 199
    confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]
    lift[(premise, conclusion)] = (valid_rules[(premise, conclusion)] / len(features)) / ((num_occurences[premise] / len(features)) * (num_occurences[conclusion] / len(features)))


In [16]:
for premise, conclusion in confidence:
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person order {0} they will also order {1}".format(premise_name, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - Support: {0}".format(support[(premise, conclusion)]))
    print(" - Lift: {0:.3f}".format(lift[(premise,conclusion)]))
    print("")

Rule: If a person order California Roll they will also order Tonkotsu Ramen
 - Confidence: 0.436
 - Support: 0.20603015075376885
 - Lift: 0.042

Rule: If a person order California Roll they will also order Edamame
 - Confidence: 0.532
 - Support: 0.25125628140703515
 - Lift: 0.053

Rule: If a person order California Roll they will also order Tempura (Shrimp)
 - Confidence: 0.511
 - Support: 0.24120603015075376
 - Lift: 0.054

Rule: If a person order Tonkotsu Ramen they will also order California Roll
 - Confidence: 0.390
 - Support: 0.20603015075376885
 - Lift: 0.042

Rule: If a person order Tonkotsu Ramen they will also order Edamame
 - Confidence: 0.448
 - Support: 0.23618090452261306
 - Lift: 0.044

Rule: If a person order Tonkotsu Ramen they will also order Tempura (Shrimp)
 - Confidence: 0.505
 - Support: 0.2663316582914573
 - Lift: 0.053

Rule: If a person order Edamame they will also order California Roll
 - Confidence: 0.495
 - Support: 0.25125628140703515
 - Lift: 0.053

Rule:

In [17]:
def print_rule(premise, conclusion, support, confidence, lift, features):
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person order {0} they will also order {1}".format(premise_name, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - Support: {0}".format(support[(premise, conclusion)]))
    print(" - Lift: {0:.3f}".format(lift[(premise,conclusion)]))
    print("")

In [18]:
premise = 8
conclusion = 2
print_rule(premise, conclusion, support, confidence, lift, features)

Rule: If a person order Mochi Ice Cream they will also order Tonkotsu Ramen
 - Confidence: 0.544
 - Support: 0.31155778894472363
 - Lift: 0.052



In [20]:
# Initialize a dictionary to store the top 3 confidence, support, and lift values for each food
top3_values = defaultdict(list)


# Iterate through the confidence dictionary to find the top 3 values based on confidence for each food
for premise, conclusion in confidence.keys():
    premise_name = features[premise]
    confidence_value = confidence[(premise, conclusion)]
    support_value = support[(premise, conclusion)]
    lift_value = lift[(premise, conclusion)]
    top3_values[premise_name].append((premise, conclusion, confidence_value, support_value, lift_value))

# Sort the values for each food based on confidence and keep only the top 3
for food, values_list in top3_values.items():
    values_list.sort(key=lambda x: x[2], reverse=True)  # Sort by confidence value
    top3_values[food] = values_list[:3]

# Convert the dictionary to a list for easy iteration
top3_values_list = [(food, data) for food, data in top3_values.items()]

# Print the list of top 3 confidence, support, and lift values for each food
for food, values_data in top3_values_list:
    print("Food:", food)
    for premise_idx, conclusion_idx, highest_confidence, support, lift in values_data:
        print(f" - Premise: {premise_idx}, Conclusion: {conclusion_idx}, Confidence: {highest_confidence:.2f}, " \
      f"Support: {support:.2f}, Lift: {lift:.2f}")


Food: California Roll
 - Premise: 0, Conclusion: 8, Confidence: 0.59, Support: 0.28, Lift: 0.05
 - Premise: 0, Conclusion: 7, Confidence: 0.56, Support: 0.27, Lift: 0.05
 - Premise: 0, Conclusion: 9, Confidence: 0.55, Support: 0.26, Lift: 0.05
Food: Tonkotsu Ramen
 - Premise: 2, Conclusion: 8, Confidence: 0.59, Support: 0.31, Lift: 0.05
 - Premise: 2, Conclusion: 7, Confidence: 0.52, Support: 0.28, Lift: 0.05
 - Premise: 2, Conclusion: 6, Confidence: 0.50, Support: 0.27, Lift: 0.05
Food: Edamame
 - Premise: 4, Conclusion: 8, Confidence: 0.57, Support: 0.29, Lift: 0.05
 - Premise: 4, Conclusion: 9, Confidence: 0.57, Support: 0.29, Lift: 0.05
 - Premise: 4, Conclusion: 7, Confidence: 0.55, Support: 0.28, Lift: 0.05
Food: Tempura (Shrimp)
 - Premise: 6, Conclusion: 2, Confidence: 0.56, Support: 0.27, Lift: 0.05
 - Premise: 6, Conclusion: 8, Confidence: 0.56, Support: 0.27, Lift: 0.05
 - Premise: 6, Conclusion: 9, Confidence: 0.53, Support: 0.25, Lift: 0.05
Food: Mochi Ice Cream
 - Premise