In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("../Data/supermarket.csv")

In [3]:
data

Unnamed: 0,Customer Id,Date,Product
0,JKML5ZWXP0XQ,2020-07-20,tropical fruit
1,YOGWQIUPDAJ7,2020-04-30,whole milk
2,Q3NL22281AF6,2020-09-18,pip fruit
3,C59GRTLMZQAU,2020-12-11,other vegetables
4,WUSF8LTEQZ5L,2020-01-02,whole milk
...,...,...,...
32705,PM1BQU9H9S2Q,2019-06-05,oil
32706,6AWIAP3ZV551,2019-08-10,sliced cheese
32707,LRWVFLSC8X5B,2019-02-23,candy
32708,8KYRMELYKNIH,2019-04-16,cake bar


## Q1

In [4]:
number_of_unique_products = len(data.Product.unique())
number_of_unique_products

137

## Q2

In [5]:
average_sales_per_day = round(data.Date.value_counts().mean(), 2)
average_sales_per_day

44.93

## Q3

In [6]:
lessـcommonـproducts = data.Product.value_counts().tail(4)
lessـcommonـproducts

Product
bags                     4
baby cosmetics           3
kitchen utensil          1
preservation products    1
Name: count, dtype: int64

## Q4

In [7]:
data["year"] = data.Date.apply(lambda item: item.split("-")[0])
customers_with_most_baskets = data[data.year=="2020"].groupby("Customer Id").apply(lambda group: len(group.groupby("Date"))).sort_values(ascending=False).head(5)
customers_with_most_baskets

Customer Id
M722MPITBY07    7
E7FD9BU4OK0F    7
0ZRF32AJ06BC    7
M2DT2Z00NQCH    7
QBUS9NZY39BE    7
dtype: int64

## Q5

In [8]:
best_selling_day_ofـweek = data.Date.apply(lambda date: pd.to_datetime(date).strftime("%A")).value_counts().idxmax()
best_selling_day_ofـweek

'Wednesday'

## Q6

In [9]:
table = pd.pivot_table(data, index=["Customer Id", "Date"], columns="Product", values="Product", aggfunc="size")

In [10]:
number_of_baskets_containing_product = table.apply(lambda column: column.sum())

In [11]:
mostـsupportedـproducts = number_of_baskets_containing_product.sort_values(ascending=False).head()
mostـsupportedـproducts

Product
whole milk          2363.0
other vegetables    1827.0
rolls/buns          1646.0
soda                1453.0
yogurt              1285.0
dtype: float64

## Q7

In [12]:
number_of_all_baskets = len(data.groupby(["Customer Id", "Date"]))

In [13]:
products_support = number_of_baskets_containing_product / number_of_all_baskets

In [14]:
def calculate_support(row):
    row["support"] =  len(table[(table[list(row["products"])] == 1).all(axis=1)]) / number_of_all_baskets
    if row["support"] >= 0.01:
        row["antecedent"] =  row["products"][0]
        row["consequent"] =  row["products"][1]
        row["antecedent support"] =  products_support[row["antecedent"]]
        row["confidence"] =  row["support"] / row["antecedent support"]

    return row

In [15]:
from itertools import permutations


grouped_products_support = pd.DataFrame(columns = ["products", "antecedent", "consequent", "antecedent support", "support", "confidence"])
grouped_products_support["products"] = list(permutations(products_support[products_support >= 0.01].index, 2))
grouped_products_support = grouped_products_support.apply(calculate_support, axis=1).dropna()
patterns_with_most_confidence = grouped_products_support.sort_values("confidence", ascending=False, ignore_index=True).head(2)
patterns_with_most_confidence

Unnamed: 0,products,antecedent,consequent,antecedent support,support,confidence
0,"(yogurt, whole milk)",yogurt,whole milk,0.087101,0.01132,0.129961
1,"(rolls/buns, whole milk)",rolls/buns,whole milk,0.111571,0.014167,0.126974


## Write the answers in a text file

In [16]:
with open("shopping_cart_analysis_outputs.txt", "w") as answers_file:
    answers_file.write(f"{number_of_unique_products}\n")
    answers_file.write(f"{average_sales_per_day}\n")
    
    answers_file.write(",".join(lessـcommonـproducts.index))
    answers_file.write("\n")
    
    answers_file.write(",".join(customers_with_most_baskets.index))
    answers_file.write("\n")
    
    answers_file.write(f"{best_selling_day_ofـweek}\n")

    answers_file.write(",".join(mostـsupportedـproducts.index))
    answers_file.write("\n")
    
    answers_file.write(f"(\"{patterns_with_most_confidence.antecedent[0]}\")->(\"{patterns_with_most_confidence.consequent[0]}\")|")
    answers_file.write(f"(\"{patterns_with_most_confidence.antecedent[1]}\")->(\"{patterns_with_most_confidence.consequent[1]}\")")