# Cafe Sales Analytics Projects

## Objectives
fill the missing values in the cleaned datasets Cafe Sales.

### SETUP for Filling values

In [24]:
import pandas as pd
import numpy as np

In [25]:
#import CSV
data_path = "../data/processed/cleaned_cafe_sales_without_filling.csv"
try:
    df = pd.read_csv(data_path)
except FileNotFoundError:
    print("ERROR : File Not Found")

In [26]:
#make a copy to fill
fill_df = df.copy()
pd.set_option("display.max_row", None)

In [27]:
fill_df.columns

Index(['Transaction ID', 'Item', 'Quantity', 'Price Per Unit', 'Total Spent',
       'Payment Method', 'Location', 'Transaction Date'],
      dtype='object')

### Fill missing values in 'Item' Using Price per Unit
Missing Item names were inferred only using the price per unit column and only when an item's price per unit is unique.
items that have the same prices where left out to avoid assumptions.

From 969 missing values in the item columns to 480 missing values. I was able to infer a total of 489 item names first based on the Price per unit column followed by dividing the total spent column with the quantity column to figure out the price per unit and therefore the missing item values. 

In [28]:
print(df["Item"].isnull().sum())

969


In [29]:
#find known items
item_known = fill_df[fill_df["Item"].notna()]

In [30]:
# find items with unique price per unit
price_count = item_known.groupby("Price Per Unit")["Item"].nunique()
print(price_count)

Price Per Unit
1.0    1
1.5    1
2.0    1
3.0    2
4.0    2
5.0    1
Name: Item, dtype: int64


In [31]:
# get safe items to infer name from the price per unit == 1
safe_price = price_count[price_count == 1].index
safe_price

Index([1.0, 1.5, 2.0, 5.0], dtype='float64', name='Price Per Unit')

In [32]:
# create a lookup table
price_to_item = (item_known[item_known["Price Per Unit"].isin(safe_price)].drop_duplicates("Price Per Unit").set_index("Price Per Unit")["Item"])
print(price_to_item)

Price Per Unit
2.0    Coffee
1.0    Cookie
5.0     Salad
1.5       Tea
Name: Item, dtype: object


In [33]:
# find missing values in the item column
mask = fill_df["Item"].isna()
# fill missing values using the mask
fill_df.loc[mask,"Item"] = ( fill_df.loc[mask,"Price Per Unit"].map(price_to_item) )

In [34]:
print(fill_df["Item"].isnull().sum())

501


### Fill missing values in 'Item' Using Total Spent and Quantity

Missing Item names were inferred using the Total Spent and Quantity column to get the price per unit.

In [35]:
# create a temp df for calculated price
fill_df["Computed_price"] = (fill_df["Total Spent"]/ fill_df["Quantity"]).round(2)
fill_df.columns

Index(['Transaction ID', 'Item', 'Quantity', 'Price Per Unit', 'Total Spent',
       'Payment Method', 'Location', 'Transaction Date', 'Computed_price'],
      dtype='object')

In [36]:
# find missing values in the item column
mask = (fill_df["Item"].isna() 
        &  
        fill_df["Computed_price"].isin(safe_price)
        )
# fill missing values using the mask
fill_df.loc[mask,"Item"] = ( fill_df.loc[mask,"Computed_price"].map(price_to_item) )

In [37]:
# remove temp column Computed_price
fill_df.drop(columns="Computed_price",inplace = True)
fill_df.columns

Index(['Transaction ID', 'Item', 'Quantity', 'Price Per Unit', 'Total Spent',
       'Payment Method', 'Location', 'Transaction Date'],
      dtype='object')

In [38]:
print(fill_df["Item"].isnull().sum())

480


In [39]:
fill_df[["Item","Price Per Unit"]]

Unnamed: 0,Item,Price Per Unit
0,Coffee,2.0
1,Cake,3.0
2,Cookie,1.0
3,Salad,5.0
4,Coffee,2.0
5,Smoothie,4.0
6,,3.0
7,Sandwich,4.0
8,,3.0
9,Sandwich,4.0
