# Cafe Sales Analytics Projects

## Objectives
fill the missing values in the cleaned datasets Cafe Sales.

### SETUP for Filling values

In [142]:
import pandas as pd
import numpy as np

In [143]:
#import CSV
data_path = "../data/processed/cleaned_cafe_sales_without_filling.csv"
try:
    df = pd.read_csv(data_path)
except FileNotFoundError:
    print("ERROR : File Not Found")

In [144]:
#make a copy to fill
fill_df = df.copy()
pd.set_option("display.max_row", None)

In [145]:
fill_df.columns

Index(['Transaction ID', 'Item', 'Quantity', 'Price Per Unit', 'Total Spent',
       'Payment Method', 'Location', 'Transaction Date'],
      dtype='object')

## Fill missing values in 'Item' 
Missing Item names were inferred only using the price per unit column and only when an item's price per unit is unique.
items that have the same prices where left out to avoid assumptions.

In [146]:
print(df["Item"].isnull().sum())

969


In [147]:
#find known items
item_known = fill_df[fill_df["Item"].notna()]

In [148]:
# find items with unique price per unit
price_count = item_known.groupby("Price Per Unit")["Item"].nunique()
print(price_count)

Price Per Unit
1.0    1
1.5    1
2.0    1
3.0    2
4.0    2
5.0    1
Name: Item, dtype: int64


In [149]:
# get safe items to infer name from the price per unit == 1
safe_price = price_count[price_count == 1].index
safe_price

Index([1.0, 1.5, 2.0, 5.0], dtype='float64', name='Price Per Unit')

In [150]:
# create a lookup table
price_to_item = (item_known[item_known["Price Per Unit"].isin(safe_price)]
                 .drop_duplicates("Price Per Unit")
                 .set_index("Price Per Unit")["Item"])
print(price_to_item)

Price Per Unit
2.0    Coffee
1.0    Cookie
5.0     Salad
1.5       Tea
Name: Item, dtype: object


In [151]:
# find missing values in the item column
mask = fill_df["Item"].isna()
# fill missing values using the mask
fill_df.loc[mask,"Item"] = ( fill_df.loc[mask,"Price Per Unit"].map(price_to_item) )

In [152]:
print(fill_df["Item"].isnull().sum())

501


### Fill missing values in 'Item' Using Total Spent and Quantity

Missing Item names were inferred using the Total Spent and Quantity column to get the price per unit.

In [153]:
# create a temp df for calculated price
fill_df["Computed_price"] = (fill_df["Total Spent"]/ fill_df["Quantity"]).round(2)
fill_df.columns

Index(['Transaction ID', 'Item', 'Quantity', 'Price Per Unit', 'Total Spent',
       'Payment Method', 'Location', 'Transaction Date', 'Computed_price'],
      dtype='object')

In [154]:
# find missing values in the item column
mask = (fill_df["Item"].isna() 
        &  
        fill_df["Computed_price"].isin(safe_price)
        )
# fill missing values using the mask
fill_df.loc[mask,"Item"] = ( fill_df.loc[mask,"Computed_price"].map(price_to_item) )

In [155]:
print(fill_df["Item"].isnull().sum())

480


From 969 missing values in the item columns to 480 missing values. I was able to infer a total of 489 item names first based on the Price per unit column followed by dividing the total spent column with the quantity column to figure out the price per unit and therefore the missing item values. 

## Fill missing values in 'Price per unit' 

Missing values in price per unit were infered using the item column

In [156]:
print(fill_df["Price Per Unit"].isna().sum())

533


In [157]:
# find all not Null values in Item and Price per unit
valid = fill_df[fill_df["Item"].notna() & fill_df["Price Per Unit"].notna()]

In [158]:
# find out if each item has a unique price 
price_count = valid.groupby("Item")["Price Per Unit"].nunique()
price_count

Item
Cake        1
Coffee      1
Cookie      1
Juice       1
Salad       1
Sandwich    1
Smoothie    1
Tea         1
Name: Price Per Unit, dtype: int64

In [159]:
# get items that sell at one price (safe)
safe_item = price_count[price_count == 1].index
safe_item

Index(['Cake', 'Coffee', 'Cookie', 'Juice', 'Salad', 'Sandwich', 'Smoothie',
       'Tea'],
      dtype='object', name='Item')

In [160]:
#create a lookup table item -> prices
item_to_price = (
    valid[valid["Item"].isin(safe_item)]
    .drop_duplicates("Item")
    .set_index("Item")["Price Per Unit"]
)
print(item_to_price)

Item
Coffee      2.0
Cake        3.0
Cookie      1.0
Salad       5.0
Smoothie    4.0
Sandwich    4.0
Tea         1.5
Juice       3.0
Name: Price Per Unit, dtype: float64


In [161]:
# fill missing values in price per unit
mask = (fill_df["Price Per Unit"].isna() 
        &  
        fill_df["Item"].isin(item_to_price.index)
        )
# fill missing values using the mask
fill_df.loc[mask,"Price Per Unit"] = ( fill_df.loc[mask,"Item"].map(item_to_price) )

In [162]:
print(f"Null values in Price Per Unit: {fill_df['Price Per Unit'].isna().sum()},\nNull values in Item : {fill_df['Item'].isna().sum()}")

Null values in Price Per Unit: 33,
Null values in Item : 480


### Fill in missing values using total spent/quantity

Remaining missing values were infered using total spent divided by quantity if both values were not null and valid



In [None]:
#create a mask for safe rows
mask = (
    fill_df["Price Per Unit"].isna() &
    fill_df["Total Spent"].notna() &
    fill_df["Quantity"].notna() &
    (fill_df["Quantity"] > 0)
    )

In [164]:
# fill missing values in price per unit
fill_df.loc[mask,"Price Per Unit"] = ( fill_df.loc[mask,"Computed_price"].round(2) )

In [166]:
print(fill_df['Price Per Unit'].isna().sum())

6


From 533 missing values in the price per unit column to 6 missing values. These values were infered by first using the item column and the using the computation total spent divided by quantity to fill the remaining values if both values were not null and valid

## Fill missing values in 'Quantity' 