# Cafe Sales Analytics Projects

## Objectives
Create useful columns to make analyse of the cafe sales data easier to answer business related question easier.

### SETUP for Feature Engineering

In [1]:
import pandas as pd
import numpy as np

In [2]:
# import CSV
data_path = "../data/processed/infered_cafe_sales_completed.csv"
try:
    df = pd.read_csv(data_path,
                     dtype= {
                        "Transaction ID" : "object",
                        "Item" : "string",
                        "Quantity": "Int32",
                        "Price Per Unit": "float64",
                        "Total Spent": "float64",
                        "Payment Method": "string", 
                        "Location": "string", 
                     } 
                     ,parse_dates=["Transaction Date"]
                     )
except FileNotFoundError:
    print("ERROR : File Not Found")

In [3]:
#make a copy just in case
feature_df = df.copy()
pd.set_option("display.max_row", None)

In [4]:
feature_df.columns

Index(['Transaction ID', 'Item', 'Quantity', 'Price Per Unit', 'Total Spent',
       'Payment Method', 'Location', 'Transaction Date'],
      dtype='object')

In [5]:
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Transaction ID    10000 non-null  object        
 1   Item              9520 non-null   string        
 2   Quantity          9977 non-null   Int32         
 3   Price Per Unit    9994 non-null   float64       
 4   Total Spent       9977 non-null   float64       
 5   Payment Method    6822 non-null   string        
 6   Location          6039 non-null   string        
 7   Transaction Date  9540 non-null   datetime64[ns]
dtypes: Int32(1), datetime64[ns](1), float64(2), object(1), string(3)
memory usage: 595.8+ KB


## Time based features

In [6]:
#month features
feature_df["Month"] = feature_df["Transaction Date"].dt.month.astype("Int64")

In [7]:
# year freatures
feature_df["Week"] = feature_df["Transaction Date"].dt.isocalendar().week.astype("Int64")

In [8]:
#day features
feature_df["Day Name"] = feature_df["Transaction Date"].dt.day_name()
feature_df["Day Of Week"] =feature_df["Transaction Date"].dt.dayofweek

In [9]:
# check weekend
feature_df["Is Weekend"] = feature_df["Day Of Week"].isin([5,6]).astype(bool)

In [10]:
# check changes
feature_df.drop(columns="Day Of Week",inplace = True)


In [11]:
feature_df.sample()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date,Month,Week,Day Name,Is Weekend
9885,TXN_4659954,,3,4.0,12.0,Credit Card,In-store,NaT,,,,False


## Transaction features

In [12]:
# check if math is correct
check = (feature_df["Quantity"]*feature_df["Price Per Unit"]).round(2)
feature_df["price_check"] = (check != feature_df["Total Spent"].round(2)).astype("Int64")

In [13]:
feature_df["price_check"].value_counts()

price_check
0    9974
Name: count, dtype: Int64

In [14]:
feature_df.drop(columns= "price_check",inplace = True)

In [15]:
feature_df.columns

Index(['Transaction ID', 'Item', 'Quantity', 'Price Per Unit', 'Total Spent',
       'Payment Method', 'Location', 'Transaction Date', 'Month', 'Week',
       'Day Name', 'Is Weekend'],
      dtype='object')

### Transaction Size

In [16]:
feature_df["Transaction Size"] = pd.cut(
    feature_df["Total Spent"],
    bins = [0,10,20,float("inf")],
    labels=["Small","Medium","Large"],
    include_lowest=True
)

In [17]:
feature_df[["Total Spent","Transaction Size"]].sample(10)

Unnamed: 0,Total Spent,Transaction Size
2443,10.0,Small
8998,2.0,Small
5552,20.0,Medium
9049,6.0,Small
7488,5.0,Small
1779,10.0,Small
8521,3.0,Small
3555,4.5,Small
9060,4.5,Small
9915,5.0,Small


In [18]:
feature_df["Transaction Size"].value_counts(dropna=False)

Transaction Size
Small     6691
Medium    3017
Large      269
NaN         23
Name: count, dtype: int64

### Bulk Orders

In [19]:
feature_df["Bulk Order"] = pd.NA
feature_df.loc[feature_df["Quantity"] > 1, "Bulk Order"] = 1 
feature_df.loc[feature_df["Quantity"] == 1, "Bulk Order"] = 0 
feature_df["Bulk Order"] = feature_df["Bulk Order"].astype("Int64")

In [20]:
feature_df[["Quantity","Bulk Order"]].sample(10)

Unnamed: 0,Quantity,Bulk Order
9644,3,1
4454,2,1
3677,3,1
5508,2,1
3941,5,1
2963,4,1
9438,1,0
4788,2,1
9051,1,0
6318,5,1


In [21]:
feature_df["Bulk Order"].value_counts(dropna=False)

Bulk Order
1       8051
0       1926
<NA>      23
Name: count, dtype: Int64

In [22]:
feature_df["Bulk Order"].dtype

Int64Dtype()

In [23]:
feature_df.sample(5)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date,Month,Week,Day Name,Is Weekend,Transaction Size,Bulk Order
8255,TXN_6874195,Juice,3,3.0,9.0,Cash,Takeaway,2023-04-22,4,16,Saturday,True,Small,1
984,TXN_1646009,Salad,4,5.0,20.0,Credit Card,,2023-10-11,10,41,Wednesday,False,Medium,1
6554,TXN_1569248,Tea,1,1.5,1.5,Credit Card,,2023-11-14,11,46,Tuesday,False,Small,0
4866,TXN_8668590,Cookie,4,1.0,4.0,Digital Wallet,,2023-04-18,4,16,Tuesday,False,Small,1
665,TXN_9323513,Cake,3,3.0,9.0,,Takeaway,2023-05-04,5,18,Thursday,False,Small,1


## Save Final Dataset to CSV

In [24]:
feature_df.to_csv("../data/processed/final_cafe_sales.csv",index = False)