# Cafe Sales Analytics Projects

## Objectives
Create useful columns to make analyse of the cafe sales data easier to answer business related question easier.

### SETUP for Feature Engineering

In [1]:
import pandas as pd
import numpy as np

In [2]:
# import CSV
data_path = "../data/processed/infered_cafe_sales_completed.csv"
try:
    df = pd.read_csv(data_path,
                     dtype= {
                        "Transaction ID" : "object",
                        "Item" : "string",
                        "Quantity": "Int32",
                        "Price Per Unit": "float64",
                        "Total Spent": "float64",
                        "Payment Method": "string", 
                        "Location": "string", 
                     } 
                     ,parse_dates=["Transaction Date"]
                     )
except FileNotFoundError:
    print("ERROR : File Not Found")

In [3]:
#make a copy just in case
feature_df = df.copy()
pd.set_option("display.max_row", None)

In [4]:
feature_df.columns

Index(['Transaction ID', 'Item', 'Quantity', 'Price Per Unit', 'Total Spent',
       'Payment Method', 'Location', 'Transaction Date'],
      dtype='object')

In [5]:
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Transaction ID    10000 non-null  object        
 1   Item              9520 non-null   string        
 2   Quantity          9977 non-null   Int32         
 3   Price Per Unit    9994 non-null   float64       
 4   Total Spent       9977 non-null   float64       
 5   Payment Method    6822 non-null   string        
 6   Location          6039 non-null   string        
 7   Transaction Date  9540 non-null   datetime64[ns]
dtypes: Int32(1), datetime64[ns](1), float64(2), object(1), string(3)
memory usage: 595.8+ KB


## Time based features

In [6]:
#month features
feature_df["Month"] = feature_df["Transaction Date"].dt.month.astype("Int64")

In [7]:
# year freatures
feature_df["Week"] = feature_df["Transaction Date"].dt.isocalendar().week.astype("Int64")

In [8]:
#day features
feature_df["Day Name"] = feature_df["Transaction Date"].dt.day_name()
feature_df["Day Of Week"] =feature_df["Transaction Date"].dt.dayofweek

In [9]:
# check weekend
feature_df["Is Weekend"] = feature_df["Day Of Week"].isin([5,6]).astype(bool)

In [10]:
# check changes
feature_df.drop(columns="Day Of Week",inplace = True)


In [11]:
feature_df.sample()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date,Month,Week,Day Name,Is Weekend
7286,TXN_3673322,Salad,2,5.0,10.0,Credit Card,Takeaway,2023-04-24,4,17,Monday,False


## Transaction features

In [12]:
# check if math is correct
check = (feature_df["Quantity"]*feature_df["Price Per Unit"]).round(2)
feature_df["price_check"] = (check != feature_df["Total Spent"].round(2)).astype("Int64")

In [13]:
feature_df["price_check"].value_counts()

price_check
0    9974
Name: count, dtype: Int64

In [14]:
feature_df.drop(columns= "price_check",inplace = True)

In [15]:
feature_df.columns

Index(['Transaction ID', 'Item', 'Quantity', 'Price Per Unit', 'Total Spent',
       'Payment Method', 'Location', 'Transaction Date', 'Month', 'Week',
       'Day Name', 'Is Weekend'],
      dtype='object')

### Transaction Size

In [16]:
feature_df["Transaction Size"] = pd.cut(
    feature_df["Total Spent"],
    bins = [0,10,20,float("inf")],
    labels=["Small","Medium","Large"],
    include_lowest=True
)

In [17]:
feature_df[["Total Spent","Transaction Size"]].sample(10)

Unnamed: 0,Total Spent,Transaction Size
6487,12.0,Medium
8330,4.0,Small
8673,10.0,Small
5712,6.0,Small
7188,20.0,Medium
4622,8.0,Small
2619,2.0,Small
4571,12.0,Medium
9291,25.0,Large
2656,12.0,Medium


In [18]:
feature_df["Transaction Size"].value_counts(dropna=False)

Transaction Size
Small     6691
Medium    3017
Large      269
NaN         23
Name: count, dtype: int64

### Bulk Orders

In [19]:
feature_df["Bulk Order"] = pd.NA
feature_df.loc[feature_df["Quantity"] > 1, "Bulk Order"] = 1 
feature_df.loc[feature_df["Quantity"] == 1, "Bulk Order"] = 0 
feature_df["Bulk Order"] = feature_df["Bulk Order"].astype("Int64")

In [20]:
feature_df[["Quantity","Bulk Order"]].sample(10)

Unnamed: 0,Quantity,Bulk Order
1120,2,1
6761,3,1
3887,3,1
8142,3,1
4459,5,1
7445,5,1
6926,5,1
6448,4,1
5607,3,1
2672,4,1


In [21]:
feature_df["Bulk Order"].value_counts(dropna=False)

Bulk Order
1       8051
0       1926
<NA>      23
Name: count, dtype: Int64

In [22]:
feature_df["Bulk Order"].dtype

Int64Dtype()

In [23]:
feature_df.sample(5)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date,Month,Week,Day Name,Is Weekend,Transaction Size,Bulk Order
1218,TXN_5322809,Cake,2,3.0,6.0,,,2023-04-30,4.0,17.0,Sunday,True,Small,1
456,TXN_8805984,Juice,2,3.0,6.0,Credit Card,Takeaway,NaT,,,,False,Small,1
1547,TXN_7644786,Cookie,3,1.0,3.0,Credit Card,Takeaway,2023-01-20,1.0,3.0,Friday,False,Small,1
4852,TXN_1846455,Sandwich,4,4.0,16.0,,,2023-03-16,3.0,11.0,Thursday,False,Medium,1
7997,TXN_8966922,Juice,4,3.0,12.0,,Takeaway,2023-11-10,11.0,45.0,Friday,False,Medium,1


## Save Final Dataset to CSV

In [26]:
feature_df.to_csv("../data/processed/final_cafe_sales.csv",na_rep="\\N",index = False)

In [25]:
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Transaction ID    10000 non-null  object        
 1   Item              9520 non-null   string        
 2   Quantity          9977 non-null   Int32         
 3   Price Per Unit    9994 non-null   float64       
 4   Total Spent       9977 non-null   float64       
 5   Payment Method    6822 non-null   string        
 6   Location          6039 non-null   string        
 7   Transaction Date  9540 non-null   datetime64[ns]
 8   Month             9540 non-null   Int64         
 9   Week              9540 non-null   Int64         
 10  Day Name          9540 non-null   object        
 11  Is Weekend        10000 non-null  bool          
 12  Transaction Size  9977 non-null   category      
 13  Bulk Order        9977 non-null   Int64         
dtypes: Int32(1), Int64(3), 