In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import logging
data = Path("../data/processed/Sales.csv")


In [None]:
df = pd.read_csv(data)
df.head()

In [None]:
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors = "coerce")
df['Time'] = df['Time'].astype(str)

In [None]:
df['Target_Total']= df['Total'] #the regression target
df['HighSpender'] = (df['Total'] > 500).astype(int) #Classification target

In [None]:
df["Year"]      = df["Date"].dt.year
df["Month"]     = df["Date"].dt.month
df["Day"]       = df["Date"].dt.day
df["Weekday"]   = df["Date"].dt.day_name()
df["IsWeekend"] = df["Weekday"].isin(["Saturday","Sunday"])
df['Hour'] = pd.to_datetime(df['Time'], format= "%H:%M").dt.hour
df['PartOfTheDay'] = pd.cut(df['Hour'], bins = [-1, 11, 16, 20, 24], labels=['Morning', 'Afternoon', 'Evening', 'Night'])

In [20]:
df_prediction = df_model.copy()


In [26]:
branch_features = df_model.groupby(["Branch", "City"]).agg({
    'Total': ['sum', 'mean'],
    'Quantity': 'mean',
    'Rating': 'mean',
    'Payment': lambda x: x.mode()[0]
}).reset_index()
branch_features.columns = [
    'Branch', 'City', 'Total_Sales', 'Avg_Sales',
    'Avg_Quantity', 'Avg_Rating', 'Most_Common_Payment'
]
branch_features


Unnamed: 0,Branch,City,Total_Sales,Avg_Sales,Avg_Quantity,Avg_Rating,Most_Common_Payment
0,A,Yangon,105651.399,311.656044,5.463127,7.020944,Ewallet
1,B,Mandalay,106197.672,319.872506,5.481928,6.818072,Ewallet
2,C,Naypyitaw,110568.7065,337.099715,5.582317,7.072866,Cash


In [27]:
customer_features = df_model.groupby('Customer_type').agg({
    'Total': ['mean', 'count'],
    'gross income': 'mean',
    'Quantity': 'mean',
    'Payment': lambda x: x.mode()[0]
}).reset_index()
customer_features.columns = [
    'Customer_type', 'Avg_Spend', 'Num_Transactions',
    'Avg_Income', 'Avg_Quantity', 'Most_Common_Payment'
]
customer_features

Unnamed: 0,Customer_type,Avg_Spend,Num_Transactions,Avg_Income,Avg_Quantity,Most_Common_Payment
0,Member,327.348945,500,15.588045,5.556,Credit card
1,Normal,318.122856,499,15.148707,5.460922,Ewallet


In [28]:
# Product-level features
product_features = df_model.groupby('Product line').agg({
    'Total': 'sum',
    'Quantity': 'mean',
    'Rating': 'mean'
}).reset_index()
product_features.columns = [
    'Product_Line', 'Total_Revenue', 'Avg_Quantity', 'Avg_Rating'
]
product_features

Unnamed: 0,Product_Line,Total_Revenue,Avg_Quantity,Avg_Rating
0,Electronic accessories,54337.5315,5.711765,6.924706
1,Fashion accessories,54305.895,5.067416,7.029213
2,Food and beverages,56144.844,5.471264,7.113218
3,Health and beauty,48644.7675,5.609272,6.989404
4,Home and lifestyle,53861.913,5.69375,6.8375
5,Sports and travel,55122.8265,5.542169,6.916265


In [30]:
branch_avg = df_model.groupby("Branch")['Total'].mean().rename("Branch_Avg_Sales")
df_model = df_model.merge(branch_avg, on="Branch")
df_model.head()

Unnamed: 0,Branch,City,Customer_type,Gender,Product line,Unit price,Quantity,Total,Payment,gross income,...,Year,Month,Day,Weekday,IsWeekend,Hour,PartOfTheDay,Average_price_Item,Branch_Avg_Sales_x,Branch_Avg_Sales_y
0,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,80.22,Cash,3.82,...,2021.0,8.0,3.0,Tuesday,False,10,Morning,16.044,337.099715,337.099715
1,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,340.5255,Credit card,16.2155,...,2021.0,3.0,3.0,Wednesday,False,13,Afternoon,48.6465,311.656044,311.656044
2,A,Yangon,Member,Male,Health and beauty,58.22,8,489.048,Ewallet,23.288,...,,,,,False,20,Evening,61.131,311.656044,311.656044
3,A,Yangon,Normal,Male,Sports and travel,86.31,7,634.3785,Ewallet,30.2085,...,2021.0,8.0,2.0,Monday,False,10,Morning,90.6255,311.656044,311.656044
4,C,Naypyitaw,Normal,Male,Electronic accessories,85.39,7,627.6165,Ewallet,29.8865,...,,,,,False,18,Evening,89.6595,337.099715,337.099715


In [31]:
def simple_encode(df):
    cat_cols = df.select_dtypes(include=['object','category']).columns
    df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    return df_encoded

df_model_encoded = simple_encode(df_model)

In [33]:
df_model_encoded.to_csv('Sales_encoded.csv')