# Preprocessing workflow - All decisions undertaken are statistically supported by our eda analysis,
- `Handling missing data` - Drop rows where the 'Customer ID' column has null values. Analysis suggest it is the catalyst for other enull values in the dataset.
- `Handling outliers` - Drop rows where outlier has a value, but has the value zero (0) in columns like 'Price' or 'Revenue'. 
- `Handling duplicates` - There are no duplicates in the dataset
- `Datetime columns` - Extract features likes year, month, day, is_night, is_weekend, hour, minutes, seconds, Diff_signup_last_login, quarter_year
- `Log transform` - Revenue, Price, Cost
- `One-hot encoding` - Gender, Customer_Segment, Marketing_channel, Category, Payment_Method
- `Frequency-encoding` - Country, Subcateogry
- `Scaling` - Robust scaling

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
import sys

In [None]:
FILE_PATH = '../data/raw/customer_churn.csv'
df = pd.read_csv(FILE_PATH)

In [None]:
# Handling missing values
print(f'Observations before dropping rows : {df.shape}')
if 'Quantity' in df.columns:
    df = df.dropna(subset=['Customer ID'])
    print(f'Observations after dropping rows : {df.shape}')
else:
    print(f"The column, 'Quantity' not found in the dataset")
    sys.exit()

Observations before dropping rows : (743486, 24)
Observations after dropping rows : (573570, 24)


In [None]:
# datetime features
def extract_datetime_features(df, col):

    df[col]  = pd.to_datetime(df[col], errors='coerce')

    df[f'Year_{col}'] = df[col].dt.year
    df[f'Month_{col}'] = df[col].dt.month
    df[f'Day_{col}'] = df[col].dt.day
    df[f'Quarter_{col}'] = df[col].dt.quarter
    df[f'Hour_{col}'] = df[col].dt.hour
    df[f'Minute_{col}'] = df[col].dt.minute
    df[f'Seconds_{col}'] = df[col].dt.second
    df[f'Is_weekend_{col}'] = (df[col].dt.dayofweek > 4).astype('int64')
    df[f'Is_night_{col}'] = (df[col].dt.hour > 17).astype('int64')

    return df

datetimes = ['InvoiceDate', 'Signup_Date', 'Last_Login_Date']
for datetime in datetimes:

    df = extract_datetime_features(df, datetime)

In [None]:
# log transformations
def log_transform(df, col):

    if col in df.columns:
        df[f'{col}_log'] = np.log1p(df[col])

    else:
        print(f'Column not present in dataset')
        sys.exit()

    return df
    
log_columns = ['Revenue', 'Price', 'Cost']
for log_col in log_columns:
    
    df = log_transform(df, log_col)

In [None]:
def encode_features(df_input, col):

    if col not in df_input.columns:
        return df_input
    
    dummies = pd.get_dummies(df_input[col],prefix=col, drop_first=True, dtype=int)

    df_output = pd.concat([df_input, dummies], axis=1)
    df_output = df_output.drop(columns=[col])
    

    return df_output

encode_columns = ['Gender','Payment_Method','Category','Customer_segment','Marketing_channel']

for encode_cols in encode_columns:

    df = encode_features(df, encode_cols)

In [None]:
df

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,Customer_Age,Signup_Date,...,Cost_log,Gender_Male,Payment_Method_Cash,Payment_Method_Credit Card,Payment_Method_PayPal,Category_Electronics,Category_Home Decor,Category_Kitchen,Category_Stationery,Category_Toys
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,26.0,2009-07-22 07:45:00,...,1.699230,False,False,True,False,False,True,False,False,False
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,29.0,2008-08-23 07:45:00,...,1.454450,False,False,False,True,False,False,False,False,True
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,33.0,2009-02-04 07:45:00,...,1.725014,False,True,False,False,False,False,True,False,False
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,United Kingdom,79.0,2009-07-18 07:45:00,...,0.751593,False,False,False,False,False,False,False,False,False
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,43.0,2009-08-22 07:45:00,...,0.614490,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
743480,555949,21672,WHITE SPOT RED CERAMIC DRAWER KNOB,12,2011-06-08 10:50:00,1.25,16626.0,United Kingdom,72.0,2009-11-29 10:50:00,...,0.483533,True,True,False,False,False,False,False,False,True
743481,555949,21670,BLUE SPOT CERAMIC DRAWER KNOB,12,2011-06-08 10:50:00,1.25,16626.0,United Kingdom,65.0,2009-09-30 10:50:00,...,0.606774,True,False,True,False,False,False,True,False,False
743482,555949,21213,PACK OF 72 SKULL CAKE CASES,24,2011-06-08 10:50:00,0.55,16626.0,United Kingdom,74.0,2010-04-01 10:50:00,...,0.346147,True,False,True,False,False,False,False,False,True
743483,555949,21754,HOME BUILDING BLOCK WORD,3,2011-06-08 10:50:00,5.95,16626.0,United Kingdom,48.0,2010-05-07 10:50:00,...,1.541248,False,False,True,False,False,True,False,False,False
