# Preprocessing workflow - All decisions undertaken are statistically supported by our eda analysis,
- `Handling missing data` - Drop rows where the 'Customer ID' column has null values. Analysis suggest it is the catalyst for other enull values in the dataset.
- `Handling outliers` - Drop rows where outlier has a value, but has the value zero (0) in columns like 'Price' or 'Revenue'. 
- `Handling duplicates` - There are no duplicates in the dataset
- `Datetime columns` - Extract features likes year, month, day, is_night, is_weekend, hour, minutes, seconds, quarter_year
- `Log transform` - Revenue, Price, Cost
- `One-hot encoding` - Gender, Customer_Segment, Marketing_channel, Category, Payment_Method
- `Frequency-encoding` - Country, Subcateogry
- `Scaling` - Robust scaling

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
import sys

In [2]:
FILE_PATH = '../data/raw/customer_churn.csv'
df = pd.read_csv(FILE_PATH)

In [3]:
# Handling missing values
print(f'Observations before dropping rows : {df.shape}')
if 'Customer ID' in df.columns:
    df = df.dropna(subset=['Customer ID'])
    print(f'Observations after dropping rows : {df.shape}')
else:
    print(f"The column, 'Quantity' not found in the dataset")
    sys.exit()

Observations before dropping rows : (743486, 24)
Observations after dropping rows : (573570, 24)


In [4]:
before = len(df)
df = df[~((df['Quantity'] != 0) & ((df['Price'] == 0) | (df['Revenue'] == 0)))]
after = len(df)
print(f'Dropped {before - after} outlier rows')

Dropped 44 outlier rows


In [5]:
# datetime features
def extract_datetime_features(df, col):

    df[col]  = pd.to_datetime(df[col], errors='coerce')

    df[f'Year_{col}'] = df[col].dt.year
    df[f'Month_{col}'] = df[col].dt.month
    df[f'Day_{col}'] = df[col].dt.day
    df[f'Quarter_{col}'] = df[col].dt.quarter
    df[f'Hour_{col}'] = df[col].dt.hour
    df[f'Minute_{col}'] = df[col].dt.minute
    df[f'Seconds_{col}'] = df[col].dt.second
    df[f'Is_weekend_{col}'] = (df[col].dt.dayofweek > 4).astype('int64')
    df[f'Is_night_{col}'] = (df[col].dt.hour > 17).astype('int64')


    return df

datetimes = ['InvoiceDate', 'Signup_Date', 'Last_Login_Date']
for datetime in datetimes:

    df = extract_datetime_features(df, datetime)


In [6]:
# log transformations
def log_transform(df, col):

    if col in df.columns:
        df[f'{col}_log'] = np.log1p(df[col])

    else:
        print(f'Column not present in dataset')
        sys.exit()

    return df
    
log_columns = ['Revenue', 'Price', 'Cost']
for log_col in log_columns:
    
    df = log_transform(df, log_col)

In [7]:
def encode_features(df_input, col):

    if col not in df_input.columns:
        return df_input
    
    dummies = pd.get_dummies(df_input[col],prefix=col, drop_first=True, dtype=int)

    df_output = pd.concat([df_input, dummies], axis=1)
    df_output = df_output.drop(columns=[col])
    

    return df_output

encode_columns = ['Gender','Payment_Method','Category','Customer_Segment','Marketing_Channel']

for encode_cols in encode_columns:

    df = encode_features(df, encode_cols)

In [8]:
df['sum_customer_profit'] = df.groupby('Customer ID')['Profit'].transform('sum') # total profit contributed by customer
df['avg_customer_profit'] = df.groupby('Customer ID')['Profit'].transform('mean') # avg profit contributed by customer

In [9]:
df['sum_description_revenue'] = df.groupby('Description')['Revenue'].transform('sum') #total revenue per item
df['avg_description_revenue'] = df.groupby('Description')['Revenue'].transform('mean') # avg revenue per item

In [15]:
df['sum_country_revenue'] = df.groupby('Country')['Revenue'].transform('sum') # total revenue per country
df['avg_country_revenue'] = df.groupby('Country')['Revenue'].transform('mean') # avg revenue per country
df['sum_country_profit'] = df.groupby('Country')['Profit'].transform('sum')
df['avg_country_profit'] = df.groupby('Country')['Profit'].transform('mean')

In [17]:
# drop columns
cols_to_drop = ['Invoice','StockCode','Description','InvoiceDate','Customer ID','Signup_Date','Last_Login_Date','Country']
for cols in cols_to_drop:
    df = df.drop(columns=[cols])