In [57]:
%load_ext watermark
%watermark

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Last updated: 2021-04-24T12:13:14.891633+01:00

Python implementation: CPython
Python version       : 3.8.5
IPython version      : 7.19.0

Compiler    : MSC v.1916 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : AMD64 Family 23 Model 32 Stepping 1, AuthenticAMD
CPU cores   : 2
Architecture: 64bit



## Data Processing

The aim of this notebook is to prepare the dataset to be used to train ML models. To do so, some outliers will be removed, scaling will be applied to numerical values and categorical values will be suitably encoded.

In the same way, he dataset will be split between training and validation data and testing data, 80% and 20% respectively.

In [61]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as pl

df = pd.read_csv('./train.csv')
df.head(5)

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1


In [62]:
# Note: Product importance and customer rating are Ordinal variables which means they have an order of priority
categorical_columns = ['Gender', 'Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Customer_rating']
numerical_columns = ['Customer_care_calls', 'Cost_of_the_Product', 'Prior_purchases',\
                    'Discount_offered', 'Weight_in_gms']

## Categorical Encoding

In [63]:
# Encode gender
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])

# Encode Importance
importance_cat = {
    "low": 0,
    "medium": 1,
    "high": 2
}
df['Product_importance'] = df['Product_importance'].apply(lambda x: importance_cat[x])

# Encode Customer rating
oe = LabelEncoder()
df['Customer_rating'] = oe.fit_transform(df['Customer_rating'])

# Encode Nominal categories
nominal = ['Warehouse_block', 'Mode_of_Shipment']
df = pd.get_dummies(df, columns=nominal)

df.head(15)

Unnamed: 0,ID,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N,Warehouse_block_A,Warehouse_block_B,Warehouse_block_C,Warehouse_block_D,Warehouse_block_F,Mode_of_Shipment_Flight,Mode_of_Shipment_Road,Mode_of_Shipment_Ship
0,1,4,1,177,3,0,0,44,1233,1,0,0,0,1,0,1,0,0
1,2,4,4,216,2,0,1,59,3088,1,0,0,0,0,1,1,0,0
2,3,2,1,183,4,0,1,48,3374,1,1,0,0,0,0,1,0,0
3,4,3,2,176,4,1,1,10,1177,1,0,1,0,0,0,1,0,0
4,5,2,1,184,3,1,0,46,2484,1,0,0,1,0,0,1,0,0
5,6,3,0,162,3,1,0,12,1417,1,0,0,0,0,1,1,0,0
6,7,3,3,250,3,0,0,3,2371,1,0,0,0,1,0,1,0,0
7,8,4,0,233,2,0,0,48,2804,1,0,0,0,0,1,1,0,0
8,9,3,3,150,3,0,0,11,1861,1,1,0,0,0,0,1,0,0
9,10,3,1,164,3,1,0,29,1187,1,0,1,0,0,0,1,0,0
