In [1]:
#Importing libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder as OHE
from sklearn.model_selection import train_test_split
from category_encoders.target_encoder import TargetEncoder

  from pandas.core import (


In [2]:
# Loading data
data = pd.read_csv('cancelations_data_clean.csv',parse_dates=['Order Date'])
data.drop(columns=['Unnamed: 0'],inplace=True)
data.head()

Unnamed: 0,Order ID,Order Date,Fulfilment By,Sales Channel,Shipment Type,Style,SKU,Category,Size,Shipment City,...,Status Category,Price,Shipment State Matched,Free Financing,Free Shipping,Coupon,Other Promotions,No Promotions,Weekend,Stock
0,405-8078784-5731545,2022-04-30,Merchant,Amazon.in,Standard,SET389,SET389-KR-NP-S,Set,S,mumbai,...,Cancelled,680.0,Maharashtra,False,False,False,False,True,True,32.0
1,171-9198151-1101146,2022-04-30,Merchant,Amazon.in,Standard,JNE3781,JNE3781-KR-XXXL,kurta,3XL,bengaluru,...,Shipped,406.0,Karnataka,True,False,False,False,False,True,96.0
2,404-0687676-7273146,2022-04-30,Amazon,Amazon.in,Expedited,JNE3371,JNE3371-KR-XL,kurta,XL,navi mumbai,...,Shipped,329.0,Maharashtra,False,True,False,False,False,True,4.0
3,403-9615377-8133951,2022-04-30,Merchant,Amazon.in,Standard,J0341,J0341-DR-L,Western Dress,L,puducherry,...,Cancelled,791.0,Puducherry,False,False,False,False,True,True,193.0
4,407-1069790-7240320,2022-04-30,Amazon,Amazon.in,Expedited,JNE3671,JNE3671-TU-XXXL,Top,3XL,chennai,...,Shipped,574.0,Tamil Nadu,False,False,False,False,True,True,6.0


In [3]:
# Adding features
data['Month'] = data['Order Date'].dt.month
data['DayofMonth'] = data['Order Date'].dt.day
data['begin_of_month'] = (data['DayofMonth'] < 10).astype('uint8')
data['midddle_of_month'] = ((data['DayofMonth'] >= 10)&(data['DayofMonth'] < 20)).astype('uint8')
data['end_of_month'] = (data['DayofMonth'] >= 20).astype('uint8')
data.head()

Unnamed: 0,Order ID,Order Date,Fulfilment By,Sales Channel,Shipment Type,Style,SKU,Category,Size,Shipment City,...,Coupon,Other Promotions,No Promotions,Weekend,Stock,Month,DayofMonth,begin_of_month,midddle_of_month,end_of_month
0,405-8078784-5731545,2022-04-30,Merchant,Amazon.in,Standard,SET389,SET389-KR-NP-S,Set,S,mumbai,...,False,False,True,True,32.0,4,30,0,0,1
1,171-9198151-1101146,2022-04-30,Merchant,Amazon.in,Standard,JNE3781,JNE3781-KR-XXXL,kurta,3XL,bengaluru,...,False,False,False,True,96.0,4,30,0,0,1
2,404-0687676-7273146,2022-04-30,Amazon,Amazon.in,Expedited,JNE3371,JNE3371-KR-XL,kurta,XL,navi mumbai,...,False,False,False,True,4.0,4,30,0,0,1
3,403-9615377-8133951,2022-04-30,Merchant,Amazon.in,Standard,J0341,J0341-DR-L,Western Dress,L,puducherry,...,False,False,True,True,193.0,4,30,0,0,1
4,407-1069790-7240320,2022-04-30,Amazon,Amazon.in,Expedited,JNE3671,JNE3671-TU-XXXL,Top,3XL,chennai,...,False,False,True,True,6.0,4,30,0,0,1


In [4]:
#Transforming target feature to numerical
data['Status Category'] = data['Status Category'].map({'Cancelled': 1, 'Shipped': 0}).values
y = data['Status Category']

In [5]:
# One-hot encoding some categorical fetaures. I discovered in my EDA that Category, Size, SKU and State may have 
# an impact on cancelation rate. I will use target encoding for SKU to avoid feature exploding.

categorical_vars = ['Category','Size','Shipment State Matched','Shipment Type']
data_encoded = pd.get_dummies(data, columns = categorical_vars, drop_first=True)

print('The data have ', data_encoded.shape[0], ' rows and ', data_encoded.shape[1], ' columns\n')

The data have  120280  rows and  78  columns



In [6]:
# Target encoding for SKU

data_encoded = TargetEncoder(cols=['SKU'], 
                             min_samples_leaf=20, 
                             smoothing=10).fit_transform(data_encoded.drop(columns=['Status Category']), 
                                                         data_encoded['Status Category'])


In [7]:
# Price and stock level don't seem to be correlated with cancellations, so I wouldn't use them in my model, 
# but for the purpose of this project I will scale them as if I'd use them.

numeric_cols = ['Price','Stock','DayofMonth','Month']  # Do I also need to scale SKU which I previously target-encoded?

scaler = RobustScaler() # using RobustScaler to deal with outliers

numeric_data_scaled = scaler.fit_transform(data_encoded[numeric_cols])
numeric_data_scaled = pd.DataFrame(numeric_data_scaled,columns=numeric_cols)

data_scaled = pd.concat([numeric_data_scaled,data_encoded.drop(columns=numeric_cols)],axis=1)

In [8]:
data_scaled.head()

Unnamed: 0,Price,Stock,DayofMonth,Month,Order ID,Order Date,Fulfilment By,Sales Channel,Style,SKU,...,Shipment State Matched_Rajasthan,Shipment State Matched_Sikkim,Shipment State Matched_Tamil Nadu,Shipment State Matched_Telangana,Shipment State Matched_Tripura,Shipment State Matched_Uttar Pradesh,Shipment State Matched_Uttarakhand,Shipment State Matched_West Bengal,Shipment State Matched_apo,Shipment Type_Standard
0,0.167173,0.285714,1.0,-0.5,405-8078784-5731545,2022-04-30,Merchant,Amazon.in,SET389,0.208974,...,False,False,False,False,False,False,False,False,False,True
1,-0.665653,1.301587,1.0,-0.5,171-9198151-1101146,2022-04-30,Merchant,Amazon.in,JNE3781,0.119269,...,False,False,False,False,False,False,False,False,False,True
2,-0.899696,-0.15873,1.0,-0.5,404-0687676-7273146,2022-04-30,Amazon,Amazon.in,JNE3371,0.14318,...,False,False,False,False,False,False,False,False,False,False
3,0.504559,2.84127,1.0,-0.5,403-9615377-8133951,2022-04-30,Merchant,Amazon.in,J0341,0.164384,...,False,False,False,False,False,False,False,False,False,True
4,-0.155015,-0.126984,1.0,-0.5,407-1069790-7240320,2022-04-30,Amazon,Amazon.in,JNE3671,0.107383,...,False,False,True,False,False,False,False,False,False,False


In [9]:
# Dropping columns I won't use in my model
columns_to_drop=['Order Date','Sales Channel','Fulfilment By','Style','Shipment City','Shipment Postal Code']

In [10]:
# Splitting data into train & test datasets
X_train, X_test, y_train, y_test = train_test_split(data_scaled.drop(columns=columns_to_drop), y, 
                                                    test_size=0.3, 
                                                    random_state=47)

In [11]:
# Saving Order_ids before dropping the column
orders_list = ['Order ID']

orders_train = X_train[orders_list]
orders_test = X_test[orders_list]

X_train.drop(columns=orders_list, inplace=True)
X_test.drop(columns=orders_list, inplace=True)

X_train.shape, X_test.shape

((84196, 70), (36084, 70))

In [12]:
X_train.head(10)

Unnamed: 0,Price,Stock,DayofMonth,Month,SKU,B2B,Easy Ship,Free Financing,Free Shipping,Coupon,...,Shipment State Matched_Rajasthan,Shipment State Matched_Sikkim,Shipment State Matched_Tamil Nadu,Shipment State Matched_Telangana,Shipment State Matched_Tripura,Shipment State Matched_Uttar Pradesh,Shipment State Matched_Uttarakhand,Shipment State Matched_West Bengal,Shipment State Matched_apo,Shipment Type_Standard
105892,0.528875,0.047619,-0.266667,0.5,0.131697,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
19992,-0.085106,-0.126984,0.133333,-0.5,0.132231,False,True,False,False,False,...,False,False,False,False,False,True,False,False,False,True
70368,2.200608,0.269841,-0.266667,0.0,0.166667,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
14475,0.495441,5.873016,0.4,-0.5,0.10221,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
101303,-0.316109,-0.079365,-0.066667,0.5,0.13441,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
88459,0.118541,-0.015873,0.733333,0.5,0.227092,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
23148,-1.112462,-0.174603,0.066667,-0.5,0.139573,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
111140,0.334347,-0.095238,-0.533333,0.5,0.175436,False,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
47668,-0.155015,0.603175,1.0,0.0,0.12782,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
51662,0.504559,-0.174603,0.8,0.0,0.104322,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [13]:
# Checking for null values
X_train.isnull().sum()

Price                                   0
Stock                                   0
DayofMonth                              0
Month                                   0
SKU                                     0
                                       ..
Shipment State Matched_Uttar Pradesh    0
Shipment State Matched_Uttarakhand      0
Shipment State Matched_West Bengal      0
Shipment State Matched_apo              0
Shipment Type_Standard                  0
Length: 70, dtype: int64

In [14]:
# Saving pre-processed fetures in csv
X_train.to_csv('X_train.csv',index=False)
X_test.to_csv('X_test.csv',index=False)

In [15]:
# Saving target feature in csv
y_train.to_csv('y_train.csv',index=False)
y_test.to_csv('y_test.csv',index=False)

In [18]:
# Saving unscaled data for Decision trees
X_train2, X_test2, y_train2, y_test2 = train_test_split(data_encoded.drop(columns=columns_to_drop), y, 
                                                    test_size=0.3, 
                                                    random_state=47)

orders_train2 = X_train2[orders_list]
orders_test2 = X_test2[orders_list]

X_train2.drop(columns=orders_list, inplace=True)
X_test2.drop(columns=orders_list, inplace=True)

X_train2.to_csv('X_train_unscaled.csv',index=False)
X_test2.to_csv('X_test_unscaled.csv',index=False)
y_train2.to_csv('y_train_unscaled.csv',index=False)
y_test2.to_csv('y_test_unscaled.csv',index=False)