# Here there is Only the Cleaning of the Model

In [2]:
# Importing the basic libraries we will require for the project

# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


# Code to ignore warnings from function usage
import warnings;
import numpy as np
warnings.filterwarnings('ignore')

from sklearn.impute import SimpleImputer

Load and View the Data Set

In [41]:
shin_exp = pd.read_csv('Traveldata_train.csv')

In [42]:
shin_exp.head(5)

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins
0,98800001,Female,Loyal Customer,52.0,,Business,272,0.0,5.0
1,98800002,Male,Loyal Customer,48.0,Personal Travel,Eco,2200,9.0,0.0
2,98800003,Female,Loyal Customer,43.0,Business Travel,Business,1061,77.0,119.0
3,98800004,Female,Loyal Customer,44.0,Business Travel,Business,780,13.0,18.0
4,98800005,Female,Loyal Customer,50.0,Business Travel,Business,1981,0.0,0.0


Check the Shape and Informations (about possible missing Values in the Data Set)

In [43]:
shin_exp.shape  # first one N° of Rows and the second one N° of Columns

(94379, 9)

In [44]:
shin_exp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       94379 non-null  int64  
 1   Gender                   94302 non-null  object 
 2   Customer_Type            85428 non-null  object 
 3   Age                      94346 non-null  float64
 4   Type_Travel              85153 non-null  object 
 5   Travel_Class             94379 non-null  object 
 6   Travel_Distance          94379 non-null  int64  
 7   Departure_Delay_in_Mins  94322 non-null  float64
 8   Arrival_Delay_in_Mins    94022 non-null  float64
dtypes: float64(3), int64(2), object(4)
memory usage: 6.5+ MB


#### All the Columns except for ID and Overall_Experience (which is our Target) HAVE Missing Values
#### All the Values are string object that must be modified to numeric one with either one-hot encoding

In [45]:
# Let's check for duplicate values in the data
shin_exp.duplicated().sum()

0

In [46]:
# Let's check for missing values in the data
round(shin_exp.isnull().sum() / shin_exp.isnull().count() * 100, 2)

ID                         0.00
Gender                     0.08
Customer_Type              9.48
Age                        0.03
Type_Travel                9.78
Travel_Class               0.00
Travel_Distance            0.00
Departure_Delay_in_Mins    0.06
Arrival_Delay_in_Mins      0.38
dtype: float64

In [47]:
shin_exp.isna().sum()


ID                            0
Gender                       77
Customer_Type              8951
Age                          33
Type_Travel                9226
Travel_Class                  0
Travel_Distance               0
Departure_Delay_in_Mins      57
Arrival_Delay_in_Mins       357
dtype: int64

Every Feature has missing values. The features with moost missing values are:

Arrival_Time_Convenient, Catering, Onboard_Service with almost 10% of data missing

All the other have around less than 0.1% of missing values --> Hence they can be treated last

In [48]:
# Printing the % sub categories of each category

for i in shin_exp.describe(include=["object"]).columns:
    print("Unique values in", i, "are :")
    print(shin_exp[i].value_counts())
    print("*" * 50)

Unique values in Gender are :
Female    47815
Male      46487
Name: Gender, dtype: int64
**************************************************
Unique values in Customer_Type are :
Loyal Customer       69823
Disloyal Customer    15605
Name: Customer_Type, dtype: int64
**************************************************
Unique values in Type_Travel are :
Business Travel    58617
Personal Travel    26536
Name: Type_Travel, dtype: int64
**************************************************
Unique values in Travel_Class are :
Eco         49342
Business    45037
Name: Travel_Class, dtype: int64
**************************************************


## Exploratory Data Analysis --> Later On

# Data Preprocessing


### Separating the independent variables (X) and the dependent variable (Y)


In [49]:
shin_exp.shape

(94379, 9)

### All 4 Categories (Gender, Customer_Type, Type_Travel, Travel_Class) have only 2 type so I can use Freely the OneHotEncoder
### 1 for the ID

In [50]:
shin_exp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       94379 non-null  int64  
 1   Gender                   94302 non-null  object 
 2   Customer_Type            85428 non-null  object 
 3   Age                      94346 non-null  float64
 4   Type_Travel              85153 non-null  object 
 5   Travel_Class             94379 non-null  object 
 6   Travel_Distance          94379 non-null  int64  
 7   Departure_Delay_in_Mins  94322 non-null  float64
 8   Arrival_Delay_in_Mins    94022 non-null  float64
dtypes: float64(3), int64(2), object(4)
memory usage: 6.5+ MB


In [51]:
#X_v2 = shin_exp.copy()
#X_v2.head()

In [52]:
#X_v2 = pd.get_dummies(X_v2, drop_first=True)
#X_v2


In [53]:
# Printing the % sub categories of each category

for i in shin_exp.describe(include=["object"]).columns:
    print("Unique values in", i, "are :")
    print(shin_exp[i].value_counts())
    print("*" * 50)

Unique values in Gender are :
Female    47815
Male      46487
Name: Gender, dtype: int64
**************************************************
Unique values in Customer_Type are :
Loyal Customer       69823
Disloyal Customer    15605
Name: Customer_Type, dtype: int64
**************************************************
Unique values in Type_Travel are :
Business Travel    58617
Personal Travel    26536
Name: Type_Travel, dtype: int64
**************************************************
Unique values in Travel_Class are :
Eco         49342
Business    45037
Name: Travel_Class, dtype: int64
**************************************************


In [54]:
def Ordinal_encoding(element_i):
    mapping =  {'Female': 0, 'Male': 1, 
                'Business Travel': 0, 'Personal Travel': 1, 
                'Disloyal Customer': 0, 'Loyal Customer': 1, 
                'Business':0, 'Eco':1}



    try:
        data = mapping[element_i]
        return data
    except:
        return np.NaN

In [55]:
X_v3 = shin_exp.copy()

In [56]:
almost_all_columns = ['Gender', 'Customer_Type', 'Type_Travel', 'Travel_Class']

In [57]:
for column_i in almost_all_columns:
    X_v3[column_i] = X_v3[column_i].apply(Ordinal_encoding)

X_v3.head(1)

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins
0,98800001,0.0,1.0,52.0,,0,272,0.0,5.0


In [58]:
shin_exp.head(1)

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins
0,98800001,Female,Loyal Customer,52.0,,Business,272,0.0,5.0


In [59]:
X_v3.isna().sum() == shin_exp.isna().sum()


ID                         True
Gender                     True
Customer_Type              True
Age                        True
Type_Travel                True
Travel_Class               True
Travel_Distance            True
Departure_Delay_in_Mins    True
Arrival_Delay_in_Mins      True
dtype: bool

# HERE START The IMPUTATION

In [60]:
X_v3.isna().sum()

ID                            0
Gender                       77
Customer_Type              8951
Age                          33
Type_Travel                9226
Travel_Class                  0
Travel_Distance               0
Departure_Delay_in_Mins      57
Arrival_Delay_in_Mins       357
dtype: int64

# IMPUTATION Option 1 --> AFTER THE MEAN WE SHOULD TRY IN THE NEXT ITERATION KNN OR LINEAR REGRESSION (But for now this is good)

In [61]:
X_v3.columns

Index(['ID', 'Gender', 'Customer_Type', 'Age', 'Type_Travel', 'Travel_Class',
       'Travel_Distance', 'Departure_Delay_in_Mins', 'Arrival_Delay_in_Mins'],
      dtype='object')

In [62]:
# Columns to impute
reqd_col_for_impute = [ 'Gender', 'Customer_Type', 'Age', 'Type_Travel', 'Travel_Class',
       'Travel_Distance', 'Departure_Delay_in_Mins', 'Arrival_Delay_in_Mins']

# Simple Imputer with Mode
imputer = SimpleImputer(strategy="most_frequent")

# Fit and transform the train data
X_v3[reqd_col_for_impute] = imputer.fit_transform(X_v3[reqd_col_for_impute])


# Checking that no column has missing values in train or test sets
print(X_v3.isna().sum())
print("-" * 30)
#print(X_test.isna().sum())

ID                         0
Gender                     0
Customer_Type              0
Age                        0
Type_Travel                0
Travel_Class               0
Travel_Distance            0
Departure_Delay_in_Mins    0
Arrival_Delay_in_Mins      0
dtype: int64
------------------------------


# Till here the Work can be assumed Correct (you can Check Later On) ---> 

# From this point START THE EFFECTIVE PREDICTION

# I save here in a SUBSEQUENT Excel the result of my work so it can be used immediately without taking all these code

In [63]:
X_v3.to_csv('Travel_Shin_cleaned_dataset.csv')