                         TITANIC SPACESHIP BINARY CLASSIFICATION
   

# Libraries

In [1]:
#import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import accuracy_score,plot_roc_curve
from pycaret.classification import *
from catboost import CatBoostClassifier
from sklearn.feature_selection import mutual_info_classif
import xgboost
from xgboost import XGBClassifier
import re

# Extract and Analysis 

In [2]:
# Extract both Titanic train and test datase. In this case from my local directory
df = pd.read_csv('data/titanic.train.csv')
train_labels = df.pop('Transported')


In [3]:
df_test = pd.read_csv('data/titanic.test.csv')

In [4]:
#check train dataset
df.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent


In [5]:
#check test dataset
df_test.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus


In [6]:
#check train columns, dtypes and non-nulls
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
dtypes: float64(6), object(7)
memory usage: 883.0+ KB


In [7]:
#check test columns, dtypes and non-nulls
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


# Data Processing

Let's convert all categorical columns to numerical...

In [8]:
#deleting cabin and name categorical columns since their convertion don't contribute to analysis
df.drop(['Cabin','Name'],axis=1,inplace=True)
df_test.drop(['Cabin','Name'],axis=1,inplace=True)

In [9]:
#Select categorical cols to transform
cat_cols = df.select_dtypes(['bool_','object_']).columns
cat_cols1 = df_test.select_dtypes(['bool_','object_']).columns

In [10]:
#deleting passenger id column
cat_cols= cat_cols.drop('PassengerId')
cat_cols1= cat_cols1.drop('PassengerId')

In [11]:
#separate numerical columns on train
num_cols = df.select_dtypes(exclude=['bool','object']).columns

In [12]:
#separate numerical columns on test
num_cols1 = df_test.select_dtypes(exclude=['bool','object']).columns

In [13]:
#Trasnform categorical to numerical cols on train
OE = OrdinalEncoder()
df[cat_cols] = OE.fit_transform(df[cat_cols])

In [14]:
#Trasnform categorical to numerical cols on test
OE = OrdinalEncoder()
df_test[cat_cols1] = OE.fit_transform(df_test[cat_cols1])

In [15]:
#dealing with nulls sat numerical columns using experimental iterative imputer on train
iterative_imputer = IterativeImputer()
df[num_cols] = pd.DataFrame(iterative_imputer.fit_transform(df[num_cols]), columns= num_cols)

In [16]:
#dealing with nulls at numerical columns using experimental iterative imputer on test
iterative_imputer = IterativeImputer()
df_test[num_cols1] = pd.DataFrame(iterative_imputer.fit_transform(df_test[num_cols1]), columns= num_cols1)

In [17]:
#dealing with nulls at categorical columns using simple imputer on train applying most frequent strategy
impute=SimpleImputer(strategy='most_frequent')
df[cat_cols]=pd.DataFrame(impute.fit_transform(df[cat_cols]),columns=cat_cols)

In [18]:
#dealing with nulls at categorical columns using simple imputer on test applying most frequent strategy
impute=SimpleImputer(strategy='most_frequent')
df_test[cat_cols1]=pd.DataFrame(impute.fit_transform(df_test[cat_cols1]),columns=cat_cols1)

In [19]:
#separating passengerId train in groups, explained in documentation
df['Group'] = df.PassengerId.apply(lambda x: str(x)[:4]).astype(int)

In [20]:
#separating passengerId test in groups, explained in documentation
df_test['Group'] = df_test.PassengerId.apply(lambda x: str(x)[:4]).astype(int)

In [21]:
#deleting PassengerId on train
df = df.drop(columns=['PassengerId'])

In [39]:
#Applying standarization scales at numerical columns on train
norm = StandardScaler()
num_cols_norm = [col+'_norm' for col in num_cols]
df[num_cols_norm] = norm.fit_transform(df[num_cols])


In [40]:
#Applying standarization scales at numerical columns on train
num_cols_norm1 = [col+'_norm' for col in num_cols1]
df_test[num_cols_norm] = norm.transform(df_test[num_cols])


In [24]:
#Checking if all columns are numerical and non nulls on train
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   HomePlanet         8693 non-null   float64
 1   CryoSleep          8693 non-null   float64
 2   Destination        8693 non-null   float64
 3   Age                8693 non-null   float64
 4   VIP                8693 non-null   float64
 5   RoomService        8693 non-null   float64
 6   FoodCourt          8693 non-null   float64
 7   ShoppingMall       8693 non-null   float64
 8   Spa                8693 non-null   float64
 9   VRDeck             8693 non-null   float64
 10  Group              8693 non-null   int64  
 11  Age_norm           8693 non-null   float64
 12  RoomService_norm   8693 non-null   float64
 13  FoodCourt_norm     8693 non-null   float64
 14  ShoppingMall_norm  8693 non-null   float64
 15  Spa_norm           8693 non-null   float64
 16  VRDeck_norm        8693 

In [25]:
#checking train dataset
df.head(3)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Group,Age_norm,RoomService_norm,FoodCourt_norm,ShoppingMall_norm,Spa_norm,VRDeck_norm
0,1.0,0.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.709373,-0.34042,-0.286919,-0.290836,-0.276256,-0.26814
1,0.0,0.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,2,-0.336374,-0.17521,-0.281279,-0.248989,0.21162,-0.229322
2,1.0,0.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,3,2.033985,-0.275245,1.954387,-0.290836,5.691115,-0.224911


In [26]:
#Checking if all columns are numerical and non nulls on test
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   PassengerId        4277 non-null   object 
 1   HomePlanet         4277 non-null   float64
 2   CryoSleep          4277 non-null   float64
 3   Destination        4277 non-null   float64
 4   Age                4277 non-null   float64
 5   VIP                4277 non-null   float64
 6   RoomService        4277 non-null   float64
 7   FoodCourt          4277 non-null   float64
 8   ShoppingMall       4277 non-null   float64
 9   Spa                4277 non-null   float64
 10  VRDeck             4277 non-null   float64
 11  Group              4277 non-null   int64  
 12  Age_norm           4277 non-null   float64
 13  RoomService_norm   4277 non-null   float64
 14  FoodCourt_norm     4277 non-null   float64
 15  ShoppingMall_norm  4277 non-null   float64
 16  Spa_norm           4277 

In [27]:
#checking test dataset
df_test.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Group,Age_norm,RoomService_norm,FoodCourt_norm,ShoppingMall_norm,Spa_norm,VRDeck_norm
0,0013_01,0.0,1.0,2.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,13,-0.118309,-0.364654,-0.291306,-0.320285,-0.274065,-0.251195
1,0018_01,0.0,0.0,2.0,19.0,0.0,0.0,9.0,0.0,2823.0,0.0,18,-0.688479,-0.364654,-0.28535,-0.320285,2.282661,-0.251195
2,0019_01,1.0,1.0,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,19,0.166777,-0.364654,-0.291306,-0.320285,-0.274065,-0.251195


In [28]:
#using mutual info classif to check every feature importance
mi_scores = mutual_info_classif(df, train_labels)
mi_scores = pd.Series(mi_scores, name="MI Scores", index= df.columns)
mi_scores = mi_scores.sort_values(ascending=False)
mi_scores

CryoSleep            0.110827
RoomService          0.081646
Spa                  0.078448
Spa_norm             0.070380
RoomService_norm     0.069861
VRDeck_norm          0.063923
VRDeck               0.061616
ShoppingMall         0.057944
ShoppingMall_norm    0.050364
FoodCourt            0.048798
FoodCourt_norm       0.044522
Group                0.020567
HomePlanet           0.015281
Age_norm             0.012384
Age                  0.009570
VIP                  0.003976
Destination          0.000373
Name: MI Scores, dtype: float64

In [29]:
#dropping least important VIP and Destination columns
df.drop(['VIP','Destination','Age_norm'],axis=1,inplace=True)

# Modeling

In [30]:
#applying train test split
X_train, X_test, y_train, y_test = train_test_split(df, train_labels, train_size = 0.8)

In [31]:
#Applying Extreme Gradient Boosting algorithm 
clf = XGBClassifier(n_estimators=1000, learning_rate=0.01, early_stopping_rounds=25, objective="binary:logistic")
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])


[0]	validation_0-logloss:0.68913
[1]	validation_0-logloss:0.68519
[2]	validation_0-logloss:0.68134
[3]	validation_0-logloss:0.67754
[4]	validation_0-logloss:0.67386
[5]	validation_0-logloss:0.67023
[6]	validation_0-logloss:0.66670
[7]	validation_0-logloss:0.66317
[8]	validation_0-logloss:0.65974
[9]	validation_0-logloss:0.65635
[10]	validation_0-logloss:0.65303
[11]	validation_0-logloss:0.64981
[12]	validation_0-logloss:0.64661
[13]	validation_0-logloss:0.64347
[14]	validation_0-logloss:0.64037
[15]	validation_0-logloss:0.63737
[16]	validation_0-logloss:0.63435
[17]	validation_0-logloss:0.63148
[18]	validation_0-logloss:0.62859
[19]	validation_0-logloss:0.62578
[20]	validation_0-logloss:0.62301
[21]	validation_0-logloss:0.62033
[22]	validation_0-logloss:0.61761
[23]	validation_0-logloss:0.61498
[24]	validation_0-logloss:0.61240
[25]	validation_0-logloss:0.60983
[26]	validation_0-logloss:0.60734
[27]	validation_0-logloss:0.60486
[28]	validation_0-logloss:0.60243
[29]	validation_0-loglos

In [32]:
#Finding predictions
preds = [x == 1 for x in clf.predict(df_test[df.columns])]

In [33]:
#Creating Transported submission column
df_test['Transported']=preds

In [34]:
# Creating submission dataframe
submission = pd.DataFrame(data=df_test,columns=['PassengerId','Transported'])

In [35]:
#checking submission dataframe
submission.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [36]:
# Deploying it
submission.to_csv('data/submission.csv',index = False)