# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import scipy.stats
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder

from pycaret.classification import setup,compare_models

from sklearn.ensemble import GradientBoostingClassifier

# Load Train and Test Data

In [2]:
train=pd.read_csv("C:\\Users\\admin\\Desktop\\DS Docs\\Titanic\\train.csv")
test=pd.read_csv("C:\\Users\\admin\\Desktop\\DS Docs\\Titanic\\test.csv")

In [3]:
train.shape

(891, 12)

In [4]:
test.shape

(418, 11)

# Merge Train and Test Data for Preprocessing

In [5]:
target=train.Survived
test_id=test.PassengerId

train1=train.drop(['PassengerId','Name','Ticket','Cabin','Survived'],axis=1)
test1=test.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)

data=pd.concat([train1,test1],axis=0).reset_index(drop=True)

In [6]:
data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
1304,3,male,,0,0,8.0500,S
1305,1,female,39.0,0,0,108.9000,C
1306,3,male,38.5,0,0,7.2500,S
1307,3,male,,0,0,8.0500,S


In [7]:
data1=data.copy()

# Null Value Treatment For Numeric and Catogorial Data

In [8]:
data1.isna().sum()

Pclass        0
Sex           0
Age         263
SibSp         0
Parch         0
Fare          1
Embarked      2
dtype: int64

In [9]:
data1.dtypes

Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

In [10]:
#Impute null values for numeric columns using KNN regressor
def knn_impute(df,na_target):
    df=df.copy()
    
    numeric_df=df.select_dtypes(np.number)
    non_na_cloumns=numeric_df.loc[:,numeric_df.isna().sum()==0].columns
    
    y_train=numeric_df.loc[numeric_df[na_target].isna()==False,na_target]
    X_train=numeric_df.loc[numeric_df[na_target].isna()==False,non_na_cloumns]
    X_test=numeric_df.loc[numeric_df[na_target].isna()==True,non_na_cloumns]
    
    knn=KNeighborsRegressor()
    knn.fit(X_train,y_train)
    y_pred=knn.predict(X_test)
    
    df.loc[df[na_target].isna()==True,na_target]=y_pred

    
    return df

In [11]:
data1=knn_impute(data1,'Age')
data1=knn_impute(data1,'Fare')

In [12]:
data1.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

In [13]:
#Impute null values for catogorial columns using mode
for column in[
    'Embarked'
]:
    data1[column]=data1[column].fillna(data1[column].mode()[0])


In [14]:
data1.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [15]:
data1

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
1304,3,male,22.0,0,0,8.0500,S
1305,1,female,39.0,0,0,108.9000,C
1306,3,male,38.5,0,0,7.2500,S
1307,3,male,22.0,0,0,8.0500,S


In [16]:
data2=data1.copy()

In [17]:
print(data2.Sex.unique())
print(data2.Embarked.unique())

['male' 'female']
['S' 'C' 'Q']


In [18]:
#Encoding catogorial values
le=LabelEncoder()

for column in[
    'Sex',
    'Embarked'
]:
    data2[column]=le.fit_transform(data2[column])


In [19]:
data2

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.2500,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.9250,2
3,1,0,35.0,1,0,53.1000,2
4,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...
1304,3,1,22.0,0,0,8.0500,2
1305,1,0,39.0,0,0,108.9000,0
1306,3,1,38.5,0,0,7.2500,2
1307,3,1,22.0,0,0,8.0500,2


In [20]:
print(data2.Sex.unique())
print(data2.Embarked.unique())

[1 0]
[2 0 1]


In [21]:
data2.dtypes

Pclass        int64
Sex           int32
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked      int32
dtype: object

In [22]:
data3=data2.copy()

# Feature Transformation

In [23]:
scipy.stats.skew(data3.Fare)

4.3645897748925995

In [24]:
#As fare column is highly skewd we need to apply log transform
data3.Fare=np.log1p(data3.Fare)

In [25]:
scipy.stats.skew(data3.Fare) #Skewness of Fare column is dropped to 0.54

0.5415352880019132

In [27]:
data3

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,2.110213,2
1,1,0,38.0,1,0,4.280593,0
2,3,0,26.0,0,0,2.188856,2
3,1,0,35.0,1,0,3.990834,2
4,3,1,35.0,0,0,2.202765,2
...,...,...,...,...,...,...,...
1304,3,1,22.0,0,0,2.202765,2
1305,1,0,39.0,0,0,4.699571,0
1306,3,1,38.5,0,0,2.110213,2
1307,3,1,22.0,0,0,2.202765,2


In [28]:
data4=data3.copy()

# Split Train and Test Data

In [29]:
data4.loc[:train.index.max(),:]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.000,1,0,2.110213,2
1,1,0,38.000,1,0,4.280593,0
2,3,0,26.000,0,0,2.188856,2
3,1,0,35.000,1,0,3.990834,2
4,3,1,35.000,0,0,2.202765,2
...,...,...,...,...,...,...,...
886,2,1,27.000,0,0,2.639057,2
887,1,0,19.000,0,0,3.433987,2
888,3,0,19.234,1,2,3.196630,2
889,1,1,26.000,0,0,3.433987,0


In [30]:
data4.loc[train.index.max()+1:,:]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
891,3,1,34.5,0,0,2.178064,1
892,3,0,47.0,1,0,2.079442,2
893,2,1,62.0,0,0,2.369075,1
894,3,1,27.0,0,0,2.268252,2
895,3,0,22.0,1,1,2.586824,2
...,...,...,...,...,...,...,...
1304,3,1,22.0,0,0,2.202765,2
1305,1,0,39.0,0,0,4.699571,0
1306,3,1,38.5,0,0,2.110213,2
1307,3,1,22.0,0,0,2.202765,2


In [31]:
train_final=data4.loc[:train.index.max(),:]
test_final=data4.loc[train.index.max()+1:,:].reset_index(drop=True)

In [32]:
train_final

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.000,1,0,2.110213,2
1,1,0,38.000,1,0,4.280593,0
2,3,0,26.000,0,0,2.188856,2
3,1,0,35.000,1,0,3.990834,2
4,3,1,35.000,0,0,2.202765,2
...,...,...,...,...,...,...,...
886,2,1,27.000,0,0,2.639057,2
887,1,0,19.000,0,0,3.433987,2
888,3,0,19.234,1,2,3.196630,2
889,1,1,26.000,0,0,3.433987,0


In [33]:
test_final

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.5,0,0,2.178064,1
1,3,0,47.0,1,0,2.079442,2
2,2,1,62.0,0,0,2.369075,1
3,3,1,27.0,0,0,2.268252,2
4,3,0,22.0,1,1,2.586824,2
...,...,...,...,...,...,...,...
413,3,1,22.0,0,0,2.202765,2
414,1,0,39.0,0,0,4.699571,0
415,3,1,38.5,0,0,2.110213,2
416,3,1,22.0,0,0,2.202765,2


In [34]:
target

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

# Model Selection

In [35]:
_=setup(data=pd.concat([train_final,target],axis=1),target='Survived')

Unnamed: 0,Description,Value
0,Session id,2582
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 8)"
4,Transformed data shape,"(891, 8)"
5,Transformed train set shape,"(623, 8)"
6,Transformed test set shape,"(268, 8)"
7,Numeric features,7
8,Preprocess,True
9,Imputation type,simple


In [36]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8412,0.8893,0.7569,0.8161,0.7835,0.6585,0.6614,0.042
lightgbm,Light Gradient Boosting Machine,0.8316,0.8854,0.7368,0.8081,0.769,0.6373,0.6405,0.036
dt,Decision Tree Classifier,0.8204,0.805,0.7576,0.7728,0.7639,0.6191,0.6205,0.018
ridge,Ridge Classifier,0.8124,0.0,0.7025,0.7846,0.7393,0.5937,0.5977,0.019
rf,Random Forest Classifier,0.8124,0.8787,0.7455,0.76,0.7498,0.6003,0.6031,0.101
lr,Logistic Regression,0.8092,0.8568,0.7109,0.7705,0.7372,0.5883,0.5914,0.88
lda,Linear Discriminant Analysis,0.8091,0.855,0.7025,0.778,0.7362,0.5876,0.5914,0.018
ada,Ada Boost Classifier,0.8073,0.8438,0.7322,0.7587,0.7429,0.5893,0.5919,0.048
qda,Quadratic Discriminant Analysis,0.8059,0.8558,0.7406,0.7507,0.7425,0.5872,0.5903,0.021
et,Extra Trees Classifier,0.7965,0.8585,0.7328,0.7359,0.7282,0.5667,0.5722,0.073


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

# Baseline Model

In [37]:
gbc=GradientBoostingClassifier()
gbc.fit(train_final,target)


In [38]:
kf=KFold(n_splits=10)
result=cross_val_score(gbc,train_final,target,scoring='accuracy',cv=kf)

In [39]:
result

array([0.76666667, 0.82022472, 0.80898876, 0.83146067, 0.86516854,
       0.82022472, 0.82022472, 0.76404494, 0.87640449, 0.84269663])

In [40]:
np.mean(result)

0.8216104868913858

In [41]:
gbc_pred=gbc.predict(test_final)

In [42]:
gbc_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [43]:
baseline_submission=pd.concat([test_id,pd.Series(gbc_pred,name='Survived')],axis=1)
baseline_submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [44]:
baseline_submission.to_csv("C:\\Users\\admin\\Desktop\\DS Docs\\Titanic\\baseline.csv",index=False,header=True)