# Import the Important Libraries

In [23]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier

In [24]:
df = pd.read_csv('titanic.csv')

# As we've known from the Analysis, we will do some data cleaning

In [25]:
df.drop(['alive','deck','who','class','embark_town'],axis = 1 , inplace=True)
df['age'] = df.groupby(['pclass', 'sex', 'alone'])['age'].transform('median')
df.dropna(axis = 0 , inplace = True)

#Data Preparing

We will use Ordinal Encoder type because 'sex' and 'embarked' features had an impact on the survival people

In [26]:
ord_encoder = OrdinalEncoder()
categorical = ['sex','embarked']
df[categorical] = ord_encoder.fit_transform(df[categorical])

In [27]:
df.head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,adult_male,alone
0,0,3,1.0,17.5,1,0,7.25,2.0,True,False
1,1,1,0.0,35.0,1,0,71.2833,0.0,False,False


We can see that 'sex' and 'embarked' are float type, we've got to transfer them to int type

In [28]:
df[['sex','embarked']] = df[['sex','embarked']].astype('int')

In [29]:
df.head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,adult_male,alone
0,0,3,1,17.5,1,0,7.25,2,True,False
1,1,1,0,35.0,1,0,71.2833,0,False,False


#Sepreate the target of the data

In [30]:
data = df.drop('survived',axis = 1)
target = df['survived']

#Split the data

In [31]:
x_train , x_test , y_train , y_test = train_test_split(data,target,test_size = 0.2,random_state = 42)

In [32]:
x_train.shape , y_train.shape , x_test.shape , y_test.shape

((711, 9), (711,), (178, 9), (178,))

#Scaling the data for better results

In [33]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

#We can now train models and evaluate them

In [34]:
#Logistic Regression

log_reg = LogisticRegression()
log_reg.fit(x_train,y_train)
y_pred = log_reg.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85       109
           1       0.74      0.80      0.77        69

    accuracy                           0.81       178
   macro avg       0.80      0.81      0.81       178
weighted avg       0.82      0.81      0.82       178

[[90 19]
 [14 55]]


It is a little bit good, but we need something more powerful

#Random Forrest

In [35]:
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       109
           1       0.72      0.72      0.72        69

    accuracy                           0.79       178
   macro avg       0.78      0.78      0.78       178
weighted avg       0.79      0.79      0.79       178

[[90 19]
 [19 50]]


What a surprise, Logistic Regression now is better than Random Forrest!

#Let us try with Decision Tree

In [36]:
dr = DecisionTreeClassifier()
dr.fit(x_train,y_train)
y_pred = dr.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85       109
           1       0.77      0.72      0.75        69

    accuracy                           0.81       178
   macro avg       0.80      0.79      0.80       178
weighted avg       0.81      0.81      0.81       178

[[94 15]
 [19 50]]


#We will use Stacking (Merge number of models to take the decisions together)

In [41]:
base_models = [('lr', LogisticRegression()), ('rf', RandomForestClassifier()), ('dt', DecisionTreeClassifier())]
stacking_model = StackingClassifier(estimators=base_models, cv=5)
stacking_model.fit(x_train, y_train)
y_pred = stacking_model.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       109
           1       0.73      0.74      0.73        69

    accuracy                           0.79       178
   macro avg       0.78      0.78      0.78       178
weighted avg       0.79      0.79      0.79       178

[[90 19]
 [18 51]]
