# Importing Libraries and Dataset

In [460]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [461]:
data = pd.read_csv("Titanic-Dataset.csv")
data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [462]:
data.shape

(891, 11)

# Data Exploring and Cleaning

In [463]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [464]:
data['Pclass'].unique()

array([3, 1, 2], dtype=int64)

In [465]:
data['Sex'].unique()

array(['male', 'female'], dtype=object)

In [466]:
data['Age'].isna().sum()

177

In [467]:
data['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

## Taking Care of Missing Data

In [468]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(data[['Age']])
data[['Age']] = imputer.transform(data[['Age']])

In [469]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       891 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [470]:
data.drop(columns="Cabin", inplace=True)

In [471]:
data.drop(columns="Name", inplace=True)

In [472]:
data.drop(columns="Ticket", inplace=True)

In [473]:
data.dropna(subset="Embarked", inplace=True)

In [474]:
data.shape

(889, 8)

In [475]:
X = data.iloc[:, 1:]
Y = data.iloc[:, 0]

## Encoding Categorical Data

In [476]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [477]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,6])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [478]:
X.shape

(889, 10)

## Data Splitting

In [479]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [480]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [481]:
x_train

array([[-0.72508694,  0.72508694, -0.47747175, ..., -0.4663365 ,
        -0.47338848, -0.5475599 ],
       [-0.72508694,  0.72508694, -0.47747175, ..., -0.4663365 ,
        -0.47338848, -0.49483135],
       [-0.72508694,  0.72508694, -0.47747175, ..., -0.4663365 ,
        -0.47338848, -0.55501867],
       ...,
       [-0.72508694,  0.72508694,  2.09436473, ...,  0.44206145,
        -0.47338848, -0.38833499],
       [-0.72508694,  0.72508694, -0.47747175, ...,  0.44206145,
        -0.47338848, -0.10127665],
       [-0.72508694,  0.72508694, -0.47747175, ..., -0.4663365 ,
        -0.47338848, -0.5475599 ]])

In [482]:
x_test

array([[ 1.37914496, -1.37914496, -0.47747175, ..., -0.4663365 ,
        -0.47338848, -0.42449015],
       [ 1.37914496, -1.37914496, -0.47747175, ..., -0.4663365 ,
         0.75053591, -0.10127665],
       [-0.72508694,  0.72508694, -0.47747175, ..., -0.4663365 ,
        -0.47338848, -0.55242799],
       ...,
       [-0.72508694,  0.72508694,  2.09436473, ..., -0.4663365 ,
        -0.47338848,  2.62448964],
       [-0.72508694,  0.72508694, -0.47747175, ..., -0.4663365 ,
        -0.47338848, -0.5475599 ],
       [-0.72508694,  0.72508694, -0.47747175, ..., -0.4663365 ,
        -0.47338848, -0.48664659]])

# Machine Learning Model

## Logistic Regression

In [483]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression()
model1.fit(x_train, y_train)

In [484]:
y_pred1 = model1.predict(x_test)
print(y_pred1)

[1 1 0 1 0 1 1 0 0 1 0 1 0 1 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 1 0 1 0
 0 1 0 1 0 1 0 0 1 1 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0
 1 1 0 0 0 0 0 0 1 1 0 0 1 1 0 1 0 1 0 0 1 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 0
 0 0 1 1 1 0 1 1 0 0 1 1 1 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 1 1
 1 0 0 0 1 0 1 1 0 1 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0]


In [510]:
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, f1_score, classification_report
print(accuracy_score(y_test, y_pred1)*100)
print(recall_score(y_test, y_pred1)*100)
print(f1_score(y_test, y_pred1)*100)
print(confusion_matrix(y_test, y_pred1))
print(classification_report(y_test, y_pred1))

84.26966292134831
80.82191780821918
80.82191780821918
[[91 14]
 [14 59]]
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       105
           1       0.81      0.81      0.81        73

    accuracy                           0.84       178
   macro avg       0.84      0.84      0.84       178
weighted avg       0.84      0.84      0.84       178



## Decision Tree

In [486]:
from sklearn.tree import DecisionTreeClassifier
model2 = DecisionTreeClassifier(max_depth=6)
model2.fit(x_train, y_train)

In [487]:
y_pred2 = model2.predict(x_test)

In [511]:
print(accuracy_score(y_test, y_pred2)*100)
print(recall_score(y_test, y_pred2)*100)
print(f1_score(y_test, y_pred2)*100)
print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))

83.70786516853933
69.86301369863014
77.86259541984732
[[98  7]
 [22 51]]
              precision    recall  f1-score   support

           0       0.82      0.93      0.87       105
           1       0.88      0.70      0.78        73

    accuracy                           0.84       178
   macro avg       0.85      0.82      0.82       178
weighted avg       0.84      0.84      0.83       178



## Random Forest

In [489]:
from sklearn.ensemble import RandomForestClassifier
model3 = RandomForestClassifier(n_estimators=100, random_state=2)
model3.fit(x_train, y_train)

In [490]:
y_pred3 = model3.predict(x_test)

In [513]:
print(accuracy_score(y_test, y_pred3)*100)
print(recall_score(y_test, y_pred3)*100)
print(f1_score(y_test, y_pred3)*100)
print(confusion_matrix(y_test, y_pred3))
print(classification_report(y_test, y_pred3))

82.58426966292134
73.97260273972603
77.6978417266187
[[93 12]
 [19 54]]
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       105
           1       0.82      0.74      0.78        73

    accuracy                           0.83       178
   macro avg       0.82      0.81      0.82       178
weighted avg       0.83      0.83      0.82       178



## XGboost

In [492]:
from xgboost import XGBClassifier
model4 = XGBClassifier()
model4.fit(x_train, y_train)

In [493]:
y_pred4 = model4.predict(x_test)

In [514]:
print(accuracy_score(y_test, y_pred4)*100)
print(recall_score(y_test, y_pred4)*100)
print(f1_score(y_test, y_pred4)*100)
print(confusion_matrix(y_test, y_pred4))
print(classification_report(y_test, y_pred4))

85.39325842696628
75.34246575342466
80.88235294117648
[[97  8]
 [18 55]]
              precision    recall  f1-score   support

           0       0.84      0.92      0.88       105
           1       0.87      0.75      0.81        73

    accuracy                           0.85       178
   macro avg       0.86      0.84      0.85       178
weighted avg       0.86      0.85      0.85       178



## Neural Network

In [505]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense

model5 = Sequential([
    Dense(25, activation="sigmoid", input_dim=10),
    Dense(25, activation="sigmoid"),
    Dense(1, activation="sigmoid")
])
model5.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [506]:
from keras.losses import BinaryCrossentropy
model5.compile(optimizer="Adam", loss=BinaryCrossentropy())

In [507]:
model5.fit(x_train, y_train, epochs=100)

Epoch 1/100


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.7225
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.6450
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6523 
Epoch 4/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6243 
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6114
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6071 
Epoch 7/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.5875
Epoch 8/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.5680
Epoch 9/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.5329
Epoch 10/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.5366
Epoch 11

<keras.src.callbacks.history.History at 0x1c82689b710>

In [508]:
y_pred5 = model5.predict(x_test)

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step


In [515]:
threshold = 0.5
y_pred5 = np.where(y_pred5 >= threshold, 1, 0)

print(accuracy_score(y_test, y_pred5) * 100)
print(recall_score(y_test, y_pred5) * 100)
print(f1_score(y_test, y_pred5) * 100)
print(confusion_matrix(y_test, y_pred5))
print(classification_report(y_test, y_pred5))

81.46067415730337
73.97260273972603
76.59574468085106
[[91 14]
 [19 54]]
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       105
           1       0.79      0.74      0.77        73

    accuracy                           0.81       178
   macro avg       0.81      0.80      0.81       178
weighted avg       0.81      0.81      0.81       178

