In [255]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [256]:
titanic_df = pd.read_csv("Titanic-Dataset.csv")
titanic_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [257]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [258]:
print(titanic_df.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [259]:
# NaN values
titanic_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [260]:
# Filling in NaN values in "Age" column with meaan
titanic_df["Age"].fillna(titanic_df["Age"].mean(), inplace=True)

titanic_df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_df["Age"].fillna(titanic_df["Age"].mean(), inplace=True)


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [261]:
titanic_df["Survived"] # Y values to predict

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [263]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder 

label_encoder = LabelEncoder()

titanic_df["Embarked"] = label_encoder.fit_transform(titanic_df["Embarked"])

In [271]:
# Splitting to dataset into test and train
from sklearn.model_selection import train_test_split
x = titanic_df[["Age", "Fare", "Pclass", "Embarked"]].to_numpy() # Choosing the features of interest
y = titanic_df["Survived"].to_numpy()

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.2, random_state=42)

In [272]:
print(x)

[[22.          7.25        3.          2.        ]
 [38.         71.2833      1.          0.        ]
 [26.          7.925       3.          2.        ]
 ...
 [29.69911765 23.45        3.          2.        ]
 [26.         30.          1.          0.        ]
 [32.          7.75        3.          1.        ]]


In [266]:
# DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier()

clf.fit(X_train, Y_train)

predictions = clf.predict(X_test)

accuracy = accuracy_score(Y_test, predictions)
print(predictions)
print("Accuracy: ", accuracy)

[0 0 1 1 1 1 0 0 1 1 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 1 1
 0 1 0 1 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
 0 1 1 1 0 1 1 1 0 0 1 1 1 0 1 1 1 0 0 1 0 0 1 1 0 1 0 0 0 0 0 1 0 0 1 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1]
Accuracy:  0.7094972067039106


In [267]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=2)

clf.fit(X_train, Y_train)

predictions = clf.predict(X_test)

accuracy = accuracy_score(Y_test, predictions)
print(predictions)
print("Accuracy", accuracy)

[0 0 0 1 0 1 0 0 0 1 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 1 1 1
 0 0 0 1 0 0 0 0 1 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1
 0 1 1 1 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0]
Accuracy 0.7262569832402235


In [268]:
# Adjusting threshold for positive results
predicted_probabilities = clf.predict_proba(X_test)

new_threshold = 0.40
binary_predictions = (predicted_probabilities[:, 1] > new_threshold).astype(int)
print(binary_predictions)

accuracy = accuracy_score(Y_test, binary_predictions)
print("Accuracy", accuracy)

[0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 1 1 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1
 0 1 0 1 0 0 0 0 0 0 0 0 1 1 1 0 1 0 1 0 1 0 0 0 1 1 1 0 0 1 0 0 1 1 1 1 1
 0 0 0 1 0 0 0 0 1 0 1 1 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1 1
 0 1 1 1 0 1 1 1 0 1 1 1 1 0 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 1 0 0 1 1 1 1 0 0 0 1 0]
Accuracy 0.7262569832402235


In [273]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier()

clf.fit(X_train, Y_train)

predictions = clf.predict(X_test)

accuracy = accuracy_score(Y_test, predictions)
print(predictions)
print("Accuracy: ", accuracy)

[0 0 1 1 1 0 0 0 1 1 1 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 1 1 0 0 0 1 0 0 0 1 0 1 1 1
 0 1 0 1 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0
 0 1 1 1 0 0 1 1 0 0 1 0 1 0 0 1 1 0 0 1 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 1
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1]
Accuracy:  0.7094972067039106


In [270]:
# Likewise, adjusting threshold for positive results
predicted_probabilities = clf.predict_proba(X_test)

new_threshold = 0.30
binary_predictions = (predicted_probabilities[:, 1] > new_threshold).astype(int)
print(binary_predictions)

accuracy = accuracy_score(Y_test, binary_predictions)
print("Accuracy", accuracy)

[0 0 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 0 0 0 1 0 0 0 0 1 1 0 1 0 1 1 0 1 0 1
 1 1 0 0 0 0 1 0 0 1 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 1 0 1 1 0 0 0 1 1 1 1 1
 0 1 0 1 0 1 0 0 1 1 1 1 1 1 0 0 1 0 0 1 0 1 1 0 1 1 0 0 1 0 0 1 0 0 0 1 0
 0 1 1 1 0 1 1 1 0 0 1 1 1 0 0 1 1 0 0 1 0 0 1 1 0 1 1 0 0 0 0 1 0 0 1 1 1
 1 0 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0 1 0 0 0 0 1 0 1 1 0 0 0 0 1]
Accuracy 0.6927374301675978
