In [54]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/titanic.csv")
df = df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Survived"]]
df = df.dropna()
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
#SibSP is the no of siblings or partners onboard
#Parch is the number of parents or children onboard
df = df.rename(columns={"SibSp": "no_of_siblings_or_partners_onboard", "Parch": "no_of_parents_or_children_onboard"})

df.head()


Unnamed: 0,Pclass,Sex,Age,no_of_siblings_or_partners_onboard,no_of_parents_or_children_onboard,Survived
0,3,0,22.0,1,0,0
1,1,1,38.0,1,0,1
2,3,1,26.0,0,0,1
3,1,1,35.0,1,0,1
4,3,0,35.0,0,0,0


Train test split

The kaggle dataset contains a test.csv file, but this does not have the labels, so instead I am splitting the train.csv into two for our purposes

In [55]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df["Survived"], test_size=0.2)

train = pd.concat([X_train, y_train], axis=1)
train.to_csv("../data/train.csv", index=False)
test = pd.concat([X_test, y_test], axis=1)
test.to_csv("../data/test.csv", index=False)

Logistic regression model

In [57]:
from sklearn.linear_model import LogisticRegression
import pickle

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

#Savign the model
with open("../models/lr_model.pkl", "wb") as f:
    pickle.dump(lr_model, f)

In [59]:
from sklearn.metrics import classification_report

y_pred = lr_model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.81      0.86      0.83        79
           1       0.81      0.75      0.78        64

    accuracy                           0.81       143
   macro avg       0.81      0.81      0.81       143
weighted avg       0.81      0.81      0.81       143



Random forest model

In [60]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
with open("../models/rf_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)



In [61]:
y_pred = rf_model.predict(X_test)

print(classification_report(y_test, y_pred)) #83 percent accuracy, better performance than the LR model

              precision    recall  f1-score   support

           0       0.84      0.85      0.84        79
           1       0.81      0.80      0.80        64

    accuracy                           0.83       143
   macro avg       0.82      0.82      0.82       143
weighted avg       0.82      0.83      0.83       143

