In [2]:
# Create a notebook to perform titanic survival classification using logisitic regression (library allowed). To perform the validation, do it in two scenarios (Split validation (70:30) and k-fold cross validation (k=5)). Then determine wheter the model is fit, underfit, or overfit in each scenario.

In [3]:
import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
dataset = pd.read_csv('titanic.csv')
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Preprocessing, dropping unimportant data

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
dataset = dataset.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin','Embarked'])
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int64(4), object(1)
memory usage: 48.9+ KB


In [7]:
dataset['Age'] = dataset['Age'].fillna(dataset['Age'].mean())

In [8]:
dataset['Sex'] = dataset.Sex.apply(lambda x:1 if x == 'male' else 0)

# Modelling

70:30 split

In [9]:
X = dataset.drop(columns=['Survived'])
y = dataset['Survived']

# Create a notebook to perform titanic survival classification using logisitic regression (library allowed). To perform the validation, do it in two scenarios (Split validation (70:30) and k-fold cross validation (k=5)). Then determine wheter the model is fit, underfit, or overfit in each scenario.

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LogisticRegression(random_state=42, max_iter=1000).fit(X_train, y_train)

y_pred = model.predict(X_test)

splitPrediction = accuracy_score(y_test, y_pred)

y_train_pred = model.predict(X_train)

splitTrainingPrediction = accuracy_score(y_train, y_train_pred)



Kfold, with k = 5

In [10]:
from sklearn.model_selection import cross_val_score, KFold

model2 = LogisticRegression(random_state=42, max_iter=1000)

cv = KFold(n_splits=5, random_state=42, shuffle=True)


kFolds = cross_val_score(model2, X, y, cv=cv, scoring='accuracy')

kFoldPrediction = np.mean(kFolds)




# Prediction Accuracy Score comparison

In [None]:
print('======================================================')
print('The results of the (70:30) split validation are:')
print('Prediction accuracy on training set:', splitTrainingPrediction)
print('Prediction accuracy on test set:', splitPrediction)
print('======================================================\n')

print('======================================================')
print('The results of the 5-fold cross validation are:')
print('Folds:', kFolds)
print('Prediction accuracy:', kFoldPrediction)
print('======================================================\n')



The results of the (70:30) split validation are:
Prediction accuracy on training set: 0.7961476725521669
Prediction accuracy on test set: 0.8134328358208955

The results of the 5-fold cross validation are:
Folds: [0.81005587 0.7752809  0.83146067 0.7752809  0.7752809 ]
Prediction accuracy: 0.7934718473416609



# Conclusion

## (70:30) split validation

> Looking at the training and test accuracies, they appear to be very similar to each other, with the model getting an accuracy of ~79% on training data and ~81% on test data. The test data accuracy being higher than even the test data accuracy might indicate that the model generalizes well to unseeen data. since the test data accuracy isn't significantly lower than the training data (it's even higher than the test data), the model isn't overfit, and since both accuracies are reasonably high, the model isn't underfit either, making it well-fit.

## K-fold cross validation

> Looking at each fold, they appear to have a relatively small range of ~77% to ~83%, with a mean of ~79%. This shows that the model does well with multiple subsets in the dataset. Although the accuracy isn't as high as the (70:30) split validation, it is still in a similar range and reasonably high, which shows that this model is well-fit

