In [8]:
from typing import List, Dict, Any
import pandas as pd
import numpy as np
import os

In [9]:
train_df = pd.read_csv(os.path.join('data', "train.csv"))
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [14]:
def substitute_sex(x: str) -> int:
    return int(x != "male")


def stone_the_adulters(x: int) -> str: # denormalizing function
    return "male" if x == 0 else "female"

def substitute_embarked(x: str) -> int:
    if x == "S":
        return 0
    elif x == "C":
        return 1
    else:
        return 2

def disembarque(x: int) -> str:
    if x == 0:
        return "S"
    elif x == 1:
        return "C"
    else:
        return "Q"

In [None]:
train_df = train_df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'])
train_df = train_df.dropna()


train_df['Sex'] = train_df['Sex'].apply(substitute_sex)
train_df['Embarked'] = train_df['Embarked'].apply(substitute_embarked)


mean = train_df['Age'].mean()
std = train_df['Age'].std()
train_df['Age'] = (train_df['Age'] - mean) / std

In [21]:
train_df['bias'] = [1 for x in range((train_df.shape[0]))]

In [22]:
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked,bias
0,0,3,0,-0.527298,1,0,0,1
1,1,1,1,0.576688,1,0,1,1
2,1,3,1,-0.251301,0,0,0,1
3,1,1,1,0.369691,1,0,0,1
4,0,3,0,0.369691,0,0,0,1
...,...,...,...,...,...,...,...,...
885,0,3,1,0.645688,0,5,2,1
886,0,2,0,-0.182302,0,0,0,1
887,1,1,1,-0.734295,0,0,0,1
889,1,1,0,-0.251301,0,0,1,1


In [32]:
from sklearn.model_selection import train_test_split

X = train_df.drop(columns=['Survived'])
y = train_df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.15, random_state=69)

In [33]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((605, 7), (107, 7), (605,), (107,))

In [115]:
def MSELoss(y_true: np.ndarray, y_pred: np.ndarray) -> np.float32:
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    return (1/len(y_true)) * np.sum((y_pred - y_true) ** 2)

def MAELoss(y_true: np.ndarray, y_pred: np.ndarray) -> np.float32:
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    return (1/len(y_true)) * np.sum(np.abs((y_pred - y_true)))

def accuracy_with_thresholding(y_true: np.ndarray, y_pred: np.ndarray, threshold = 0.5):
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    return np.sum(
        y_true == (y_pred > threshold).astype(np.int32)
    ) / len(y_true)
    
def sigmoid(z):
    return 1/(1 + np.exp(-z))


# Full batch Gradient Descent

In [126]:
np.random.seed(69)
W = np.random.rand(7).reshape(7,1)
learning_rate = 0.00005

In [None]:
epochs = 5000
for epoch in range(epochs):
    
    y_pred = sigmoid(np.matmul(X_train, W))
    train_loss = MSELoss(y_train, y_pred)
    W = W - learning_rate * np.matmul(X_train.T, (y_pred - y_train.reshape(-1,1)) * (y_pred * (1-y_pred)))
    
    y_pred_val = sigmoid(np.matmul(X_test, W))
    validation_loss = MSELoss(y_test, y_pred_val)
    validation_accuracy = accuracy_with_thresholding(y_test, y_pred_val)
    
    if (epoch + 1 ) % 100 == 0:
        print(f"Accuracy: {validation_accuracy}, Training Loss: {train_loss}, Validation Loss: {validation_loss}")
    

Accuracy: 0.6542056074766355, Training Loss: 0.2659838433192597, Validation Loss: 0.2493421244585776
Accuracy: 0.719626168224299, Training Loss: 0.22361600205484658, Validation Loss: 0.21337818861036278
Accuracy: 0.7009345794392523, Training Loss: 0.2101098947839723, Validation Loss: 0.2045607648473325
Accuracy: 0.7009345794392523, Training Loss: 0.20225861098363515, Validation Loss: 0.19985480473574646
Accuracy: 0.7009345794392523, Training Loss: 0.1958965940141562, Validation Loss: 0.19568709709248203
Accuracy: 0.719626168224299, Training Loss: 0.19022805168284393, Validation Loss: 0.1915548249478347
Accuracy: 0.7289719626168224, Training Loss: 0.18515919805569891, Validation Loss: 0.187507923553139
Accuracy: 0.7289719626168224, Training Loss: 0.18071125358891585, Validation Loss: 0.18368681626405506
Accuracy: 0.7383177570093458, Training Loss: 0.1768782999640983, Validation Loss: 0.18020312027510887
Accuracy: 0.7850467289719626, Training Loss: 0.17361345996027738, Validation Loss: 0

# Mini-Batch Gradient Descent

In [134]:
np.random.seed(69)
W = np.random.rand(7).reshape(7,1)
learning_rate = 0.00005

In [135]:
import math
epochs = 5000
batch_size = 100
n_batches = math.ceil(X_train.shape[0] / batch_size)

for epoch in range(epochs):
    # batching
    
    for batch_no in range(n_batches):
        batch_x_train = X_train[batch_no * batch_size: (batch_no + 1) * batch_size, :]
        batch_y_train = y_train[batch_no * batch_size: (batch_no + 1) * batch_size]
    
        y_pred = sigmoid(np.matmul(batch_x_train, W))
        train_loss = MSELoss(batch_y_train, y_pred)
        W = W - learning_rate * np.matmul(batch_x_train.T, (y_pred - batch_y_train.reshape(-1,1)) * (y_pred * (1-y_pred)))
        
        y_pred_val = sigmoid(np.matmul(X_test, W))
        validation_loss = MSELoss(y_test, y_pred_val)
        validation_accuracy = accuracy_with_thresholding(y_test, y_pred_val)
        
    if (epoch + 1 ) % 100 == 0:
        print(f"Accuracy: {validation_accuracy}, Training Loss: {train_loss}, Validation Loss: {validation_loss}")
    

Accuracy: 0.6542056074766355, Training Loss: 0.1754968472295485, Validation Loss: 0.2494405864959342
Accuracy: 0.719626168224299, Training Loss: 0.14341053253149968, Validation Loss: 0.2134183998144617
Accuracy: 0.7009345794392523, Training Loss: 0.1408876812652112, Validation Loss: 0.2045579925243664
Accuracy: 0.7009345794392523, Training Loss: 0.14004271058386566, Validation Loss: 0.1998420226029134
Accuracy: 0.7009345794392523, Training Loss: 0.1379681131188164, Validation Loss: 0.19567460042008283
Accuracy: 0.719626168224299, Training Loss: 0.13473012532267595, Validation Loss: 0.1915450413606585
Accuracy: 0.7289719626168224, Training Loss: 0.1307286215458395, Validation Loss: 0.1875010851636707
Accuracy: 0.7289719626168224, Training Loss: 0.12636575465629346, Validation Loss: 0.1836826352653255
Accuracy: 0.7383177570093458, Training Loss: 0.1219738932342701, Validation Loss: 0.18020116352957835
Accuracy: 0.7850467289719626, Training Loss: 0.11777855368809412, Validation Loss: 0.17