In [3]:
import pandas as pd
import numpy as np
from math import e
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from numpy import random

# Load the dataset
titanic = sns.load_dataset("titanic")


def preprocess_titanic_data(df):

    df = df.drop(['embarked', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone'], axis=1)

    df['age'].fillna(df['age'].median(), inplace=True)
    
    labelencoder = LabelEncoder()
    df['sex'] = labelencoder.fit_transform(df['sex'])
    df['class'] = labelencoder.fit_transform(df['class'])
    

    X = df.drop('survived', axis=1)
    y = df['survived']
    

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = preprocess_titanic_data(titanic)

In [4]:
# Sigmoid function
def sigmoid(logit):
    return 1 / (1 + np.exp(-logit))

# Loss function: binary crossentropy
def loss_calculator(y_pred, y_act):
    # clipping because otherwise the log function can go to infinity
    y_pred= np.clip(y_pred, 1e-15, 1 - 1e-15)
    return -np.mean(y_act * np.log(y_pred) + (1 - y_act) * np.log(1 - y_pred))

# Gradient Descent
def gradient_descent(X, y_pred, y_act, weights, learning_rate=0.01):
    m = len(y_act)
    
    # Transposing X so that the dot product is feasible
    gradient = np.dot(X.T, (y_pred - y_act)) / m
    weights -= learning_rate * gradient
    return weights

# Logistic Regression
def logistic_regression(X, y_act, epochs=1000, learning_rate=0.01):
    
    # adding the intercept term to the dataset so that its easier to take the dot product 
    X = np.hstack((np.ones((X.shape[0], 1)), X))

    # Initialize weights
    weights = random.randn(X.shape[1])
    print(weights)
    for i in range(epochs):
        # Compute logit
        logit = np.dot(X, weights)

        y_pred = sigmoid(logit)
        
        loss = loss_calculator(y_pred, y_act)
        
        weights = gradient_descent(X, y_pred, y_act, weights, learning_rate)
        
        # print loss every 100 epochs
        if i % 100 == 0:
            print(f"Epoch {i}, Loss: {loss}")
            
    return weights

def predict(weights, X_test, y_test):
    test= np.hstack((np.ones((X_test.shape[0],1)), X_test))
    y_prob= sigmoid(np.dot(test,weights))
    output_list=[0 for i in range(len(X_test))]
    for index, i in enumerate(y_prob):
        if i>0.5:
            output_list[index]=1
        else:
            output_list[index]=0
    return output_list
    

# Run logistic regression
weights = logistic_regression(X_train, y_train)
print("Final weights:", weights)

[-0.1283938  -1.19870739 -1.36619563  0.53180196  1.17908404 -0.08609342
 -0.75935275 -0.51763074]
Epoch 0, Loss: 0.7035341736105
Epoch 100, Loss: 0.6524156614404523
Epoch 200, Loss: 0.6125959818403729
Epoch 300, Loss: 0.5815060369338859
Epoch 400, Loss: 0.5571096745897782
Epoch 500, Loss: 0.537865657244996
Epoch 600, Loss: 0.5225744377047781
Epoch 700, Loss: 0.5102903592122138
Epoch 800, Loss: 0.5002988071473173
Epoch 900, Loss: 0.4920850378131642
Final weights: [-0.59324435 -0.87968143 -1.38884711 -0.1237624   0.24395693 -0.20334404
 -0.37459791 -0.19860478]


In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_model(y_true, y_pred):

    
    metrics = {}
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred)
    metrics['Recall'] = recall_score(y_true, y_pred)
    metrics['F1 Score'] = f1_score(y_true, y_pred)
    
    metrics['True Positives']= confusion_matrix(y_true,y_pred)[1][1]
    metrics['True Negatives']= confusion_matrix(y_true,y_pred)[0][0]
    metrics['False Positives']= confusion_matrix(y_true,y_pred)[0][1]
    metrics['False Negatives']= confusion_matrix(y_true,y_pred)[1][0]
    #metrics['Confusion Matrix'] = confusion_matrix(y_true, y_pred)
    
    return metrics

y_pred = predict(weights, X_test, y_test)
metrics = evaluate_model(y_test, y_pred)
print("Model Evaluation Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value}")

Model Evaluation Metrics:
Accuracy: 0.8212290502793296
Precision: 0.7625
Recall: 0.8243243243243243
F1 Score: 0.7922077922077922
True Positives: 61
True Negatives: 86
False Positives: 19
False Negatives: 13


-----------

## The logistic regression model shows a high level of accuracy (82.12%) in predicting the survival outcomes of passengers on the Titanic. The model also demonstrates a good balance between Precision and Recall, as evidenced by the F1 Score of 79.22%. This suggests that the model is both reliable and robust in identifying both classes—those who survived and those who did not