# Logistic Regression on Titanic dataset from Scratch

In [1]:
import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt
from scipy.special import expit
np.seterr(divide = 'ignore') 

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
y_test = pd.read_csv('gender_submission.csv')
df_test = pd.merge(df_test, y_test, on='PassengerId')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


In [3]:
df = df.drop(columns=['PassengerId','Name', 'Ticket', 'Cabin'])
df_test = df_test.drop(columns=['PassengerId','Name', 'Ticket', 'Cabin'])
df = df.dropna()
df_test = df_test.dropna()
print("Training data shape", df.shape)
print("Testing data shape", df_test.shape)

Training data shape (712, 8)
Testing data shape (331, 8)


In [4]:
df['Age'] = df['Age']/df['Age'].max()
df['Fare'] = df['Fare']/df['Fare'].max()
df_test['Age'] = df_test['Age']/df['Age'].max()
df_test['Fare'] = df_test['Fare']/df['Fare'].max()

In [5]:
df['Sex'].replace('female', 0, inplace=True)
df['Sex'].replace('male', 1, inplace=True)
df['Embarked'].replace('S', 0, inplace=True)
df['Embarked'].replace('C', 1, inplace=True)
df['Embarked'].replace('Q', 2, inplace=True)

In [6]:
df_test['Sex'].replace('female', 0, inplace=True)
df_test['Sex'].replace('male', 1, inplace=True)
df_test['Embarked'].replace('S', 0, inplace=True)
df_test['Embarked'].replace('C', 1, inplace=True)
df_test['Embarked'].replace('Q', 2, inplace=True)

In [7]:
Y_train = df['Survived'].values
X_train = df.drop(['Survived'], axis=1).values
Y_test = df_test['Survived'].values
X_test = df_test.drop(['Survived'], axis=1).values

In [8]:
X_train = X_train.T
Y_train = Y_train.reshape(1, Y_train.shape[0])
X_test = X_test.T
Y_test = Y_test.reshape(1, Y_test.shape[0])

print("Shape of X_train and Y_train is:", (X_train.shape, Y_train.shape))
print("Shape of X_test and Y_test is:", (X_test.shape, Y_test.shape))

Shape of X_train and Y_train is: ((7, 712), (1, 712))
Shape of X_test and Y_test is: ((7, 331), (1, 331))


## Logistic Regression

The formula for each is z = wTX + b, 
a = sig(z)

Initialise w and b

In [9]:
def init(X):
    w = np.random.randn(X.shape[0], 1) * .01
    b = np.zeros((1, 1))
#     print("w transpose has a shape of", w.shape)
#     print("b has a shape of", b.shape)
    return {
        'w': w,
        'b': b
    }

In [10]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))

In [11]:
def propogate(w, b, X, Y):
    
    m = Y.shape[1]
    
    z = np.dot(w.T,X) + b
    a = sigmoid(z)
    J = -np.mean(Y*np.log(a) + (1-Y)*np.log(1-a))
    
    dw = np.dot(X,((a-Y).T))/m
    db = np.sum(a-Y)/m
    
    grad={
        'dw': dw,
        'db': db
    }
    
    return J, grad

In [12]:
def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost = False):
    cost = []
    for i in range(num_iterations):
        J, grad = propogate(w, b, X, Y)
        cost.append(J)
        dw = grad['dw']
        db = grad['db']
        
        w = w - learning_rate*dw
        b = b - learning_rate*db
        
        if print_cost and i%100 == 0:
            print("Cost at %ith iteration is %f" %(i, J))
    
#     plt.plot(cost)
#     plt.ylabel('cost')
#     plt.xlabel('iterations (per hundreds)')
#     plt.show()
    
    params = {
        'w': w,
        'b': b
    }
    return cost, params
        

In [13]:
def predict(params, X):
    m = X.shape[1]
    
    w = params['w']
    b = params['b']
    
    z = np.dot(w.T,X) + b
    a = sigmoid(z)
    
    predict = (a + 0.5)//1
    
    return predict    

In [14]:
def model(X_train, Y_train, X_test, Y_test, num_iterations, learning_rate, print_cost):
#     initialise the parameters
    init_para = init(X_train)
    w = init_para['w']
    b = init_para['b']
    
#     Training the model to get parameters acc to the hyperparameters
    cost, params = optimize(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost)
    
#     Predict and print accuracy
    Y_train_pred = predict(params, X_train)
    Y_test_pred = predict(params, X_test)
    
    print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_train_pred - Y_train)) * 100))
    print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_test_pred - Y_test)) * 100))
    d = {
        "costs": cost,
         "Y_prediction_test": Y_test_pred, 
         "Y_prediction_train" : Y_train_pred, 
         "w" : params['w'], 
         "b" : params['b'],
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations
    }
    
    return d

In [15]:
d = model(X_train, Y_train, X_test, Y_test, 100, 0.01, False)

train accuracy: 59.55056179775281 %
test accuracy: 65.86102719033232 %


In [16]:
learning_rates = [0.01, 0.001, 0.0001]
models = {}
for i in learning_rates:
    print ("learning rate is: " + str(i))
    models[str(i)] = model(X_train, Y_train, X_test, Y_test, 200, i, False)
    print ('\n' + "-------------------------------------------------------" + '\n')

learning rate is: 0.01
train accuracy: 59.97191011235955 %
test accuracy: 66.46525679758308 %

-------------------------------------------------------

learning rate is: 0.001
train accuracy: 59.55056179775281 %
test accuracy: 61.631419939577036 %

-------------------------------------------------------

learning rate is: 0.0001
train accuracy: 30.47752808988764 %
test accuracy: 37.46223564954683 %

-------------------------------------------------------

