In [128]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        if filename.endswith('.csv'):
            print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./gender_submission.csv
./test.csv
./train.csv


In [130]:
# Loading the train.csv file into the notebook
train_data = pd.read_csv("./train.csv")

# Outputs the first five rows of the table in train.csv
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [132]:
# Loading the test.csv file into the notebook
test_data = pd.read_csv("./test.csv")

# Outputs the first five rows of the table in test.csv
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [134]:
# Percentage of female passengers (in train.csv) who survived
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [183]:
# Percentage of male passengers (in train.csv) who survived
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of men who survived: 0.18890814558058924


# Titanic Logistic Regression Model

In [220]:
# Features and target labels
X = train_data.drop(columns=['Survived']).values.T
Y = train_data['Survived'].values.reshape(1, -1)

In [222]:
# Sigmoid function
import math
def sigmoid(z):
    e = math.e
    a = 1/(1 + e**(-z))
    return a

In [224]:
# Initializing Parameters
def init_zeros(dim):
    w = np.zeros((dim, 1))
    b = 0.0
    return w, b

In [226]:
# Forward and Backward Propagation
def forward_backward(X, Y, w, b):
    m = X.shape[1]

    # Forward
    A = sigmoid(np.transpose(w) @ X + b)
    cost = -(1/m) * np.sum(Y * np.log(A) + (1 - Y) * np.log(1 - A))

    # Backward
    dZ = A - Y
    dw = (1/m) * X @ np.transpose(dZ)
    db = (1/m) * np.sum(dZ)

    cost = np.squeeze(cost)
    grads = {'dw': dw, 'db': db}
    
    return grads, cost

In [228]:
# Gradient Descent
def gradient_descent(X, Y, w, b, num_iters, alpha, verbose=False):
    costs = []
    m = X.shape[1]
    
    for i in range(num_iters):
        grads, cost = forward_backward(X, Y, w, b)

        dw = grads['dw']
        db = grads['db']

        w = w - alpha * dw
        b = b - alpha * db

        if i % 100 == 0:
            costs.append(cost)
        if i % 100 == 0 and verbose:
            print("Cost after iter {}: {}".format(i, cost))
        
    params = {'w': w, 'b': b}
        
    return params, costs

In [230]:
# Predict
def predict(X, w, b):
    m = X.shape[1]
    Y_pred = np.zeros((1, m))

    A = sigmoid(np.transpose(w) @ X + b)

    for i in range(A.shape[1]):
        Y_pred[0, i] = 1 if A[0][i] > 0.5 else 0
    
    return Y_pred

In [234]:
# Putting it together for the model
def model(X_train, Y_train, X_test, Y_test, num_iters=2000, alpha=0.005, verbose=False):
    w, b = init_zeros(X.shape[0])
    
    params, costs = GD(X_train, Y_train, w, b, num_iters, alpha, verbose)
    
    w = params['w']
    b = params['b']
    
    Y_pred_train = predict(X_train, w, b)
    Y_pred_test = predict(X_test, w, b)
    
    acc_train = np.mean(Y_pred_train == Y_train)
    acc_test = np.mean(Y_pred_test == Y_test)
    
    print('train accuracy: {} %'.format(100 * acc_train))
    print('test accuracy: {} %'.format(100 * acc_test))
    
    result = {
        'w': w,
        'b': b,
        'costs': costs,
        'Y_pred_test': Y_pred_test
    }
    
    return result