In [109]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        if filename.endswith('.csv'):
            print(os.path.join(dirname, filename))

./gender_submission.csv
./test.csv
./train.csv


In [111]:
# Loading the train.csv file into the notebook
train_data = pd.read_csv("./train.csv")

# Outputs the first five rows of the table in train.csv
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [113]:
# Loading the test.csv file into the notebook
test_data = pd.read_csv("./test.csv")

# Outputs the first five rows of the table in test.csv
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [115]:
# Percentage of female passengers (in train.csv) who survived
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [117]:
# Percentage of male passengers (in train.csv) who survived
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of men who survived: 0.18890814558058924


# Titanic Logistic Regression Model

In [120]:
train_data = pd.get_dummies(train_data, columns=["Sex"], drop_first=True)

X_train = train_data.drop(columns=["PassengerId", "Name", "Ticket", "Cabin", "Fare", "Embarked", "Survived"])
Y_train = train_data["Survived"]
X_test = test_data.drop(columns=["PassengerId", "Name", "Ticket", "Cabin", "Fare", "Embarked"])

X_train = np.array(X_train).T
Y_train = np.array(Y_train).reshape(1, -1)
X_test = np.array(X_test).T

print()
print('X_train shape {}'.format(X_train.shape))
print('Y_train shape {}'.format(Y_train.shape))
print('X_test shape {}'.format(X_test.shape))

print(pd.isna(X_train))


X_train shape (5, 891)
Y_train shape (1, 891)
X_test shape (5, 418)
[[False False False ... False False False]
 [False False False ...  True False False]
 [False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]]


In [8]:
# Sigmoid function
import math
def sigmoid(z):
    e = math.e
    a = 1/(1 + e**(-z))
    return a

In [9]:
# Evaluating
print('sigmoid(-10) = {}'.format(sigmoid(-10)))
print('sigmoid(10) = {}'.format(sigmoid(10)))

sigmoid(-10) = 4.539786870243442e-05
sigmoid(10) = 0.9999546021312976


In [10]:
# Initializing Parameters
def init_zeros(dim):
    w = np.zeros((dim, 1))
    b = 0.0
    return w, b

In [11]:
# Evaluating
dim = 3
w, b = init_zeros(dim)
print('w = {}'.format(w))
print('b = {}'.format(b))

w = [[0.]
 [0.]
 [0.]]
b = 0.0


In [12]:
# Forward and Backward Propagation
def forward_backward(X, Y, w, b):
    m = X.shape[1]

    # Forward    
    A = sigmoid(w.T @ X + b)    
    
    A = np.nan_to_num(A, nan=1e-8)
    print(A)
    
    cost = -(1/m) * np.sum(Y * np.log(A) + (1 - Y) * np.log(1 - A))

    # Backward
    dZ = A - Y
    dw = (1/m) * X @ np.transpose(dZ)
    db = (1/m) * np.sum(dZ)

    cost = np.squeeze(cost)
    grads = {'dw': dw, 'db': db}
    
    return grads, cost

In [13]:
# Evaluating
X = np.array([[1,2,-3,0],[0.5,6,-5,0]])
Y = np.array([[1,0,1,0]])
w = np.array([[1],[2]])
b = 0
grads, cost = forward_backward(X, Y, w, b)

print('dw = {}'.format(grads['dw']))
print('db = {}'.format(grads['db']))
print('cost = {}'.format(cost))

[[8.80797078e-01 9.99999168e-01 2.26032430e-06 5.00000000e-01]]
dw = [[1.22019716]
 [2.73509556]]
db = 0.09519962669353813
cost = 6.9550195708335805


In [14]:
# Gradient Descent
def gradient_descent(X, Y, w, b, num_iters, alpha, verbose=False):
    costs = []
    m = X.shape[1]
    
    for i in range(num_iters):
        grads, cost = forward_backward(X, Y, w, b)

        dw = grads['dw']
        db = grads['db']

        w = w - alpha * dw
        b = b - alpha * db

        if i % 100 == 0:
            costs.append(cost)
        if i % 100 == 0 and verbose:
            print("Cost after iter {}: {}".format(i, cost))
        
    params = {'w': w, 'b': b}
        
    return params, costs

In [15]:
# Evaluating
params, costs = gradient_descent(X, Y, w, b, num_iters=1000, alpha=0.01)
print('w = {}'.format(params['w']))
print('b = {}'.format(params['b']))

[[8.80797078e-01 9.99999168e-01 2.26032430e-06 5.00000000e-01]]
[[8.77951254e-01 9.99998995e-01 2.68563331e-06 4.99762001e-01]]
[[8.75048945e-01 9.99998785e-01 3.19086898e-06 4.99525929e-01]]
[[8.72089562e-01 9.99998532e-01 3.79103014e-06 4.99291819e-01]]
[[8.69072537e-01 9.99998226e-01 4.50392569e-06 4.99059705e-01]]
[[8.65997322e-01 9.99997857e-01 5.35070011e-06 4.98829622e-01]]
[[8.62863392e-01 9.99997410e-01 6.35645701e-06 4.98601604e-01]]
[[8.59670243e-01 9.99996871e-01 7.55099905e-06 4.98375688e-01]]
[[8.56417396e-01 9.99996219e-01 8.96970580e-06 4.98151910e-01]]
[[8.53104399e-01 9.99995432e-01 1.06545752e-05 4.97930304e-01]]
[[8.49730826e-01 9.99994482e-01 1.26554589e-05 4.97710908e-01]]
[[8.46296280e-01 9.99993334e-01 1.50315284e-05 4.97493757e-01]]
[[8.42800394e-01 9.99991947e-01 1.78530126e-05 4.97278889e-01]]
[[8.39242833e-01 9.99990273e-01 2.12032603e-05 4.97066340e-01]]
[[8.35623294e-01 9.99988252e-01 2.51811850e-05 4.96856147e-01]]
[[8.31941510e-01 9.99985811e-01 2.990416

In [16]:
# Predict
def predict(X, w, b):
    m = X.shape[1]
    Y_pred = np.zeros((1, m))

    A = sigmoid(np.transpose(w) @ X + b)

    for i in range(A.shape[1]):
        Y_pred[0, i] = 1 if A[0][i] > 0.5 else 0
    
    return Y_pred

In [17]:
# Evaluating
print('predictions = {}'.format(predict(X, w, b)))
print('predictions = {}'.format(predict(X, params['w'], params['b'])))

predictions = [[1. 1. 0. 0.]]
predictions = [[1. 0. 1. 1.]]


In [18]:
# Putting it together for the model
def model(X_train, Y_train, X_test, num_iters=1000, alpha=0.01, verbose=False):
    w, b = init_zeros(X_train.shape[0])
    params, costs = gradient_descent(X_train, Y_train, w, b, num_iters, alpha, verbose)
    
    w = params['w']
    b = params['b']
    
    Y_pred_train = predict(X_train, w, b)
    Y_pred_test = predict(X_test, w, b)
    
    result = {
        'w': w,
        'b': b,
        'costs': costs,
        'Y_pred_test': Y_pred_test
    }
    
    return result

In [19]:
# Evaluating
res = model(X_train, Y_train, X_test, num_iters=1500, alpha=0.002, verbose=True)

[[0.5 0.5 0.5 0.5 0.5 nan 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 nan
  0.5 nan 0.5 0.5 0.5 0.5 0.5 0.5 nan 0.5 nan nan 0.5 nan nan 0.5 0.5 0.5
  nan 0.5 0.5 0.5 0.5 0.5 nan 0.5 0.5 nan nan nan nan 0.5 0.5 0.5 0.5 0.5
  0.5 nan 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 nan nan 0.5 0.5 0.5 0.5 0.5 0.5
  0.5 0.5 0.5 0.5 nan nan 0.5 0.5 0.5 0.5 nan 0.5 0.5 0.5 0.5 nan 0.5 0.5
  0.5 0.5 0.5 0.5 0.5 nan 0.5 0.5 0.5 0.5 0.5 nan 0.5 0.5 0.5 0.5 0.5 nan
  0.5 nan 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 nan 0.5 0.5 0.5 0.5
  nan 0.5 nan 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 nan 0.5 0.5 0.5
  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 nan 0.5 0.5 0.5 nan nan 0.5 0.5
  0.5 0.5 0.5 0.5 nan 0.5 nan 0.5 0.5 0.5 0.5 0.5 0.5 0.5 nan 0.5 0.5 0.5
  nan nan 0.5 0.5 0.5 nan nan 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 nan 0.5
  nan 0.5 0.5 nan 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 nan 0.5
  0.5 0.5 0.5 0.5 0.5 0.5 0.5 nan 0.5 0.5 0.5 0.5 0.5 nan 0.5 0.5 0.5 0.5
  0.5 nan 0.5 0.5 0.5 0.5 nan nan 0.5 

TypeError: loop of ufunc does not support argument 0 of type float which has no callable log method