In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf


In [2]:
train = pd.read_csv("titanic/train.csv")

In [3]:
train.Embarked

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [4]:
def sanitise_data(data, test=False):
    if not test:
        del data['Survived']
        
    for col in ['PassengerId','Name','Ticket']:
        del data[col]

    data.Cabin = data.Cabin.isna().astype(int)
    data.Embarked = data.Embarked.map({'S':1.,'C':0.6,'Q':0.3,np.nan:0})
    data.Sex = data.Sex.map({'male':0,'female':1})
    
    for col in ['Pclass', 'Age', 'Parch', 'SibSp', 'Fare']:
        data[col] = data[col].replace(np.nan, 0)
        data[col] = (data[col] - data[col].mean()) / data[col].std()
        
    data['bias'] = pd.Series([1 for i in range(len(data))])
    return data

In [5]:
y = train['Survived']
x = sanitise_data(train).to_numpy()

In [6]:
x[61]

array([-1.56522783,  1.        ,  0.80703838, -0.47427882, -0.47340772,
        0.96181313,  0.        ,  0.        ,  1.        ])

In [7]:
theta = np.random.randn((x.shape[1]))

m = x.shape[0]
J = 0
grad = np.zeros(m)

In [8]:
def h(X, theta):
    return 1./(1 + np.exp(-(theta @ X.T)))

In [9]:
def cost(theta, X, y):
    A = h(x, theta)
    J = (1/m) * np.sum((-(y.T) * np.log(A)) - ((1 - y.T) * np.log(1 - A)))
    err = A - y
    grad = (1/m) * (err.T @ X)
    return (J, grad)
 

In [10]:
theta

array([ 0.55936815,  0.98488206, -0.00233559, -0.79965477, -1.60758038,
        0.37660351,  0.60745165, -0.26881754,  0.44734198])

In [11]:
(J, grad) = cost(theta, x, y)

In [12]:
grad

array([ 0.23187194, -0.02295486, -0.00274136, -0.15460187, -0.28306782,
       -0.19846774,  0.34490686,  0.28271769,  0.32141681])

In [13]:
for i in range(10000):
    theta = theta - (0.01 * grad)
    (J, grad) = cost(theta, x, y)
    if i % 1000 == 0:
        print("Cost: {}".format(J))


Cost: 1.2831585249493838
Cost: 0.49582033361271516
Cost: 0.46953646349010875
Cost: 0.4593240640902637
Cost: 0.45422949721560224
Cost: 0.45171020665083006
Cost: 0.4504514068506722
Cost: 0.4497891591652328
Cost: 0.4494157696567433
Cost: 0.44919031887333283


In [25]:
test = pd.read_csv("test.csv")
testIds = test['PassengerId']
t = sanitise_data(test, test=True).to_numpy()
tA = h(t, theta)

array([ 892,  893,  894,  895,  896,  897,  898,  899,  900,  901,  902,
        903,  904,  905,  906,  907,  908,  909,  910,  911,  912,  913,
        914,  915,  916,  917,  918,  919,  920,  921,  922,  923,  924,
        925,  926,  927,  928,  929,  930,  931,  932,  933,  934,  935,
        936,  937,  938,  939,  940,  941,  942,  943,  944,  945,  946,
        947,  948,  949,  950,  951,  952,  953,  954,  955,  956,  957,
        958,  959,  960,  961,  962,  963,  964,  965,  966,  967,  968,
        969,  970,  971,  972,  973,  974,  975,  976,  977,  978,  979,
        980,  981,  982,  983,  984,  985,  986,  987,  988,  989,  990,
        991,  992,  993,  994,  995,  996,  997,  998,  999, 1000, 1001,
       1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012,
       1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
       1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034,
       1035, 1036, 1037, 1038, 1039, 1040, 1041, 10

In [40]:
res = {'PassengerId': testIds.to_numpy(), 'Survived': (tA > 0.5).astype(int)}
result = pd.DataFrame(data=res)

In [43]:
with open("results.csv", "w") as fh:
    fh.write(result.to_csv(index=False))