In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import maxabs_scale

In [6]:
train = pd.read_csv("titanic/train.csv")

In [7]:
train.SibSp.min()

0

In [8]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
def sanitise_data(data, test=False):
    if not test:
        del data['Survived']
        
    for col in ['PassengerId','Name','Ticket']:
        del data[col]

    data.Cabin = data.Cabin.isna().astype(int)
    data.Cabin = data.Cabin.map({1: 0.75, 0: 0.25})
    data.Embarked = data.Embarked.map({'S':0.65,'C':0.55,'Q':0.45,np.nan:0.35})
    data.Sex = data.Sex.map({'male':0.1,'female':0.9})
    
    for col in ['Pclass', 'Age', 'Parch', 'SibSp', 'Fare']:
        data[col] = data[col].replace(np.nan, 0)
        data[col] = maxabs_scale(data[col])
        data[col] = (data[col] - data[col].mean()) / data[col].std()
        
    data['bias'] = pd.Series([1 for i in range(len(data))])
    return data

In [10]:
def preprocess_data(data, test=False):
    #data.insert(len(data.columns), "nameLength", [len(name) for name in train.Name.replace(to_replace=r'^(\w+),.+$', value=r'\1', regex=True)])
    return data

In [11]:
x = preprocess_data(train)

In [12]:
x.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
y = train['Survived']
x = sanitise_data(train)

AttributeError: 'DataFrame' object has no attribute 'nameLength'

In [15]:
x.loc[3]

Pclass     -1.565228
Sex         0.900000
Age         0.636546
SibSp       0.432550
Parch      -0.473408
Fare        0.420494
Cabin       0.250000
Embarked    0.650000
bias        1.000000
Name: 3, dtype: float64

In [32]:
theta = np.random.normal(0, 0.5, size=(x.shape[1]))

m = x.shape[0]
J = 0
grad = np.zeros(m)

In [33]:
def h(X, theta):
    return 1./(1 + np.exp(-(theta @ X.T)))

In [34]:
def cost(theta, X, y):
    A = h(x, theta)
    J = (1/m) * np.sum((-(y.T) * np.log(A)) - ((1 - y.T) * np.log(1 - A)))
    err = A - y
    grad = (1/m) * (err.T @ X)
    return (J, grad)
 

In [35]:
theta

array([ 0.0387048 ,  0.69123158,  0.57447512,  0.67932365, -0.0760624 ,
       -0.43314446,  0.60258682, -0.59093549,  0.26640867])

In [36]:
(J, grad) = cost(theta, x, y)

In [37]:
grad

Pclass      0.198044
Sex         0.004466
Age         0.074718
SibSp       0.095154
Parch      -0.019807
Fare       -0.180534
Cabin       0.189413
Embarked    0.148661
bias        0.234294
dtype: float64

In [38]:
for i in range(10000):
    theta = theta - (0.05 * grad)
    (J, grad) = cost(theta, x, y)
    if i % 1000 == 0:
        print("Cost: {}".format(J))


Cost: 0.8801473036178437
Cost: 0.4606160919094109
Cost: 0.4511960452274101
Cost: 0.44960111718284224
Cost: 0.44919054130316194
Cost: 0.449012541313096
Cost: 0.4488998399211659
Cost: 0.4488170056736756
Cost: 0.4487534874623088
Cost: 0.4487042447340062


In [39]:
test = pd.read_csv("titanic/test.csv")

In [40]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [41]:
testIds = test['PassengerId']
t = sanitise_data(test, test=True).to_numpy()
tA = h(t, theta)

In [42]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,bias
0,0.872436,0.1,0.594089,-0.498872,-0.399769,-0.496043,0.75,0.45,1
1,0.872436,0.9,1.304333,0.616254,-0.399769,-0.510885,0.75,0.65,1
2,-0.315441,0.1,2.156624,-0.498872,-0.399769,-0.46278,0.75,0.45,1
3,0.872436,0.1,0.167944,-0.498872,-0.399769,-0.481127,0.75,0.65,1
4,0.872436,0.9,-0.116154,0.616254,0.619154,-0.416242,0.75,0.65,1


In [43]:
res = {'PassengerId': testIds.to_numpy(), 'Survived': (tA > 0.5).astype(int)}
result = pd.DataFrame(data=res)

In [44]:
with open("results.csv", "w") as fh:
    fh.write(result.to_csv(index=False))