In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
path = 'data/train.csv'
data = pd.read_csv(path)
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
data = data.dropna()
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,183.0,183.0,183.0,183.0,183.0,183.0,183.0
mean,455.36612,0.672131,1.191257,35.674426,0.464481,0.47541,78.682469
std,247.052476,0.470725,0.515187,15.643866,0.644159,0.754617,76.347843
min,2.0,0.0,1.0,0.92,0.0,0.0,0.0
25%,263.5,0.0,1.0,24.0,0.0,0.0,29.7
50%,457.0,1.0,1.0,36.0,0.0,0.0,57.0
75%,676.0,1.0,1.0,47.5,1.0,1.0,90.0
max,890.0,1.0,3.0,80.0,3.0,4.0,512.3292


In [5]:
feature_names = ['Pclass', 'Sex', 'SibSp', 'Parch']
X = pd.get_dummies(data[feature_names])
y = data['Survived']
X
#X, y

Unnamed: 0,Pclass,SibSp,Parch,Sex_female,Sex_male
1,1,1,0,True,False
3,1,1,0,True,False
6,1,0,0,False,True
10,3,1,1,True,False
11,1,0,0,True,False
...,...,...,...,...,...
871,1,1,1,True,False
872,1,0,0,False,True
879,1,0,1,True,False
887,1,0,0,True,False


In [6]:
class MachineLearning:
    def __init__(self):
        self.w = None 
        self.loss_hist = None

    def sigmoid(self, x):
        #return 1/(1+np.exp(-x))
        return 0.5 * (x/(1 + np.abs(x))) + 0.5
        
    def softmax(self, x):
        return np.exp(x)/np.sum(np.exp(x))

    def loss(self, X, y, lam):
        w = self.w
        a = self.sigmoid(X.dot(w))
        loss_0 = -np.mean(y * np.log(a) + (1 - y) * np.log(1-a))
        weight_decay = 0.5 * lam/X.shape[0] * np.sum(w, w)
        return loss_0 + weight_decay
        
    def fit(self, X_train, y_train, lr = 0.15, epochs = 200, lam=0.001):
        N, d = X_train.shape[0], X_train.shape[1] 
        self.w = w_old = np.random.randn(X_train.shape[1])
        #self.loss_hist = [self.loss(X_train, y_train, lam)]
        ep = 0
        while ep < epochs:
            ep += 1
            mix_ids = np.random.permutation(N)
            for i in mix_ids:
                xi = X_train[i]
                yi = y_train[i]
                ai = self.sigmoid(xi.dot(self.w))
                #Update
                self.w = self.w - lr * ((ai - yi) * xi + lam * self.w)
            if np.linalg.norm(self.w - w_old)/d < 1e-6: 
                break
            w_old = self.w
        return self.w
        
    def predict(self, X_test):
        return X_test.dot(self.w)

In [9]:
X_train = np.array(X[['Pclass', 'SibSp', 'Parch', 'Sex_female', 'Sex_male']])
y_train = np.array(y)
X_train.shape

(183, 5)

In [10]:
Xbar = np.concatenate((X_train, np.ones((X_train.shape[0], 1))), axis = 1)
model = MachineLearning()
model.fit(Xbar, y_train)

array([-0.4411944629216343, 0.47698757088523386, -0.12215302022090434,
       4.053538869924135, -2.009075169978023, 2.033896667518026],
      dtype=object)

In [17]:
path_test = 'data/test.csv'
data_test = pd.read_csv(path_test)
test = pd.get_dummies(data_test[feature_names])
test

Unnamed: 0,Pclass,SibSp,Parch,Sex_female,Sex_male
0,3,0,0,False,True
1,3,1,0,True,False
2,2,0,0,False,True
3,3,0,0,False,True
4,3,1,1,True,False
...,...,...,...,...,...
413,3,0,0,False,True
414,1,0,0,True,False
415,3,0,0,False,True
416,3,0,0,False,True


In [12]:
X_test = np.array(test[['Pclass', 'SibSp', 'Parch', 'Sex_female', 'Sex_male']])
X_test.shape

(418, 5)

In [13]:
Xbar_test = np.concatenate((X_test, np.ones((X_test.shape[0], 1))), axis = 1)
predict = model.predict(Xbar_test)
predict

array([-1.2987618912249, 5.2408397195624925, -0.8575674283032653,
       -1.2987618912249, 5.118686699341588, -1.2987618912249,
       4.763852148677258, -0.5027328776389357, 4.763852148677258,
       -0.3447867494544319, -1.2987618912249, -0.41637296538163104,
       6.123228645405761, -0.3805798574180317, 6.123228645405761,
       5.682034182484126, -0.8575674283032653, -1.2987618912249,
       5.2408397195624925, 4.763852148677258, 0.06061460550360276,
       -1.420914911445804, 5.6462410745205265, -0.5385259856025355,
       5.756769584743047, -0.8217743203396659, 5.5240880542996225,
       -1.2987618912249, -0.41637296538163104, -0.3447867494544319,
       -0.3805798574180317, 0.09640771346720234, 4.996533679120684,
       4.996533679120684, 0.06061460550360276, -1.2987618912249,
       4.763852148677258, 4.763852148677258, -1.2987618912249,
       -1.2987618912249, -1.420914911445804, -0.41637296538163104,
       -1.2987618912249, 5.205046611598893, 6.123228645405761,
       -1.2

In [19]:
y = np.sign(predict)
y[y < 0] = 0
y.shape

(418,)

In [18]:
data_test 

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [29]:
with open('output.txt', 'w') as file:
    for i in range(len(data_test)):
        file.write(f"{data_test['PassengerId'][i]} {y[i]}\n")

In [32]:
sub = pd.read_csv('data/gender_submission.csv')
y_test = sub['Survived']

In [33]:
count = 0
for i in range(len(y_test)):
    if y_test[i] == y[i]: count+=1
print('The rate of it:', count/len(y_test) * 100)

The rate of it: 94.49760765550239


In [35]:
output = pd.DataFrame({
    'PassengerId': data_test.PassengerId,
    'Survived': y
})
#output.to_csv('submission.csv', index=False)