In [1]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline

## load data

In [2]:
train_df = pd.read_csv('./titanicTrain.csv')
test_df = pd.read_csv('./titanicQuestion.csv')

## clean data

In [3]:
train_df = train_df[:1000]

In [4]:
print('Train:', len(train_df),
      'Test:', len(test_df))

Train: 1000 Test: 309


## preprocessing function

In [5]:
# Titanic data preprocessing function
def preprocessTitanicData(raw_df):
    df = raw_df[['survived', 'pclass', 'sibsp', 'parch', 'age', 'sex', 'fare', 'embarked']]
    df['age'] = df['age'].fillna(df['age'].mean())
    df['fare'] = df['fare'].fillna(df['fare'].mean())
    df['sex'] = df['sex'].map({'female':0, 'male':1}).astype(int)
    x_onehot_df = pd.get_dummies(data=df, columns= ['embarked'])
    ndarray = x_onehot_df.values
    feature = ndarray[:,1:]
    label = ndarray[:, 0]
    
    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
    scaledFeatures = minmax_scale.fit_transform(feature)
    
    return scaledFeatures, label

In [6]:
x_train, y_train = preprocessTitanicData(train_df)
x_test, y_test = preprocessTitanicData(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
print(x_train.shape, y_train.shape) #np.array np.array
print(x_test.shape, y_test.shape) # np.array np.array

# reshape y_train and y_test
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

print(x_train.shape, y_train.shape) #np.array np.array
print(x_test.shape, y_test.shape) # np.array np.array

(1000, 9) (1000,)
(309, 9) (309,)
(1000, 9) (1000, 1)
(309, 9) (309, 1)


## construct model

In [8]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torchvision.transforms as transforms

In [9]:
class LR(nn.Module): # 繼承torch.nn.module特性
    
    # 架構
    def __init__(self):
        super(LR, self).__init__() # super(LR, self) 繼承自己的__init__()
        self.hidden1 = nn.Linear(9, 100) # input 9, hidden 40  Linear全連接
        self.dropout = nn.Dropout(0.1)
        self.hidden2 = nn.Linear(100, 80) # input 40, hidden 40  Linear全連接
        self.hidden3 = nn.Linear(80, 60) # input 40, hidden 40  Linear全連接
        self.hidden4 = nn.Linear(60, 10) # input 40, hidden 30  Linear全連接
        self.hidden5 = nn.Linear(10, 1) # hidden 10, output 1

    # 運作流程（資料流）
    def forward(self, x):  
        x = F.relu(self.hidden1(x)) # 讓x由hidden1後，進入activation function
        x = F.dropout(self.dropout(x))
        x = F.relu(self.hidden2(x))
        x = F.dropout(self.dropout(x))
        x = F.relu(self.hidden3(x))
        x = F.dropout(self.dropout(x))
        x = F.relu(self.hidden4(x))
        out = self.hidden5(x)
        return out

In [10]:
model = LR()
loss_func = nn.MSELoss()
learning_rate = 0.03
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) #model.parameters() 所有訓練的參數都在這
# model.cuda()   

## training

In [11]:
epochs = 5000

In [12]:
for epoch in range(epochs):
    epoch += 1

    # convert to variables
    x = Variable(torch.from_numpy(x_train).float())
    y = Variable(torch.from_numpy(y_train).float())

    # clear gradient w.r.t. parameters 
    optimizer.zero_grad()
    
    # forward to get output
    prediction = model(x)

    # calculate loss
    loss = loss_func(prediction, y)

    # backward to get gradient
    loss.backward()

    # update parameters
    optimizer.step() 

    if epoch % 5 == 0:
    # plot and show learning process
        print("epoch %d, loss %.8f" % (epoch, loss.data[0]))

epoch 5, loss 0.51323396
epoch 10, loss 0.36527100
epoch 15, loss 0.29833260
epoch 20, loss 0.26864344
epoch 25, loss 0.25512227
epoch 30, loss 0.24934813
epoch 35, loss 0.24629083
epoch 40, loss 0.24543828
epoch 45, loss 0.24443641
epoch 50, loss 0.24368422
epoch 55, loss 0.24369325
epoch 60, loss 0.24284405
epoch 65, loss 0.24277371
epoch 70, loss 0.24187580
epoch 75, loss 0.24227172
epoch 80, loss 0.24162793
epoch 85, loss 0.24082695
epoch 90, loss 0.24040173
epoch 95, loss 0.23981400
epoch 100, loss 0.23985256
epoch 105, loss 0.23946765
epoch 110, loss 0.23902877
epoch 115, loss 0.23768061
epoch 120, loss 0.23743665
epoch 125, loss 0.23687302
epoch 130, loss 0.23649557
epoch 135, loss 0.23615484
epoch 140, loss 0.23490752
epoch 145, loss 0.23445092
epoch 150, loss 0.23381589
epoch 155, loss 0.23270236
epoch 160, loss 0.23248239
epoch 165, loss 0.23252945
epoch 170, loss 0.23110551
epoch 175, loss 0.23038520
epoch 180, loss 0.22988813
epoch 185, loss 0.22883606
epoch 190, loss 0.227

In [13]:
# get filters from parameters:
print(list(model.parameters())[0].size())  

torch.Size([100, 9])


In [14]:
train_prediction = prediction.data.numpy()
train_prediction = np.where(train_prediction>=0.7,1,0)
train_df['prediction'] = train_prediction

In [15]:
# Check some 
# train_df[(train_df['prediction']==1)&(train_df['survived'] == 0)]

## predict

In [16]:
x_test_v = Variable(torch.from_numpy(x_test).float())
test_prediction = model(x_test_v).data.numpy()
# test_prediction = np.where(test_prediction>=0.6,1,0)

In [17]:
test_df['prediction'] = test_prediction

In [18]:
test_df['prediction']

0      0.219108
1      0.780435
2      0.793985
3      0.369818
4      0.985305
5      0.334282
6      0.960066
7      0.870707
8      0.914225
9      0.365510
10     0.255164
11     0.918699
12     0.955548
13     0.982102
14     1.014007
15     0.158954
16     0.265783
17     0.350709
18     0.502637
19     0.194757
20     0.425347
21     0.321741
22     0.353142
23     0.918171
24     0.420204
25     0.524246
26     0.983658
27     0.711760
28     0.967431
29     0.254700
         ...   
279    1.084620
280    0.520777
281    0.484967
282    0.435077
283    0.646125
284    0.372826
285    0.181278
286    0.963175
287    0.119338
288    0.386504
289    0.619683
290    0.953342
291    0.537757
292    0.688097
293    0.280039
294    0.419940
295    0.591007
296    0.314141
297    0.395569
298    0.233905
299    0.364770
300    0.979643
301    0.221562
302    0.527281
303    0.609916
304    1.092152
305    0.937162
306    0.365193
307    0.362107
308    0.281629
Name: prediction, dtype:

In [20]:
test_df['prediction'].to_csv('titanic_answer.csv')

## save model 

In [None]:
# save_model = False
# if save_model == True:
#     torch.save(model.state_dict(), "nonlinear_model.pkl")