In [1]:
import pandas as pd
import numpy as np

In [2]:
train=pd.read_csv('train.csv')
data_test=pd.read_csv('data_test.csv')
label_test=pd.read_csv('label_test.csv')

In [3]:
TRAIN_INDEX = ['Pclass', 'Sex', 'Age', 'SibSp','Parch', 'Fare','Embarked','Survived']
TESTS_INDEX = ['Pclass', 'Sex', 'Age', 'SibSp','Parch', 'Fare','Embarked']

In [4]:
train = train[TRAIN_INDEX]
data_test = data_test[TESTS_INDEX]

### 填補缺失值

In [5]:
train['Age'] = train['Age'].fillna(train['Age'].mean())
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)
train.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Survived    0
dtype: int64

In [6]:
data_test['Fare'] = data_test['Fare'].fillna(data_test['Fare'].mean())
data_test['Age'] = data_test['Age'].fillna(data_test['Age'].mean())
data_test.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

### 數值轉換

In [7]:
train['Sex']=train['Sex'].replace('female', 0)
train['Sex']=train['Sex'].replace('male', 1)
data_test['Sex']=data_test['Sex'].replace('female', 0)
data_test['Sex']=data_test['Sex'].replace('male', 1)

In [8]:
train['Embarked']=train['Embarked'].replace('C', 0)
train['Embarked']=train['Embarked'].replace('Q', 1)
train['Embarked']=train['Embarked'].replace('S', 2)

data_test['Embarked']=data_test['Embarked'].replace('C', 0)
data_test['Embarked']=data_test['Embarked'].replace('Q', 1)
data_test['Embarked']=data_test['Embarked'].replace('S', 2)

### 特徵比較(資料關聯性):

In [9]:
# 艙等
print(train[["Pclass", "Survived"]].groupby(["Pclass"], as_index = False).mean())

   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363


In [10]:
# 性別
print(train[["Sex", "Survived"]].groupby(["Sex"], as_index = False).mean())

   Sex  Survived
0    0  0.742038
1    1  0.188908


In [11]:
# 出發地
print(train[["Embarked", "Survived"]].groupby(["Embarked"], as_index = False).mean())

   Embarked  Survived
0         0  0.553571
1         1  0.389610
2         2  0.339009


### 數據生成

In [12]:
data_train=train[TESTS_INDEX].values
label_train=train[['Survived']].values 

data_test=data_test[TESTS_INDEX].values
label_test=label_test[['Survived']].values

In [13]:
label_train=label_train[:,0] # 改變排序方式
label_test=label_test[:,0]

print(data_train.shape)
print(label_train.shape)
print(data_test.shape)
print(label_test.shape)
print(data_train)

(891, 7)
(891,)
(418, 7)
(418,)
[[ 3.          1.         22.         ...  0.          7.25
   2.        ]
 [ 1.          0.         38.         ...  0.         71.2833
   0.        ]
 [ 3.          0.         26.         ...  0.          7.925
   2.        ]
 ...
 [ 3.          0.         29.69911765 ...  2.         23.45
   2.        ]
 [ 1.          1.         26.         ...  0.         30.
   0.        ]
 [ 3.          1.         32.         ...  0.          7.75
   1.        ]]


### 標準化

In [14]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(data_train) #以 data_train 當作標準化基準

data_train=ss.transform(data_train) #對data_train進行標準化
data_test=ss.transform(data_test) #對data_train進行標準化

print(data_train[0])
print(data_test[0])

[ 0.82737724  0.73769513 -0.5924806   0.43279337 -0.47367361 -0.50244517
  0.58595414]
[ 0.82737724  0.73769513  0.36944878 -0.4745452  -0.47367361 -0.49078316
 -0.67817453]


### NN模型

In [15]:
import torch
import torch.nn.functional as F     # 激勵函數

data_train = torch.from_numpy(data_train).type(torch.FloatTensor) 
label_train = torch.from_numpy(label_train)

data_test = torch.from_numpy(data_test).type(torch.FloatTensor) 
label_test = torch.from_numpy(label_test)

In [16]:
class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        # 定義各層樣式
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # nn.Linear(輸入二維張量大小, 輸出二維張量大小)
        self.out = torch.nn.Linear(n_hidden, n_output)       # 上一級的輸出為這級的輸入

    def forward(self, x):
        # 正向傳播輸入值, 神经網路分析出输出值
        x = F.relu(self.hidden(x))      # 激勵函數
        x = self.out(x)                 # 输出值, 但是这个不是预测值, 预测值还需要再另外计算
        return x
                                                 # 類別數 : 2
net = Net(n_feature=7, n_hidden=50, n_output=2)  # 輸入特徵 : 2, 神經元個數 : 10, 輸出特徵 : 2

print(net)  # net 結構

Net(
  (hidden): Linear(in_features=7, out_features=50, bias=True)
  (out): Linear(in_features=50, out_features=2, bias=True)
)


In [32]:
optimizer = torch.optim.SGD(net.parameters(), lr=0.005, momentum = 0.9)
# optimizer = torch.optim.Adam(net.parameters(), lr=0.003)
loss_func = torch.nn.CrossEntropyLoss()

In [35]:
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

best_accuracy = 0
best_loss = 1

for t in range(50):
    # data_train, label_train = shuffle(data_train, label_train)
    
    out = net(data_train)
    loss = loss_func(out, label_train)

    optimizer.zero_grad()   # 清空上一步的數據
    loss.backward()
    optimizer.step()

    prediction = torch.max(F.softmax(out), 1)[1]    # 通過 softmax 的激勵函數(用於多分類神經網絡輸出)
        
    pred_Survived = prediction.data.numpy().squeeze()
    target_Survived = label_train.data.numpy()
    
    
    #accuracy = sum(pred_Survived == target_Survived)/len(data_train)
    accuracy = accuracy_score(pred_Survived, target_Survived)
    
    if (accuracy > best_accuracy):
        best_accuracy = accuracy
    if (loss < best_loss):
        best_loss = loss
        
print(type(pred_Survived))     
print(best_accuracy)
print(best_loss)



<class 'numpy.ndarray'>
0.8058361391694725
tensor(0.4324, grad_fn=<NllLossBackward>)


In [31]:
from sklearn.metrics import accuracy_score

best_accuracy = 0
best_loss = 1

for t in range(50):
    out = net(data_test)
    loss = loss_func(out, label_test)

    optimizer.zero_grad()   # 清空上一步的數據
    loss.backward()
    optimizer.step()

    prediction = torch.max(F.softmax(out), 1)[1]    # 通過 softmax 的激勵函數(用於多分類神經網絡輸出)
        
    pred_Survived = prediction.data.numpy().squeeze()
    target_Survived = label_test.data.numpy()
    
    
    #accuracy = sum(pred_Survived == target_Survived)/len(data_train)
    accuracy = accuracy_score(pred_Survived, target_Survived)
    
    if (accuracy > best_accuracy):
        best_accuracy = accuracy
    if (loss < best_loss):
        best_loss = loss
        
print(type(pred_Survived))     
print(best_accuracy)
print(best_loss)

  


<class 'numpy.ndarray'>
0.9712918660287081
tensor(0.3152, grad_fn=<NllLossBackward>)
