# 使用Pytorch 二层神经网络实现

In [1]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data/model.pt
./data/test.csv
./data/submission_byNN.csv
./data/train.csv
./data/sample_submission.csv


In [2]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Data
## load Dataset
一共有8693条数据，14列，其中12个特征，Transported为目标预测值

In [3]:
dataset_df = pd.read_csv('./data/train.csv')
print("Full train dataset shape is {}".format(dataset_df.shape))
dataset_df.head(5)
# dataset_df.describe()
# dataset_df.info()
# # label的柱状图显示
# plot_df = dataset_df.Transported.value_counts()
# plot_df.plot(kind="bar")

Full train dataset shape is (8693, 14)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


### data process

列Cabin(舱位号)的值是一个字符串，格式为Deck/Cabin_num/Side。这里我们将拆分Cabin列并创建3个新列Deck, Cabin_num和Side，因为在这些单独的数据上训练模型会更容易。将“Cabin”列拆分为“Deck”、“Cabin_num”和“Side”列

In [4]:
# 删除PassengerId、Name两个没用的列
dataset_df = dataset_df.drop(['PassengerId', 'Name'], axis=1)
target = dataset_df['Transported'].astype(int)
dataset_df = dataset_df.drop('Transported',axis=1)
dataset_df[["Deck", "Num", "Side"]] = dataset_df["Cabin"].str.split("/", expand=True)
dataset_df = dataset_df.drop('Cabin',axis=1)
dataset_df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Num,Side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,F,1,S


缺失值处理: 布尔和连续值赋0，离散数据填充“U”.布尔类型转为int

In [5]:
dataset_df.isnull().sum().sort_values(ascending=False)

CryoSleep       217
ShoppingMall    208
VIP             203
HomePlanet      201
Deck            199
Num             199
Side            199
VRDeck          188
FoodCourt       183
Spa             183
Destination     182
RoomService     181
Age             179
dtype: int64

In [6]:
null_col_dec=['HomePlanet','Destination','Deck','Side']
null_col_con=['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck',]
null_col_bool=['CryoSleep','VIP','Num']

In [7]:
dataset_df[null_col_con] = dataset_df[null_col_con].fillna(value=0)
dataset_df[null_col_bool] = dataset_df[null_col_bool].fillna(value=0)
dataset_df[null_col_dec] = dataset_df[null_col_dec].fillna('U')


dataset_df[null_col_con] = dataset_df[null_col_con].fillna(value=0)
dataset_df['Num'] = dataset_df['Num'].astype(int)
dataset_df[null_col_bool] = dataset_df[null_col_bool].astype(int)
dataset_df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Num,Side
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,B,0,P
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,F,0,S
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,A,0,S
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,A,0,S
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,F,1,S


In [8]:
# 自定义离差标准化函数
def minmaxscale(data):
    data=(data-data.min())/(data.max()-data.min())
    return data
for col in null_col_con:
    dataset_df[col] = minmaxscale(dataset_df[col] )
# one-hot编码:最后得到29列特征值
dataset_df = pd.get_dummies(dataset_df, columns=null_col_dec)
dataset_df.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Num,HomePlanet_Earth,...,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U,Side_P,Side_S,Side_U
0,0,0.493671,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0.303797,0,0.007608,0.000302,0.001064,0.0245,0.001823,0,1,...,0,0,0,1,0,0,0,0,1,0
2,0,0.734177,1,0.003001,0.119948,0.0,0.29967,0.00203,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0.417722,0,0.0,0.043035,0.015793,0.148563,0.007997,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0.202532,0,0.021149,0.002348,0.006428,0.025214,8.3e-05,1,1,...,0,0,0,1,0,0,0,0,1,0


In [9]:
dataset = dataset_df.values
target = target.values
X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size = 0.2, random_state = 1234)
X_train.shape

(6954, 29)

## 模型定义

In [10]:
# 定义神经网络模型
class NN(nn.Module):
    def __init__(self):
        super(NN, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(29, 15), # 单隐层12个神经元
            nn.ReLU(),
            nn.Linear(15, 2),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        out = torch.sigmoid(logits)
        return out



## 训练模型

In [27]:
def verify(model):
    num_correct = 0
    model.eval()  # 将模型调整为 eval 模式
    with torch.no_grad():
        for i  in range(len(X_test)):
            inputs = torch.tensor(X_test[i],dtype=torch.float)
            labels = torch.tensor(y_test[i],dtype=torch.long)
            outputs = model(inputs)

            _, predictions = outputs.max(0)
            num_correct += (predictions == labels).sum()
        acc = float(num_correct)/len(X_test)*100
        # print(f'Got {num_correct} / {len(X_test)} with accuracy {acc:.2f}')
    return acc


In [44]:
model = NN()
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
# 训练模型
total_epoch = 100
best_acc = 0
best_epoch = 0
for epoch in range(total_epoch+1):
    model.train()
    for i  in range(len(X_train)):
        inputs = torch.tensor(X_train[i],dtype=torch.float)
        labels = torch.tensor(y_train[i],dtype=torch.long)

        optimizer.zero_grad()   # 清空梯度
        outputs = model(inputs)  # 正向传播
        loss = loss_fn(outputs, labels)  # 计算损失
        loss.backward()  # 反向传播
        optimizer.step()  # 更新参数

    # 寻找最优epoch
    acc = verify(model)
    if acc > best_acc:
        last_file_name = f'./data/model_{best_epoch}.pth'
        if os.path.isfile(last_file_name):
            os.remove(last_file_name)
        torch.save(model, f'./data/model_{epoch}.pth')
        best_epoch = epoch
        best_acc = acc
    print(f"Epoch [{epoch}/{total_epoch}]:",end='')
    print('█'*int(epoch/total_epoch*30)+'-'*int(30-epoch/total_epoch*30)+f'with accuracy {acc:.2f}',end='\r')
print(best_epoch,best_acc)

14 54.2840713053479███████████████████████████with accuracy 51.06


## 验证准确度

In [39]:
num_correct = 0
model = torch.load(f'./data/model_{best_epoch}.pth')
model.eval()  # 将模型调整为 eval 模式
with torch.no_grad():
    for i  in range(len(X_test)):
        inputs = torch.tensor(X_test[i],dtype=torch.float)
        labels = torch.tensor(y_test[i],dtype=torch.long)
        outputs = model(inputs)

        _, predictions = outputs.max(0)
        num_correct += (predictions == labels).sum()
    acc = float(num_correct)/len(X_test)*100
    print(f'Got {num_correct} / {len(X_test)} with accuracy {acc:.2f}')

Got 1094 / 1739 with accuracy 62.91


## Submission

In [40]:
test_data = pd.read_csv('./data/test.csv')
submission_id = test_data['PassengerId']
test_data = test_data.drop(['PassengerId', 'Name'], axis=1)
test_data[["Deck", "Num", "Side"]] = test_data["Cabin"].str.split("/", expand=True)
test_data = test_data.drop('Cabin',axis=1)
test_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Num,Side
0,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,3,S
1,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,4,S
2,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,0,S
3,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,1,S
4,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,5,S


In [41]:
# 缺失值处理
test_data[null_col_con] = test_data[null_col_con].fillna(value=0)
test_data[null_col_bool] = test_data[null_col_bool].fillna(value=0)
test_data[null_col_dec] = test_data[null_col_dec].fillna('U')


test_data[null_col_con] = test_data[null_col_con].fillna(value=0)
test_data['Num'] = test_data['Num'].astype(int)
test_data[null_col_bool] = test_data[null_col_bool].astype(int)
test_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Num,Side
0,Earth,1,TRAPPIST-1e,27.0,0,0.0,0.0,0.0,0.0,0.0,G,3,S
1,Earth,0,TRAPPIST-1e,19.0,0,0.0,9.0,0.0,2823.0,0.0,F,4,S
2,Europa,1,55 Cancri e,31.0,0,0.0,0.0,0.0,0.0,0.0,C,0,S
3,Europa,0,TRAPPIST-1e,38.0,0,0.0,6652.0,0.0,181.0,585.0,C,1,S
4,Earth,0,TRAPPIST-1e,20.0,0,10.0,0.0,635.0,0.0,0.0,F,5,S


In [42]:
# 标准化
for col in null_col_con:
    test_data[col] = minmaxscale(test_data[col] )
# 离散化
test_data = pd.get_dummies(test_data, columns=null_col_dec)
test_data.head()
test_data = test_data.values

In [43]:
# 输出预测结果
n_predictions =[]
model.eval()
with torch.no_grad():
    for i  in range(len(test_data)):
        inputs = torch.tensor(test_data[i],dtype=torch.float)
        outputs = model(inputs)  # 正向传播

        _, predictions = outputs.max(0)
        n_predictions.append(True if predictions.numpy()==1 else False)

submission = pd.DataFrame({'PassengerId': submission_id,
                   'Transported': n_predictions})
submission.to_csv('./data/submission_byNN.csv', index=False)
print(submission)

     PassengerId  Transported
0        0013_01         True
1        0018_01        False
2        0019_01         True
3        0021_01         True
4        0023_01        False
...          ...          ...
4272     9266_02        False
4273     9269_01        False
4274     9271_01        False
4275     9273_01        False
4276     9277_01        False

[4277 rows x 2 columns]
