In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from matplotlib import pyplot as plt
from torch.utils.data import dataloader as DataLoader
from torch.utils.data import dataset as Dataset
import seaborn as sns

In [31]:
raw_data = pd.read_csv('D:/Code_software/Jupyter_Notebook/kaggle2_data/space_ship_train.csv')
# raw_data.isna().sum(), raw_data.dtypes, raw_data.head(5)
#反选不含nan的行
raw_data[~raw_data.isna().any(axis=1)]

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64

In [3]:
pop_data = raw_data.drop(['PassengerId', 'Cabin', 'Name', 'Transported'], axis=1)

In [4]:
label_raw = raw_data['Transported']
label_raw[label_raw==True] = 1
label_raw[label_raw==False] = 0
label_new = label_raw.astype('long').values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_raw[label_raw==True] = 1


In [5]:
pop_data = pop_data.fillna(method='ffill')
pop_data = pd.get_dummies(pop_data).astype('float')

In [6]:
pop_data = pop_data.apply(lambda x: (x - x.mean()) / x.std()).values

In [7]:
train_pct = round(0.7 * len(pop_data))
train_set = pop_data[:train_pct]
train_label = label_new[:train_pct]
test_set = pop_data[train_pct:]
test_label = label_new[train_pct:]

In [8]:
class dataset(Dataset.Dataset):
    def __init__(self, data_set, label_set):
        self.data = torch.Tensor(data_set)
        self.label = torch.from_numpy(label_set).type(torch.long)
    
    def __getitem__(self, index):
        return self.data[index], self.label[index]

    def __len__(self):
        return len(self.data)

In [9]:
train_dataset = dataset(train_set, train_label)
test_dataset = dataset(test_set, test_label)

In [10]:
train_iter = DataLoader.DataLoader(dataset=train_dataset, batch_size=100)
test_iter = DataLoader.DataLoader(dataset=test_dataset, batch_size=100)

In [21]:
class Accumulator():
    def __init__(self, n):
        assert type(n) == int
        self.n = n
        self.matrix = [0.0] * self.n

    def add(self, *args):
        self.matrix = [a + float(b) for a, b in zip(self.matrix, args)]

    def reset(self):
        self.matrix = [0.0] * self.n

    def __getitem__(self, index):
        return self.matrix[index]

In [12]:
def evaluate_accuracy(net, data_iter):
    if isinstance(net, nn.Module):
        net.eval()
    matrix = Accumulator(2)
    for X, y in data_iter:
        acc = accuracy(net(X), y)
        matrix.add(acc, y.numel())
    return matrix[0] / matrix[1]
    

def accuracy(y_hat, y):
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)
    cmp = y_hat.type(y.dtype) == y
    return float(cmp.type(y_hat.dtype).sum())

In [13]:
class MLP(nn.Module):
    def __init__(self, input_size):
        assert type(input_size) == int
        super().__init__()
        self.linear1 = nn.Linear(input_size, 2*input_size)
        self.relu = nn.functional.relu
        self.linear2 = nn.Linear(2*input_size, 3*input_size)
        self.linear3 = nn.Linear(3*input_size, 10)
        self.dense = nn.Linear(10, 2)

        self._init_parameters()
        
    def forward(self, X):
        X = self.relu(self.linear1(X))
        X = self.relu(self.linear2(X))
        X = self.relu(self.linear3(X))
        return self.dense(X)

    def _init_parameters(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)

In [14]:
# mlp = nn.Sequential(nn.Linear(20, 40), nn.ReLU(), 
#                     nn.Linear(40, 80), nn.ReLU(),
#                    nn.Linear(80, 10), nn.ReLU(),
#                    nn.Linear(10, 2))

In [15]:
loss = nn.CrossEntropyLoss()

In [25]:
def train(net, loss, train_iter, test_iter, num_epochs, lr, weight_decay):
    train_ls = []
    matrix = Accumulator(2)
    if isinstance(net, nn.Module):
        net.train()
    optimizer = torch.optim.SGD(net.parameters(), lr=lr, weight_decay=weight_decay)
    for epoch in range(num_epochs):
        for X, y in train_iter:
            optimizer.zero_grad()
            l = loss(net(X), y)
            l.backward()
            optimizer.step()
            matrix.add(l * y.numel(), y.numel())
        acc = evaluate_accuracy(net, test_iter)
        if epoch % 5 == 0:
            train_ls.append([acc, matrix[0] / matrix[1]])
        matrix.reset()
    return train_ls

In [27]:
mlp = MLP(14)
train(mlp, loss, train_iter, test_iter, 50, 0.005, 1e-4)

[[0.6625766871165644, 0.6649495033132511],
 [0.7335122699386503, 0.5980349946890554],
 [0.7484662576687117, 0.5670015602513604],
 [0.7691717791411042, 0.5477840894935752],
 [0.7776073619631901, 0.5341466816890481],
 [0.7818251533742331, 0.5234528119792664],
 [0.7894938650306749, 0.5146653785496262],
 [0.7868098159509203, 0.507306529288639],
 [0.7910276073619632, 0.5010755818790908],
 [0.7921779141104295, 0.49573198830701076]]