1. Для датасета животных обучить MLP
2. Использовать Custom Dataset, Sampler, collate_fn
3. Сделать предобработку фичей
4. Попробовать BatchNorm1d, Dropout
4. Подключить для логирования tensorboard и/или mlflow
5. Не забыть разделить выборку на train/valid в соотношении 80/20%
6. Получить точность не ниже 65%.

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import torch

print(torch.__version__)

import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils

from torch.utils.data import DataLoader, Dataset, Sampler
from torch.utils.data.dataloader import default_collate
from torch.utils.tensorboard import SummaryWriter
from torchmetrics.classification.accuracy import Accuracy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

1.9.0+cpu


In [2]:
#!pip install tensorboard

In [3]:
import mlflow
import tensorboard

In [30]:
# Tensor Board
from torch.utils.tensorboard import SummaryWriter
train_writer = SummaryWriter('./logs/train')
valid_writer = SummaryWriter('./logs/valid')

In [42]:
INPUT_SIZE = 36
HIDDEN_SIZE = 25
OUTPUT_SIZE = 5
LEARNING_RATE = 1e-2
EPOCHS = 50
BATCH_SIZE = 256
EMBEDDING_SIZE = 5

In [32]:
class TrainDataset(Dataset):
    # Конструктор, где считаем датасет
    def __init__(self):
        X = pd.read_csv('./data/X_cat.csv', sep='\t', index_col=0)
        target = pd.read_csv('./data/y_cat.csv', sep='\t', index_col=0, names=['status'])  # header=-1,

        weekday_columns = ['Weekday_0', 'Weekday_1', 'Weekday_2',
                           'Weekday_3', 'Weekday_4', 'Weekday_5', 'Weekday_6']
        weekdays = np.argmax(X[weekday_columns].values, axis=1)

        X.drop(weekday_columns, axis=1, inplace=True)

        
        X['Weekday_cos'] = np.cos(2 * np.pi / 7.) * weekdays
        X['Weekday_sin'] = np.sin(2 * np.pi / 7.) * weekdays

        X['Hour_cos'] = np.cos(2 * np.pi / 24.) * X['Hour'].values
        X['Hour_sin'] = np.sin(2 * np.pi / 24.) * X['Hour'].values

        X['Month_cos'] = np.cos(2 * np.pi / 12.) * X['Month'].values
        X['Month_sin'] = np.sin(2 * np.pi / 12.) * X['Month'].values

        X['Gender'] = np.argmax(X[['Sex_Female', 'Sex_Male', 'Sex_Unknown']].values, axis=1)
        
        X['Age'] = X['Age'].apply(lambda x: 0 if x <= 350 else 1 if x <= 800 else 2 if x<=2000 else 3)

        X.drop(['Sex_Female', 'Sex_Male', 'Sex_Unknown'], axis=1, inplace=True)

        #print(X.shape)
        #print(X.head())

        target = target.iloc[:, :].values
        target[target == 'Died'] = 'Euthanasia'

        le = LabelEncoder()
        le.fit(target)
        
        X,_,target,_= train_test_split(X,target, train_size=0.8,random_state=0)

        self.y = le.transform(target)
        self.X = X.values

        self.columns = X.columns.values

        self.embedding_column = 'Gender'
        self.nrof_emb_categories = 3
        self.numeric_columns = ['IsDog', 'Age', 'HasName', 'NameLength', 'NameFreq', 'MixColor', 'ColorFreqAsIs',
                                'ColorFreqBase', 'TabbyColor', 'MixBreed', 'Domestic', 'Shorthair', 'Longhair',
                                'Year', 'Day',  'Breed_Chihuahua Shorthair Mix', 'Breed_Domestic Medium Hair Mix',
                                'Breed_Domestic Shorthair Mix', 'Breed_German Shepherd Mix', 'Breed_Labrador Retriever Mix',
                                 'Breed_Pit Bull Mix', 'Breed_Rare',
                                'SexStatus_Flawed', 'SexStatus_Intact', 'SexStatus_Unknown',
                                'Weekday_cos', 'Weekday_sin', 'Hour_cos', 'Hour_sin',
                                'Month_cos', 'Month_sin']
        
        return

    def __len__(self):
        return len(self.X)

    # Переопределяем метод,
    # который достает по индексу наблюдение из датасет
    def __getitem__(self, idx):

        row = self.X[idx, :]

        row = {col: torch.tensor(row[i]) for i, col in enumerate(self.columns)}

        return row, self.y[idx]

In [33]:
class ValidDataset(Dataset):
    # Конструктор, где считаем датасет
    def __init__(self):
        X = pd.read_csv('./data/X_cat.csv', sep='\t', index_col=0)
        target = pd.read_csv('./data/y_cat.csv', sep='\t', index_col=0, names=['status'])  # header=-1,

        weekday_columns = ['Weekday_0', 'Weekday_1', 'Weekday_2',
                           'Weekday_3', 'Weekday_4', 'Weekday_5', 'Weekday_6']
        weekdays = np.argmax(X[weekday_columns].values, axis=1)

        X.drop(weekday_columns, axis=1, inplace=True)

        
        X['Weekday_cos'] = np.cos(2 * np.pi / 7.) * weekdays
        X['Weekday_sin'] = np.sin(2 * np.pi / 7.) * weekdays

        X['Hour_cos'] = np.cos(2 * np.pi / 24.) * X['Hour'].values
        X['Hour_sin'] = np.sin(2 * np.pi / 24.) * X['Hour'].values

        X['Month_cos'] = np.cos(2 * np.pi / 12.) * X['Month'].values
        X['Month_sin'] = np.sin(2 * np.pi / 12.) * X['Month'].values

        X['Gender'] = np.argmax(X[['Sex_Female', 'Sex_Male', 'Sex_Unknown']].values, axis=1)
        
        X['Age'] = X['Age'].apply(lambda x: 0 if x <= 350 else 1 if x <= 800 else 2 if x<=2000 else 3)

        X.drop(['Sex_Female', 'Sex_Male', 'Sex_Unknown'], axis=1, inplace=True)
        
        X,_,target,_= train_test_split(X,target, train_size=0.8,random_state=0)
        

        target = target.iloc[:, :].values
        target[target == 'Died'] = 'Euthanasia'

        le = LabelEncoder()
        le.fit(target)
        
        _,X,_,target= train_test_split(X,target, train_size=0.8,random_state=0)

        self.y = le.transform(target)
        self.X = X.values

        self.columns = X.columns.values

        self.embedding_column = 'Gender'
        self.nrof_emb_categories = 3
        self.numeric_columns = ['IsDog', 'Age', 'HasName', 'NameLength', 'NameFreq', 'MixColor', 'ColorFreqAsIs',
                                'ColorFreqBase', 'TabbyColor', 'MixBreed', 'Domestic', 'Shorthair', 'Longhair',
                                'Year', 'Day',  'Breed_Chihuahua Shorthair Mix', 'Breed_Domestic Medium Hair Mix',
                                'Breed_Domestic Shorthair Mix', 'Breed_German Shepherd Mix', 'Breed_Labrador Retriever Mix',
                                 'Breed_Pit Bull Mix', 'Breed_Rare',
                                'SexStatus_Flawed', 'SexStatus_Intact', 'SexStatus_Unknown',
                                'Weekday_cos', 'Weekday_sin', 'Hour_cos', 'Hour_sin',
                                'Month_cos', 'Month_sin']
        
        return

    def __len__(self):
        return len(self.X)

    # Переопределяем метод,
    # который достает по индексу наблюдение из датасет
    def __getitem__(self, idx):

        row = self.X[idx, :]

        row = {col: torch.tensor(row[i]) for i, col in enumerate(self.columns)}

        return row, self.y[idx]

In [34]:
class MLPNet(nn.Module):

    def __init__(self, input_size, hidden_size, output_size, nrof_cat, emb_dim,
                 emb_columns, numeric_columns):
        super(MLPNet, self).__init__()
        self.emb_columns = emb_columns
        self.numeric_columns = numeric_columns

        self.emb_layer = torch.nn.Embedding(nrof_cat, emb_dim)

        self.feature_bn = torch.nn.BatchNorm1d(input_size)

        self.linear1 = torch.nn.Linear(input_size, hidden_size)
        self.linear1.apply(self.init_weights)
        self.bn1 = torch.nn.BatchNorm1d(hidden_size)

        self.linear2 = torch.nn.Linear(hidden_size, hidden_size)
        self.linear2.apply(self.init_weights)
        self.dp2 = torch.nn.BatchNorm1d(hidden_size)
        
        self.linear3 = torch.nn.Linear(hidden_size, hidden_size)
        self.linear3.apply(self.init_weights)
        self.bn3 = torch.nn.BatchNorm1d(hidden_size)

        self.linear4 = torch.nn.Linear(hidden_size, output_size)

    def init_weights(self, m):
        if type(m) == nn.Linear:
            torch.nn.init.xavier_uniform(m.weight)

    def forward(self, x):
        emb_output = self.emb_layer(torch.tensor(x[self.emb_columns], dtype=torch.int64))
        numeric_feats = torch.tensor(pd.DataFrame(x)[self.numeric_columns].values, dtype=torch.float32)

        concat_input = torch.cat([numeric_feats, emb_output], dim=1)
        output = self.feature_bn(concat_input)

        output = self.linear1(output)
        output = self.bn1(output)
        output = torch.relu(output)
        
        output = self.linear2(output)
        output = self.dp2(output)
        output = torch.relu(output)

        output = self.linear3(output)
        output = self.bn3(output)
        output = torch.relu(output)

        output = self.linear4(output)
        predictions = torch.softmax(output, dim=1)

        return predictions

In [35]:
def run_train(model, train_loader, valid_loader):
    step = 0
    for epoch in range(EPOCHS):
        model.train()

        for features, label in train_loader:
            # Reset gradients
            optimizer.zero_grad()

            output = model(features)
            # Calculate error and backpropagate
            loss = criterion(output, label.type(torch.LongTensor))
            loss.backward()
            acc = accuracy(output, label.type(torch.LongTensor)).item()
            
            # Update weights with gradients
            optimizer.step()

            step += 1

            if step % 100 == 0:
                print('EPOCH %d STEP %d : train_loss: %f train_acc: %f' %
                      (epoch, step, loss.item(), acc))
                
        train_writer.add_scalar('CrossEntropyLoss', loss, epoch)
        train_writer.add_scalar('Accuracy', acc, epoch)
                
        for features, label in valid_loader:
                    
            output = model(features)
            loss = criterion(output, label.type(torch.LongTensor))
            acc = accuracy(output, label.type(torch.LongTensor)).item()
            step += 1
            
        valid_writer.add_scalar('CrossEntropyLoss', loss, epoch)
        valid_writer.add_scalar('Accuracy', acc, epoch)

    return step

In [36]:
def collate(batch):
    return default_collate(batch)

In [37]:
# Пример Loader с Sampler
animal_dataset = CustomDataset()
sampler = Sampler(animal_dataset)
sampled_loader = data_utils.DataLoader(dataset=animal_dataset, sampler=sampler, collate_fn=collate,
                                     batch_size=BATCH_SIZE)

(26729, 34)
   IsDog  Age  HasName  NameLength  NameFreq  MixColor  ColorFreqAsIs  \
0      1    1        1           7  0.000157         1       0.032919   
1      0    1        1           5  0.000655         0       0.008092   
2      1    1        1           6  0.000052         1       0.026293   
3      0    0        0           7  0.285871         0       0.000471   
4      1    1        0           7  0.285871         0       0.023831   

   ColorFreqBase  TabbyColor  MixBreed  ...  SexStatus_Flawed  \
0       0.463624           0         1  ...                 1   
1       0.015005           1         1  ...                 1   
2       0.357521           0         1  ...                 1   
3       0.058418           0         1  ...                 0   
4       0.075353           0         0  ...                 1   

   SexStatus_Intact  SexStatus_Unknown  Weekday_cos  Weekday_sin   Hour_cos  \
0                 0                  0     1.246980     1.563663  13.877134   


In [43]:
animal_dataset = CustomDataset()

train_dataset = TrainDataset()
valid_dataset = ValidDataset()

train_loader = data_utils.DataLoader(dataset=train_dataset, shuffle=True,
                                     batch_size=BATCH_SIZE)
valid_loader = data_utils.DataLoader(dataset=valid_dataset, shuffle=True,
                                     batch_size=BATCH_SIZE)

model = MLPNet(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, animal_dataset.nrof_emb_categories,
               EMBEDDING_SIZE,
               animal_dataset.embedding_column, animal_dataset.numeric_columns)

criterion = nn.CrossEntropyLoss()
accuracy = Accuracy()

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

(26729, 34)
   IsDog  Age  HasName  NameLength  NameFreq  MixColor  ColorFreqAsIs  \
0      1    1        1           7  0.000157         1       0.032919   
1      0    1        1           5  0.000655         0       0.008092   
2      1    1        1           6  0.000052         1       0.026293   
3      0    0        0           7  0.285871         0       0.000471   
4      1    1        0           7  0.285871         0       0.023831   

   ColorFreqBase  TabbyColor  MixBreed  ...  SexStatus_Flawed  \
0       0.463624           0         1  ...                 1   
1       0.015005           1         1  ...                 1   
2       0.357521           0         1  ...                 1   
3       0.058418           0         1  ...                 0   
4       0.075353           0         0  ...                 1   

   SexStatus_Intact  SexStatus_Unknown  Weekday_cos  Weekday_sin   Hour_cos  \
0                 0                  0     1.246980     1.563663  13.877134   


In [44]:
step = run_train(model, train_loader, valid_loader)

EPOCH 16 STEP 1700 : train_loss: 1.180318 train_acc: 0.718518
EPOCH 17 STEP 1800 : train_loss: 1.222556 train_acc: 0.675781
EPOCH 18 STEP 1900 : train_loss: 1.278427 train_acc: 0.609375
EPOCH 19 STEP 2000 : train_loss: 1.199100 train_acc: 0.707031
EPOCH 20 STEP 2100 : train_loss: 1.194548 train_acc: 0.707031
EPOCH 21 STEP 2200 : train_loss: 1.279716 train_acc: 0.621094
EPOCH 22 STEP 2300 : train_loss: 1.280003 train_acc: 0.613281
EPOCH 23 STEP 2400 : train_loss: 1.199502 train_acc: 0.695312
EPOCH 24 STEP 2500 : train_loss: 1.256261 train_acc: 0.636719
EPOCH 25 STEP 2600 : train_loss: 1.230916 train_acc: 0.660156
EPOCH 26 STEP 2700 : train_loss: 1.202373 train_acc: 0.695312
EPOCH 27 STEP 2800 : train_loss: 1.173871 train_acc: 0.722656
EPOCH 28 STEP 2900 : train_loss: 1.209973 train_acc: 0.699219
EPOCH 29 STEP 3000 : train_loss: 1.242350 train_acc: 0.656250
EPOCH 30 STEP 3100 : train_loss: 1.205398 train_acc: 0.695312
EPOCH 31 STEP 3200 : train_loss: 1.225600 train_acc: 0.675781
EPOCH 32

In [51]:
#Sampler(animal_dataset):