In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.image as img

import json

from torchvision import transforms as T
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms.functional as F

import torch
import os

import numpy as np


from tqdm.notebook import tqdm

import cv2

from PIL import Image
import pickle
import torch.nn as nn

import time

In [2]:
def get_files(dir_path):
    res = []
    for path in os.listdir(dir_path):
        if os.path.isfile(os.path.join(dir_path, path)) and 'outputs' in path:
            res.append(os.path.join(dir_path, path))
    res = sorted(res, key=lambda x: int(x.strip('.pkl').split('_')[-1]))
    return res

In [3]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']


def train_model(model, train_loader, val_loader, loss, optimizer, num_epochs, scheduler, device):
    t1 = time.time()
    best_model_name = None
    loss_history = []
    train_history = []
    val_history = []
    top_val_accuracy = 0.64 
    for epoch in range(num_epochs):
        model.train()
        loss_accum = 0
        correct_samples = 0
        total_samples = 0
        for i_step, (x, y) in enumerate(tqdm(train_loader)):
            x = x.to(device)
            y = y.to(device)
            prediction = model(x)    
            loss_value = loss(prediction, y.type(torch.long))
            optimizer.zero_grad()
            loss_value.backward()
            optimizer.step()
            
            _, indices = torch.max(prediction, 1)
            correct_samples += torch.sum(indices == y)
            total_samples += y.shape[0]
            
            loss_accum += loss_value

        ave_loss = loss_accum / (i_step + 1)
        train_accuracy = float(correct_samples) / total_samples
        val_accuracy = compute_accuracy(model, val_loader, device)
        
        loss_history.append(float(ave_loss))
        train_history.append(train_accuracy)
        val_history.append(val_accuracy)
        if scheduler != None:
            scheduler.step()

        print("Epoch: %i; %f sec; lr: %f; Average loss: %f, Train accuracy: %f, Val accuracy: %f" % 
              (epoch, round((time.time() - t1) * 1000, 2), get_lr(optimizer), ave_loss, train_accuracy, val_accuracy))

  
        if val_accuracy > top_val_accuracy:
            top_val_accuracy = val_accuracy
            model_name = f'classifier_{epoch}_{round(val_accuracy, 3)}.ckpt'
            best_model_name = model_name
            torch.save(model, open(model_name, 'wb'))
            print("saved", model_name)

    return loss_history, train_history, val_history, best_model_name
        
    
def compute_accuracy(model, loader, device):
    """
    Computes accuracy on the dataset wrapped in a loader    
    Returns: accuracy as a float value between 0 and 1
    """
    model.eval()
    correct_samples = 0
    total_samples = 0 
    for i_step, (x, y) in enumerate(loader):
        x = x.to(device)
        y = y.to(device)
        prediction = model(x)
        _, indices = torch.max(prediction, 1)
        correct_samples += torch.sum(indices == y)
        total_samples += y.shape[0]            

    val_accuracy = float(correct_samples) / total_samples
         
    return val_accuracy

In [4]:
class FeaturesDataset(torch.utils.data.Dataset):
    def __init__(self, feature_files, label_file):
        self.feature_files = feature_files
        
        with open(label_file, 'rb') as f:
            labels = pickle.load(f)
            self.labels = [{'id':k, 'label':labels[k]['label'], 'text':labels[k]['text']} for i, k in enumerate(labels)]
            
        self.current_chunk_id = -1
        self.chunk_size = None
        self.id2name = []
        self.ln = 0
        
        for i in range(len(self.feature_files)):
            with open(self.feature_files[i], 'rb') as f:
                chunk = pickle.load(f)
                self.ln += len(chunk)
                if self.chunk_size is None or len(chunk) > self.chunk_size:
                    self.chunk_size = len(chunk)
                    
                self.id2name.extend([k for k in chunk])
    
    def __getitem__(self, index: int):
        chunk_num = index // self.chunk_size
        if chunk_num != self.current_chunk_id:
            with open(self.feature_files[chunk_num], 'rb') as f:
                chunk = pickle.load(f)
                
                self.current_chunk = [{'id':k, 'tensor':chunk[k]} for k in chunk]
                
                self.current_chunk_id = chunk_num
                print(f'chunk {self.current_chunk_id} loaded')
        
        id1 = self.current_chunk[index % self.chunk_size]['id']
        id2 = self.labels[index]['id']
        id3 = self.id2name[index]
        
        assert id1 == id2
        assert id2 == id3
        return self.current_chunk[index % self.chunk_size]['tensor'], self.labels[index]['label']
    
    def __len__(self):
        return self.ln

In [5]:
features_val_dataset = FeaturesDataset(get_files('d:\\val'), 'd:\\val\\labels_val.pkl')
len(features_val_dataset), len(features_val_dataset.labels), len(features_val_dataset.id2name)

(498, 498, 498)

In [6]:
for i in range(len(features_val_dataset)):
    features_val_dataset[i]

chunk 0 loaded
chunk 1 loaded
chunk 2 loaded
chunk 3 loaded
chunk 4 loaded
chunk 5 loaded
chunk 6 loaded
chunk 7 loaded
chunk 8 loaded
chunk 9 loaded


In [7]:
features_train_dataset = FeaturesDataset(get_files('d:\\train'), 'd:\\train\\labels_train.pkl')

len(features_train_dataset), len(features_train_dataset.labels), len(features_train_dataset.id2name)

(8464, 8464, 8464)

In [8]:
#assert False

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

device = "cpu"

In [10]:
for x, y in tqdm(DataLoader(features_val_dataset, batch_size=8)):
    print(x.shape, y)
    break

  0%|          | 0/63 [00:00<?, ?it/s]

chunk 0 loaded
torch.Size([8, 5402394]) tensor([1, 1, 1, 1, 1, 1, 1, 1])


#### Обучение нейросети

In [11]:
input_shape = features_val_dataset[0][0].shape[0]
num_classes = 2

In [12]:
torch.manual_seed(1024)

shape = 256
nn_model = nn.Sequential(
            nn.Linear(input_shape, shape, dtype=torch.half),
            #nn.Dropout(0.66),
            #nn.BatchNorm1d(shape, dtype=torch.half),
            nn.ReLU(inplace=True),    
    
            nn.Linear(shape, shape, dtype=torch.half),
            #nn.Dropout(0.66),
            #nn.BatchNorm1d(shape, dtype=torch.half),
            nn.ReLU(inplace=True),    
    
            nn.Linear(shape, num_classes, dtype=torch.half),
            )

nn_model = nn_model.to(device)
print(nn_model)
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(nn_model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.8)

loss_history, train_history, val_history, best_model_name = train_model(
    nn_model, 
    DataLoader(features_train_dataset, batch_size=500),
    DataLoader(features_val_dataset, batch_size=500),
    loss, optimizer, 5, scheduler, device)
print('end!')


Sequential(
  (0): Linear(in_features=5402394, out_features=256, bias=True)
  (1): ReLU(inplace=True)
  (2): Linear(in_features=256, out_features=256, bias=True)
  (3): ReLU(inplace=True)
  (4): Linear(in_features=256, out_features=2, bias=True)
)


  0%|          | 0/17 [00:00<?, ?it/s]

chunk 0 loaded
chunk 1 loaded
chunk 2 loaded
chunk 3 loaded
chunk 4 loaded
chunk 5 loaded
chunk 6 loaded
chunk 7 loaded
chunk 8 loaded
chunk 9 loaded


RuntimeError: "clamp_min_cpu" not implemented for 'Half'

In [None]:
fig = plt.figure(figsize=(10, 8))    
plt.xlabel("#iteration")
plt.ylabel("loss")
plt.plot(loss_history, label='loss')
plt.plot(train_history, label='train accuracy')
plt.plot(val_history, label='val accuracy')
fig.legend()
plt.show()

In [None]:
print("best model:", best_model_name)

best_model = torch.load(open(best_model_name, 'rb'))
print(best_model)

In [None]:
best_model.eval()
for i_step, (x, y) in enumerate(DataLoader(features_val_dataset, batch_size=5000)):
    prediction = best_model(x)

acc_score = accuracy_score(np.array([x.item() for x in labels_val]), torch.max(prediction, 1)[1])
auc_score = roc_auc_score(np.array([x.item() for x in labels_val]), prediction[:,1].detach().numpy())

fpr, tpr, thresh = roc_curve(labels_val, prediction[:,1].detach().numpy(), pos_label=1)

random_probs = [0 for i in range(len(labels_val))]
p_fpr, p_tpr, _ = roc_curve(labels_val, random_probs, pos_label=1)
auc_score = roc_auc_score(labels_val, prediction[:,1].detach().numpy())

print('Accuracy: ', acc_score, '\n', 'ROC AUC: ', auc_score, sep='')

plt.plot(fpr, tpr, linestyle='--',color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
plt.title('ROC Curve', fontsize=20)
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive rate',fontsize=18)

plt.show();