In [1]:
import pandas as pd 
import numpy as np
import glob
from tqdm import tqdm
import cv2
from sklearn.model_selection import train_test_split
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torchvision import datasets, models, transforms
from torchvision.models import resnet18

from PIL import Image
import matplotlib.pyplot as plt
from IPython.display import clear_output

## Датасет

Прежде чем разбираться с моделями, нам надо в первую очередь разобраться с тем, как грузить датасет. Давайте напишем класс в торче для этого.

In [2]:
class ImageDataset(Dataset):
    def __init__(self, data_df, transform=None):

        self.data_df = data_df
        self.transform = transform

    def __getitem__(self, idx):
        # достаем имя изображения и ее лейбл
        image_name = self.data_df.iloc[idx]['img']
        label = self.data_df.iloc[idx]['sing2']

        # читаем картинку. read the image
        image = cv2.imread(f"/media/murad/SSD/krasn/train_dataset_train/train/{image_name}")
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(image)
        
        # преобразуем, если нужно. transform it, if necessary
        if self.transform:
            image = self.transform(image)
        
        return image, torch.tensor(label).long()#to(torch.float32)
    
    def __len__(self):
        return len(self.data_df)

In [3]:
# задаем преобразование изображения.

size = 384
train_transform = transforms.Compose([
    transforms.Resize((size, size)), 
    #transforms.RandomResizedCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                          std=[0.229, 0.224, 0.225]),
])

valid_transform = transforms.Compose([
    transforms.Resize((size, size)), 
    #transforms.RandomResizedCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                          std=[0.229, 0.224, 0.225]),
])

In [4]:
# читаем датасет
data_df = pd.read_csv("/media/murad/SSD/krasn/train_dataset_train/train.csv")

In [5]:
data_df.head(3)

Unnamed: 0,id,img,sing1,sing2,sing3,sing4,sing5,sing6,sing7,sing8
0,807,5-avi-frame24_jpg.rf.5dec372f9195e9a88ff7dd3bd...,1,2,37,0,0,0,0,0
1,121,6-avi-frame6431_jpg.rf.1ad48ac0ce545b88cefb946...,8,0,0,0,0,0,0,0
2,1130,9-avi-frame1457_jpg.rf.634a979898a9caa4d106913...,21,0,0,0,0,0,0,0


In [6]:
from os import listdir

print("Обучающей выборки " ,len(listdir("/media/murad/SSD/krasn/train_dataset_train/train")))
print("Тестовой выборки " ,len(listdir("/media/murad/SSD/krasn/test_dataset_test/test")))

Обучающей выборки  778
Тестовой выборки  388


In [7]:
# разделим датасет на трейн и валидацию, чтобы смотреть на качество
train_df, valid_df = train_test_split(data_df, test_size=0.2, random_state=1)

In [9]:
train_dataset = ImageDataset(train_df, train_transform)
valid_dataset = ImageDataset(valid_df, valid_transform)

In [10]:
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=32,
                                           shuffle=True,
                                           pin_memory=True,
                                           num_workers=8)

valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
                                           batch_size=32,
                                           # shuffle=True,
                                           pin_memory=True,
                                           num_workers=8)

## Вспомогательные функции

In [11]:
def crossvalid(res_model=None,criterion=None,optimizer=None,dataset=None,k_fold=5):
    
    train_score = pd.Series()
    val_score = pd.Series()
    
    total_size = len(dataset)
    fraction = 1/k_fold
    seg = int(total_size * fraction)
    # tr:train,val:valid; r:right,l:left;  eg: trrr: right index of right side train subset 
    # index: [trll,trlr],[vall,valr],[trrl,trrr]
    for i in range(k_fold):
        trll = 0
        trlr = i * seg
        vall = trlr
        valr = i * seg + seg
        trrl = valr
        trrr = total_size
        
        train_left_indices = list(range(trll,trlr))
        train_right_indices = list(range(trrl,trrr))
        
        train_indices = train_left_indices + train_right_indices
        val_indices = list(range(vall,valr))
        
        train_set = torch.utils.data.dataset.Subset(dataset,train_indices)
        val_set = torch.utils.data.dataset.Subset(dataset,val_indices)
        
        train_loader = torch.utils.data.DataLoader(train_set, batch_size=50,
                                          shuffle=True, num_workers=4)
        val_loader = torch.utils.data.DataLoader(val_set, batch_size=50,
                                          shuffle=True, num_workers=4)
        train_acc = train(res_model,criterion,optimizer,train_loader,val_loader,1)
        train_score.at[i] = train_acc
        #val_acc = valid(res_model,criterion,optimizer,val_loader)
        #val_score.at[i] = val_acc
    
    return train_score,val_score

In [12]:
def plot_history(train_history, val_history, title='loss'):
    plt.figure()
    plt.title('{}'.format(title))
    plt.plot(train_history, label='train', zorder=1)
    
    points = np.array(val_history)
    steps = list(range(0, len(train_history) + 1, int(len(train_history) / len(val_history))))[1:]
    
    plt.scatter(steps, val_history, marker='+', s=180, c='orange', label='val', zorder=2)
    plt.xlabel('train steps')
    
    plt.legend(loc='best')
    plt.grid()

    plt.show()

In [13]:
def train(res_model, criterion, optimizer, train_dataloader, test_dataloader, NUM_EPOCH=15):
    train_loss_log = []
    val_loss_log = []
    
    train_acc_log = []
    val_acc_log = []
    

    for epoch in range(NUM_EPOCH):
        print(f'Epoch - {epoch + 1}/{NUM_EPOCH}')
        model.train()
        train_loss = 0.
        train_size = 0
        
        train_pred = 0.
        
        correct = 0
        l = len(train_dataloader)
        for imgs, labels in tqdm(train_dataloader, total=len(train_dataloader)):
            optimizer.zero_grad()

            imgs = imgs.cuda()
            labels = labels.cuda()
            #print(labels)
            y_pred = model(imgs)

            loss = criterion(y_pred, labels)
            loss.backward()
            
            train_loss += loss.item()
            train_size += y_pred.size(0)
            train_loss_log.append(loss.data / y_pred.size(0))

            correct += (np.argmax(y_pred.cpu().detach().numpy()) == labels).float().sum()

            optimizer.step()
        accuracy = 100 * correct / l
    # trainset, not train_loader
    # probably x in your case

        print("Accuracy = {}".format(accuracy))

        train_acc_log.append(train_pred / train_size)

        val_loss = 0.
        val_size = 0
        
        val_pred = 0.
        
        model.eval()
        val_pred = 0

        with torch.no_grad():
            for imgs, labels in tqdm(test_dataloader, total=len(test_dataloader)):
                
                imgs = imgs.cuda()
                labels = labels.cuda()
                
                pred = model(imgs)
                loss = criterion(pred, labels)
                
                val_loss += loss.item()
                val_size += pred.size(0)
                
                val_pred += (np.argmax(pred.cpu().detach().numpy()) == labels).float().sum()
#                 val_pred += (np.argmax(y_pred.cpu().detach().numpy()) == labels).float().sum()

        l = len(test_dataloader)
        val_pred = 100 * val_pred / l
        val_loss_log.append(val_loss / val_size)
        val_acc_log.append(val_pred / val_size)

        clear_output()
#         plot_history(train_loss_log, val_loss_log, 'loss')
        


        print('Train loss:', (train_loss / train_size)*100)
        print('Val loss:', (val_loss / val_size)*100)
        print('Train acc:', accuracy)
        print('Val acc:', val_pred)
        
    return train_loss_log, train_acc_log, val_loss_log, val_acc_log

## Модель

In [14]:
torch.cuda.empty_cache()

In [15]:
from pytorch_pretrained_vit import ViT

# Подгружаем модель
model = ViT('L_32_imagenet1k', pretrained=True)

model.fc = nn.Linear(1024, 70)

model = model.cuda()
criterion = torch.nn.CrossEntropyLoss()

Loaded pretrained weights.


In [18]:
model

ViT(
  (patch_embedding): Conv2d(3, 1024, kernel_size=(32, 32), stride=(32, 32))
  (positional_embedding): PositionalEmbedding1D()
  (transformer): Transformer(
    (blocks): ModuleList(
      (0): Block(
        (attn): MultiHeadedSelfAttention(
          (proj_q): Linear(in_features=1024, out_features=1024, bias=True)
          (proj_k): Linear(in_features=1024, out_features=1024, bias=True)
          (proj_v): Linear(in_features=1024, out_features=1024, bias=True)
          (drop): Dropout(p=0.1, inplace=False)
        )
        (proj): Linear(in_features=1024, out_features=1024, bias=True)
        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (pwff): PositionWiseFeedForward(
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        )
        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (drop): Dropout(p=0.1, inplace=False)
      )
      

In [16]:
optimizer = torch.optim.Adam(model.fc.parameters(), lr=0.001)

In [17]:
train_loss_log, train_acc_log, val_loss_log, val_acc_log = train(model, 
                                                                 criterion, 
                                                                optimizer, 
                                                                 train_loader, 
                                                                 valid_loader, 
                                                                 40)

Train loss: 1.481292672287613
Val loss: 4.1085755213713036
Train acc: tensor(0., device='cuda:0')
Val acc: tensor(0., device='cuda:0')


## Посмотрим метрики нашей итоговой модели на валидации.

In [18]:
valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
                                           batch_size=1,
                                           # shuffle=True,
                                           pin_memory=True,
                                           num_workers=8)

model.eval()
valid_predicts = []

for imgs, _ in tqdm(valid_loader):
    
    imgs = imgs.cuda()
    pred = model(imgs)

    pred_numpy = pred.cpu().detach().numpy()
#     print(pred_nump6y.shape)
    for class_obj in pred_numpy:
      index, max_value = max(enumerate(class_obj), key=lambda i_v: i_v[1])
      valid_predicts.append(index)

100%|█████████████████████████████████████████| 156/156 [00:04<00:00, 37.95it/s]


In [19]:
np.array(valid_predicts).shape

(156,)

In [20]:
valid_df["pred"] = valid_predicts

# Посчитаем точность модели

In [21]:
from sklearn.metrics import recall_score

In [22]:
valid_df['pred'].values

array([ 4,  4,  5,  0,  0,  4,  4, 35,  4,  0,  0,  0,  0,  0,  1,  0, 35,
        0, 58, 29, 35, 14,  0,  0,  4,  4, 33,  2,  0, 10,  1, 46, 16,  0,
        0, 35,  4, 12, 45, 30, 35, 11,  0,  4,  4,  0, 35, 29,  4,  0, 10,
        0,  4, 29,  2,  4,  0, 29,  4,  4,  0, 47,  4, 29,  1,  7,  0,  7,
        1,  0,  0,  4,  0,  0,  0,  0, 35,  0, 14,  2, 35,  4,  0,  0, 30,
       10, 35,  0,  4, 26,  0,  0,  4, 29,  0, 16,  1,  0,  0, 31,  4,  7,
        0,  0,  0,  7, 29, 29,  0,  4,  2,  4,  4,  0,  4,  0, 19, 35,  4,
        4, 15,  2,  0, 49,  0,  4,  0,  0, 10,  4,  4, 10,  7,  2,  0,  4,
        0,  0,  1,  5, 29, 16, 16,  4,  0,  1, 35,  0, 12, 35,  0,  0,  5,
       35,  0,  0])

In [23]:
val_accuracy = recall_score(valid_df['sing2'].values, valid_df['pred'].values, average='macro', zero_division  =0)
print(f"Validation accuracy = {val_accuracy}")

Validation accuracy = 0.3718917550953146


In [24]:
valid_df['sing2'].values

array([ 4,  4,  5, 14,  0,  4,  4,  0,  4,  5,  0,  0,  0,  0,  1, 10, 35,
        0, 58, 29,  5,  4,  0,  0,  4,  4, 33, 10,  0, 10,  1, 10, 24,  0,
        5,  5,  4,  1, 45, 29, 35,  2, 55,  4,  0,  0,  4, 29,  4,  0, 10,
        0,  4, 29,  2,  4,  0, 29,  4,  5,  0, 16,  4, 29,  0,  7,  0,  1,
        2,  0, 14,  4,  0,  0,  0,  0, 35,  0,  0, 49, 35, 51,  0,  0, 40,
        4,  0,  2,  4, 27,  0,  4, 51, 26,  0, 16,  1,  0,  4, 31,  4,  0,
        0,  0,  1, 10, 29, 29,  0,  0,  2,  4,  4,  5, 10,  0, 11, 35,  4,
        4,  0,  2, 39, 49,  0,  4,  0,  0, 10,  0,  4, 10,  7,  2,  0, 15,
        0,  0,  1,  5, 29, 16,  0,  4,  0,  2, 17,  0,  0, 35,  0,  0,  5,
       35,  0,  0])

In [25]:
class TestDataset(Dataset):
    def __init__(self, data_df, transform=None):

        self.data_df = data_df
        self.transform = transform

    def __getitem__(self, idx):
        image_name = self.data_df.iloc[idx]['img']
        image = cv2.imread(f"/media/murad/SSD/krasn/test_dataset_test_/test/{image_name}")
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(image)
        
        # преобразуем, если нужно. transform it, if necessary
        if self.transform:
            image = self.transform(image)
        
        return image
    
    def __len__(self):
        return len(self.data_df)
    
    
test_transform = transforms.Compose([
    transforms.Resize((size, size)),
    #transforms.RandomResizedCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                          std=[0.229, 0.224, 0.225]),
])

# читаем датасет
test_df = pd.read_csv("/media/murad/SSD/krasn/test_dataset_test_/test.csv")
test_df

Unnamed: 0,id,img
0,754,6-avi-frame14887_jpg.rf.bb0bf6b4b122c23e1b33a9...
1,29,5-avi-frame2916_jpg.rf.1ecdbbc129d33896fd25b9b...
2,1157,5-avi-frame2871_jpg.rf.f73998176f8a19ee03f8704...
3,1049,6-avi-frame5752_jpg.rf.a067b0fc55b770c9b10bb7a...
4,715,6-avi-frame5678_jpg.rf.f140419d224703d49fe65db...
...,...,...
383,243,3-avi-frame433_jpg.rf.b29d8fb61048b3016805a62e...
384,921,6-avi-frame4125_jpg.rf.e1dbdc7a7421bc9bc95a58f...
385,881,6-avi-frame1424_jpg.rf.e171549738da66b200b537c...
386,503,5-avi-frame2867_jpg.rf.c0bbb8942eeb6fac582e6b6...


In [26]:
test_dataset = TestDataset(test_df, test_transform)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                           batch_size=1,
                                           # shuffle=True,
                                           pin_memory=True,
                                           num_workers=8)

model.eval()
test_predicts = []

for imgs in tqdm(test_loader):
    imgs = imgs.cuda()
    pred = model(imgs)

    pred_numpy = pred.cpu().detach().numpy()
#     print(pred_nump6y.shape)
    for class_obj in pred_numpy:
      index, max_value = max(enumerate(class_obj), key=lambda i_v: i_v[1])
      test_predicts.append(index)

100%|█████████████████████████████████████████| 388/388 [00:09<00:00, 40.23it/s]


In [27]:
sings_df = pd.read_csv('pred1.csv')

In [28]:
df = pd.DataFrame()
df['id'] = sings_df['id']
df['sing1'] = sings_df['sing1']
df['sing2'] = test_predicts
df['sing3'] = [0 for i in range(len(test_predicts))]
df['sing4'] = [0 for i in range(len(test_predicts))]
df['sing5'] = [0 for i in range(len(test_predicts))]
df['sing6'] = [0 for i in range(len(test_predicts))]
df['sing7'] = [0 for i in range(len(test_predicts))]
df['sing8'] = [0 for i in range(len(test_predicts))]


In [29]:
df.to_csv('pred2.csv', index=False)

In [30]:
torch.save(model.state_dict(), '/home/murad/models/model_sing2.pth')