# Library initialization

In [None]:
import numpy as np
import pandas as pd
import torch
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
import shutil
import cv2
import time
import torchvision
from PIL import Image
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, random_split
import albumentations 
from albumentations.pytorch import ToTensorV2 as AT


import matplotlib.pyplot as plt

In [None]:
PATH = 'yandex/'
train_path=list()
for directory in os.listdir(PATH):
    train_path.append(os.path.join(PATH, directory))
    
test_path = ("test/")

train_list=list()
for directory in train_path:
    for pic in os.listdir(directory):
        train_list.append(directory+'/'+pic)

test_list=list()
for pic in os.listdir(test_path):
    test_list.append(test_path+pic)
print(len(train_list), len(test_list))

17057 1753


In [None]:
class ChartsDataset(Dataset):
    
    def __init__(self, path, img_list, transform=None, mode='train'):
        self.path = path
        self.img_list = img_list
        self.transform = transform
        self.mode = mode
    
    def __len__(self):
        return len(self.img_list)
    
    def __getitem__(self, idx):
        image_name = self.img_list[idx]
        
        if image_name.split(".")[-1] == "gif":
           gif = cv2.VideoCapture(self.path + image_name)
           _, image = gif.read()
        else:
            image = cv2.imread(self.path + image_name)
            
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Distribution of pictures into categories

        if "bar_chart" in image_name:
            label = 1
        elif "diagram" in image_name:
            label = 2
        elif "flow_chart" in image_name:
            label = 3
        elif "graph" in image_name:
            label = 4
        elif "growth_chart" in image_name:
            label = 5
        elif "pie_chart" in image_name:
            label = 6
        elif "table" in image_name:
            label = 7
        else:
            label = 0 #just_image
            
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented["image"]
        
        if self.mode == "train":
            return image, label
        else:
            return image, image_name

In [None]:
batch_size = 64
num_workers = os.cpu_count()
img_size = 256

In [None]:
# Since the data is not sufficiently applicable augmentation
data_transforms = albumentations.Compose([
    albumentations.Resize(img_size, img_size),
    albumentations.CLAHE(),
    albumentations.ChannelShuffle(),
    albumentations.Downscale(),
    albumentations.Cutout(),
    albumentations.ShiftScaleRotate(),
    albumentations.Normalize(),
    AT()
    ])


data_transforms_test = albumentations.Compose([
    albumentations.Resize(img_size, img_size),
    albumentations.Normalize(),
    AT()
    ])

In [None]:
# Initialize datasets
trainset = ChartsDataset('./', train_list,  transform=data_transforms)
testset = ChartsDataset('./', test_list,  transform=data_transforms_test, mode="test")

In [None]:
valid_size = int(len(train_list) * 0.1)
train_set, valid_set = torch.utils.data.random_split(trainset, 
                                    (len(train_list)-valid_size, valid_size))

In [None]:
trainloader = torch.utils.data.DataLoader(train_set, pin_memory=True, 
                                        batch_size=batch_size, shuffle=True, #Remember to shuffle data
                                        num_workers = num_workers)

validloader = torch.utils.data.DataLoader(valid_set, pin_memory=True, 
                                        batch_size=batch_size, shuffle=True,
                                        num_workers = num_workers)

testloader = torch.utils.data.DataLoader(testset, batch_size = batch_size,
                                         num_workers = num_workers)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #Translate learning to the GPU for faster learning
device

device(type='cuda', index=0)

# Model creation

In [None]:
model = torchvision.models.resnet152(pretrained=True, progress=True)

In [None]:
for param in model.parameters():
    param.requires_grad = False

in_features = model.fc.in_features

In [None]:
model.fc = nn.Sequential(nn.Linear(in_features, 1024),
                         nn.Linear(1024,8))

In [None]:
def train_model(model_conv, train_loader, valid_loader, criterion, optimizer, sheduler, n_epochs):
    model_conv.to(device)
    valid_loss_min = np.Inf
    patience = 10
    p = 0
    # Determine how many epochs we wait for before turning off, otherwise we stop learning.
    stop = False

    # количество эпох
    for epoch in range(1, n_epochs+1):
        print(time.ctime(), 'Epoch:', epoch)

        train_loss = []

        for batch_i, (data, target) in enumerate(tqdm(train_loader)):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model_conv(data)
            loss = criterion(output, target)
            train_loss.append(loss.item())
            loss.backward()
            optimizer.step()
        # Run validation
        model_conv.eval()
        val_loss = []
        for batch_i, (data, target) in enumerate(valid_loader):
            data, target = data.to(device), target.to(device)
            output = model_conv(data)
            loss = criterion(output, target)
            val_loss.append(loss.item()) 

        print(f'Epoch {epoch}, train loss: {np.mean(train_loss):.4f}, valid loss: {np.mean(val_loss):.4f}.')

        valid_loss = np.mean(val_loss)
        scheduler.step(valid_loss)
        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
            torch.save(model_conv.state_dict(), 'model.pt')
            valid_loss_min = valid_loss
            p = 0

        # Checking data
        if valid_loss > valid_loss_min:
            p += 1
            print(f'{p} epochs of increasing val loss')
            if p > patience:
                print('Stopping training')
                stop = True
                break        

    return model_conv, train_loss, val_loss

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0007)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.8, patience=3,)

# Trainig

In [None]:
model_resnet, train_loss, val_loss = train_model(model, trainloader, validloader, criterion, 
                              optimizer, scheduler, n_epochs=80,)

  0%|          | 0/240 [00:00<?, ?it/s]

Mon Mar  2 15:46:47 2020 Epoch: 1


100%|██████████| 240/240 [03:18<00:00,  1.21it/s]


Epoch 1, train loss: 0.6156, valid loss: 0.3697.
Validation loss decreased (inf --> 0.369712).  Saving model ...


  0%|          | 0/240 [00:00<?, ?it/s]

Mon Mar  2 15:50:31 2020 Epoch: 2


100%|██████████| 240/240 [03:07<00:00,  1.28it/s]


Epoch 2, train loss: 0.3505, valid loss: 0.3023.
Validation loss decreased (0.369712 --> 0.302332).  Saving model ...


  0%|          | 0/240 [00:00<?, ?it/s]

Mon Mar  2 15:54:01 2020 Epoch: 3


100%|██████████| 240/240 [03:07<00:00,  1.28it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 3, train loss: 0.3282, valid loss: 0.3094.
1 epochs of increasing val loss
Mon Mar  2 15:57:30 2020 Epoch: 4


100%|██████████| 240/240 [03:07<00:00,  1.28it/s]


Epoch 4, train loss: 0.3037, valid loss: 0.2689.
Validation loss decreased (0.302332 --> 0.268940).  Saving model ...


  0%|          | 0/240 [00:00<?, ?it/s]

Mon Mar  2 16:01:01 2020 Epoch: 5


100%|██████████| 240/240 [03:07<00:00,  1.28it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 5, train loss: 0.3020, valid loss: 0.3065.
1 epochs of increasing val loss
Mon Mar  2 16:04:30 2020 Epoch: 6


100%|██████████| 240/240 [03:06<00:00,  1.28it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 6, train loss: 0.2896, valid loss: 0.2757.
2 epochs of increasing val loss
Mon Mar  2 16:07:59 2020 Epoch: 7


100%|██████████| 240/240 [03:06<00:00,  1.28it/s]


Epoch 7, train loss: 0.2862, valid loss: 0.2473.
Validation loss decreased (0.268940 --> 0.247281).  Saving model ...


  0%|          | 0/240 [00:00<?, ?it/s]

Mon Mar  2 16:11:29 2020 Epoch: 8


100%|██████████| 240/240 [03:06<00:00,  1.29it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 8, train loss: 0.2986, valid loss: 0.2917.
1 epochs of increasing val loss
Mon Mar  2 16:14:58 2020 Epoch: 9


100%|██████████| 240/240 [03:07<00:00,  1.28it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 9, train loss: 0.2753, valid loss: 0.2739.
2 epochs of increasing val loss
Mon Mar  2 16:18:28 2020 Epoch: 10


100%|██████████| 240/240 [03:07<00:00,  1.28it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 10, train loss: 0.2682, valid loss: 0.2871.
3 epochs of increasing val loss
Mon Mar  2 16:21:57 2020 Epoch: 11


100%|██████████| 240/240 [03:07<00:00,  1.28it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 11, train loss: 0.2684, valid loss: 0.2602.
4 epochs of increasing val loss
Mon Mar  2 16:25:26 2020 Epoch: 12


100%|██████████| 240/240 [03:06<00:00,  1.29it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 12, train loss: 0.2548, valid loss: 0.2528.
5 epochs of increasing val loss
Mon Mar  2 16:28:54 2020 Epoch: 13


100%|██████████| 240/240 [03:06<00:00,  1.29it/s]


Epoch 13, train loss: 0.2458, valid loss: 0.2449.
Validation loss decreased (0.247281 --> 0.244897).  Saving model ...


  0%|          | 0/240 [00:00<?, ?it/s]

Mon Mar  2 16:32:24 2020 Epoch: 14


100%|██████████| 240/240 [03:06<00:00,  1.28it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 14, train loss: 0.2420, valid loss: 0.2628.
1 epochs of increasing val loss
Mon Mar  2 16:35:53 2020 Epoch: 15


100%|██████████| 240/240 [03:06<00:00,  1.29it/s]


Epoch 15, train loss: 0.2560, valid loss: 0.2214.
Validation loss decreased (0.244897 --> 0.221356).  Saving model ...


  0%|          | 0/240 [00:00<?, ?it/s]

Mon Mar  2 16:39:23 2020 Epoch: 16


100%|██████████| 240/240 [03:06<00:00,  1.29it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 16, train loss: 0.2358, valid loss: 0.2382.
1 epochs of increasing val loss
Mon Mar  2 16:42:52 2020 Epoch: 17


100%|██████████| 240/240 [03:06<00:00,  1.29it/s]


Epoch 17, train loss: 0.2332, valid loss: 0.2195.
Validation loss decreased (0.221356 --> 0.219492).  Saving model ...


  0%|          | 0/240 [00:00<?, ?it/s]

Mon Mar  2 16:46:22 2020 Epoch: 18


100%|██████████| 240/240 [03:06<00:00,  1.29it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 18, train loss: 0.2419, valid loss: 0.2651.
1 epochs of increasing val loss
Mon Mar  2 16:49:50 2020 Epoch: 19


100%|██████████| 240/240 [03:06<00:00,  1.29it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 19, train loss: 0.2455, valid loss: 0.2263.
2 epochs of increasing val loss
Mon Mar  2 16:53:19 2020 Epoch: 20


100%|██████████| 240/240 [03:06<00:00,  1.29it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 20, train loss: 0.2358, valid loss: 0.2221.
3 epochs of increasing val loss
Mon Mar  2 16:56:48 2020 Epoch: 21


100%|██████████| 240/240 [03:06<00:00,  1.29it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 21, train loss: 0.2416, valid loss: 0.2442.
4 epochs of increasing val loss
Mon Mar  2 17:00:17 2020 Epoch: 22


100%|██████████| 240/240 [03:06<00:00,  1.29it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 22, train loss: 0.2205, valid loss: 0.2260.
5 epochs of increasing val loss
Mon Mar  2 17:03:45 2020 Epoch: 23


100%|██████████| 240/240 [03:06<00:00,  1.29it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 23, train loss: 0.2169, valid loss: 0.2284.
6 epochs of increasing val loss
Mon Mar  2 17:07:13 2020 Epoch: 24


100%|██████████| 240/240 [03:06<00:00,  1.29it/s]


Epoch 24, train loss: 0.2099, valid loss: 0.2078.
Validation loss decreased (0.219492 --> 0.207761).  Saving model ...


  0%|          | 0/240 [00:00<?, ?it/s]

Mon Mar  2 17:10:44 2020 Epoch: 25


100%|██████████| 240/240 [03:07<00:00,  1.28it/s]


Epoch 25, train loss: 0.2191, valid loss: 0.1815.
Validation loss decreased (0.207761 --> 0.181462).  Saving model ...


  0%|          | 0/240 [00:00<?, ?it/s]

Mon Mar  2 17:14:14 2020 Epoch: 26


100%|██████████| 240/240 [03:06<00:00,  1.28it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 26, train loss: 0.2137, valid loss: 0.2244.
1 epochs of increasing val loss
Mon Mar  2 17:17:43 2020 Epoch: 27


100%|██████████| 240/240 [03:07<00:00,  1.28it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 27, train loss: 0.2078, valid loss: 0.2359.
2 epochs of increasing val loss
Mon Mar  2 17:21:12 2020 Epoch: 28


100%|██████████| 240/240 [03:06<00:00,  1.28it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 28, train loss: 0.2114, valid loss: 0.2292.
3 epochs of increasing val loss
Mon Mar  2 17:24:41 2020 Epoch: 29


100%|██████████| 240/240 [03:06<00:00,  1.28it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 29, train loss: 0.2173, valid loss: 0.2610.
4 epochs of increasing val loss
Mon Mar  2 17:28:10 2020 Epoch: 30


100%|██████████| 240/240 [03:06<00:00,  1.28it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 30, train loss: 0.2066, valid loss: 0.2311.
5 epochs of increasing val loss
Mon Mar  2 17:31:39 2020 Epoch: 31


100%|██████████| 240/240 [03:06<00:00,  1.29it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 31, train loss: 0.1954, valid loss: 0.2151.
6 epochs of increasing val loss
Mon Mar  2 17:35:07 2020 Epoch: 32


100%|██████████| 240/240 [03:07<00:00,  1.28it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 32, train loss: 0.2018, valid loss: 0.2673.
7 epochs of increasing val loss
Mon Mar  2 17:38:36 2020 Epoch: 33


100%|██████████| 240/240 [03:06<00:00,  1.29it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 33, train loss: 0.1922, valid loss: 0.1961.
8 epochs of increasing val loss
Mon Mar  2 17:42:04 2020 Epoch: 34


100%|██████████| 240/240 [03:06<00:00,  1.28it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 34, train loss: 0.1843, valid loss: 0.1921.
9 epochs of increasing val loss
Mon Mar  2 17:45:33 2020 Epoch: 35


100%|██████████| 240/240 [03:07<00:00,  1.28it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

Epoch 35, train loss: 0.1912, valid loss: 0.2168.
10 epochs of increasing val loss
Mon Mar  2 17:49:02 2020 Epoch: 36


100%|██████████| 240/240 [03:07<00:00,  1.28it/s]


Epoch 36, train loss: 0.1892, valid loss: 0.2112.
11 epochs of increasing val loss
Stopping training


# Test

In [None]:
sample_submission = pd.read_csv("sample_submission.csv")
model.to(device)
model.eval()
pred_list = []
names_list = []
for images, image_names in testloader:
    with torch.no_grad():
        images = images.to(device)
        output = model(images)
        pred = F.softmax(output)
        pred = torch.argmax(pred, dim=1).cpu().numpy()
        pred_list += [p.item() for p in pred]
        names_list += [name for name in image_names]


sample_submission.image_name = names_list
sample_submission.label = pred_list
sample_submission["image_name"]=sample_submission["image_name"].apply(lambda x: x.split('/')[1])
sample_submission.to_csv('submission_152_10-3.csv', index=False)

  # Remove the CWD from sys.path while we load stuff.
