In [None]:
import warnings  

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import Subset
from torch.nn.functional import softmax

from efficientnet_pytorch import EfficientNet

from typing import Callable, List, Tuple, Dict
from pathlib import Path

import catalyst
from catalyst.utils import imread
from catalyst.dl import utils
from catalyst.utils import get_dataset_labeling, map_dataframe

from catalyst.utils import split_dataframe_train_test
from catalyst.data.reader import ImageReader, ScalarReader, ReaderCompose
from catalyst.data.augmentor import Augmentor
from catalyst.dl.runner import SupervisedRunner
from catalyst.dl.callbacks import AccuracyCallback, AUCCallback, F1ScoreCallback, CheckpointCallback, EarlyStoppingCallback

from ignite.engine import Engine, _prepare_batch
from ignite.engine import create_supervised_trainer
from ignite.engine import create_supervised_evaluator
from ignite.engine import Events
from ignite.metrics import Loss, Accuracy, Precision, Recall
from ignite.handlers import ModelCheckpoint
from ignite.handlers import EarlyStopping

from transformers import AdamW
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import train_test_split
from collections import defaultdict, OrderedDict
from tqdm.notebook import tqdm
from torchsummary import summary

In [None]:
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", DeprecationWarning)
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [None]:
EXPERIMENT_NAME = "starter_code_01"

class ConfigExperiment:
    logdir = f"./logs/{EXPERIMENT_NAME}"
    save_dirname = EXPERIMENT_NAME
    submission_file = f"{EXPERIMENT_NAME}.csv"
    seed = 42
    batch_size = 8
    model_name = 'efficientnet-b0'
    size = 512
    num_workers = 20
    root_images = "../../../data/raw/plant-pathology-2020-fgvc7/images/"
    root = "../../../data/raw/plant-pathology-2020-fgvc7/"
    num_classes = 4
    patience= 5
    num_epochs = 200
    lr = 0.003
    class_names = ["healthy", "multiple_diseases", "rust", "scab"]
    is_fp16_used = False
    log_interval = 50
    
config = ConfigExperiment()
config.size = EfficientNet.get_image_size(config.model_name)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
utils.set_global_seed(config.seed)
utils.prepare_cudnn(deterministic=True)
device = utils.get_device()

In [4]:
class PlantDataset(Dataset):
    
    def __init__(self, df, config, transforms=None):
    
        self.df = df
        self.images_dir = config.root_images
        self.class_names = config.class_names
        self.transforms=transforms
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        image_src = self.images_dir + self.df.iloc[idx]['image_id'] + '.jpg'
        image = cv2.imread(image_src, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        labels = self.df.iloc[idx][self.class_names].values.astype(np.int8)
        label = torch.argmax(torch.from_numpy(labels))
        
        if self.transforms:
            transformed = self.transforms(image=image)
            image = transformed['image']

        return image, label

In [5]:
def pre_transforms(image_size=224):
    # Convert the image to a square of size image_size x image_size
    # (keeping aspect ratio)
    result = [
        A.LongestMaxSize(max_size=image_size),
        A.PadIfNeeded(image_size, image_size, border_mode=0)
    ]
    
    return result

def hard_transforms():
    result = [
        # Random shifts, stretches and turns with a 50% probability
        A.RandomResizedCrop(height=config.size, width=config.size, p=1.0),
        A.Flip(),
        A.ShiftScaleRotate(rotate_limit=1.0, p=0.8),

        # Pixels
        A.OneOf([
            A.IAAEmboss(p=1.0),
            A.IAASharpen(p=1.0),
            A.Blur(p=1.0),
        ], p=0.5),

        # Affine
        A.OneOf([
            A.ElasticTransform(p=1.0),
            A.IAAPiecewiseAffine(p=1.0)
        ], p=0.5),
    ]
    
    return result

def post_transforms():
    # we use ImageNet image normalization
    # and convert it to torch.Tensor
    return [A.Normalize(p=1.0), ToTensorV2(p=1.0),]

def compose(transforms_to_compose):
    # combine all augmentations into one single pipeline
    result = A.Compose([item for sublist in transforms_to_compose for item in sublist])
    return result

In [6]:
train_df = pd.read_csv(config.root + 'train.csv')
train, valid = train_test_split(train_df, test_size=0.33, random_state=config.seed, shuffle=True, stratify=train_df[config.class_names])

train_transforms = compose([
    pre_transforms(config.size),
#     hard_transforms(), 
    post_transforms()
])
valid_transforms = compose([
    pre_transforms(config.size), 
    post_transforms()
])

show_transforms = compose([
    pre_transforms(config.size),
#     hard_transforms()
])
train_dataset = PlantDataset(train, config, train_transforms)
valid_dataset = PlantDataset(valid, config, valid_transforms)

train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers)
valid_dataloader = DataLoader(valid_dataset, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers)


indices = np.arange(len(train_dataset))
random_indices = np.random.permutation(indices)[:len(valid_dataset)]
train_subset = Subset(train_dataset, indices=random_indices)

train_eval_loader = DataLoader(train_subset, batch_size=config.batch_size, shuffle=True,
                                num_workers=config.num_workers, 
                                drop_last=True, pin_memory=True)

In [7]:
def get_model(model_name: str, num_classes: int, pretrained: str = "imagenet") -> EfficientNet:
    model = EfficientNet.from_pretrained(model_name)
    for param in model.parameters():
        param.requires_grad = False
    num_ftrs = model._fc.in_features
    model._fc = nn.Sequential(nn.Linear(num_ftrs, num_classes, bias = True))
    return model

model = get_model(config.model_name, config.num_classes)

Loaded pretrained weights for efficientnet-b0


In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=config.patience, verbose=True, mode="min", factor=0.3)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)

In [9]:
metrics = {
    'avg_loss': Loss(criterion),
    'avg_accuracy': Accuracy(),
    'avg_precision': Precision(average=True),
    'avg_recall': Recall(average=True)
}

trainer = create_supervised_trainer(model, optimizer, criterion, device)
train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)
val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)


@trainer.on(Events.ITERATION_COMPLETED)
def log_training_loss(engine):
    iteration = (engine.state.iteration - 1) % len(train_dataloader) + 1
    if iteration % config.log_interval == 0:
        print("Epoch[{}] Iteration[{}/{}] Loss: {:.4f}".format(engine.state.epoch, iteration, len(train_dataloader), engine.state.output))


@trainer.on(Events.EPOCH_COMPLETED)
def compute_and_display_offline_train_metrics(engine):
    epoch = engine.state.epoch
    print("Compute train metrics...")
    metrics = train_evaluator.run(train_eval_loader).metrics
    print("Training Results - Epoch: {}  Average Loss: {:.4f} | Accuracy: {:.4f} | Precision: {:.4f} | Recall: {:.4f}"
          .format(engine.state.epoch, 
                      metrics['avg_loss'], 
                      metrics['avg_accuracy'], 
                      metrics['avg_precision'], 
                      metrics['avg_recall']))

    
@trainer.on(Events.EPOCH_COMPLETED)
def compute_and_display_val_metrics(engine):
    epoch = engine.state.epoch
    print("Compute validation metrics...")
    metrics = val_evaluator.run(valid_dataloader).metrics
    print("Validation Results - Epoch: {}  Average Loss: {:.4f} | Accuracy: {:.4f} | Precision: {:.4f} | Recall: {:.4f}"
          .format(engine.state.epoch, 
                      metrics['avg_loss'], 
                      metrics['avg_accuracy'], 
                      metrics['avg_precision'], 
                      metrics['avg_recall']))

    
@trainer.on(Events.EPOCH_COMPLETED)
def update_lr_scheduler(engine):
    scheduler.step()
    # Вывод значений скорости обучения:
    if len(optimizer.param_groups) == 1:
        lr = float(optimizer.param_groups[0]['lr'])
        print("Learning rate: {}".format(lr))
    else:
        for i, param_group in enumerate(optimizer.param_groups):
            lr = float(param_group['lr'])
            print("Learning rate (group {}): {}".format(i, lr))
            

def score_function(engine):
    val_avg_accuracy = engine.state.metrics['avg_accuracy']
    return val_avg_accuracy


best_model_saver = ModelCheckpoint(
    f"{config.save_dirname}_best",  
    filename_prefix="model",
    score_name="val_accuracy",  
    score_function=score_function,
    n_saved=1,
    require_empty=True,
    save_as_state_dict=True,
    create_dir=True
)

val_evaluator.add_event_handler(
    Events.COMPLETED, 
    best_model_saver, 
    {"best_model": model}
)


training_saver = ModelCheckpoint(
    f"{config.save_dirname}_checkpoint",
    filename_prefix="checkpoint",
    n_saved=1,
    save_as_state_dict=True,
    create_dir=True
)

to_save = {"model": model, "optimizer": optimizer, "scheduler": scheduler} 
trainer.add_event_handler(Events.ITERATION_COMPLETED(every=100), training_saver, to_save)


early_stopping = EarlyStopping(
    patience=config.patience,
    score_function=score_function,
    trainer=trainer
)

val_evaluator.add_event_handler(Events.EPOCH_COMPLETED, early_stopping)

<ignite.engine.engine.RemovableEventHandle at 0x7fc538284208>

In [10]:
output = trainer.run(train_dataloader, max_epochs=config.num_epochs)

Epoch[1] Iteration[50/153] Loss: 1.1698
Epoch[1] Iteration[100/153] Loss: 0.6756
Epoch[1] Iteration[150/153] Loss: 0.7308
Compute train metrics...
Training Results - Epoch: 1  Average Loss: 0.6444 | Accuracy: 0.7567 | Precision: 0.5840 | Recall: 0.6049
Compute validation metrics...
Validation Results - Epoch: 1  Average Loss: 0.6370 | Accuracy: 0.7687 | Precision: 0.5902 | Recall: 0.6106
Learning rate: 0.0024000000000000002
Epoch[2] Iteration[50/153] Loss: 0.7603
Epoch[2] Iteration[100/153] Loss: 0.4699
Epoch[2] Iteration[150/153] Loss: 0.3752
Compute train metrics...
Training Results - Epoch: 2  Average Loss: 0.4938 | Accuracy: 0.8050 | Precision: 0.8202 | Recall: 0.6762
Compute validation metrics...
Validation Results - Epoch: 2  Average Loss: 0.6003 | Accuracy: 0.7854 | Precision: 0.6796 | Recall: 0.6270
Learning rate: 0.0019200000000000003
Epoch[3] Iteration[50/153] Loss: 0.5728
Epoch[3] Iteration[100/153] Loss: 0.7329
Epoch[3] Iteration[150/153] Loss: 0.5498
Compute train metrics.

In [None]:
import torch.nn.functional as F
from ignite._utils import convert_tensor

In [1]:
def _prepare_batch(batch):
    x, index = batch
    x = convert_tensor(x, device=device)
    return x, index

def inference_update(engine, batch):
    x, indices = _prepare_batch(batch)
    y_pred = model(x)
    y_pred = F.softmax(y_pred, dim=1)
    return {"y_pred": convert_tensor(y_pred, device='cpu'), "indices": indices}

model.eval()
inferencer = Engine(inference_update)

@inferencer.on(Events.EPOCH_COMPLETED)
def log_tta(engine):
    print("TTA {} / {}".format(engine.state.epoch, n_tta))
    
n_tta = 5
num_classes = 4
n_samples = len(valid_dataset)

# Массив для хранения предсказаний
y_probas_tta = np.zeros((n_samples, num_classes, n_tta), dtype=np.float32)


@inferencer.on(Events.ITERATION_COMPLETED)
def save_results(engine):
    output = engine.state.output
    tta_index = engine.state.epoch - 1
    start_index = ((engine.state.iteration - 1) % len(valid_dataloader)) * batch_size
    end_index = min(start_index + batch_size, n_samples)
    batch_y_probas = output['y_pred'].detach().numpy()
    y_probas_tta[start_index:end_index, :, tta_index] = batch_y_probas

NameError: name 'model' is not defined

In [24]:
test_df.head()

Unnamed: 0,image_id
0,Test_0
1,Test_1
2,Test_2
3,Test_3
4,Test_4


In [25]:
dataset_test = create_dataset(root_dir=config.root_images, mask="Test*.jpg")

In [26]:
df_path_test = create_dataframe(dataset_test, columns=["image_id", "filepath"])

In [27]:
df_path_test["healthy"] = 0
df_path_test["multiple_diseases"] = 0
df_path_test["rust"] = 0
df_path_test["scab"] = 0

In [28]:
df_path_test.head()

Unnamed: 0,image_id,filepath,healthy,multiple_diseases,rust,scab
0,Test_0,../../../data/raw/plant-pathology-2020-fgvc7/i...,0,0,0,0
1,Test_1,../../../data/raw/plant-pathology-2020-fgvc7/i...,0,0,0,0
2,Test_10,../../../data/raw/plant-pathology-2020-fgvc7/i...,0,0,0,0
3,Test_100,../../../data/raw/plant-pathology-2020-fgvc7/i...,0,0,0,0
4,Test_1000,../../../data/raw/plant-pathology-2020-fgvc7/i...,0,0,0,0


In [29]:
def make_predict(model: torch.nn.Module, config: ConfigExperiment, class_names: List[str], images_df: pd.DataFrame, device: torch.device) -> pd.DataFrame:
    result = []
    for _, row in images_df.iterrows():
        path = row["filepath"]
        _image = imread(path)
    
        model.eval()
        with torch.no_grad():
            tensor_ = torch.stack([valid_transforms(image=_image)["image"]]).to(device)
            logits = model.forward(tensor_)
            probabilities = softmax(logits, dim=1)
            predictions = probabilities.argmax(dim=1)
            result.append(probabilities.cpu().numpy()[0])
            
    return result

In [30]:
%%time

df_path_test[['healthy', 'multiple_diseases', 'rust', 'scab']] = make_predict(model, config, class_names=config.class_names, images_df=df_path_test, device=device)

CPU times: user 6min 8s, sys: 1.5 s, total: 6min 10s
Wall time: 4min 36s


In [31]:
df_path_test.head()

Unnamed: 0,image_id,filepath,healthy,multiple_diseases,rust,scab
0,Test_0,../../../data/raw/plant-pathology-2020-fgvc7/i...,0.001694,1e-06,0.998304,5.17547e-09
1,Test_1,../../../data/raw/plant-pathology-2020-fgvc7/i...,0.00018,0.011641,0.988051,0.0001286567
2,Test_10,../../../data/raw/plant-pathology-2020-fgvc7/i...,0.01124,2.3e-05,0.988737,2.179752e-07
3,Test_100,../../../data/raw/plant-pathology-2020-fgvc7/i...,0.000166,0.029997,3.6e-05,0.9698012
4,Test_1000,../../../data/raw/plant-pathology-2020-fgvc7/i...,0.684439,0.02483,0.267419,0.02331294


In [32]:
df_path_test.drop(["filepath"], axis=1, inplace=True)

In [33]:
df_path_test.to_csv(config.submission_file, index=False)

In [34]:
df_path_test.head()

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,Test_0,0.001694,1e-06,0.998304,5.17547e-09
1,Test_1,0.00018,0.011641,0.988051,0.0001286567
2,Test_10,0.01124,2.3e-05,0.988737,2.179752e-07
3,Test_100,0.000166,0.029997,3.6e-05,0.9698012
4,Test_1000,0.684439,0.02483,0.267419,0.02331294
