# Get data

In [1]:
#!unzip ./train_metadata.json.zip

# Imports

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import json

# Read metadata

In [3]:
with open('../input/herbarium-2022-fgvc9/train_metadata.json', 'rb') as f:
#with open('./train_metadata.json', 'rb') as f:
    train_meta = json.load(f)

In [4]:
#train_meta.keys()

In [5]:
#with open('../input/herbarium-2022-fgvc9/test_metadata.json', 'rb') as f:
#    test_meta = json.load(f)

In [6]:
# test_meta[0].keys()

# Getting family_ids

In [7]:
df = pd.DataFrame(train_meta['categories'])
#df

In [8]:
#df.groupby('family').count()

In [9]:
maps = list(enumerate(df['family'].unique()))
maps = {k: v for (v,k) in maps}
df['family_id'] = df['family'].map(maps)

Получили разметку изображений по надклассам

In [10]:
cat_fam = df[['category_id', 'family_id']]
cat_fam['family_id'] = cat_fam['family_id'].astype(int)
#cat_fam

# Get image-family-category dataframe

In [11]:
df = pd.DataFrame(train_meta['annotations'])
df = df[['category_id', 'image_id']]

images = pd.DataFrame(train_meta['images'])

dataset1 = df.merge(cat_fam, on='category_id', how='left').merge(images, on='image_id', how='left')
dataset1 = dataset1[['file_name', 'category_id', 'family_id']]
#dataset1

Разделим выборку на обучающую и валидационную и сохраним в отдельные csv-файлы

In [12]:
from sklearn.model_selection import train_test_split

train_X, validation_X, train_Y, validation_Y = train_test_split(dataset1['file_name'], dataset1['family_id'], test_size=0.01, stratify=dataset1['family_id'], random_state=0)

train_X.to_csv('train_X.csv')
validation_X.to_csv('valid_X.csv')
train_Y.to_csv('train_Y.csv')
validation_Y.to_csv('valid_Y.csv')

In [13]:
#train

Выгрузили на гдиск

In [14]:
#from google.colab import drive
#drive.mount('/content/drive')

In [15]:
#!cp *.csv /content/drive/MyDrive/kaggle_Herbarium22/

# First model building

In [16]:
n_family = 272

## Imports

In [17]:
# PyTorch model and training necessities
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Image datasets and image manipulation
import torchvision
import torchvision.transforms as transforms
from torchvision import datasets

# Image display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from PIL import Image

torch.manual_seed(0)
images_path = '../input/herbarium-2022-fgvc9/train_images/'

In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

## Read metadata

In [19]:
train_X = pd.read_csv('./train_X.csv')
validation_X = pd.read_csv('./valid_X.csv')
train_Y = pd.read_csv('./train_Y.csv')
validation_Y = pd.read_csv('./valid_Y.csv')

In [20]:
#train_X.head(5).merge(train_Y.head(5))

## Data preparing

In [21]:
# Gather datasets and prepare them for consumption
transform = transforms.Compose([
                                transforms.Resize((224, 224)),
                                transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,))
                            ])

class FamilyDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe_X, dataframe_Y, transform):
        self.dataframe = dataframe_X.merge(dataframe_Y)
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        return (
            transform(Image.open(images_path + row["file_name"])).cuda(),
            torch.tensor(row["family_id"]).cuda()
        )


train_dataset = FamilyDataset(train_X, train_Y, transform)
validation_dataset = FamilyDataset(validation_X, validation_Y, transform)

training_loader = torch.utils.data.DataLoader(train_dataset,
                                              batch_size=64,
                                              shuffle=True)


validation_loader = torch.utils.data.DataLoader(validation_dataset,
                                                batch_size=8,
                                                shuffle=False)

## Torch model

In [22]:
!wget https://raw.githubusercontent.com/Lavabar/kaggle_Herbarium22/main/efficientnet.py

In [23]:
from efficientnet import EfficientNetB0

In [24]:
# n_family = 272
# class LeNet(nn.Module):
#     def __init__(self):
#         super(LeNet, self).__init__()
#         self.conv11 = nn.Conv2d(1, 6, 3, padding=1)
#         self.conv12 = nn.Conv2d(6, 6, 3, padding=1)
#         self.pool = nn.MaxPool2d(2, 2)
#         self.conv21 = nn.Conv2d(6, 16, 3, padding=1)
#         self.conv22 = nn.Conv2d(16, 16, 3, padding=1)
#         self.fc1 = nn.Linear(16 * 64 * 64, 120)
#         self.fc2 = nn.Linear(120, 84)
#         self.fc3 = nn.Linear(84, n_family)
#         self.out = nn.Softmax(dim=1)

#     def forward(self, x):
#         x = self.conv11(x)
#         x = self.pool(F.relu(self.conv12(x)))
#         x = self.conv21(x)
#         x = self.pool(F.relu(self.conv22(x)))
#         x = x.view(-1, 16 * 64 * 64)
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
#         x = self.out(self.fc3(x))
#         return x

In [25]:
net = EfficientNetB0(out_sz=n_family)
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(net.parameters(), lr=0.001)
net.to(device)

## Setting writer

In [26]:
from torch.utils.tensorboard import SummaryWriter

# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/herb_experiment_100')

## Training

In [None]:
for epoch in range(10):  # loop over the dataset multiple times
    running_loss = 0.0

    for i, data in enumerate(training_loader, 0):
        # basic training loop
        inputs, labels = data
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        print('Batch {}'.format(i + 1))
        running_loss += loss.item()
        if i % 1000 == 999:# Every 1000 mini-batches...
            torch.save(net, f'./checkpoint{epoch}_{i+1}.pth')
            print('Batch {}'.format(i + 1))
            # Check against the validation set
            running_vloss = 0.0

            net.train(False) # Don't need to track gradents for validation
            for j, vdata in enumerate(validation_loader, 0):
                vinputs, vlabels = vdata
                voutputs = net(vinputs)
                vloss = criterion(voutputs, vlabels)
                running_vloss += vloss.item()
            net.train(True) # Turn gradients back on for training

            avg_loss = running_loss / 1000
            avg_vloss = running_vloss / len(validation_loader)

            # Log the running loss averaged per batch
            writer.add_scalars('Training vs. Validation Loss',
                            { 'Training' : avg_loss, 'Validation' : avg_vloss },
                            epoch * len(training_loader) + i)

            running_loss = 0.0
print('Finished Training')
writer.flush()

In [None]:
#model = torch.load(PATH)
#model.eval()

In [None]:
#!zip -r -0 runs.zip runs

In [None]:
#!tensorboard --logdir=runs

# http://localhost:6006/