In [5]:
import os
from os import walk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision import models, transforms
from tqdm import tqdm

In [50]:
# pre-setup
HP = {'epochs': 25, 'batch_size': 32, 'learning_rate': 1e-3, 'momentum': 0.9, 'test_size': 0.05,'seed': 1}
torch.manual_seed(HP['seed'])
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cuda':
    torch.backends.cudnn.benchmark = True
print(f'using {device} device')

dataset_dir = '/kaggle/input/paddy-disease-classification/train_images/'
submission_dir = '/kaggle/input/paddy-disease-classification/test_images/'
dataset_file = '/kaggle/input/paddy-disease-classification/train.csv'
submission_sample = '/kaggle/input/paddy-disease-classification/sample_submission.csv'
submission_output = '/kaggle/working/submission.csv'


In [51]:
# data processing block
df = pd.read_csv(dataset_file)

# shuffle dataset
df = shuffle(df, random_state=HP['seed'])

# replace category column with numbers
df['variety'] = pd.factorize(df['variety'])[0] # replaces category with number

# create index -> label and vice versa dictionaries
idx_to_label = df['label'].unique()
label_to_idx = {idx: label for label, idx in enumerate(idx_to_label)}

# create train/val split
train_df, test_df = train_test_split(df, test_size=HP['test_size'])
print(f'train len: {len(train_df)}, test len: {len(test_df)}')

In [52]:
df.head()

In [54]:
# image transforms block
# for train dataset
train_transform = transforms.Compose([transforms.RandomHorizontalFlip(), 
                                      transforms.RandomVerticalFlip(), 
                                      transforms.RandomChoice([transforms.Pad(padding=10), transforms.CenterCrop(480), transforms.RandomRotation(20),transforms.CenterCrop((576,432)),transforms.ColorJitter(brightness=0.1,contrast=0.1, saturation=0.1,hue=0.1)]),
                                      transforms.Resize((224,224)),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

# for test dataset
test_transform = transforms.Compose([transforms.Resize((224,224)),
                                     transforms.ToTensor(),
                                     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

In [56]:
'''
IMPORTANT:

__getitem__ in custom dataset is the SAME as collate_fn (both have df['path'] initialized and only translate to tensor on needed basis)
In general: both are interchangeable.
- if you have custom dataset, __getitem__ replaces your collate_fn
- if you have collate_fn, you don't need __getitem__

torchvision: mainly custom dataset
torchtext: mainly collate_fn

TODO: Do both implementations
'''


# custom dataset class (could have used regular dataset if you did preprocessing on the data before loading it in)

# option 1: do all preprocessing on dataframe itself and create a list dataset with zip(), then use default DataLoader (BAD because overflows memory)
# it is better to change to tensor one example at a time!

# option 2: create custom dataset class that does preprocessing for you 

class PaddyDataset(Dataset):
    def __init__(self, dataset_dir, df, label_to_idx, transforms):
        self.df = df
        self.label_to_idx = label_to_idx
        self.transforms = transforms
        self.df['path'] = dataset_dir + '/' + self.df.label + '/' + self.df.image_id
        # 0: image_id, 1: label, 2: variety, 3: age, 4: path
        self.df = self.df.values.tolist()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df[idx]
        image = Image.open(row[4])
#         converter = transforms.ToTensor()
#         print(converter(image))
        image = self.transforms(image)
        idx = self.label_to_idx[row[1]]
        return image, idx

# add tensor image column to dataframe and use standard dataset
# df['path'] = dataset_dir + '/' + df.label + '/' + df.image_id
# converter = transforms.ToTensor()
# train_data = list(zip(df['label'], df['path']))

# def collate_fn(batch):
#     label_list, image_list = [], []
#     for label, image in batch:
#         label_list.append(label_to_idx[label])
#         image_list.append(train_transform(Image.open(image)))
#     label_list = torch.tensor(label_list, dtype=torch.int64)
#     image_list = torch.nn.utils.rnn.pad_sequence(image_list)
# #     print(type(image_list))
#     return label_list, image_list

# train_dataloader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_fn)

    
# image = Image.open(dataset_dir + '/' + df.label + '/' + df.image_id)
# self.df['image'] = transforms.ToTensor(Image.open(dataset_dir + '/' + df.label + '/' + df.image_id))

# creates datasets 
train_dataset = PaddyDataset(dataset_dir, train_df, label_to_idx, train_transform)
test_dataset = PaddyDataset(dataset_dir, test_df, label_to_idx, test_transform)

# creates dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=HP['batch_size'], shuffle=True, pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=HP['batch_size'], shuffle=True, pin_memory=True)

In [60]:
next(iter(train_dataloader))[0].shape

In [32]:
# model block

# gets pretrained network
model = models.resnet34(pretrained=True)

# changes outer layer to have output of 10
model.fc = nn.Sequential(
    nn.Dropout(0.1),
    nn.Linear(model.fc.in_features, len(label_to_idx))
)
model = model.to(device)

# optimizer / loss_fn
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=HP['learning_rate'], momentum=HP['momentum'])

In [33]:
def train(model, criterion, optimizer, train_dataloader, test_dataloader):

    total_train_loss = 0
    total_test_loss = 0
    
    model.train()
    with tqdm(train_dataloader, unit='batch', leave=False) as pbar:
        pbar.set_description(f'training')
        for images, idxs in pbar:
            images = images.to(device, non_blocking=True)
            idxs = idxs.to(device, non_blocking=True)
            output = model(images)

            loss = criterion(output, idxs)
            total_train_loss += loss.item()

            loss.backward()
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

    model.eval()
    with tqdm(test_dataloader, unit='batch', leave=False) as pbar:
        pbar.set_description(f'testing')
        for images, idxs in pbar:
            images = images.to(device, non_blocking=True)
            idxs = idxs.to(device, non_blocking=True)

            output = model(images)
            loss = criterion(output, idxs)
            total_test_loss += loss.item()

    train_acc = total_train_loss / len(train_dataset)
    test_acc = total_test_loss / len(test_dataset)
    print(f'Train loss: {train_acc:.4f} Test loss: {test_acc:.4f} ')

In [34]:
%%time
for i in range(HP['epochs']):
    print(f"Epoch {i+1}/{HP['epochs']}")
    train(model, criterion, optimizer, train_dataloader, test_dataloader)

In [None]:
%%time
model.eval()
image_ids, labels = [], []
for (dirpath, dirname, filenames) in walk(submission_dir):
    for filename in filenames:
        image = Image.open(dirpath+filename)
        image = test_transform(image)
        image = image.unsqueeze(0).to(device)
        image_ids.append(filename)
        labels.append(idx_to_label[model(image).argmax().item()])

In [None]:
submission = pd.DataFrame({
    'image_id': image_ids,
    'label': labels,
})
# submission['label'].value_counts()

In [None]:
submission.to_csv(submission_output, index=False, header=True)