# Starter Code - Classification on Unlabeled and Mislabeled Images

<h4> Import Libraries </h4>

In [None]:
import os
import sys

import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

# Pytorch imports
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

<h4> Training Configuration </h4>
You should experiment with different hyperparameters

In [None]:
CONFIG = {"seed": 420,
          "epochs": 20,
          "img_size": 64,
          "num_classes": 30,
          "train_batch_size": 128,
          "val_batch_size": 128,
          "learning_rate": 1e-4,
          "num_workers": 2,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          # StepLR Scheduler hyperparameters
          "step_size": 10,
          "gamma": 0.95
          }

<h4> Set seed for reproducibility </h4>

In [None]:
torch.manual_seed(CONFIG["seed"])
torch.cuda.manual_seed(CONFIG["seed"])

<h4> Data Directories </h4>

In [None]:
ROOT_DIR = '/kaggle/input/classification-on-unlabeled-and-mislabeled-images/'
TRAIN_LABELED_DIR = os.path.join(ROOT_DIR, 'train/train/labeled_images/')
TRAIN_UNLABELED_DIR = os.path.join(ROOT_DIR, 'train/train/unlabeled_images/')
TEST_DIR = os.path.join(ROOT_DIR, 'test/test/images/')
SAVE_PATH = "best_model.pth"

<h4> Read the CSV </h4>

In [None]:
df = pd.read_csv(os.path.join(ROOT_DIR, 'train_annotations.csv'))
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/classification-on-unlabeled-and-mislabeled-images/train_annotations.csv'

<h4> Create mapping for class_name </h4>

In [None]:
class_names = df.class_name.unique()
class_to_index_mapping = {}
index_to_class_mapping = {}
for i in range(CONFIG['num_classes']):
    class_to_index_mapping[class_names[i]] = i
    index_to_class_mapping[i] = class_names[i]
class_to_index_mapping

<h4> Dataset Class </h4>

In [None]:
class CustomDataset(Dataset):
    def __init__(self, csv_path, data_dir, transform, dataset_type='train'):
        self.transform = transform
        df = pd.read_csv(csv_path)
        self.data_dir = data_dir

        # Split training set into train and validation
        train_data=df.sample(frac=0.8,random_state=CONFIG['seed'])
        if dataset_type == 'train':
            self.data = train_data
        elif dataset_type == 'val':
            self.data=df.drop(train_data.index)

        # For submission use full training set
        elif dataset_type == 'full-train':
                self.data = df

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        img_path = os.path.join(self.data_dir, row['image_name'])
        img_label = class_to_index_mapping[row.class_name]

        img = Image.open(img_path).convert("RGB")

        if self.transform:
            img = self.transform(img)

        output = {'img': img, 'label': img_label}

        return output

In [None]:
# Dataset for loading unlabeled set and test set

class UnlabeledDataset(Dataset):
    def __init__(self, data_dir, transform):
        self.transform = transform
        self.data_dir = data_dir
        self.img_names = [filename for filename in sorted(os.listdir(self.data_dir))]

    def __len__(self):
        return len(self.img_names)

    def __getitem__(self, idx):
        img_path = os.path.join(self.data_dir, self.img_names[idx])

        img = Image.open(img_path).convert("RGB")

        if self.transform:
            img = self.transform(img)

        output = {'img': img, 'img_name': self.img_names[idx]}

        return output

<h4> Augmentations </h4>
You should experiment with adding/removing transforms

In [None]:
train_transforms = transforms.Compose([
    transforms.Resize(CONFIG['img_size']),
    transforms.RandomResizedCrop(CONFIG['img_size']),
    transforms.RandomRotation(degrees=90),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229,0.224,0.225]),
])

test_transforms = transforms.Compose([
    transforms.Resize(CONFIG['img_size']),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229,0.224,0.225]),
])

<h4> Prepare Data loaders </h4>

In [None]:
def prepare_loaders():
    train_set = CustomDataset(
        csv_path = os.path.join(ROOT_DIR, 'train_annotations.csv'),
        data_dir = TRAIN_LABELED_DIR,
        transform = train_transforms,
        dataset_type ='train'
    )

    train_loader = DataLoader(
        train_set,
        batch_size = CONFIG['train_batch_size'],
        shuffle = True,
        num_workers = CONFIG['num_workers'],
        pin_memory = True
    )

    val_set = CustomDataset(
        csv_path = os.path.join(ROOT_DIR, 'train_annotations.csv'),
        data_dir = TRAIN_LABELED_DIR,
        transform = test_transforms,
        dataset_type ='val'
    )

    val_loader = DataLoader(
        val_set,
        batch_size = CONFIG['val_batch_size'],
        shuffle = False,
        num_workers = CONFIG['num_workers'],
        pin_memory = True
    )

    return train_loader, val_loader


<h4> Create model </h4>
You should experiment with different models by modifying existing ones or constructing new models from scratch

In [None]:
from torchvision.models import resnet18

class ResnetModel(nn.Module):
    def __init__(self, num_classes):
        super(ResnetModel, self).__init__()
        self.backbone = resnet18()

        # Change the classification head to have num_classes output neurons
        in_features = self.backbone.fc.in_features
        self.backbone.fc = nn.Linear(in_features=in_features, out_features=num_classes, bias=True)

    def forward(self, x):
        x = self.backbone(x)

        return x

In [None]:
model = ResnetModel(num_classes=CONFIG['num_classes'])
model.to(CONFIG['device'])

<h4> Training Helpers </h4>
You are encouraged to experiment with different scheduler types and parameters

In [None]:
criterion = nn.CrossEntropyLoss()

optimizer = optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'])
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=CONFIG['step_size'], gamma=CONFIG['gamma'])

<h4> Training function </h4>

In [None]:
def train_epoch(model, dataloader, device, epoch):
    model.train()

    total_train_loss = 0.0
    dataset_size = 0

    bar = tqdm(enumerate(dataloader), total=len(dataloader), colour='cyan', file=sys.stdout)
    for step, data in bar:
        images = data['img'].to(device)
        labels = data['label'].to(device)

        batch_size = images.shape[0]

        optimizer.zero_grad()
        pred = model(images)
        loss = criterion(pred, labels)

        loss.backward()
        optimizer.step()

        total_train_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = np.round(total_train_loss / dataset_size, 2)
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss)

    scheduler.step()


    return epoch_loss

<h4> Validation function </h4>

In [None]:
def valid_epoch(model, dataloader, device, epoch):
    model.eval()

    total_val_loss = 0.0
    dataset_size = 0

    correct = 0

    bar = tqdm(enumerate(dataloader), total=len(dataloader), colour='cyan', file=sys.stdout)
    for step, data in bar:
        images = data['img'].to(device)
        labels = data['label'].to(device)

        batch_size = images.shape[0]

        pred = model(images)
        loss = criterion(pred, labels)

        _, predicted = torch.max(pred, 1)
        correct += (predicted == labels).sum().item()

        total_val_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = np.round(total_val_loss / dataset_size, 2)

        accuracy = np.round(100 * correct / dataset_size, 2)

        bar.set_postfix(Epoch=epoch, Valid_Acc=accuracy, Valid_Loss=epoch_loss)

    return accuracy, epoch_loss

<h4> Build submission file </h4>

In [None]:
def build_submission(model, dataloader, device, submission_file):
    model.eval()

    all_predictions = []
    all_image_names = []

    for data in dataloader:
        images = data['img'].to(device)
        img_names = data['img_name']
        pred = model(images)
        _, predicted = torch.max(pred, 1)

        predicted = predicted.cpu().numpy().tolist()
        all_predictions.extend(predicted)
        all_image_names.extend(img_names)

    all_predictions = [index_to_class_mapping[prediction] for prediction in all_predictions]
    data = list(zip(all_image_names, all_predictions))
    submission_df = pd.DataFrame(data=data, columns=['image_name', 'class_name'])
    submission_df.to_csv(submission_file, index=False)
    print(f"Submission saved to {submission_file}")

<h4> Run Training </h4>

In [None]:
def run_training(model, device, num_epochs):
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))

    top_accuracy = 0.0

    train_loader, val_loader = prepare_loaders()
    for epoch in range(num_epochs):

        train_loss = train_epoch(model, train_loader, device, epoch)
        with torch.no_grad():
            val_accuracy, val_loss = valid_epoch(model, val_loader, device, epoch)
            if val_accuracy > top_accuracy:
                print(f"Validation Accuracy Improved ({top_accuracy} ---> {val_accuracy})")
                top_accuracy = val_accuracy
                torch.save(model.state_dict(), SAVE_PATH)
                print("Model Saved")
        print()

In [None]:
run_training(model, CONFIG['device'], CONFIG['epochs'])

In [None]:
print("Loading best model for submission")
model.load_state_dict(torch.load(SAVE_PATH))

test_set = UnlabeledDataset(TEST_DIR, test_transforms)

test_loader = DataLoader(
        test_set,
        batch_size = CONFIG['val_batch_size'],
        shuffle = False,
        num_workers = CONFIG['num_workers'],
        pin_memory = True
)

build_submission(model, test_loader, CONFIG['device'], 'submission.csv')