# Load CSV files to Pandas DataFrame

# 1 library

In [1]:
import os
import gc
import cv2
import math
import copy
import time
import random
import glob
from matplotlib import pyplot as plt
import seaborn as sns

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp
import torchvision

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

# For Image Models
import timm

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "0,1"



In [2]:
train_csv_path = "/kaggle/input/UBC-OCEAN/train.csv"
test_csv_path = "/kaggle/input/UBC-OCEAN/test.csv"
train_pandas_df = pd.read_csv(train_csv_path)
test_pandas_df = pd.read_csv(test_csv_path)
BASE_DIR = ["/kaggle/input/UBC-OCEAN/train_thumbnails/", "/kaggle/input/UBC-OCEAN/test_thumbnails/"]

class_distribution = train_pandas_df['label'].value_counts()


# 2 Model building

In [3]:


CONFIG = {
    "seed": 42,
    "epochs": 20,
    "img_size": 400,
    "model_name": "tf_efficientnet_b0_ns",
    "num_classes": 5,
    "train_batch_size": 32,
    "valid_batch_size": 32,
    "learning_rate": 1e-3,
    "scheduler": 'CosineAnnealingLR',
    "min_lr": 1e-6,
    "T_max": 500,
    "weight_decay": 1e-6,
    "fold" : 0,
    "n_fold": 5,
    "n_accumulate": 1,
    "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
}


In [4]:
'''
CONFIG = {
    "seed": 42,
    "img_size": 512,
    "model_name": "tf_efficientnetv2_s_in21ft1k",# tf_efficientnetv2_s_in21ft1k swin_large_patch4_window12_384
    "num_classes": 5,
    "valid_batch_size": 32,
    "scheduler": 'CosineAnnealingLR',
    "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
}
'''


'\nCONFIG = {\n    "seed": 42,\n    "img_size": 512,\n    "model_name": "tf_efficientnetv2_s_in21ft1k",# tf_efficientnetv2_s_in21ft1k swin_large_patch4_window12_384\n    "num_classes": 5,\n    "valid_batch_size": 32,\n    "scheduler": \'CosineAnnealingLR\',\n    "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),\n}\n'

In [5]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [6]:
ROOT_DIR = '/kaggle/input/UBC-OCEAN'
TRAIN_DIR = '/kaggle/input/UBC-OCEAN/train_thumbnails'
TEST_DIR = '/kaggle/input/UBC-OCEAN/test_images'
ALT_TEST_DIR = '/kaggle/input/UBC-OCEAN/test_images'

In [7]:
def get_train_file_path(image_id):
    return f"{TRAIN_DIR}/{image_id}_thumbnail.png"
#    return f"{TRAIN_DIR}/{image_id}.png"

In [8]:
train_images = sorted(glob.glob(f"{TRAIN_DIR}/*.png"))

In [9]:
df = pd.read_csv(f"{ROOT_DIR}/train.csv")
df['file_path'] = df['image_id'].apply(get_train_file_path)
df = df[ df["file_path"].isin(train_images) ].reset_index(drop=True)


In [10]:
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])

with open("label_encoder.pkl", "wb") as fp:
    joblib.dump(encoder, fp)

In [11]:
CONFIG['T_max'] = df.shape[0] * (CONFIG["n_fold"]-1) * CONFIG['epochs'] // CONFIG['train_batch_size'] // CONFIG["n_fold"]
CONFIG['T_max']

256

---
# K-fold
With the model defined, let's move on to the Training step.

For training, I'll use k-fold cross-validation as suggested. During each fold, we'll train the model using the training split and evaluate its performance on the validation split.

In [12]:
skf = StratifiedKFold(n_splits=CONFIG['n_fold'])

for fold, ( _, val_) in enumerate(skf.split(X=df, y=df.label)):
      df.loc[val_ , "kfold"] = int(fold)

# Dataset Class

In [13]:
class UBCDataset(Dataset):
    def __init__(self, df, transforms=None):
        self.df = df
        self.file_names = df['file_path'].values
        self.labels = df['label'].values
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        img_path = self.file_names[index]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        label = self.labels[index]
        
        if self.transforms:
            img = self.transforms(image=img)["image"]
            
        return {
            'image': img,
            'label': torch.tensor(label, dtype=torch.long)
        }

# Data Augmentations (transforms)

In [14]:
data_transforms = {
    "train": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.ShiftScaleRotate(shift_limit=0.1, 
                           scale_limit=0.15, 
                           rotate_limit=60, 
                           p=0.5),
        A.HueSaturationValue(
                hue_shift_limit=0.2, 
                sat_shift_limit=0.2, 
                val_shift_limit=0.2, 
                p=0.5
            ),
        A.RandomBrightnessContrast(
                brightness_limit=(-0.1,0.1), 
                contrast_limit=(-0.1, 0.1), 
                p=0.5
            ),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.),
    
    "valid": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.)
}

# GeM Pooling

In [15]:
class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
        
    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
        
    def __repr__(self):
        return self.__class__.__name__ + \
                '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + \
                ', ' + 'eps=' + str(self.eps) + ')'

# Create EfficientNet_b0 Model

In [16]:
class UBCModel(nn.Module):
    def __init__(self, model_name, num_classes, pretrained=True, checkpoint_path=None):
        super(UBCModel, self).__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained, checkpoint_path=checkpoint_path)

        in_features = self.model.classifier.in_features
        self.model.classifier = nn.Identity()
        self.model.global_pool = nn.Identity()
        self.pooling = GeM()
        self.linear = nn.Linear(in_features, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, images):
        features = self.model(images)
        pooled_features = self.pooling(features).flatten(1)
        output = self.linear(pooled_features)
        return output

    
#model = UBCModel(CONFIG['model_name'], CONFIG['num_classes'])#, checkpoint_path=CONFIG['checkpoint_path']
#model.to(CONFIG['device'])

# Loss Function

In [17]:
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

# Model Training

In [18]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    running_acc  = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        images = data['image'].to(device, dtype=torch.float)
        labels = data['label'].to(device, dtype=torch.long)
        
        batch_size = images.size(0)
        
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss = loss #/ CONFIG['n_accumulate']
            
        loss.backward()
        
        
        optimizer.step()

        # zero the parameter gradients
        optimizer.zero_grad()

        if scheduler is not None:
            scheduler.step()
            
        _, predicted = torch.max(model.softmax(outputs), 1)
        acc = torch.sum( predicted == labels )
        
        running_loss += (loss.item() * batch_size)
        running_acc  += acc.item()
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        epoch_acc = running_acc / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss, Train_Acc=epoch_acc,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss, epoch_acc

In [19]:
@torch.inference_mode()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    running_acc = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        images = data['image'].to(device, dtype=torch.float)
        labels = data['label'].to(device, dtype=torch.long)
        
        batch_size = images.size(0)

        outputs = model(images)
        loss = criterion(outputs, labels)

        _, predicted = torch.max(model.softmax(outputs), 1)
        acc = torch.sum( predicted == labels )

        running_loss += (loss.item() * batch_size)
        running_acc  += acc.item()
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        epoch_acc = running_acc / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss, Valid_Acc=epoch_acc,
                        LR=optimizer.param_groups[0]['lr'])   
    
    gc.collect()
    
    return epoch_loss, epoch_acc

In [20]:
def run_training(model, optimizer, scheduler, device, num_epochs):
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_acc = -np.inf
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss, train_epoch_acc = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch)
        
        val_epoch_loss, val_epoch_acc = valid_one_epoch(model, valid_loader, device=CONFIG['device'], 
                                         epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        history['Train Accuracy'].append(train_epoch_acc)
        history['Valid Accuracy'].append(val_epoch_acc)
        history['lr'].append( scheduler.get_lr()[0] )
        
        # deep copy the model
        if best_epoch_acc <= val_epoch_acc:
            print(f"{b_}Validation Accuracy Improved ({best_epoch_acc} ---> {val_epoch_acc})")
            best_epoch_acc = val_epoch_acc
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = "Acc{:.2f}_Loss{:.4f}_epoch{:.0f}.bin".format(best_epoch_acc, val_epoch_loss, epoch)
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved{sr_}")
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Accuracy: {:.4f}".format(best_epoch_acc))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history

In [21]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], 
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler

In [22]:
def prepare_loaders(df, fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_dataset = UBCDataset(df_train, transforms=data_transforms["train"])
    valid_dataset = UBCDataset(df_valid, transforms=data_transforms["valid"])

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], 
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], 
                              num_workers=2, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

In [23]:
train_loader, valid_loader = prepare_loaders(df, fold=CONFIG["fold"])

In [24]:
'''
model, history = run_training(model, optimizer, scheduler,
                              device=CONFIG['device'],
                              num_epochs=CONFIG['epochs'])

'''

"\nmodel, history = run_training(model, optimizer, scheduler,\n                              device=CONFIG['device'],\n                              num_epochs=CONFIG['epochs'])\n\n"

In [25]:
#torch.save(model.state_dict(),'/kaggle/working/Best_EfficientNet_b2_Model.pth')

In [26]:
#model.load_state_dict(torch.load('/kaggle/input/weight/Best_EfficientNet_b0_Model.pth'))

# Loss, Accuracy and Learning Rate

In [27]:
#train 10 models
'''
for i in range(10):
    
    model = UBCModel(CONFIG['model_name'], CONFIG['num_classes'])#, checkpoint_path=CONFIG['checkpoint_path']
    model.to(CONFIG['device'])
    optimizer = optim.Adam(model.parameters(), lr=CONFIG['learning_rate'], 
                       weight_decay=CONFIG['weight_decay'])
    scheduler = fetch_scheduler(optimizer)
    model, history = run_training(model, optimizer, scheduler,
                              device=CONFIG['device'],
                              num_epochs=CONFIG['epochs'])
    torch.save(model.state_dict(),f"/kaggle/working/Best_EfficientNet_b0_Model_{i}.pth")
'''

'\nfor i in range(10):\n    \n    model = UBCModel(CONFIG[\'model_name\'], CONFIG[\'num_classes\'])#, checkpoint_path=CONFIG[\'checkpoint_path\']\n    model.to(CONFIG[\'device\'])\n    optimizer = optim.Adam(model.parameters(), lr=CONFIG[\'learning_rate\'], \n                       weight_decay=CONFIG[\'weight_decay\'])\n    scheduler = fetch_scheduler(optimizer)\n    model, history = run_training(model, optimizer, scheduler,\n                              device=CONFIG[\'device\'],\n                              num_epochs=CONFIG[\'epochs\'])\n    torch.save(model.state_dict(),f"/kaggle/working/Best_EfficientNet_b0_Model_{i}.pth")\n'

# Test Data

In [28]:
# Examine the Data Split of training and testing data
train_data = glob.glob('/kaggle/input/UBC-OCEAN/train_images/*.png')
test_data = glob.glob('/kaggle/input/UBC-OCEAN/test_images/*.png')

print(f"The Training Set contains: {len(train_data)} images")
print(f"The Testing Set contains: {len(test_data)} images")

The Training Set contains: 538 images
The Testing Set contains: 1 images


In [29]:
def get_test_file_path(image_id):
    if os.path.exists(f"{TEST_DIR}/{image_id}_thumbnail.png"):
        return f"{TEST_DIR}/{image_id}_thumbnail.png"
    else:
        return f"{ALT_TEST_DIR}/{image_id}.png"

In [30]:
df_test = pd.read_csv(f"{ROOT_DIR}/test.csv")
df_test['file_path'] = df_test['image_id'].apply(get_test_file_path)
df_test['label'] = 0 # dummy
df_test

Unnamed: 0,image_id,image_width,image_height,file_path,label
0,41,28469,16987,/kaggle/input/UBC-OCEAN/test_images/41.png,0


In [31]:
test_dataset = UBCDataset(df_test, transforms=data_transforms["valid"])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['valid_batch_size'], 
                          num_workers=2, shuffle=False, pin_memory=True)

In [32]:
df_sub = pd.read_csv(f"{ROOT_DIR}/sample_submission.csv")
df_sub

Unnamed: 0,image_id,label
0,41,HGSC


In [34]:

RESULTs=np.array([[]]*len(test_data))
for i in range(10):
    model = UBCModel(CONFIG['model_name'], CONFIG['num_classes'])#, checkpoint_path=CONFIG['checkpoint_path']
    model.to(CONFIG['device'])
    model.load_state_dict(torch.load(f'/kaggle/input/weight/Best_EfficientNet_b0_Model_{i}.pth'))
    preds = []
    with torch.no_grad():
        bar = tqdm(enumerate(test_loader), total=len(test_loader))
        for step, data in bar:        
            images = data['image'].to(CONFIG["device"], dtype=torch.float)        
            batch_size = images.size(0)
            outputs = model(images)
            _, predicted = torch.max(model.softmax(outputs), 1)
            preds.append( predicted.detach().cpu().numpy() )
    preds = np.concatenate(preds).flatten()
    preds = np.array(preds)
    preds = preds.reshape(-1,1)
    RESULTs=np.append(RESULTs,preds,axis=1)


'\nRESULTs=np.array([[]]*len(test_data))\nfor i in range(10):\n    model = UBCModel(CONFIG[\'model_name\'], CONFIG[\'num_classes\'])#, checkpoint_path=CONFIG[\'checkpoint_path\']\n    model.to(CONFIG[\'device\'])\n    model.load_state_dict(torch.load(f\'/kaggle/input/weight/Best_EfficientNet_b0_Model_{i}.pth\'))\n    preds = []\n    with torch.no_grad():\n        bar = tqdm(enumerate(test_loader), total=len(test_loader))\n        for step, data in bar:        \n            images = data[\'image\'].to(CONFIG["device"], dtype=torch.float)        \n            batch_size = images.size(0)\n            outputs = model(images)\n            _, predicted = torch.max(model.softmax(outputs), 1)\n            preds.append( predicted.detach().cpu().numpy() )\n    preds = np.concatenate(preds).flatten()\n    preds = np.array(preds)\n    preds = preds.reshape(-1,1)\n    RESULTs=np.append(RESULTs,preds,axis=1)\n'

In [None]:
RESULTs=RESULTs.astype(int)

In [None]:
RESULT=[]
for line in RESULTs:
    #print(np.argmax(np.bincount(line)))
    RESULT.append(np.argmax(np.bincount(line)))

In [None]:
pred_labels = encoder.inverse_transform(RESULT)

In [None]:
df_sub["label"] = pred_labels
df_sub.to_csv("submission.csv", index=False)