# 1. Load and visualize train data

In [None]:
import pandas as pd
import numpy as np

train_data = pd.read_csv(
    '/kaggle/input/world-championship-2023-embryo-classification/hvwc23/train.csv')
train_data

In [None]:
import matplotlib.pyplot as plt

labels = train_data.Class.value_counts()
classes = ['0', '1']
#classes = np.unique(train_data.Class)

fig = plt.figure(figsize=(5, 4))
plt.bar(classes, labels.values, width=0.4)
plt.xticks(rotation = 90)

#for index in range(len(classes)):
#  plt.text(index, y[i], y[i], ha = 'center')

for index, value in enumerate(labels.values):
    plt.text(index, value, " "*2 + str(value), rotation = 0, ha = 'center')

plt.title('Số lượng mẫu', fontweight='bold')
plt.show()

In [None]:
train_img_path = '/kaggle/input/world-championship-2023-embryo-classification/hvwc23/train'
test_img_path = '/kaggle/input/world-championship-2023-embryo-classification/hvwc23/test'

In [None]:
#split class 0 and 1
Class0 = train_data[train_data['Class'] == 0].values
Class1 = train_data[train_data['Class'] == 1].values

In [None]:
import os
from PIL import Image
import matplotlib.pyplot as plt

fig,ax=plt.subplots(3,6)
fig.set_size_inches(15,7)
for i in range(3):
    for j in range (6):
        l=np.random.randint(0, len(train_data.Class))
        img = Image.open(os.path.join(train_img_path, train_data.Image[l]))
        ax[i,j].imshow(img)
        ax[i,j].set_title(train_data.Class[l])
        ax[i,j].axis("off")
plt.axis('off')

# 2. Create dataset

In [None]:
import os
import torch
from sklearn.model_selection import train_test_split

# define a data class
class TrainDataset:
    def __init__(self, data, data_path, transform, training=True):
        """Define the dataset for classification problems

        Args:
            data ([dataframe]): [a dataframe that contain 2 columns: image name and label]
            data_path ([str]): [path/to/folder that contains image file]
            transform : [augmentation methods and transformation of images]
            training (bool, optional): []. Defaults to True.
        """
        self.data = data
        self.imgs = data["Image"].unique().tolist()
        self.data_path = data_path
        self.training = training
        self.transform = transform

    def __getitem__(self, idx):
        img = Image.open(os.path.join(self.data_path, self.data.iloc[idx, 0]))
        label = self.data.iloc[idx, 1]
        if self.transform is not None:
            img = self.transform(img)
        return img, label

    def __len__(self):
        return len(self.imgs)
    

class TestDataset:
    def __init__(self, data, data_path, transform, training=True):
        """Define the dataset for classification problems

        Args:
            data ([dataframe]): [a dataframe that contain 2 columns: image name and label]
            data_path ([str]): [path/to/folder that contains image file]
            transform : [augmentation methods and transformation of images]
            training (bool, optional): []. Defaults to True.
        """
        self.data = data
        self.imgs = data["Image"].unique().tolist()
        self.data_path = data_path
        self.training = training
        self.transform = transform

    def __getitem__(self, idx):
        img = Image.open(os.path.join(self.data_path, self.data.iloc[idx, 1]))
        if self.transform is not None:
            img = self.transform(img)
        return img

    def __len__(self):
        return len(self.imgs)
    
    
def train_val_split(dataset, train_batch_size, validation_split=0.2):
    """make dataloader for pytorch training

    Args:
        dataset ([object]): [the dataset object]
        train_batch_size ([int]): [training batch size]
        validation_split (float, optional): [validation ratio]. Defaults to 0.2.

    Returns:
        [type]: [description]
    """
    # number of samples in train and test set
    train_len = int(len(dataset) * (1 - validation_split))
    test_len = len(dataset) - train_len
    train_set, test_set = torch.utils.data.random_split(dataset, [train_len, test_len])
    # create train_loader
    #train_loader = torch.utils.data.DataLoader(train_set, batch_size=train_batch_size, shuffle=True,)
    # create test_loader
    #test_loader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle=False,)
    return train_set, test_set

#def data_split(data, test_size=0.3):
#    x_train, x_test, y_train, y_test = train_test_split(
#        data, data["Class"], test_size=test_size, stratify = data.iloc[:,1]
#    )
#    return x_train, x_test, y_train, y_test

In [None]:
import torchvision.transforms as transform
import torchvision

#mean = (0.4124234616756439, 0.3674212694168091, 0.2578217089176178)
#std = (0.3268945515155792, 0.29282665252685547, 0.29053378105163574)
mean = (0.5, 0.5, 0.5)
std = (0.5, 0.5, 0.5)
IMG_SIZE = (224, 224)

transformer = transform.Compose([
       #transform.Resize((224, 224)),
       transform.Resize(IMG_SIZE),
       #transform.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
       #transform.RandomRotation(5),
       #transform.RandomAffine(degrees=11, translate=(0.1,0.1), scale=(0.8, 0.8)),
       transform.ToTensor(),
       transform.Normalize(mean, std)
])

In [None]:
dataset = TrainDataset(train_data, data_path = train_img_path, transform=transformer, training=True)
#train_loader, val_loader = make_loader(dataset, train_batch_size=32, validation_split=0.2)
train_set, valid_set = train_val_split(dataset, train_batch_size=16, validation_split=0.2)

In [None]:
import numpy as np 

y_train_indices = train_set.indices
y_train = [train_data.Class[i] for i in y_train_indices]
class_sample_count = np.array([len(np.where(y_train == t)[0]) for t in np.unique(y_train)])

weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in y_train])
samples_weight = torch.from_numpy(samples_weight)

In [None]:
#sampler = torch.utils.data.sampler.WeightedRandomSampler(
#    samples_weight.type('torch.DoubleTensor'), len(samples_weight))

#train_dataloader = DataLoader(train_dataset, batch_size=4, sampler=sampler)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=False)
val_loader = torch.utils.data.DataLoader(valid_set, batch_size=1, shuffle=False)

In [None]:
def imshow(inp, title=None):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    inp = np.clip(inp, 0, 1)
    plt.figure(figsize = (12, 12))
    plt.imshow(inp)
    if title is not None:
        plt.title(title)
    plt.axis("off")
    plt.pause(0.001)  # pause a bit so that plots are updated

In [None]:
# Visualization
images, classes = next(iter(train_loader))
# Make a grid from batch
out = torchvision.utils.make_grid(images, nrow=8)
imshow(out, title=classes)

# 3. Define Metrics and Optimizers and Loss function

In [None]:
from sklearn import metrics as skmetrics
import numpy

class Metrics:
    def __init__(self, metric_names):
        self.metric_names = metric_names
        # initialize a metric dictionary
        self.metric_dict = {metric_name: [0] for metric_name in self.metric_names}

    def step(self, labels, preds):
        for metric in self.metric_names:
            # get the metric function
            do_metric = getattr(
                skmetrics, metric, "The metric {} is not implemented".format(metric)
            )
            # check if metric require average method, if yes set to 'micro' or 'macro' or 'None'
            try:
                self.metric_dict[metric].append(
                    do_metric(labels, preds, average="macro")
                )
            except:
                self.metric_dict[metric].append(do_metric(labels, preds))

    def epoch(self):
        # calculate metrics for an entire epoch
        avg = [sum(metric) / (len(metric) - 1) for metric in self.metric_dict.values()]
        metric_as_dict = dict(zip(self.metric_names, avg))
        return metric_as_dict

    def last_step_metrics(self):
        # return metrics of last steps
        values = [self.metric_dict[metric][-1] for metric in self.metric_names]
        metric_as_dict = dict(zip(self.metric_names, values))
        return metric_as_dict

In [None]:
train_metrics = Metrics(["accuracy_score", "f1_score"])
val_metrics = Metrics(["accuracy_score", "f1_score"])

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from torch import nn
#criterion = nn.CrossEntropyLoss()

device

# 4. Define the Model: Transfer Learning

In [None]:
#model.named_parameters

In [None]:
from torchvision import models
from torch import nn
import torch.optim as optim

model = models.vit_b_32(pretrained=True).cuda()

for param in model.parameters():
    param.requires_grad = True

classifier = nn.Sequential(
    nn.Linear(in_features=model.hidden_dim, out_features=256, bias=True),
    nn.Linear(in_features=256, out_features=2, bias=True)
)
model.heads = classifier
model = model.to(device)

In [None]:
#criterion = LabelSmoothingLoss(weight=weights, epsilon=0.12, num_classes=2)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min", patience=2, factor=0.5)

In [None]:
#!pip uninstall timm -y
#!pip install -q timm==0.4.12

# 5. Define a training epoch

In [None]:
def train_one_epoch(
    model,
    train_loader,
    test_loader,
    device,
    optimizer,
    criterion,
    train_metrics,
    val_metrics,
):

    # training-the-model
    train_loss = 0
    valid_loss = 0
    all_labels = []
    all_preds = []
    model.train()
    for data, target in train_loader:
        # move-tensors-to-GPU
        data = data.type(torch.FloatTensor).to(device)
        #target=torch.Tensor(target).to(device)
        target = target.to(device)
        # clear-the-gradients-of-all-optimized-variables
        optimizer.zero_grad()
        # forward-pass: compute-predicted-outputs-by-passing-inputs-to-the-model
        output = model(data)
        #output = model(data)
        # get the prediction label and target label
        preds = torch.argmax(output, axis=1).cpu().detach().numpy()
        labels = target.cpu().numpy()
        # calculate-the-batch-loss
        loss = criterion(output.type(torch.FloatTensor).to(device), target.type(torch.LongTensor).to(device))
        # backward-pass: compute-gradient-of-the-loss-wrt-model-parameters
        loss.backward()
        # perform-a-ingle-optimization-step (parameter-update)
        optimizer.step()
        # update-training-loss
        train_loss += loss.item() * data.size(0)
        # calculate training metrics
        all_labels.extend(labels)
        all_preds.extend(preds)
    
    train_metrics.step(all_labels, all_preds)

    # validate-the-model
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for data, target in test_loader:
            data = data.type(torch.FloatTensor).to(device)
            target = target.to(device)
            output = model(data)
            preds = torch.argmax(output, axis=1).tolist()
            labels = target.tolist()
            all_labels.extend(labels)
            all_preds.extend(preds)
            loss = criterion(output, target)

            # update-average-validation-loss
            valid_loss += loss.item() * data.size(0)

    val_metrics.step(all_labels, all_preds)
    train_loss = train_loss / len(train_loader.sampler)
    valid_loss = valid_loss / len(test_loader.sampler)

    return (
        train_loss,
        valid_loss,
        train_metrics.last_step_metrics(),
        val_metrics.last_step_metrics(),
    )

# 6. Training

In [None]:
from tqdm import tqdm

num_epoch = 20
best_val_acc = 0

train_losses=[]
valid_losses=[]
train_accuracies=[]
valid_accuracies=[]

import logging
import numpy as np
print("begin training process")

for i in tqdm(range(0, num_epoch)):
    loss, val_loss, train_result, val_result = train_one_epoch(
        model,
        train_loader,
        val_loader,
        device,
        optimizer,
        criterion,
        train_metrics,
        val_metrics,
    )

    scheduler.step(val_loss)
    
    train_losses.append(loss)
    valid_losses.append(val_loss)
    train_accuracies.append(train_result["accuracy_score"])
    valid_accuracies.append(val_result["accuracy_score"])

    print(
        "Epoch {} / {} \n Training loss: {} - Other training metrics: ".format(
            i + 1, num_epoch, loss
        )
    )
    print(train_result)
    print(
        " \n Validation loss : {} - Other validation metrics:".format(val_loss)
    )
    print(val_result)
    print("\n")
    # saving epoch with best validation accuracy
    if best_val_acc < float(val_result["accuracy_score"]):
        print(
            "Validation accuracy= "+
            str(val_result["accuracy_score"])+
            "===> Save best epoch"
        )
        best_val_acc = val_result["accuracy_score"]
        torch.save(
            model.state_dict(),
            "./" +  "best_no_tuning.pt",
        )
    else:
        print(
            "Validation accuracy= "+ str(val_result["accuracy_score"])+ "===> No saving"
        )
        continue

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_losses, label="Train loss")
plt.plot(valid_losses, label="Val loss")
#plt.title('Loss Graph')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.plot(train_accuracies, label="Train acc")
plt.plot(valid_accuracies, label="Val acc")
#plt.title('Accuracy Graph')
plt.xlabel('Epoch')
plt.ylabel('accuracy')
plt.legend()
plt.show()

# 7. Predict validation

In [None]:
preds = []
labels = []

with torch.no_grad():
    for data, target in val_loader:
        # move-tensors-to-GPU
        data = data.to(device)
        label = target.numpy()
            
        #forward-pass: compute-predicted-outputs-by-passing-inputs-to-the-model
        output = model(data)
        #applying Softmax to results
        prob = nn.Softmax(dim=1)
        probs = prob(output)
        labels.extend(target.numpy())
        preds.extend(torch.argmax(probs, axis=1).tolist())

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
np.unique(labels)

In [None]:
report = classification_report(labels, preds, digits=2)
print(report)

In [None]:
cm = confusion_matrix(labels, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

# 7. Test the results

In [None]:
test_data = pd.read_csv('/kaggle/input/world-championship-2023-embryo-classification/hvwc23/test.csv')
test_data

In [None]:
test_transform = torchvision.transforms.Compose([torchvision.transforms.Resize(IMG_SIZE),
    torchvision.transforms.ToTensor(),
    transform.Normalize(mean, std)
])

In [None]:
testset = TestDataset(test_data, data_path = test_img_path, transform=test_transform, training=False)
test_loader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False)

In [None]:
import copy
test_model = copy.deepcopy(model)
test_model.load_state_dict(torch.load("/kaggle/working/best_no_tuning.pt"))
test_model = test_model.to(device)

In [None]:
def test_result(model, test_loader, device):
    # testing the model by turning model "Eval" mode
    model.eval()
    preds = []
    aprobs = []
    with torch.no_grad():
        for data in test_loader:
            # move-tensors-to-GPU
            data = data.to(device)
            # forward-pass: compute-predicted-outputs-by-passing-inputs-to-the-model
            output = model(data)
            prob = nn.Softmax(dim=1)
            # applying Softmax to results
            probs = prob(output)
            aprobs.append(probs.cpu())
            preds.extend(torch.argmax(probs, axis=1).tolist())
    return preds

In [None]:
preds = test_result(test_model, test_loader, device)

In [None]:
submission = pd.DataFrame(data={
    "ID": test_data.ID,
    "Class": preds
})
submission.to_csv("task1_submission.csv", index=None)
print(submission['Class'].sum())