# Load Libraries

In [None]:
!pip install -U kaleido
import kaleido

import seaborn as sns
import os
import timm
import time
import numpy as np
from PIL import Image
import PIL.ImageOps
import matplotlib.pyplot as plt
import random
import math
import pickle as pkl
import pandas as pd
import inspect
from logging import exception
from prettytable import PrettyTable


import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
import torch.nn.functional as F
import torchvision
import torchvision.utils
import torchvision.transforms as transforms
from torchvision import models
import albumentations as A
from tqdm.autonotebook import tqdm
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
import h5py

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [None]:
# Tạo ổ nhớ tạm
os.makedirs("/kaggle/temp",exist_ok=True)
os.environ['TORCH_HOME'] = "/kaggle/temp"

# Configuration

In [None]:
class CFG:
    BATCH_SIZE = 16
    num_workers = 16
    WeightSampler = True
    seed = 122
    
    model_name = "swin_v2_t"
    pretrained = True

    learning_rate = 5e-6
    weight_decay = 1e-6
        
    embedding_size = 512
    loss_function = "" # ContrastiveLoss else BCE loss
    replace_fc = True
    
    threshold = 0.5
    # margin for contrastive loss
    margin = 1.5
    
    EPOCHS = 10
    early_stopping = 25
    
    model_path = "/kaggle/input/bigdataset-models" # Dir to load model to test
    output_dir = "/kaggle/working/" # Dir to save model after train
    mode = "" # Training or Inference else All  
                        #if mode is All set model_path and output_dir to ""
    # Dataset
    ratio = 0.80
    SKFold = True
    train_dir = "/kaggle/input/spamv2/SelectedP6"
    train_csv = "/kaggle/input/dataset-csvs/paired_csv_v4/paired_train_big.csv"
    
    test_dir = "/kaggle/input/spamv1/spamimg_train/Products"
    test_csv = "/kaggle/input/dataset-csvs/paired_csv_v4/paired_train_small.csv"
    
    online_pairing = False
    neg_ratio = 1
    pos_ratio = 1

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            

In [None]:
def print_class(cfg_class):
    for i in inspect.getmembers(cfg_class):
        # to remove private and protected
        # functions
        if not i[0].startswith('_'):
            print(i)
            with open("configuration.txt", "a") as f:
                f.write(str(i) + "\n")
print_class(CFG)

In [None]:
def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    

set_seed(CFG.seed)

# Helper Functions

In [None]:
def imshow(img, text=None, should_save=False):
    invTrans = transforms.Compose([ transforms.Normalize(mean = [ 0., 0., 0. ],
                                                     std = [ 1/0.229, 1/0.224, 1/0.225 ]),
                                    transforms.Normalize(mean = [ -0.485, -0.456, -0.406 ],
                                                     std = [ 1., 1., 1. ]),])
    img = invTrans(img)
    npimg = img.numpy()
    plt.axis("off")
    if text:
        plt.text(
            75,
            8,
            text,
            style="italic",
            fontweight="bold",
            bbox={"facecolor": "white", "alpha": 0.8, "pad": 10},
        )
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

def plot_graph(avr_loss, val_avr_loss, test_loss, acc_list):
    plt.figure(figsize=(20,10))
    plt.subplot(2,1,1)
    plt.plot(avr_loss, label = "Train_loss")
    plt.plot(val_avr_loss, label = "Val_loss")
    plt.plot(test_loss, label = "Test_loss")
    plt.legend()
    plt.savefig("Loss graph_{}")
    plt.title("Loss graph")
    plt.subplot(2,1,2)
    plt.plot(acc_list, label = "F1_Score")
    plt.legend()
    plt.title("F1_Score")
    plt.savefig("F1_Score{}")
    plt.show()

def plot_distribution(distributions, epoch=0):    
    negative_distribution = [x[0] for x in distributions if x[1]==0]
    positive_distribution = [x[0] for x in distributions if x[1]==1]
    
    plt.figure(figsize=(20,10))
    plt.hist(negative_distribution, bins=100, label='Negatives')
    plt.hist(positive_distribution, bins=100, label='Positives', alpha = 0.7,color='r')
    plt.xlabel('Score of being Positive Class Distribution', fontsize=25)
    plt.ylabel('Number of records in each bucket', fontsize=25)
    plt.legend(fontsize=15)
    plt.tick_params(axis='both', labelsize=25, pad=5)
    plt.savefig("Distributions_of_Epoch{}".format(epoch))
    plt.show()

def float_format(*args):
   formatted = []
   for arg in args:
     float_number = "{:.3f}".format(arg)
     formatted.append(float_number) 
   return formatted
    

In [None]:
# Function to save checkpoint
def save_checkpoint(model, optimizer, filename):
    print("==> Saving checkpoint")
    checkpoint = {
        "state_dict": model.state_dict(),
        "optimizer": optimizer.state_dict(),
    }
    torch.save(checkpoint, filename)

# Function to load checkpoint
def load_checkpoint(checkpoint_file):
    print("==> Loading checkpoint")
    checkpoint = torch.load(checkpoint_file, map_location=CFG.device)
    model.load_state_dict(checkpoint["state_dict"], assign=True)
    optimizer.load_state_dict(checkpoint["optimizer"])
    for state in optimizer.state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor):
                state[k] = v.to(CFG.device)


def save_pkl(pkl_list,filename):
    with open(filename, 'wb') as f:       #this will save the list as "results.pkl" which you can load in later
                pkl.dump(pkl_list, f)          #as a list to python


def load_pkl(filename):
    with open(filename, 'rb') as f:       #this will save the list as "results.pkl" which you can load in later
            pkl_list =  pkl.load(f)
    return pkl_list


In [None]:
def increase_data(csv=CFG.train_csv,neg_ratio=1,pos_ratio=1,sample_column=0,num_samplers_per_image=1,saved_dir=''):
  df0 = pd.read_csv(csv)
  print("Start increase dataset by create dissimilar pairs")  
  saved_csv = csv.split('/')[-1].split('.')[0]
  csv_name = os.path.join(saved_dir,saved_csv+'_'+str(neg_ratio)+'_'+str(pos_ratio)+'_.csv')

  # Calculate Number of Samplers
  labels =df0.iloc[:,-1]
  pos_pairs = np.sum(labels,axis = 0)
  neg_pairs = len(labels) - pos_pairs
  assert neg_pairs != 0, "Too Few Number of Negative sample"
  assert neg_ratio/pos_ratio - neg_pairs/pos_pairs < 0, "The Ratio in the dataset already smaller"+str(neg_ratio/pos_ratio)
  num_samplers = neg_pairs/neg_ratio*pos_ratio-pos_pairs
  print("Number Sampling Pairing: ",num_samplers)

  # Encoding Product to it own class, instead of pair
  le = LabelEncoder()
  indices = df0.loc[df0.label == 1].index
  df1 = df0.drop(indices, axis=0)
  product_series = df1.iloc[:,sample_column]
  product_df = pd.DataFrame(product_series)
  product_df = product_df.drop_duplicates(product_df.columns[0])
  product_df.reset_index(drop=True, inplace=True)
  product_encoding = product_df.iloc[:,0].str.split("/", n=1, expand=True)[0]
  product_df["id"] = le.fit_transform(product_encoding)
  product_df.sort_values(by=['id'], inplace=True, ignore_index=True)
  product_df.head()


  #Create Loop sampling
  loops = np.arange(int(len(product_df))).tolist()*(int(num_samplers//len(product_df))+1)

  # Convert relevant columns to NumPy arrays for faster access
  product_ids = product_df['id'].values
  product_len = len(product_ids)

  #Sampling Dataset in to positive pairs
  pair_sample = []
  sample_index = 0
  sample_count = 0
  looping = tqdm(loops)
  for product_index in looping:
    if sample_count > num_samplers:
      break
    pair_product = product_df[product_df['id'] == product_ids[product_index]].index.values
    if sample_index == 0:
      sample_index = sample_index+len(pair_product)
    for _ in range(num_samplers_per_image):
      while sample_count <= num_samplers:
          if sample_index >= len(product_df):
            sample_index = 0
          pair = [product_df.iloc[product_index,0],product_df.iloc[sample_index,0],1]
          sample_index += 1
          if pair[0]!=pair[1] and pair not in pair_sample and [pair[1], pair[0],1] not in pair_sample:
            pair_sample.append(pair)
            sample_count += 1
            looping.set_description(f"Looping")
            looping.set_postfix(sample_count=sample_count)
            break

  # Save csv

  new_pair = np.concatenate([df0.values, pair_sample], axis=0)

  new_pair = pd.DataFrame(new_pair)
  print("Duplicate Pair", new_pair[new_pair.duplicated(keep=False)])
  print("Original Csv File Length: ",len(df0))
  print("New Csv File Length: ",len(new_pair))
  new_pair.columns = ['product','review','label']
  csv_name = os.path.join(saved_dir,'paired_'+csv_name)
  new_pair.to_csv(csv_name, index=False)
  return csv_name

In [None]:
class Evaluate_Metrics():
    def __init__(self, y_true, y_preds, y_cls):
            self.y_true = y_true
            self.y_preds = y_preds
            self.y_cls = y_cls
    def plot_curve(self, curve_type, x_value, y_value, auc, fig_name):
        plt.figure(figsize=(20,10))
        if curve_type == 'roc':
            plt.plot([0, 1], [0, 1],linestyle='dashed')
            xlabel = 'False positive rate'
            ylabel = 'True positive rate'
            title = 'ROC curve'
        elif curve_type == 'pr':
            xlabel = 'Recall'
            ylabel = 'Precision'
            title = 'PR curve'
        else:
            pass
        plt.plot(x_value, y_value, label='AUC(area = {:.3f} )'.format(auc))        
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)        
        plt.title(title)
        plt.legend(loc='best')
        plt.savefig(fig_name)
        plt.show()
        plt.close()           

    def plot_roc(self,name):
        fpr, tpr, thres = metrics.roc_curve(self.y_true, self.y_preds, pos_label=1)
        roc_auc = metrics.auc(x=fpr, y=tpr)
        self.plot_curve(curve_type="roc",
                   x_value=fpr,
                   y_value=tpr,
                   auc = roc_auc,
                   fig_name = name)        
       
        return fpr, tpr, thres, roc_auc        
        
    def plot_pr(self,name):
        precision, recall, thresholds = metrics.precision_recall_curve(self.y_true, self.y_preds, pos_label=1)
        pr_auc = metrics.auc(x=recall, y=precision)
        self.plot_curve(curve_type="pr",
                   x_value=recall,
                   y_value=precision,
                   auc = pr_auc,
                   fig_name = name)
        return precision, recall, thresholds, pr_auc


    def confusion_matrix(self):
        cm = metrics.confusion_matrix(self.y_true, self.y_cls)
        tn, fp, fn, tp =cm.ravel()
        disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot()
        return tn, fp, fn, tp

    def accuracy_score(self):
        return metrics.accuracy_score(self.y_true, self.y_cls)
    
    def f1_score(self):
        precision = metrics.precision_score(self.y_true, self.y_cls)
        recall = metrics.recall_score(self.y_true, self.y_cls)
        f1 = 2*precision*recall/(precision+recall)
        return f1

In [None]:
def compute_class_freqs(labels):
    labels = np.array(labels)
    N = labels.shape[0]
    positive_frequencies = np.sum(labels,axis = 0) / N
    negative_frequencies = 1 - positive_frequencies

    return positive_frequencies, negative_frequencies

In [None]:
def label_freq(csv = CFG.train_csv):
  freq_pos, freq_neg = compute_class_freqs(labels = pd.read_csv(csv)['label'])
  df = pd.DataFrame({"Targets": ['0'], "Label": ["Negative"], "Value": freq_neg})
  df2= pd.DataFrame({"Targets": ['1'], "Label": ["Positive"], "Value": freq_pos})
  df = pd.concat([df,df2],ignore_index=True)
  #sns.barplot(x="Targets", y="Value" ,data=df)
  print("pos", freq_pos,"--:--","neg",freq_neg)
  return freq_pos, freq_neg

In [None]:
def labels_to_class_weights_sampler(csv=CFG.train_csv):
    class_weights = []
    df = pd.read_csv(csv)
    num_labels = df['label'].value_counts().sort_index()
    nums = num_labels.tolist()
    print("count label: ", num_labels)
    for num in nums:
        frequency = num/len(df)
        class_weights.append(1/frequency)
    print("weight of dataset: ",class_weights)
    torch.tensor(class_weights)
    samplers = []
    for idx, label in enumerate(df.label):
        weight = class_weights[int(label)]
        samplers.append(weight)
    return samplers

In [None]:
def GroupKDataset(k = 5, csv = CFG.train_csv, fold_id = 1):
    df_train = pd.read_csv(csv,  names = ['image1','image2','label'], header=0)
    newP = df_train['image1'].str.split("/", n=1, expand=True)
    df_train.product_id = newP[0]
    le = LabelEncoder()
    df_train.product_label = le.fit_transform(df_train.product_id)

    skf = StratifiedKFold(n_splits=k)
    df_train['fold'] = -1
    for fold, ( _, val_) in enumerate(skf.split(X=df_train, y=df_train.product_label)):
      df_train.loc[val_ , "fold"] = fold

                  
    df_train_this = df_train[df_train['fold'] != fold_id]
    df_valid_this = df_train[df_train['fold'] == fold_id]
    df_train_this.drop("fold", axis='columns',inplace = True)
    df_valid_this.drop("fold", axis='columns',inplace = True)
    train_csv = "train_split.csv"
    val_csv = "val_split.csv"

    df_train_this.to_csv(train_csv,index=False)
    df_valid_this.to_csv(val_csv,index=False)
    
#    train_path = os.path.join(CFG.output_dir,train_csv)
#    val_path = os.path.join(CFG.output_dir,val_csv)
    train_path = train_csv
    val_path = val_csv
    return train_path, val_path

In [None]:
def reset_weights(m):
    path = "/kaggle/temp/hub/checkpoints"
    dir_list = os.listdir(path)
    file = os.path.join(path,dir_list[0])
    m.backbone.load_state_dict(torch.load(file))
    for name, layer in m.cls_head.named_children():
            for n, l in layer.named_modules():
                #print(n)
                if hasattr(l, 'reset_parameters'):
                    #print(f'Reset trainable parameters of layer = {l}')
                    l.reset_parameters()

# Dataset

In [None]:
class SiameseDataset():
    def __init__(self,csv=None,data_dir=None,transform=None):
        # used to prepare the labels and images path
        self.df=pd.read_csv(csv)
        self.df.columns =["image1","image2","label"]
        self.dir = data_dir
        self.transform = transform

    def __getitem__(self,index):
        # getting the image path
        image0_path=os.path.join(self.dir,self.df.iat[index,0])
        image1_path=os.path.join(self.dir,self.df.iat[index,1])
        image0_path = image0_path.replace('_cropped', '')
        image1_path = image1_path.replace('_cropped', '')
        # Loading the image
        img0 = Image.open(image0_path)
        img1 = Image.open(image1_path)
        
        img0 = img0.convert("RGB")
        img1 = img1.convert("RGB")

        # Apply image transformations
        if self.transform is not None:
            img0 = self.transform(img0)
            img1 = self.transform(img1)
        return img0, img1 , torch.from_numpy(np.array([int(self.df.iat[index,2])],dtype=np.float32)),self.df.iat[index,0], self.df.iat[index,1]
    def __len__(self):
        return len(self.df)

# Network

In [None]:
class SiameseNetwork(nn.Module):
    def __init__(self, model_name="resnet18", pretrained=True):
        super().__init__()
        if model_name not in models.__dict__:
            raise Exception("No model named {} exists in torchvision.models.".format(model_name))

        # Create a model network
        model = models.__dict__[model_name](pretrained=pretrained, progress=True)
        
        in_features = list(model.modules())[-1].in_features
        #out_features = list(model.modules())[-1].out_features
        extractor = list(model.children())[:-2]
        self.backbone = nn.Sequential(*extractor)
        self.flatten1 = nn.Flatten()
        self.pooling = list(model.children())[-2]

        if CFG.replace_fc:
            self.fc = nn.Linear(in_features, CFG.embedding_size)
        else:
           if hasattr(list(model.children())[-1], '__iter__'):
             classifier = []
             for layer in list(model.children())[-1]:
              if isinstance(layer, nn.Linear):
                classifier.append(layer)
             self.fc = nn.Sequential(*classifier)
           else: 
             self.fc =  list(model.children())[-1] 
                
        out_features = self.fc.out_features
        self.cls_head = nn.Sequential(
            nn.Linear(out_features, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            
            nn.Linear(128, 1),
        )

    def forward(self, img1, img2):
        
        feat1 = self.backbone(img1)
        feat2 = self.backbone(img2)
        
        feat1 = self.pooling(feat1)
        feat2 = self.pooling(feat2)
        
        feat1 = self.flatten1(feat1)
        feat2 = self.flatten1(feat2)
        
        feat1 = self.fc(feat1)
        feat2 = self.fc(feat2)
        
        if CFG.loss_function == "ContrastiveLoss":
            #feat1 = F.normalize(feat1, p=2, dim=1, eps=1e-12)
            #feat2 = F.normalize(feat2, p=2, dim=1, eps=1e-12)
            return feat1, feat2
        
        else:
            # Pass the combined feature vector through classification head to get similarity value in the range of 0 to 1.
            combined_features =  feat1 * feat2
            output = self.cls_head(combined_features)
            return output
        

    def extraction(self, img1, img2):
        feat1 = self.backbone(img1)
        feat2 = self.backbone(img2)
        
        feat1 = self.pooling(feat1)
        feat2 = self.pooling(feat2)
        
        feat1 = self.flatten1(feat1)
        feat2 = self.flatten1(feat2)
        
        feat1 = self.fc(feat1)
        feat2 = self.fc(feat2)
        
        #feat1 = F.normalize(feat1, p=2, dim=1, eps=1e-12)
        #feat2 = F.normalize(feat2, p=2, dim=1, eps=1e-12)
        return feat1, feat2

# Loss Functions

In [None]:
class ContrastiveLoss(torch.nn.Module):
    """
    Contrastive loss function.
    """
    def __init__(self, margin=CFG.margin):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = F.pairwise_distance(output1, output2,keepdim=True)
        loss_contrastive = torch.mean((1-label) * torch.pow(euclidean_distance, 2) +
                                      (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))

        return loss_contrastive

# Traing Evaluate Model

In [None]:
def training_one_epoch(train_dataloader,epoch):
        model.train()
        loop = tqdm(train_dataloader)
        losses=[]
        for i, data in enumerate(loop,0):
            img0, img1 , label, _, _ = data
            img0, img1 , label = img0.to(CFG.device), img1.to(CFG.device) , label.to(CFG.device)
            for param in model.parameters():
              param.grad = None
            
            if CFG.loss_function == "ContrastiveLoss":    
                emb0,emb1 = model(img0,img1)
                loss = criterion(emb0,emb1,label)
            else:    
                output = model(img0,img1)
                loss = criterion(output, label)
                
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
            mean_loss =  np.mean(losses)
            loop.set_description(f"Epoch [{epoch}/{CFG.EPOCHS}]")
            loop.set_postfix(Loss=mean_loss)

        print("Epoch {}\n Current loss {}".format(epoch,loss.item()))
        print(" Avarage loss {}\n".format(mean_loss))
        return mean_loss

In [None]:
def evaluating_one_epoch(eval_dataloader,epoch):
      global val_loss, val_avr_loss
      loop = tqdm(eval_dataloader)
      losses=[]
      model.eval()
      for i, data in enumerate(loop,0):
        img0, img1 , label, _, _ = data
        img0, img1 , label = img0.to(CFG.device), img1.to(CFG.device) , label.to(CFG.device)
        for param in model.parameters():
          param.grad = None

        if CFG.loss_function == "ContrastiveLoss":    
            emb0,emb1 = model(img0,img1)
            loss = criterion(emb0,emb1,label)
        else:    
            output = model(img0,img1)
            loss = criterion(output, label)
                
        losses.append(loss.item())
        mean_loss = np.mean(losses)
        loop.set_description(f"Epoch [{epoch}/{CFG.EPOCHS}]")
        loop.set_postfix(loss=mean_loss)
        
      print("Epoch {}\n Eval Current loss {}".format(epoch,loss.item()))
      print(" Eval Avarage loss {}\n".format(mean_loss))
      return mean_loss

In [None]:
def testing_one_epoch(test_dataloader,epoch):
      model.eval()
      losses=[]
      threshold_accuracy = 0
      results = []
      probs = []
      y_label = []
      y_pred = []
      loop = tqdm(test_dataloader)
      with torch.no_grad():
        for i, data in enumerate(loop,0):
            img0, img1 , label, _, _ = data
            img0, img1 , label = img0.to(CFG.device), img1.to(CFG.device) , label.to(CFG.device)
            for param in model.parameters():
              param.grad = None

            if CFG.loss_function == "ContrastiveLoss":    
                emb0,emb1 = model(img0,img1)
                loss = criterion(emb0,emb1,label)
                output = F.pairwise_distance(emb0,emb1)
            
            else:    
                output = model(img0,img1)
                loss = criterion(output, label)
        
            losses.append(loss.item())
            mean_loss = np.mean(losses)
            for o, l in zip(output,label):
                if o.item()>CFG.threshold:
                    result = 1
                else:
                    result = 0
                results.append(result)
                probs.append([o.item(), l.item()])
                y_label.append(l.item())
                y_pred.append(o.item())
            
            
            threshold_accuracy = np.mean(results)                
            loop.set_description(f"Epoch [{epoch}/{CFG.EPOCHS}]")
            loop.set_postfix(loss=mean_loss)
            

        
        print("Epoch {}\n Current loss {}".format(epoch,loss.item()))
        print(f"Test loss {mean_loss}")
        print(f"Test threshold {CFG.threshold} accuracy: {threshold_accuracy}")
      return mean_loss, probs, y_label, y_pred, results

In [None]:
def dataloader(train_csv, val_csv, test_csv):    
    train_transforms=transforms.Compose([
        transforms.Resize((256,256)),
        transforms.RandomPerspective(distortion_scale=0.2, p=0.3),
        transforms.RandomHorizontalFlip(p=0.3),
        transforms.RandomVerticalFlip(p=0.3),
        transforms.CenterCrop(224),
        #transforms.RandomResizedCrop(224),
        transforms.RandomRotation(30),
        transforms.ColorJitter(brightness=0.5, contrast=0.1,
                               saturation=0.1, hue=0.1),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    test_transform = transforms.Compose([
        transforms.Resize((256,256)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    
    test_set = SiameseDataset(test_csv,CFG.test_dir,
                                                test_transform
                                            )
    test_dataloader = DataLoader(test_set,num_workers=CFG.num_workers,batch_size=CFG.BATCH_SIZE,shuffle=False,pin_memory = True)
    

    train_set = SiameseDataset(train_csv,CFG.train_dir,
                                                train_transforms)
    val_set = SiameseDataset(val_csv,CFG.train_dir,
                                                train_transforms)
    
    print("Test Dataset length: ", len(test_set))
    print("Train Dataset length: ",len(train_set))
    print("Val Dataset length: ",len(val_set))
    
    #Load dataset
    if CFG.WeightSampler:
        print("Weight Sampling Dataset !!!")
        sampler_weights = labels_to_class_weights_sampler(train_csv)
        print("Sampling Dataset with length: ", len(sampler_weights))
        sampler = WeightedRandomSampler(weights=sampler_weights, num_samples=len(sampler_weights), replacement=True)
        
        train_dataloader = DataLoader(
        train_set, shuffle=False, num_workers=CFG.num_workers, batch_size=CFG.BATCH_SIZE, pin_memory = True, sampler = sampler
    )
    else:
        train_dataloader = DataLoader(
            train_set, shuffle=True, num_workers=CFG.num_workers, batch_size=CFG.BATCH_SIZE, pin_memory = True
        )

    eval_dataloader = DataLoader(
        val_set, shuffle=False, num_workers=CFG.num_workers, batch_size=CFG.BATCH_SIZE, pin_memory = True
    )

    vis_dataloader = DataLoader(test_set, shuffle=True, batch_size=8)
    dataiter = iter(vis_dataloader)
    example_batch = next(dataiter)
    concatenated = torch.cat((example_batch[0], example_batch[1]), 0)
    imshow(torchvision.utils.make_grid(concatenated))
    print(example_batch[2].numpy())
    return train_dataloader, eval_dataloader, test_dataloader

In [None]:
def training_model(epochs= CFG.EPOCHS):
  train_loss=[]
  eval_loss=[]
  best_model_stop = 9999  
  for epoch in range(0,epochs):
        trained = training_one_epoch(train_dataloader,epoch)
        filename = os.path.join(CFG.output_dir, "checkpoint epoch{}.pth.tar".format(epoch))
        save_checkpoint(model,optimizer,filename)
        
        evaluated = evaluating_one_epoch(eval_dataloader,epoch)
        if evaluated < best_model_stop:
            save_checkpoint(model,optimizer,"Best Model Checkpoint.pth.tar")
            best_model_stop = evaluated
            print("Save best model with evaluated loss:", best_model_stop)
        
        train_loss.append(trained)        
        eval_loss.append(evaluated)
        save_pkl(train_loss,os.path.join(CFG.output_dir, 'train_loss.pkl'))
        save_pkl(eval_loss,os.path.join(CFG.output_dir, 'eval_loss.pkl'))


In [None]:
def testing_model(epochs= CFG.EPOCHS):
  test_loss =[]
  acc_list = []
  threshold_accuracy_list = []
  confusion_list = []
  f1_list = []
  roc_auc_list = []
  pr_auc_list = []
  f1_score_list = []
  probs = []
  log_list = []  
  metric_table = PrettyTable(["epoch", "pr_auc", "roc_auc", "f1_score", "accuracy score"]) 
  confusion_table = PrettyTable(["Epoch","True Negative","False Positive", "False Negative","True Positive"])
  for epoch in range(0,epochs):
      checkpoint = os.path.join(CFG.model_path, "checkpoint epoch{}.pth.tar".format(epoch))
      load_checkpoint(checkpoint)
      tested, probs, y_label, y_pred, y_cls = testing_one_epoch(test_dataloader,epoch)
  
      evaluate_metrics = Evaluate_Metrics(y_label, y_pred, y_cls)      
      tn, fp, fn, tp = evaluate_metrics.confusion_matrix()
      accuracy_score = evaluate_metrics.accuracy_score()
      ftp, ttp, threshold, roc_auc = evaluate_metrics.plot_roc(name = f"ROC_Curve-epoch{epoch}.png")
      per, recall, threshold, pr_auc = evaluate_metrics.plot_pr(name = f"PR_Curve-epoch{epoch}.png")
      f1_score = evaluate_metrics.f1_score()
      
      
      pr_auc, roc_auc, f1_score, accuracy_score = float_format(pr_auc, roc_auc, f1_score, accuracy_score)
      metric_table.add_row([epoch, pr_auc, roc_auc, f1_score, accuracy_score])  
      confusion_table.add_row([epoch, tn, fp, fn, tp])
      test_loss.append(tested) 
      
      df = pd.DataFrame(probs, columns = ['dist','label'])
      df.to_csv('result distribution.csv',index=False)
      plot_distribution(probs,epoch)  
      
      print("f1 score: ", f1_score )  
      f1_list.append(f1_score)  
      print("Accuracy score:",accuracy_score)
      acc_list.append(accuracy_score)
      confusion_list.append([epoch, tn, fp, fn, tp])  
      log_list.append([epoch, pr_auc, roc_auc, f1_score, accuracy_score, tested])
  
  print(metric_table)
  print(confusion_table)  
  train_loss = load_pkl(os.path.join(CFG.model_path,'train_loss.pkl'))
  eval_loss = load_pkl(os.path.join(CFG.model_path,'eval_loss.pkl'))

  save_pkl(test_loss,os.path.join(CFG.output_dir, 'test_loss.pkl'))    
  save_pkl(acc_list,os.path.join(CFG.output_dir, 'acc_list.pkl'))
  save_pkl(f1_list,os.path.join(CFG.output_dir, 'f1_score.pkl'))
  confusion_df = pd.DataFrame(confusion_list)
  confusion_df.columns = [["Epoch","True Negative","False Positive", "False Negative","True Positive"]]
  confusion_df.to_csv("confusion_matrix.csv", index = False)
  log_df = pd.DataFrame(log_list)
  log_df.columns = ["epoch", "pr_auc", "roc_auc", "f1_score", "accuracy score", "test_loss"]
  log_df["train_loss"] = train_loss
  log_df["eval_loss"] = eval_loss   
  log_df.to_csv("test_log.csv", index = False)  
  plot_graph(train_loss, eval_loss, test_loss, f1_list)  


# Main

In [None]:
model = SiameseNetwork(CFG.model_name, CFG.pretrained)
model = model.to(CFG.device)
optimizer = torch.optim.Adam(model.parameters(), lr=CFG.learning_rate, weight_decay=CFG.weight_decay)
print(model)

In [None]:
if CFG.online_pairing:
    CFG.train_csv = increase_data(csv = CFG.train_csv, neg_ratio=CFG.neg_ratio, pos_ratio=CFG.pos_ratio)

train_csv, val_csv = GroupKDataset(csv = CFG.train_csv, k = 5)
freq_pos, freq_neg = label_freq(train_csv)

train_dataloader, eval_dataloader, test_dataloader = dataloader(train_csv=train_csv, val_csv = val_csv, test_csv = CFG.test_csv)    

if CFG.loss_function == "ContrastiveLoss":    
    criterion =  ContrastiveLoss()
else:    
    criterion = nn.BCEWithLogitsLoss()

if CFG.mode == "Training":
    training_model()
elif CFG.mode == "Inference":
    testing_model()
else:
    CFG.model_path = ""
    CFG.output_dir = ""  
    training_model()
    testing_model()

#reset_all_weights(model)


In [None]:
checkpoint = os.path.join(CFG.model_path, "Best Model Checkpoint.pth.tar")
load_checkpoint(checkpoint)

model.eval()
count=0
transform = transforms.Compose([transforms.Resize((256,256)),
                                    #transforms.CenterCrop(224),
                                    transforms.ToTensor(),
                        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
                                   ])
demo_dataset = SiameseDataset(CFG.test_csv,CFG.test_dir,
                                            transform=transform,
                                            )
demo_dataloader = DataLoader(demo_dataset,num_workers=CFG.num_workers,batch_size=CFG.BATCH_SIZE,shuffle=True,pin_memory = True)

In [None]:
dataloader = DataLoader(demo_dataset,num_workers=CFG.num_workers,batch_size=1,shuffle=True,pin_memory = True)
for i, data in enumerate(dataloader,0):
  img0, img1 , label, _, _ = data
#  img0, img1 , label = img0.to(CFG.device), img1.to(CFG.device) , label
  concat = torch.cat((img0,img1),0)
  if CFG.loss_function == "ContrastiveLoss":    
    emb0,emb1 = model(img0.to(CFG.device), img1.to(CFG.device))
    output =  F.pairwise_distance(emb0, emb1)
    #similarity = F.cosine_similarity(emb0, emb1)
  else:    
    output = model(img0.to(CFG.device), img1.to(CFG.device))

 
  emb0, emb1 = model.extraction(img0.to(CFG.device), img1.to(CFG.device))
  distance =  F.pairwise_distance(emb0, emb1)
  similarity = F.cosine_similarity(emb0, emb1)

  if label==torch.FloatTensor([[0]]):
    label="Same Pair Of Product"
  else:
    label="Different Pair Of Product"

  imshow(torchvision.utils.make_grid(concat))
  
  print("Predicted: ",output.item())
  print("Distance: ",distance.item())
  print("Similarity: ",similarity.item())    
    
  print("Actual Label: ",label)
  count=count+1
  if count ==50:
     break

In [None]:
embedding_list = []
embedding_hadamant = []
loop = tqdm(demo_dataloader)
for i, data in enumerate(loop):
  img0, img1 , label, product_id, review_id = data
#  img0, img1 , label = img0.to(CFG.device), img1.to(CFG.device) , label
  emb0, emb1 = model.extraction(img0.to(CFG.device), img1.to(CFG.device))
  for e0,e1,l,p,r in zip(emb0, emb1,label,product_id, review_id):  
      embedding_list.append([e0.detach().cpu().numpy(),p.split('/')[0]])
      embedding_list.append([e1.detach().cpu().numpy(),r.split('/')[0]])
      embedding_hadamant.append([e0.detach().cpu().numpy()*e1.detach().cpu().numpy(),l.item()])
df = pd.DataFrame(embedding_list, columns = ['embedding_list', 'id'])
df_hadamard  = pd.DataFrame(embedding_hadamant, columns = ['embedding_list', 'id'])
# df.to_csv("embedding_df.csv", index = False)
# df_hadamard.to_csv("embedding_hadamard.csv", index = False)

In [None]:
def pca_embedding(df,n_components=3):
  embedding_list_3d = df['embedding_list'].to_list()
  embedding_list_2d = np.reshape(embedding_list_3d,(len(embedding_list_3d),CFG.embedding_size))
  pca = PCA(n_components=n_components)
  vis_dims = pca.fit_transform(embedding_list_2d)
  df["embed_vis"] = vis_dims.tolist()
  return df

def tsne_embedding(df,n_components=3):
  embedding_list_3d = df['embedding_list'].to_list()
  embedding_list_2d = np.reshape(embedding_list_3d,(len(embedding_list_3d),CFG.embedding_size))
  tsne = TSNE(n_components = n_components, random_state=42)
  vis_tsne= tsne.fit_transform(embedding_list_2d)
  df["embed_vis"] = vis_tsne.tolist()
  return df

def visualize_embedding_3d(df, fig_name='pca'):
  categories = df['id'].unique()
  cates = []
  for cate in categories:
      cates.append(cate.item())

  fig = go.Figure()
  for i, cat in enumerate(cates):
      sub_matrix = np.array(df[df["id"] == cat]["embed_vis"].to_list())
      x = sub_matrix[:, 0]
      y = sub_matrix[:, 1]
      z = sub_matrix[:, 2]

      fig.add_trace(
          go.Scatter3d(
              x=x,
              y=y,
              z=z,
              mode="markers",
              marker=dict(size=5, color=i, colorscale="Viridis", opacity=0.8),
              name=cat,
          )
      )

  fig.update_layout(
      autosize=False,
      title="3D Scatter Plot of Labels",
      width=800,
      height=500,
      margin=dict(l=50, r=50, b=100, t=100, pad=10),
      scene=dict(
          xaxis=dict(title="x"),
          yaxis=dict(title="y"),
          zaxis=dict(title="z"),
      ),
  )
  fig.write_image(fig_name+'-3d.png')
  fig.show()

def visualize_embedding_2d(df,fig_name='pca'):
  categories = df['id'].unique()
  cates = []
  for cate in categories:
      cates.append(cate.item())
  fig = go.Figure()
  for i, cat in enumerate(cates):
      sub_matrix = np.array(df[df["id"] == cat]["embed_vis"].to_list())
      x = sub_matrix[:, 0]
      y = sub_matrix[:, 1]
      fig.add_trace(
          go.Scatter(
              x=x,
              y=y,
              mode="markers",
              marker=dict(size=5, color=i, colorscale="Viridis", opacity=0.8),
              name=cat,
          )
      )
  fig.update_layout(
      title="2D Scatter Plot of Labels",
      xaxis_title='x',
      yaxis_title='y',
  )
  fig.write_image(fig_name+'-2d.png')
  fig.show()

In [None]:
print("PCA VISUALIZE")
df_pca = pca_embedding(df_hadamard,n_components=122)
visualize_embedding_3d(df_pca)
visualize_embedding_2d(df_pca)

In [None]:
print("T-SNE VISUALIZE")
df_tsne = tsne_embedding(df_hadamard,n_components=3)
visualize_embedding_3d(df_tsne,fig_name='tsne')
visualize_embedding_2d(df_tsne,fig_name='tsne')