In [1]:
%matplotlib inline
import logging

import os
import random
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)
from glob import glob
import wandb

# Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import Dataset, DataLoader, random_split, Dataset
from torchvision import datasets, transforms, models
from torchvision.datasets import ImageFolder
from torchvision.utils import make_grid
from torchvision.transforms import ToTensor, RandomCrop

from tqdm import tqdm
from sklearn.manifold import TSNE
from PIL import Image
import warnings
warnings.filterwarnings('ignore')
import statistics


import torchvision
import umap
from cycler import cycler

import pytorch_metric_learning
import pytorch_metric_learning.utils.logging_presets as logging_presets
from pytorch_metric_learning import losses, miners, samplers, testers, trainers
from pytorch_metric_learning.utils.accuracy_calculator import AccuracyCalculator
from pytorch_metric_learning.distances import CosineSimilarity
from pytorch_metric_learning.utils.inference import InferenceModel, MatchFinder
from pytorch_metric_learning.utils import common_functions as c_f

logging.getLogger().setLevel(logging.INFO)
logging.info("VERSION %s" % pytorch_metric_learning.__version__)

import fuzzymatcher
from fuzzymatcher import link_table, fuzzy_left_join

from scipy import stats


INFO:root:VERSION 2.1.2


In [2]:
path = '/var/scratch/mxiao/data/'
os.chdir(path)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Data load

In [4]:
def remove_tabs(value):
    if isinstance(value, str):
        return value.lstrip('\t')
    return value

In [5]:
df = pd.read_excel(path + 'TINTIN_bibliography_05082023.xlsx') # loaded image information csv and created df
df.head()

Unnamed: 0,Global Region,Country,Publication Date,Document Name,Document Directory,Title,Format,Genre,Style,Language,Original Language
0,\tAfrica,\tAlgeria,2020,The Legend of Tteryel,ee628c6877967dd5e03e59a09a0dccb6,\tThe Legend of Tteryel,\tComic,"\tsupernatural, action",\tmanga,\t,\t
1,\tAfrica,\tAngola,2021,A Resistencia Da Pena,8ded1ce5013fd3b3a019f7d8b499492a,\tA Resistencia Da Pena,\tShort story,\tPolitical commentary,\t,\tPortuguese,\t
2,\tAfrica,\tAngola,2021,Explorando,2bddf2fcafdb960d372dec0322317bbd,\tExplorando,\tShort story,\tSlice of life,\t,\tPortuguese,\t
3,\tAfrica,\tAngola,2021,Identidade Abandonada,453289c004860e8162bdfcb585cffe69,\tIdentidade Abandonada,\tShort story,\tPolitical commentary,\t,\tPortuguese,\t
4,\tAfrica,\tBenin,2015,Teddy Bear,cebc8f67b388c5e271e007ef340dbd84,\tTeddy Bear,\tGraphic Novel,\tSlice of life,\tmanga,\t,\t


In [6]:
namel = list()
subfolder = list()
my_folder = 'model/train' 

for root, dirs, files in os.walk(my_folder):
    for name in dirs:
        for root, dirs, files in os.walk(os.path.join(my_folder,name)):
            for filename in files:
                namel.append(filename)
                subfolder.append(root)

df1 = pd.DataFrame(zip(namel, subfolder), columns = ['name', 'subfolder']) # generated dataframe df1 which contains images' name and comics subfolder name

In [7]:
df1

Unnamed: 0,name,subfolder
0,1986_1.png,model/train/1986
1,1986_2.png,model/train/1986
2,1986_3.png,model/train/1986
3,1986_4.png,model/train/1986
4,1986_7.png,model/train/1986
...,...,...
35637,Zorro_66.png,model/train/Zorro
35638,Zorro_67.png,model/train/Zorro
35639,Zorro_68.png,model/train/Zorro
35640,Zorro_69.png,model/train/Zorro


In [8]:
col_list = ['Document Name','Global Region','Country','Publication Date',]
df_info = df[col_list].drop_duplicates()
df_info.rename({'Document Name': 'document_name'}, axis=1, inplace=True) # preprocess(subset&rename) comic information df
df1['subfolder_name'] = df1.subfolder.str.split('/', expand=True)[2]  # preprocess 

In [9]:
df_final = fuzzymatcher.link_table(df1, df_info, 'subfolder_name', 'document_name') # fuzzy join 2 tables using comic's names, since comic on both files are not exactly the same
df_final = df_final.reset_index(drop=True)

In [10]:
df_final1 = df_final[df_final.match_rank == 1] #fuzzy match creates multiple join, here only subset the top match result

In [11]:
fuzzy_match_pairs = df_final1[['subfolder_name','document_name']].drop_duplicates() 

In [12]:
df1_match = pd.merge(df1,fuzzy_match_pairs, left_on='subfolder_name', right_on='subfolder_name', how='inner')

In [13]:
df1_info_fuzzy_match = pd.merge(df1_match,df_info, left_on='document_name', right_on='document_name', how='left')

In [14]:
def strip_tabs(value):
    if isinstance(value, str):
        return value.strip('\t').strip(' ')
    return value

In [15]:
df1_info_fuzzy_match = df1_info_fuzzy_match.applymap(strip_tabs)

In [16]:
# Define the custom bars
custom_bars = [1930, 1990, 2000, 2010, 2020, 2023]

# Create bins based on column values and custom bars
#  pd.cut(df1_info_fuzzy_match['Values'], bins=custom_bars)
df1_info_fuzzy_match['Date_EFB'] = pd.cut(df1_info_fuzzy_match['Publication Date'],bins=custom_bars)

In [17]:
def class_map(df, new_class_col):
    class_map_df = df[['subfolder_name',new_class_col]].drop_duplicates()
    class_map_dict =  dict(zip(class_map_df['subfolder_name'], class_map_df[new_class_col]))
    return class_map_dict

In [18]:
year_class_map_dict = class_map(df1_info_fuzzy_match, 'Date_EFB')
country_class_map_dict = class_map(df1_info_fuzzy_match, 'Country')
region_class_map_dict = class_map(df1_info_fuzzy_match, 'Global Region')

In [19]:
# Set the image transforms
normalize = transforms.Normalize(mean=[0.6195012,0.6195012,0.6195012], std=[0.3307451,0.3307451,0.3307451])
# normalize = transforms.Normalize(mean=[0.53997546,0.53997546,0.53997546], std=[0.36844322,0.36844322,0.36844322])

train_transform = transforms.Compose([
#         transforms.RandomRotation(10),      # rotate +/- 10 degrees
#         transforms.RandomHorizontalFlip(),  # reverse 50% of images
#         transforms.RandomVerticalFlip(p=0.5),
#         transforms.Resize(224),             # resize shortest side to 224 pixels
#         transforms.CenterCrop(224),         # crop longest side to 224 pixels at center
        transforms.RandomCrop(size=(224,224),pad_if_needed=True), 
        transforms.ToTensor(),
        normalize
    ])

test_transform = transforms.Compose([
        transforms.RandomCrop((224,224),pad_if_needed=True),
        transforms.ToTensor(),
        normalize
    ])

In [20]:
excluded_subfolders = [] #['Sidequest', 'Stand Still Stay Silent']  # Subfolders to be skipped

class CustomImageFolder(ImageFolder):
    def __init__(self, root, transform=None, target_transform=None, class_mapping=None):
        self.valid_classes = [subfolder for subfolder in os.listdir(root) if subfolder not in excluded_subfolders]
        super().__init__(root, transform=transform, target_transform=target_transform)
        self.classes = [class_mapping[subfolder] for subfolder in self.valid_classes]
        self.class_mapping = class_mapping

    def __getitem__(self, index):
        original_tuple = super().__getitem__(index)
        path, _ = self.samples[index]
        folder_name = os.path.basename(os.path.dirname(path))
        
        if folder_name in excluded_subfolders:
            pass
#             raise RuntimeError(f"Skipping images from subfolder: {folder_name}")
            
        
        new_target = self.class_mapping[folder_name]
        # Extract unique values from the original dictionary
        unique_values = list(set(self.class_mapping.values()))

        # Sort the unique values
        sorted_values = sorted(unique_values)

        # Generate the new dictionary with unique and sorted values as keys
        year_class_mapping = {value: index for index, value in enumerate(sorted_values)}
        new_target_idx = year_class_mapping[new_target]
        return original_tuple[0], new_target_idx #, new_target, folder_name

In [21]:
unique_values = list(set(year_class_map_dict.values()))
# Sort the unique values
sorted_values = sorted(unique_values)
# Generate the new dictionary with unique and sorted values as keys
year_class_mapping = {index: value for index, value in enumerate(sorted_values)}

In [125]:
region_class_map_dict

{'1986': 'Europe',
 '13th Boy': 'East Asia',
 '1714 Baluarte': 'Europe',
 '14 days in the desert': 'East Asia',
 '26 Needled Girl': 'West Asia',
 '40 Years of Silence': 'West Asia',
 '5-nen Hibarigumi': 'East Asia',
 '6People - Alti Adam 1': 'Central Asia',
 '6People - Alti Adam 2': 'Central Asia',
 '6People - Alti Adam 3': 'Central Asia',
 '6People - Alti Adam 4': 'Central Asia',
 '6People - Alti Adam 5': 'Central Asia',
 '6People - Alti Adam 6': 'Central Asia',
 '6People - Alti Adam 7': 'Central Asia',
 '6People - Alti Adam  8': 'Central Asia',
 'Abena': 'Africa',
 'Abisul Clujulu': 'Europe',
 'Ace Hart in Men of Rubber': 'Europe',
 'Action-Truyen': 'Southeast Asia',
 'A Deal With Lucifer': 'Southeast Asia',
 'Adventures of Amitabh Bachchan-The Lost Idol': 'South Asia',
 'Adventures of Bongoman': 'Africa',
 'After Yesterday': 'Europe',
 'Aganza': 'Africa',
 'Age of Bronze': 'North America',
 'Age of Ivory': 'Africa',
 'AGGA': 'Southeast Asia',
 'Aguia Negra': 'South America',
 'Ah Ah

In [43]:
# train_dataset = CustomImageFolder(root="model/train", transform=train_transform, class_mapping = region_class_map_dict)
# val_dataset = CustomImageFolder(root="model/val", transform=test_transform, class_mapping = region_class_map_dict)
test_dataset = CustomImageFolder(root="model/test", transform=test_transform, class_mapping = year_class_map_dict)

In [44]:
# valid_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)

### Pretrain embedding

In [25]:
import faiss


In [26]:
# Efficientnet
from efficientnet_pytorch import EfficientNet
model = EfficientNet.from_name('efficientnet-b0',include_top=False).to(device)

In [27]:
model

EfficientNet(
  (_conv_stem): Conv2dStaticSamePadding(
    3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False
    (static_padding): ZeroPad2d((0, 1, 0, 1))
  )
  (_bn0): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
  (_blocks): ModuleList(
    (0): MBConvBlock(
      (_depthwise_conv): Conv2dStaticSamePadding(
        32, 32, kernel_size=(3, 3), stride=[1, 1], groups=32, bias=False
        (static_padding): ZeroPad2d((1, 1, 1, 1))
      )
      (_bn1): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
      (_se_reduce): Conv2dStaticSamePadding(
        32, 8, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_se_expand): Conv2dStaticSamePadding(
        8, 32, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_project_conv): Conv2dStaticSamePadding(
        32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False
    

In [28]:
import torch
torch.cuda.empty_cache()

In [29]:
def create_faiss_index(feature_dim):
    res = faiss.StandardGpuResources()
    index = faiss.IndexFlatL2(feature_dim)
    gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
    return gpu_index

In [30]:
def create_embeddings_and_index(model, dataloader, index):
    model.eval()
    with torch.no_grad():  # No need to track gradients
        for i, (inputs, labels) in enumerate(dataloader):
            inputs = inputs.to(device)
            embeddings = model(inputs)
            index.add(embeddings.cpu().numpy())
    return index


In [45]:
model = model.to(device)
model.eval()
batch_size = 32
embeddings = []
labels = []

with torch.no_grad():
    for batch in test_dataloader:
        inputs, targets = batch
        inputs = inputs.to(device)
        outputs = model(inputs)
        embeddings.append(outputs.cpu().numpy().reshape(outputs.shape[0], -1))  # Corrected

#         embeddings.append(outputs.cpu().numpy())
        labels.append(targets.numpy())

embeddings = np.vstack(embeddings)
# labels = np.concatenate(labels)

In [46]:
labels = np.hstack(labels)

In [47]:
embeddings.shape

(17958, 1280)

In [48]:
dimension = embeddings.shape[1]  # Dimension of the embeddings
index = faiss.IndexFlatL2(dimension)
index = faiss.index_cpu_to_all_gpus(index)  # Make use of all GPUs

In [49]:
index.add(embeddings)

In [50]:
# Now you can use the index for k-NN search
k = 5
D, I = index.search(embeddings, k)  # sanity check
print(I[:5])  # Nearest neighbors of the first 5 queries
# print(D[:5])  # Distances to the nearest neighbors of the first 5 queries

[[    0  4930  9724  7643  3665]
 [    1  5426 17765 16269  1454]
 [    2 10448 13509   691  2452]
 [    3 13930  6595 16135 11414]
 [    4  1186 10416   349 10591]]


In [51]:
I[:5][:,:]

array([[    0,  4930,  9724,  7643,  3665],
       [    1,  5426, 17765, 16269,  1454],
       [    2, 10448, 13509,   691,  2452],
       [    3, 13930,  6595, 16135, 11414],
       [    4,  1186, 10416,   349, 10591]])

In [52]:
neighbor_labels = labels[I]

In [53]:
neighbor_labels[:5]

array([[3, 3, 1, 3, 3],
       [3, 3, 3, 4, 3],
       [3, 3, 2, 4, 3],
       [3, 4, 3, 4, 3],
       [3, 2, 3, 3, 3]])

In [54]:
predicted_labels = stats.mode(neighbor_labels, axis=1)[0]

In [55]:
from sklearn import metrics

# Assuming you have the true labels (y_true) and predicted labels (y_pred)
# y_true: true labels of shape (n_samples,)
# y_pred: predicted labels of shape (n_samples,)
y_pred = predicted_labels.reshape(-1)
y_true = labels
# Calculate accuracy
accuracy = metrics.accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

# Calculate precision, recall, and F1-score for each class
precision = metrics.precision_score(y_true, y_pred, average=None)
recall = metrics.recall_score(y_true, y_pred, average=None)
f1_score = metrics.f1_score(y_true, y_pred, average=None)

# Print precision, recall, and F1-score for each class
for i in range(len(precision)):
    print("Class {}: Precision={}, Recall={}, F1-score={}".format(i, precision[i], recall[i], f1_score[i]))

# Calculate macro-average precision, recall, and F1-score
macro_precision = metrics.precision_score(y_true, y_pred, average='macro')
macro_recall = metrics.recall_score(y_true, y_pred, average='macro')
macro_f1_score = metrics.f1_score(y_true, y_pred, average='macro')
print("Macro-average Precision:", macro_precision)
print("Macro-average Recall:", macro_recall)
print("Macro-average F1-score:", macro_f1_score)

# Calculate weighted-average precision, recall, and F1-score
weighted_precision = metrics.precision_score(y_true, y_pred, average='weighted')
weighted_recall = metrics.recall_score(y_true, y_pred, average='weighted')
weighted_f1_score = metrics.f1_score(y_true, y_pred, average='weighted')
print("Weighted-average Precision:", weighted_precision)
print("Weighted-average Recall:", weighted_recall)
print("Weighted-average F1-score:", weighted_f1_score)


Accuracy: 0.6070275086312507
Class 0: Precision=0.546242774566474, Recall=0.4184887904788265, F1-score=0.47390691114245415
Class 1: Precision=0.4638009049773756, Recall=0.2122153209109731, F1-score=0.2911931818181818
Class 2: Precision=0.4306083650190114, Recall=0.24539544962080173, F1-score=0.31262939958592134
Class 3: Precision=0.6426800871328776, Recall=0.8799753162604135, F1-score=0.7428372981420386
Class 4: Precision=0.45691906005221933, Recall=0.09668508287292818, F1-score=0.15959872321021432
Macro-average Precision: 0.5080502383495916
Macro-average Recall: 0.37055199202878863
Macro-average F1-score: 0.396033102779762
Weighted-average Precision: 0.5731324115396701
Weighted-average Recall: 0.6070275086312507
Weighted-average F1-score: 0.5614273802247016


In [42]:
from sklearn import metrics

# Assuming you have the true labels (y_true) and predicted labels (y_pred)
# y_true: true labels of shape (n_samples,)
# y_pred: predicted labels of shape (n_samples,)
y_pred = predicted_labels.reshape(-1)
y_true = labels
# Calculate accuracy
accuracy = metrics.accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

# Calculate precision, recall, and F1-score for each class
precision = metrics.precision_score(y_true, y_pred, average=None)
recall = metrics.recall_score(y_true, y_pred, average=None)
f1_score = metrics.f1_score(y_true, y_pred, average=None)

# Print precision, recall, and F1-score for each class
for i in range(len(precision)):
    print("Class {}: Precision={}, Recall={}, F1-score={}".format(i, precision[i], recall[i], f1_score[i]))

# Calculate macro-average precision, recall, and F1-score
macro_precision = metrics.precision_score(y_true, y_pred, average='macro')
macro_recall = metrics.recall_score(y_true, y_pred, average='macro')
macro_f1_score = metrics.f1_score(y_true, y_pred, average='macro')
print("Macro-average Precision:", macro_precision)
print("Macro-average Recall:", macro_recall)
print("Macro-average F1-score:", macro_f1_score)

# Calculate weighted-average precision, recall, and F1-score
weighted_precision = metrics.precision_score(y_true, y_pred, average='weighted')
weighted_recall = metrics.recall_score(y_true, y_pred, average='weighted')
weighted_f1_score = metrics.f1_score(y_true, y_pred, average='weighted')
print("Weighted-average Precision:", weighted_precision)
print("Weighted-average Recall:", weighted_recall)
print("Weighted-average F1-score:", weighted_f1_score)


Accuracy: 0.46759104577347144
Class 0: Precision=0.32655551037006914, Recall=0.46550724637681157, F1-score=0.38384321223709367
Class 1: Precision=0.2727272727272727, Recall=0.18032786885245902, F1-score=0.21710526315789475
Class 2: Precision=0.3630246715796219, Recall=0.5415869980879541, F1-score=0.4346825244580856
Class 3: Precision=0.5628651806968189, Recall=0.7392354696603667, F1-score=0.6391055961668407
Class 4: Precision=0.42023346303501946, Recall=0.13989637305699482, F1-score=0.20991253644314867
Class 5: Precision=0.4088397790055249, Recall=0.09622886866059818, F1-score=0.15578947368421053
Class 6: Precision=0.4578313253012048, Recall=0.11746522411128284, F1-score=0.18696186961869618
Class 7: Precision=0.4439918533604888, Recall=0.16232315711094564, F1-score=0.23773173391494004
Class 8: Precision=0.43154761904761907, Recall=0.22979397781299524, F1-score=0.2998965873836608
Class 9: Precision=0.35799086757990867, Recall=0.2530664945125888, F1-score=0.29652042360060515
Class 10: Pr