In [None]:
import os

import cv2
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import shutil
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as tt
import json
import plotly.express as px
import plotly.subplots
import plotly.graph_objs as go
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from torchvision.utils import make_grid
from torchvision.utils import save_image
from tqdm.notebook import tqdm
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from PIL import Image
import ast
from utils.plots import *
from utils.train import *

In [None]:
%matplotlib inline

In [None]:
df = pd.read_json('logs.json', lines =  True)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
"""train_X, train_Y = get_data("train") 
test_X, test_Y = get_data('test')
data = train_X, train_Y, test_X, test_Y"""

In [None]:
def short_title(title):
    if len(str(title)) > 20:
        title = title.split(' ')
        title = title[0]
        title = title[1:]
    return title

In [None]:
df.iloc[0]['train_losses_epoches']

In [None]:
model_list = ['xlm-roberta-base', 'xlm-roberta-large', 'bert-base-uncased', 'roberta-base', 'roberta-large']

In [None]:
def plot(df, exp_title, column_to_check, exp_list=False):
    filtered_df = df[df['exp_title'] == exp_title]
    fig = plotly.subplots.make_subplots(rows=4, 
                                        cols=2, 
                                        subplot_titles=['train_losses_avg', 
                                                        'valid_losses_avg', 
                                                        'train_losses_epoches', 
                                                        'valid_losses_epoches'
                                                        ])

    columns = ['train_losses_avg', 'valid_losses_avg', 'train_losses_epoches', 'valid_losses_epoches']
    
    # Define a colormap for unique colors for each trace
  
    filtered_df[column_to_check] = filtered_df[column_to_check].astype(str)
    num_traces = len(filtered_df)
    if num_traces > 10:
        colorscale = px.colors.qualitative.Alphabet[:num_traces] 
        colorscale.extend(colorscale)
    else:
        colorscale = px.colors.qualitative.Plotly[:num_traces]
        colorscale.extend(colorscale) 
    for i, column in enumerate(columns):
        fig.update_xaxes(title_text='Epoch', row=i%4+1, col=i%2+1)
        fig.update_yaxes(title_text=column, row=i%4+1, col=i%2+1)
        if exp_list:
            exp_counter = 0
        
        for index, (row_index, row) in enumerate(filtered_df.iterrows(), start=0):
            trace_color = colorscale[index]
            y_values = row[column]
            if type(y_values) is str:
                y_values = ast.literal_eval(y_values)
            
            if exp_list:
                
                fig.add_trace(go.Scatter(x=list(range(len(row[column]))), 
                                         y=y_values,
                                         mode='lines', 
                                         name=f'{short_title(exp_list[exp_counter])}',
                                         line=dict(color=trace_color),
                                         legendgroup=row['pretrained_model_name']+row['model_type']+'_'+exp_list[exp_counter]), 
                              row=i//2+1, 
                              col=i%2+1)
                exp_counter += 1
            else:
                fig.add_trace(go.Scatter(x=list(range(len(row[column]))), 
                                         y=y_values,
                                         mode='lines', 
                                         name=f"{row['pretrained_model_name']}{row['model_type']} {short_title(row[column_to_check])}",
                                         line=dict(color=trace_color),
                                         legendgroup=row['pretrained_model_name']+row['model_type']+'_'+row[column_to_check]), 
                              row=i//2+1, 
                              col=i%2+1)
                
    names = set()
    fig.for_each_trace(
        lambda trace:
            trace.update(showlegend=False)
            if (trace.name in names) else names.add(trace.name))
    
    
    fig.update_layout(
        title_text=f'Exp Title: {exp_title}',
        width=1000,  
        height=1000  
    )
    
    if exp_list:
        filtered_df['true_labels'] = exp_list
        bar_fig = px.bar(filtered_df, x='true_labels', y='execution_time', color='pretrained_model_name', title=f'Time {short_title(column_to_check)}')
    else:
        bar_fig = px.bar(filtered_df, x=column_to_check, y='execution_time',facet_col='pretrained_model_name', facet_col_wrap=1, title=f'Time {short_title(column_to_check)}')

    fig.show()
    bar_fig.show()

In [None]:
from importlib import reload
import utils.plots

reload(utils.plots)
from utils.plots import *

# Test lr

In [None]:
exp_title = 'test_lr_stable'
tmp_df = df[df['exp_title'] == exp_title]
scores = [(f'{pretrained_name}_{model_name}_{lr}', x['weighted avg']['f1-score']) for pretrained_name, model_name, x, lr in tmp_df[['pretrained_model_name', 'model_type', 'report', 'lr']].values]
max_lengths = list(tmp_df['max_length'])
paddings = list(tmp_df['padding'])
truncations = list(tmp_df['truncation'])
batch_sizes = list(tmp_df['batch_size'])
plots_names = [f'{pretrained_name}_{model_name}_{lr}' for pretrained_name, model_name, lr in tmp_df[['pretrained_model_name', 'model_type', 'lr']].values]


## Plot

In [None]:
plot(df, exp_title, 'lr')

## PCA

In [None]:
model_transform =  PCA(n_components=2)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(models_gen=generator, device=device, 
                  transform_model=model_transform, test=True, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=0, true_scores=scores, plots_names=plots_names, num_batches=10)

## TSNE 2

In [None]:
model_transform = TSNE(n_components=2, random_state=0)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=False, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=2, true_scores=scores, plots_names=plots_names, num_batches=10)


# Test padding

In [None]:
exp_title = 'test_padding'
tmp_df = df[df['exp_title'] == exp_title]
scores = [(f'{pretrained_name}_{model_name}_{lr}', x['weighted avg']['f1-score']) for pretrained_name, model_name, x, lr in tmp_df[['pretrained_model_name', 'model_type', 'report', 'padding']].values]
max_lengths = list(tmp_df['max_length'])
paddings = list(tmp_df['padding'])
truncations = list(tmp_df['truncation'])
batch_sizes = list(tmp_df['batch_size'])
plots_names = [f'{pretrained_name}_{model_name}_{lr}' for pretrained_name, model_name, lr in tmp_df[['pretrained_model_name', 'model_type', 'padding']].values]

In [None]:
plots_names

## Plot

In [None]:
plot(df, exp_title, 'padding')

## PCA

In [None]:
model_transform =  PCA(n_components=2)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=True, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=0, true_scores=scores,plots_names=plots_names, num_batches=10)

## TSNE

In [None]:
model_transform = TSNE(n_components=2, random_state=0)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=False, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=0, true_scores=scores,plots_names=plots_names, num_batches=10)


# Test max_len

In [None]:
exp_title = 'test_len'
tmp_df = df[df['exp_title'] == exp_title]
scores = [(f'{pretrained_name}_{model_name}_{lr}', x['weighted avg']['f1-score']) for pretrained_name, model_name, x, lr in tmp_df[['pretrained_model_name', 'model_type', 'report', 'max_length']].values]
max_lengths = list(tmp_df['max_length'])
paddings = list(tmp_df['padding'])
truncations = list(tmp_df['truncation'])
batch_sizes = list(tmp_df['batch_size'])
plots_names = [f'{pretrained_name}_{model_name}_{lr}' for pretrained_name, model_name, lr in tmp_df[['pretrained_model_name', 'model_type', 'max_length']].values]


## Plot

In [None]:
plot(df, exp_title, 'max_length')

## PCA

In [None]:
model_transform =  PCA(n_components=2)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=True, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=0, true_scores=scores,plots_names=plots_names, num_batches=10)

## TSNE

In [None]:
model_transform = TSNE(n_components=2, random_state=0)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=False, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=0, true_scores=scores,plots_names=plots_names, num_batches=10)


# Test extensions

In [None]:
exp_title = 'test_ext3'
tmp_df = df[df['exp_title'] == exp_title]
scores = [(f'{pretrained_name}_{model_name}', x['weighted avg']['f1-score']) for pretrained_name, model_name, x in tmp_df[['pretrained_model_name', 'model_type', 'report']].values]
max_lengths = list(tmp_df['max_length'])
paddings = list(tmp_df['padding'])
truncations = list(tmp_df['truncation'])
batch_sizes = list(tmp_df['batch_size'])
plots_names = [f'{pretrained_name}_{model_name}' for pretrained_name, model_name in tmp_df[['pretrained_model_name', 'model_type']].values]


## Plot

In [None]:
plot(df, exp_title, 'model_type')

## PCA

In [None]:
model_transform =  PCA(n_components=2)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=True, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=2, true_scores=scores,plots_names=plots_names, num_batches=10)

In [None]:
model_transform =  PCA(n_components=2)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=True, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=1, true_scores=scores,plots_names=plots_names, num_batches=10)

In [None]:
model_transform =  PCA(n_components=2)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=True, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=0, true_scores=scores,plots_names=plots_names, num_batches=10)

## TSNE

In [None]:
model_transform = TSNE(n_components=2, random_state=0)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=False, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=0, true_scores=scores,plots_names=plots_names, num_batches=10)


# Test optimizer

In [None]:
exp_title = 'test_optimizer'
tmp_df = df[df['exp_title'] == exp_title]
scores = [(f'{pretrained_name}_{model_name}_{lr.split(" ")[0]}', x['weighted avg']['f1-score']) for pretrained_name, model_name, x, lr in tmp_df[['pretrained_model_name', 'model_type', 'report', 'optimizer']].values]
max_lengths = list(tmp_df['max_length'])
paddings = list(tmp_df['padding'])
truncations = list(tmp_df['truncation'])
batch_sizes = list(tmp_df['batch_size'])
plots_names = [f'{pretrained_name}_{model_name}_{lr.split(" ")[0]}' for pretrained_name, model_name, lr in tmp_df[['pretrained_model_name', 'model_type', 'optimizer']].values]


## Plot

In [None]:
plot(df, exp_title, 'optimizer')

## PCA

In [None]:
model_transform =  PCA(n_components=2)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=True, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=0, true_scores=scores,plots_names=plots_names, num_batches=10)

## TSNE

In [None]:
model_transform = TSNE(n_components=2, random_state=0)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=False, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=0, true_scores=scores,plots_names=plots_names, num_batches=10)


# Test batch

In [None]:
exp_title = 'test_batch'
tmp_df = df[df['exp_title'] == exp_title]
scores = [(f'{pretrained_name}_{model_name}_{lr}', x['weighted avg']['f1-score']) for pretrained_name, model_name, x, lr in tmp_df[['pretrained_model_name', 'model_type', 'report', 'batch_size']].values]
max_lengths = list(tmp_df['max_length'])
paddings = list(tmp_df['padding'])
truncations = list(tmp_df['truncation'])
batch_sizes = list(tmp_df['batch_size'])
plots_names = [f'{pretrained_name}_{model_name}_{lr}' for pretrained_name, model_name, lr in tmp_df[['pretrained_model_name', 'model_type', 'batch_size']].values]


## Plot

In [None]:
plot(df, exp_title, 'batch_size')

## PCA

In [None]:
model_transform =  PCA(n_components=2)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=True, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=0, true_scores=scores,plots_names=plots_names, num_batches=10)

## TSNE

In [None]:
model_transform = TSNE(n_components=2, random_state=0)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=False, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=0, true_scores=scores,plots_names=plots_names, num_batches=10)


# Test epoches

In [None]:
exp_title = 'test_epochs'
tmp_df = df[df['exp_title'] == exp_title]
scores = [(f'{pretrained_name}_{model_name}_{lr}', x['weighted avg']['f1-score']) for pretrained_name, model_name, x, lr in tmp_df[['pretrained_model_name', 'model_type', 'report', 'epochs']].values]
max_lengths = list(tmp_df['max_length'])
paddings = list(tmp_df['padding'])
truncations = list(tmp_df['truncation'])
batch_sizes = list(tmp_df['batch_size'])
plots_names = [f'{pretrained_name}_{model_name}_{lr}' for pretrained_name, model_name, lr in tmp_df[['pretrained_model_name', 'model_type', 'epochs']].values]


## Plot

In [None]:
plot(df, exp_title, 'epochs')

## PCA

In [None]:
model_transform =  PCA(n_components=2)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=True, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=0, true_scores=scores,plots_names=plots_names, num_batches=10)

## TSNE

In [None]:
model_transform = TSNE(n_components=2, random_state=0)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=False, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=0, true_scores=scores,plots_names=plots_names, num_batches=10)


# Test lr_stable

In [None]:
exp_title = 'test_lr_unstable'
tmp_df = df[df['exp_title'] == exp_title]
scores = [(f'{pretrained_name}_{model_name}_{lr}', x['weighted avg']['f1-score']) for pretrained_name, model_name, x, lr in tmp_df[['pretrained_model_name', 'model_type', 'report', 'lr']].values]
max_lengths = list(tmp_df['max_length'])
paddings = list(tmp_df['padding'])
truncations = list(tmp_df['truncation'])
batch_sizes = list(tmp_df['batch_size'])
plots_names = [f'{pretrained_name}_{model_name}_{lr}' for pretrained_name, model_name, lr in tmp_df[['pretrained_model_name', 'model_type', 'lr']].values]


## Plot

In [None]:
plot(df, exp_title, 'lr')

## PCA

In [None]:
model_transform =  PCA(n_components=2)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=True, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=0, true_scores=scores,plots_names=plots_names, num_batches=10)

## TSNE

In [None]:
model_transform = TSNE(n_components=2, random_state=0)
generator = load_models_gen(exp_title, model_list, device)

plot_latent_multi_gen(generator, device, 
                  model_transform, test=False, max_lengths=max_lengths, paddings=paddings,
                      truncations=truncations, batch_sizes=batch_sizes, rep_idx=0, true_scores=scores,plots_names=plots_names, num_batches=10)
