In [1]:
# Library imports
import torch
import os
import torch.nn as nn
from torch.utils.data import DataLoader as TorchDataLoader
import pandas as pd

# Our imports
from DL_vs_HateSpeech.loading_data.dataloader import DataLoader
from DL_vs_HateSpeech.training.training import collate_fn
from DL_vs_HateSpeech.utils import check_frozen_params
from DL_vs_HateSpeech.models.utils import load_model_from_path
from DL_vs_HateSpeech.evaluation.evaluate import evaluate


# Some constants
DATA_SUBSET = "us_pol"
BATCH_SIZE = 1


# Load Data
train_dataset = DataLoader(type="train", subset=DATA_SUBSET)
test_dataset = DataLoader(type="test", subset=DATA_SUBSET)
train_loader = TorchDataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = TorchDataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Function to evaluate all the models
def create_metric_df(path):
    """
    Create a DataFrame with the metrics of all models in the given path.
    """
    # Get all the model files in the directory
    model_files = [f for f in os.listdir(path) if f.endswith('.pth')]
    
    # Dataset to store the model accuracies and F1 scores
    df = pd.DataFrame(columns=["accuracy", "f1_score_0", "f1_score_1", "avg_loss"])
    
    for file_name in model_files:
        model_v2_16 = load_model_from_path(path, file_name=file_name, device="cpu")
        model_v2_16.eval()
        
        # Check how many parameters are frozen
        check_frozen_params(model_v2_16)
        
        # Evaluate the model
        avg_loss_test, accuracy_test, f1_test = evaluate(model_v2_16, test_loader, nn.BCEWithLogitsLoss(), device="cpu")
        
        # Print the results
        print(f"Model: {file_name}")
        print(f"Accuracy: {accuracy_test:.4f}")
        print(f"F1 Score (0): {f1_test[0]:.4f}")
        print(f"F1 Score (1): {f1_test[1]:.4f}")
        print(f"Average Loss: {avg_loss_test:.4f}")
        
        # Append the results to the DataFrame
        df.loc[len(df)] = {
            "accuracy": accuracy_test,
            "f1_score_0": f1_test[0],
            "f1_score_1": f1_test[1],
            "avg_loss": avg_loss_test
        }
    
    # Save the DataFrame to a CSV file
    df.to_csv(os.path.join(path, "model_metrics.csv"), index=False)
    return df


# Function to compute average and standard deviation of metrics
def compute_average_metrics(df):
    """
    Compute the average and standard deviation of the metrics in the DataFrame.
    """

    # Averge the F1 scores into a single metric
    df['f1_score_avg'] = (df['f1_score_0'] + df['f1_score_1']) / 2
    df = df.drop(columns=['f1_score_0', 'f1_score_1'])
    df = df.rename(columns={'f1_score_avg': 'f1_score'})

    # Compute average and standard deviation for each metric
    avg_metrics = df.mean()
    std_metrics = df.std()

    # Print avg +_ std
    print("\nAverage Metrics with Standard Deviation:")
    for metric in avg_metrics.index:
        print(f"{metric}: {avg_metrics[metric] * 100:.2f} ± {std_metrics[metric] * 100:.2f}")

# Best Models

In [3]:
# Set this to true if the models need to be evaluated
# Note: This will take a while to run
EVALUATE_MODELS = False

## Best model using CLIP of type 16

### With augmentation

In [4]:
path = "ModelV2_clip_16_aug_True"

In [5]:
if EVALUATE_MODELS:
    create_metric_df(path)


In [6]:
df = pd.read_csv(os.path.join(path, "model_metrics.csv"))
compute_average_metrics(df)


Average Metrics with Standard Deviation:
accuracy: 50.42 ± 4.42
avg_loss: 69.33 ± 0.10
f1_score: 38.22 ± 10.03


### Without augmentation

In [7]:
path = "ModelV2_clip_16_aug_False"

In [8]:
if EVALUATE_MODELS:
    create_metric_df(path)

In [9]:
df = pd.read_csv(os.path.join(path, "model_metrics.csv"))
compute_average_metrics(df)


Average Metrics with Standard Deviation:
accuracy: 60.73 ± 2.00
avg_loss: 161.80 ± 69.68
f1_score: 59.50 ± 2.76


## Best model using CLIP of type 32

### With augmentation

In [10]:
path = "ModelV2_clip_32_aug_True"

In [11]:
if EVALUATE_MODELS:
    create_metric_df(path)

In [12]:
df = pd.read_csv(os.path.join(path, "model_metrics.csv"))
compute_average_metrics(df)


Average Metrics with Standard Deviation:
accuracy: 48.79 ± 0.92
avg_loss: 69.33 ± 0.02
f1_score: 35.87 ± 4.77


### Without augmentation

In [13]:
path = "ModelV2_clip_32_aug_False"

In [14]:
if EVALUATE_MODELS:
    create_metric_df(path)

In [15]:
df = pd.read_csv(os.path.join(path, "model_metrics.csv"))
compute_average_metrics(df)


Average Metrics with Standard Deviation:
accuracy: 57.18 ± 1.70
avg_loss: 144.78 ± 57.45
f1_score: 57.04 ± 1.77
