In [None]:
!pip install sentence-transformers

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses, models, evaluation
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
import os
import pickle
import random
import math
import joblib

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
# Data
train_path = "../Dataset/Augmented/train_augmented_Swap.csv"
test_path = "../Dataset/valid.csv"
MODEL_PATH = "../BERT_CL"
DICT_PATH = f"{MODEL_PATH}/embeddings_dictionary.pkl"

# Model
MODEL_NAME = "bert-large-uncased"
BATCH_SIZE = 16
N_EPOCHS = 5

# Reproducibility
SEED = 42
set_seed(SEED)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Device selected: {device}")

In [None]:
df_train = pd.read_csv(train_path).dropna(subset=['text', 'label', 'task', 'variety'])
print(f"Train rows: {len(df_train)}")
df_test = pd.read_csv(test_path)
print(f"Test rows: {len(df_test)}")

## Lingue separate

In [None]:
# Trainig function
def train_specific_model(task, variety, subset_df):
    model_name_id = f"{task}_{variety}".replace(" ", "_")
    output_path = f"{MODEL_PATH}/{model_name_id}"

    print(f"\nTraining: TASK={task.upper()} | VARIETY={variety.upper()}")
    print(f"Samples: {len(subset_df)}")

    df_train, df_val = train_test_split(subset_df, test_size=0.1, random_state=42, stratify=subset_df['label'])

    # Creating dataset for contrastive learning
    train_examples = []
    for i, row in subset_df.iterrows():
        train_examples.append(InputExample(texts=[row['text']], label=int(row['label'])))

    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)

    word_embedding_model = models.Transformer(MODEL_NAME)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # Loss
    train_loss = losses.BatchHardTripletLoss(model=model)

    # Training
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        #evaluator=evaluator,
        epochs=N_EPOCHS,
        #evaluation_steps=EVAL_STEPS,
        warmup_steps=int(len(train_dataloader) * 0.01),
        output_path=output_path,
        save_best_model=True,
        show_progress_bar=True
    )

    print(f"Model saved in: {output_path}")
    return output_path

In [None]:
tasks = df_train['task'].unique()
varieties = df_train['variety'].unique()

trained_models = {}

for task in tasks:
    for variety in varieties:
        subset = df_train[
            (df_train['task'] == task) &
            (df_train['variety'] == variety)
        ]

        # Start training
        path = train_specific_model(task, variety, subset)
        trained_models[f"{task}|{variety}"] = path

print("\nEnd training")

In [None]:
# Dictionary to save all organized embeddings
# Structure: all_embeddings[task][variety] = matrice_numpy
all_embeddings = {}

print("\nEmbeddings generation")

for key, model_path in trained_models.items():
    task, variety = key.split("|")

    if task not in all_embeddings:
        all_embeddings[task] = {}

    model = SentenceTransformer(model_path)

    subset = df_train[(df_train['task'] == task) & (df_train['variety'] == variety)]
    texts = subset['text'].tolist()
    labels = subset['label'].tolist()

    emb = model.encode(texts, show_progress_bar=True)

    all_embeddings[task][variety] = {
        "X": emb,           # vectors
        "y": np.array(labels), # labels
        "texts": texts      # original text
    }
    print(f"{task}-{variety}: generated {len(emb)} embeddings.")

# Saving the entire dictionary to a pickle file
with open(DICT_PATH, "wb") as f:
    pickle.dump(all_embeddings, f)

print("All embeddings saved")

### Plots

In [8]:
print("Loading embeddings...")
with open(DICT_PATH, "rb") as f:
    all_embeddings = pickle.load(f)

print(f"Task found: {list(all_embeddings.keys())}")

Loading embeddings...
Task found: ['Sentiment', 'Sarcasm']


In [13]:
output_graph = f"{MODEL_PATH}/Graphs"
os.makedirs(output_graph, exist_ok=True)

In [None]:
def embeddings_visualization(task_name, all_embeddings):
    print(f"\nTask: {task_name.upper()}")

    if task_name not in all_embeddings:
        print(f"Task '{task_name}' not found in the dictionary")
        return

    varieties = list(all_embeddings[task_name].keys())
    num_plots = len(varieties)

    if num_plots == 0:
        print("No variety found")
        return

    num_cols = 2
    num_rows = math.ceil(num_plots / num_cols)

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 6 * num_rows))

    if num_plots == 1:
        axes = [axes]
    else:
        axes = axes.flatten()

    fig.suptitle(f'Vector space: {task_name.upper()}', fontsize=20, y=1.02)

    for i, variety in enumerate(varieties):
        ax = axes[i]

        data = all_embeddings[task_name][variety]
        X = data['X']
        y = data['y']

        n_samples = X.shape[0]

        perp_val = min(30, n_samples - 1)

        tsne = TSNE(n_components=2, random_state=42, perplexity=perp_val, init='pca', learning_rate='auto')
        X_2d = tsne.fit_transform(X)

        # Plot
        sns.scatterplot(
            x=X_2d[:, 0],
            y=X_2d[:, 1],
            hue=y,
            palette='bright', # 0=Blu, 1=Orange
            style=y,
            s=60,
            alpha=0.7,
            ax=ax
        )

        ax.set_title(f'Variety: {variety} (n={n_samples})', fontsize=14, fontweight='bold')
        ax.set_xlabel('')
        ax.set_ylabel('')

        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles, ['Class 0', 'Class 1'], title='Label', loc='upper right')
        ax.grid(True, linestyle='--', alpha=0.3)

    for j in range(i + 1, len(axes)):
        axes[j].axis('off')

    plt.tight_layout()

    save_path = f"{output_graph}/graph_dict_{task_name}.png"
    plt.savefig(save_path, dpi=600, bbox_inches='tight')
    plt.show()
    print(f"Graph saved in: {save_path}")

#### Sarcasm

In [None]:
embeddings_visualization("Sarcasm", all_embeddings)

#### Sentiment

In [None]:
embeddings_visualization("Sentiment", all_embeddings)

### Competition

In [None]:
OUTPUT_COMPETITION_DIR=f"{MODEL_PATH}/competition_models"
os.makedirs(OUTPUT_COMPETITION_DIR, exist_ok=True)

In [None]:
def dataset_embeddig(df, task, variety, source):
    final_results = []

    groups = df.groupby(["task", "variety", "source"])
    group = df[
            (df["task"] == task) &
            (df["variety"] == variety) &
            (df["source"] == source)
    ]

    # compute embeddings

    bert_path = f"{MODEL_PATH}/{task}_{variety}"

    if not os.path.exists(bert_path):
        print(f"BERT model not found in: {bert_path}")
        return None, None

    try:
        bert_model = SentenceTransformer(bert_path)
        embeddings = bert_model.encode(group["text"].tolist(), show_progress_bar=False)
    except Exception as e:
        print(f"Error loading the model: {e}")
        return None, None

    if torch.cuda.is_available():
            torch.cuda.empty_cache()

    return embeddings, group["label"].tolist()

In [None]:
# Classifiers
classifiers = {
    "LogReg": LogisticRegression(
            max_iter=1000,
            random_state=SEED
    ),
    "SVM": SVC(
            kernel='rbf',
            C=1.0,
            probability=True,
            random_state=SEED
    ),
    "XGBoost": XGBClassifier(
            n_estimators=100,
            eval_metric='logloss',
            random_state=SEED
    ),
    "MLP": MLPClassifier(
            hidden_layer_sizes=(128, 64),
            max_iter=500,
            activation='relu',
            solver='adam',
            random_state=SEED
    )
}

#### Redistribution by source, for testing

In [None]:
with open(DICT_PATH, "rb") as f:
    old_embeddings = pickle.load(f)

In [None]:
newDict = f"{MODEL_PATH}/embeddings_by_source.pkl"

new_embeddings = {}

for task in old_embeddings:
    if task not in new_embeddings:
        new_embeddings[task] = {}

    for variety in old_embeddings[task]:
        print(f" Elaborazione: {task} - {variety}")

        data = old_embeddings[task][variety]
        X_all = data["X"]
        y_all = data["y"]
        texts_all = data["texts"]

        df_emb = pd.DataFrame({"text": texts_all})

        df_lookup = df_train[
            (df_train["task"] == task) &
            (df_train["variety"] == variety)
        ][["text", "source"]]

        df_lookup = df_lookup.drop_duplicates(subset=['text'])

        merged = df_emb.merge(df_lookup, on='text', how='left')

        merged['source'] = merged['source'].fillna('Unknown')

        sources_aligned = merged['source'].values
        unique_sources = np.unique(sources_aligned)

        new_embeddings[task][variety] = {}

        for source in unique_sources:
            mask = (sources_aligned == source)

            X_source = X_all[mask]
            y_source = y_all[mask]
            texts_source = [t for t, m in zip(texts_all, mask) if m]

            new_embeddings[task][variety][source] = {
                'X': X_source,
                'y': y_source,
                'texts': texts_source
            }

In [None]:
# Saving
with open(newDict, "wb") as f:
    pickle.dump(new_embeddings, f)


#### Start classifiers competition

In [None]:
newDict = f"{MODEL_PATH}/embeddings_by_source.pkl"

with open(newDict, "rb") as f:
    all_embeddings = pickle.load(f)

In [None]:
results_list = []

print(f"{'Task':<12} | {'Variety':<15} | {"Source":<15} | {'Winner':<15} | {'Acc':<8} | {'F1-Score':<8}")
print("-" * 90)


for task in all_embeddings:
    for variety in all_embeddings[task]:
        for source in all_embeddings[task][variety]:

            data = all_embeddings[task][variety][source]
            X_train = data['X']
            y_train = data['y']

            X_test, y_test = dataset_embeddig(df_test, task, variety, source)

            if X_test is None:
                continue

            best_acc = 0
            best_model_name = ""
            best_clf_obj = None
            best_f1 = 0


            for clf_name, clf in classifiers.items():
                try:
                    # Training
                    clf.fit(X_train, y_train)

                    # Evaluation
                    preds = clf.predict(X_test)
                    acc = accuracy_score(y_test, preds)
                    f1 = f1_score(y_test, preds, average='binary')

                    # Updating scores
                    if f1 > best_f1:
                        best_acc = acc
                        best_f1 = f1
                        best_model_name = clf_name
                        best_clf_obj = clf
                except Exception as e:
                    print(f"Error qith {clf_name} on {variety}_{source}: {e}")

            print(f"{task} | {variety:<15} | {source:<15} | {best_model_name:<15} | {best_acc:.4%}   | {best_f1:.4%}")

            # Saving results and model
            results_list.append({
                'Task': task,
                'Variety': variety,
                'Source': source,
                'Best_Model': best_model_name,
                'Accuracy': best_acc,
                'F1_Score': best_f1,
                'Test_Samples': len(X_test)
            })

            filename = f"{task}_{variety}_{source }_{best_model_name}.joblib".replace(" ", "_")
            joblib.dump(best_clf_obj, os.path.join(OUTPUT_COMPETITION_DIR, filename))

In [None]:
print("\nFinal results")
df_results = pd.DataFrame(results_list)

df_results = df_results.sort_values(by=['Task', 'F1_Score'], ascending=[True, True])

print(df_results)

# Saving results report
df_results.to_csv(f"{OUTPUT_COMPETITION_DIR}/best_models_results.csv", index=False)
print(f"Saved in'{OUTPUT_COMPETITION_DIR}'")