In [None]:
!pip install -q sentence-transformers pandas scikit-learn

In [None]:
!pip install datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, mean_squared_error
from scipy.stats import spearmanr, pearsonr

In [None]:
DATA_PATH = '/content/drive/MyDrive/Datasets/cleaned_dataset.csv'

In [None]:
df = pd.read_csv(DATA_PATH)
df = df.sample(3000, random_state=42)
df = df.dropna(subset=["text1", "text2", "label"])
train_df, val_df = train_test_split(df, test_size=0.15, random_state=42)

train_samples = [
    InputExample(texts=[row['text1'], row['text2']], label=float(row['label']))
    for _, row in train_df.iterrows()
]

val_pairs = list(zip(val_df['text1'].tolist(), val_df['text2'].tolist()))
val_labels = val_df['label'].astype(float).tolist()

In [None]:
MODEL_NAME = 'distiluse-base-multilingual-cased-v1'
EPOCHS = 1
BATCH_SIZE = 128
SAVE_DIR = '/content/fine-tuned-model'

model = SentenceTransformer(MODEL_NAME, device=device)
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=BATCH_SIZE)
train_loss = losses.CosineSimilarityLoss(model=model)

In [None]:
def evaluate_model(model, pairs, true_labels):
    embeddings1 = model.encode([p[0] for p in pairs], convert_to_tensor=True, show_progress_bar=False)
    embeddings2 = model.encode([p[1] for p in pairs], convert_to_tensor=True, show_progress_bar=False)
    cosine_scores = torch.nn.functional.cosine_similarity(embeddings1, embeddings2).cpu().numpy()

    roc = roc_auc_score(true_labels, cosine_scores)
    mse = mean_squared_error(true_labels, cosine_scores)
    spearman = spearmanr(true_labels, cosine_scores).correlation
    pearson = pearsonr(true_labels, cosine_scores)[0]

    print(f"\n🔍 Validation Metrics:")
    print(f"ROC AUC:    {roc:.4f}")
    print(f"MSE:        {mse:.4f}")
    print(f"Spearman:   {spearman:.4f}")
    print(f"Pearson:    {pearson:.4f}")
    return roc

In [None]:
warmup_steps = int(len(train_dataloader) * EPOCHS * 0.1)
best_roc_auc = 0

for epoch in range(EPOCHS):
    print(f"\n🚀 Epoch {epoch + 1}/{EPOCHS}")
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,
        warmup_steps=warmup_steps,
        show_progress_bar=True
    )

    roc_auc = evaluate_model(model, val_pairs, val_labels)

    if roc_auc > best_roc_auc:
        best_roc_auc = roc_auc
        model.save(SAVE_DIR)
        print(f"Model saved to {SAVE_DIR}")

In [None]:
import shutil
from google.colab import files

shutil.make_archive("fine-tuned-model", 'zip', SAVE_DIR)
files.download("fine-tuned-model.zip")