In [None]:
!pip install -U sentence-transformers
!pip install -U scikit-learn



In [None]:
!pip install datasets



In [None]:
import os
import pandas as pd
import torch
import shutil
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score, mean_squared_error
from scipy.stats import spearmanr, pearsonr

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
MODEL_ZIP = "/content/drive/MyDrive/Datasets/fine-tuned-model.zip"
UNZIP_DIR = "/content/drive/MyDrive/Datasets/fine-tuned-model"

In [None]:
with ZipFile(MODEL_ZIP, 'r') as zip_ref:
    zip_ref.extractall(UNZIP_DIR)

In [None]:
DATA_PATH = "/content/drive/MyDrive/Datasets/data_part_7.csv"

In [None]:
df = pd.read_csv(DATA_PATH)
df = df.dropna(subset=["text1", "text2", "label"])

train_df, val_df = train_test_split(df, test_size=0.15, random_state=42)

train_samples = [
    InputExample(texts=[row['text1'], row['text2']], label=float(row['label']))
    for _, row in train_df.iterrows()
]

val_pairs = list(zip(val_df['text1'], val_df['text2']))
val_labels = val_df['label'].astype(float).tolist()

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=8)
train_loss = losses.CosineSimilarityLoss(model=model)

In [None]:
def evaluate_model(model, pairs, true_labels):
    embeddings1 = model.encode([p[0] for p in pairs], convert_to_tensor=True, show_progress_bar=False)
    embeddings2 = model.encode([p[1] for p in pairs], convert_to_tensor=True, show_progress_bar=False)
    cosine_scores = torch.nn.functional.cosine_similarity(embeddings1, embeddings2).cpu().numpy()

    roc = roc_auc_score(true_labels, cosine_scores)
    mse = mean_squared_error(true_labels, cosine_scores)
    spearman = spearmanr(true_labels, cosine_scores).correlation
    pearson = pearsonr(true_labels, cosine_scores)[0]

    print(f"\n Validation Metrics:")
    print(f"ROC AUC:    {roc:.4f}")
    print(f"MSE:        {mse:.4f}")
    print(f"Spearman:   {spearman:.4f}")
    print(f"Pearson:    {pearson:.4f}")
    return roc

In [None]:
EPOCHS = 1
warmup_steps = int(len(train_dataloader) * EPOCHS * 0.1)
best_roc_auc = 0

for epoch in range(EPOCHS):
    print(f"\n Epoch {epoch + 1}/{EPOCHS}")
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,
        warmup_steps=warmup_steps,
        show_progress_bar=True
    )

    roc_auc = evaluate_model(model, val_pairs, val_labels)

    if roc_auc > best_roc_auc:
        best_roc_auc = roc_auc
        model.save(UNZIP_DIR)
        print(f" Model updated and saved to {UNZIP_DIR}")

In [None]:
shutil.make_archive(UNZIP_DIR, 'zip', UNZIP_DIR)

'/content/drive/MyDrive/Datasets/fine-tuned-model.zip'

In [None]:
from google.colab import files
files.download(f"{UNZIP_DIR}.zip")