In [None]:
!pip install evaluate
!pip install optuna
!pip install -U transformers
!pip install wandb -qU

In [None]:
#log in to Wandb
import wandb
wandb.login()

In [None]:
#Data handling & visualization
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#PyTorch core
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import torch.nn.utils.prune as prune
import torch.nn.functional as F

# Training utilities
from tqdm.auto import tqdm

# Transformers (Hugging Face)
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    RobertaTokenizer,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    get_scheduler,
    default_data_collator,
    AutoConfig # Import AutoConfig here

)


#Evaluation with HF Evaluate library
import evaluate

#Hyperparameter tuning
import optuna

from optuna import Trial

#Experiment tracking
import wandb

# File paths (optional)
from pathlib import Path

from datetime import datetime

In [None]:
from google.colab import drive
drive.mount('/content/drive')


## Code for Part B1: Fine-Tuning using the "full code" from exercise 4

In [None]:
# Define model names
=MODEL_NAME_ELECTRA = "google/electra-base-discriminator"

# Load tokenizers
tokenizers = {
    MODEL_NAME_ELECTRA: AutoTokenizer.from_pretrained(MODEL_NAME_ELECTRA)
}

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
# Label mapping
label2id = {
    "Extremely Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4
}

train_path = "/content/drive/MyDrive/תואר שני/deep/Corona_NLP_train_clean.xls" #Change location where you saved the cleaned data
test_path = "/content/drive/MyDrive/תואר שני/deep/Corona_NLP_test_clean.xls" #Change location where you saved the cleaned data

#Load the CSVs
train_df = pd.read_csv(train_path, encoding='ISO-8859-1')
test_df = pd.read_csv(test_path, encoding='ISO-8859-1')



#train_df = train_df.sample(n=50, random_state=42).reset_index(drop=True)


# Map sentiment labels to integers
train_df["label"] = train_df["Sentiment"].map(label2id)
test_df["label"] = test_df["Sentiment"].map(label2id)

train_df = train_df[["cleaned_tweets", "label"]]
test_df = test_df[["cleaned_tweets", "label"]]



#Split TRAIN into train + validation
train_df, val_df = train_test_split(train_df,test_size=0.2,stratify=train_df["label"],random_state=42)

test_df_final = test_df.copy()


In [None]:
# Run this right after you load and subset the data, BEFORE building TweetDataset
for df in [train_df, val_df, test_df_final]:
    # Replace NaN/None with empty string and cast to str
    df["cleaned_tweets"] = df["cleaned_tweets"].fillna("").astype(str).str.strip()

In [None]:
# Define dataset
class TweetDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.texts = dataframe["cleaned_tweets"].tolist()
        self.labels = dataframe["label"].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # If batched access, HuggingFace might pass a list of indices
        if isinstance(idx, list):
            return [self._encode_single(i) for i in idx]
        return self._encode_single(idx)

    def _encode_single(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"][0],
            "attention_mask": encoding["attention_mask"][0],
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [None]:
def early_stop_check(patience, best_val_accuracy,
best_val_accuracy_epoch, current_val_accuracy,
current_val_accuracy_epoch):
    early_stop_flag = False
    if current_val_accuracy > best_val_accuracy:
        best_val_accuracy = current_val_accuracy
        best_val_accuracy_epoch = current_val_accuracy_epoch
    else:
        if current_val_accuracy_epoch - best_val_accuracy_epoch > patience:
            early_stop_flag = True

    return best_val_accuracy, best_val_accuracy_epoch, early_stop_flag

In [None]:
def train_model_with_hyperparams(model, train_loader, val_loader,
                                  optimizer, criterion, epochs, patience, trial):
    best_val_accuracy = 0.0
    best_val_accuracy_epoch = 0
    early_stop_flag = False
    best_model_state = None

    model = model.to(device)

    #Total training steps for scheduler
    total_steps = len(train_loader) * epochs
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )

    for epoch in range(1, epochs + 1):
        model.train()
        train_loss = 0.0
        total_train_samples = 0
        correct_train_predictions = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)

            loss.backward()
            optimizer.step()
            lr_scheduler.step()  # Step the scheduler AFTER optimizer

            train_loss += loss.item() * input_ids.size(0)
            total_train_samples += input_ids.size(0)
            correct_train_predictions += (logits.argmax(dim=1) == labels).sum().item()

        train_loss /= total_train_samples
        train_accuracy = correct_train_predictions / total_train_samples

        ### Validation ###
        model.eval()
        val_loss = 0.0
        total_val_samples = 0
        correct_val_predictions = 0
        all_val_labels = []
        all_val_preds = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)

                val_loss += loss.item() * input_ids.size(0)
                total_val_samples += input_ids.size(0)
                correct_val_predictions += (logits.argmax(dim=1) == labels).sum().item()

                all_val_labels.extend(labels.cpu().numpy())
                all_val_preds.extend(logits.argmax(dim=1).cpu().numpy())

        val_loss /= total_val_samples
        val_accuracy = correct_val_predictions / total_val_samples
        val_precision = precision_score(all_val_labels, all_val_preds, average='weighted')
        val_recall = recall_score(all_val_labels, all_val_preds, average='weighted')
        val_f1 = f1_score(all_val_labels, all_val_preds, average='weighted')

        # Early stopping
        best_val_accuracy, best_val_accuracy_epoch, early_stop_flag = early_stop_check(
            patience, best_val_accuracy, best_val_accuracy_epoch, val_accuracy, epoch
        )

        if val_accuracy == best_val_accuracy:
            best_model_state = model.state_dict()

        # W&B logging
        wandb.log({
            "Epoch": epoch,
            "Train Loss": train_loss,
            "Train Accuracy": train_accuracy,
            "Validation Loss": val_loss,
            "Validation Accuracy": val_accuracy,
            "Validation Precision": val_precision,
            "Validation Recall": val_recall,
            "Validation F1": val_f1,
            "Learning Rate": optimizer.param_groups[0]['lr']  # Log LR
        })

        if early_stop_flag:
            break

    if best_model_state is not None:
        torch.save(best_model_state, f"best_model_trial_{trial.number}.pt")

    return best_val_accuracy


In [None]:
def objective(trial, model_name, tokenizer):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 4e-5, log=True)
    max_length = trial.suggest_categorical("max_length", [128, 192])
    batch_size = trial.suggest_categorical("batch_size", [16, 32] )
    epochs = trial.suggest_int("epochs", 2, 4)
    patience = trial.suggest_int("patience", 1, 2)
    weight_decay = trial.suggest_categorical("weight_decay", [0.0, 0.01, 0.02])
    dropout = trial.suggest_float("dropout", 0.1, 0.3) # Add dropout hyperparameter
    # Add suggested hyperparameters from the second part of the notebook
    lr_trainer = trial.suggest_categorical("learning_rate_trainer", [2e-5])
    bsz_trainer = trial.suggest_categorical("per_device_train_batch_size_trainer", [8])


    # --- Fresh tokenizer per trial ---
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    tokenizer.add_special_tokens({"additional_special_tokens": ["<url>", "<user>", "<number>"]})

    # --- Data (tokenize with this tokenizer) ---
    train_dataset = TweetDataset(train_df, tokenizer, max_length=max_length)
    val_dataset   = TweetDataset(val_df,   tokenizer, max_length=max_length)
    train_loader  = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    val_loader    = DataLoader(val_dataset,   batch_size=batch_size,               pin_memory=True)

      # --- Model ---
    config = AutoConfig.from_pretrained(model_name, num_labels=5, hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

    # Always resize to tokenizer length (safe even if no tokens were added)
    model.resize_token_embeddings(len(tokenizer))

    # --- Class-weighted loss ---
    import numpy as np
    counts = np.bincount(train_df["label"].values, minlength=5)
    weights = 1.0 / np.maximum(counts, 1)
    class_weights = torch.tensor(weights, dtype=torch.float, device=device)
    criterion = nn.CrossEntropyLoss(weight=class_weights)

    # --- Discriminative LRs (4x head) ---
    head_params, backbone_params = [], []
    for n, p in model.named_parameters():
        (head_params if "classifier" in n else backbone_params).append(p)

    base_lr = learning_rate
    head_lr = base_lr * 4.0
    optimizer = AdamW(
        [
            {"params": backbone_params, "lr": base_lr, "weight_decay": weight_decay},
            {"params": head_params,     "lr": head_lr,  "weight_decay": 0.0},
        ]
    )

    trial_dir = Path(f"./runs_electra_fullcode/trial-{trial.number}")
    trial_dir.mkdir(parents=True, exist_ok=True)

    # --- W&B (sanitize slashes) ---
    safe_model_name = model_name.replace("/", "-")
    wandb.init(
        project=f"{safe_model_name}-fullcode-3",
        name=f"trial-{trial.number}",
        reinit=True,
        config=dict(
            learning_rate=learning_rate, head_lr=head_lr, batch_size=batch_size,
            patience=patience, max_length=max_length,
            weight_decay=weight_decay, epochs=epochs,dropout=dropout,

        ),
    )

    try:
        val_accuracy = train_model_with_hyperparams(
            model, train_loader, val_loader, optimizer, criterion,
            epochs, patience, trial
        )

        model.save_pretrained(trial_dir.as_posix())
        tokenizer.save_pretrained(trial_dir.as_posix())


        trial.set_user_attr("best_dir", str(trial_dir))



        return val_accuracy
    finally:
        # Clean up GPU even on prune/error
        try:
            wandb.finish()
        except Exception:
            pass
        model.to("cpu"); del model, optimizer
        import gc; gc.collect()
        torch.cuda.empty_cache()

    wandb.finish()

    # Set user attribute to store the directory of the best model for this trial
    trial.set_user_attr("best_dir", ".") # Assuming the best model is saved in the current directory
    model.save_pretrained(trial_dir.as_posix())
    tokenizer.save_pretrained(trial_dir.as_posix())


    trial.set_user_attr("best_dir", str(trial_dir))



    return val_accuracy

In [None]:
  # RoBERTa
#study = optuna.create_study(direction="maximize")
#study.optimize(lambda trial: objective(trial, model_name=MODEL_NAME_ROBERTA, tokenizer=tokenizers[MODEL_NAME_ROBERTA]), n_trials=10)

# Electra
from transformers import set_seed
study_electra = optuna.create_study(direction="maximize")
study_electra.optimize(lambda trial: objective(trial, model_name=MODEL_NAME_ELECTRA, tokenizer=tokenizers[MODEL_NAME_ELECTRA]), n_trials=10)

In [None]:
best_trial = study_electra.best_trial
best_dir = best_trial.user_attrs.get("best_dir", None)
assert best_dir is not None, "No best_dir found on the best trial. Did you add trial.set_user_attr('best_dir', ...)?"

FINAL_DIR = Path("./best_model_autosaved")
copytree(best_dir, FINAL_DIR, dirs_exist_ok=True)
print("✅ Best model copied to:", FINAL_DIR)

Compression techniques



In [None]:
import os, torch, copy
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch import nn
import torch.nn.utils.prune as prune
from torch.utils.data import DataLoader

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Point to your saved checkpoint folder (where config.json + pytorch_model.bin live)
BASE_DIR = "./best_model_autosaved"

model = AutoModelForSequenceClassification.from_pretrained(BASE_DIR).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(BASE_DIR)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

def evaluate_on(df, model, tokenizer, max_length=192, batch_size=64, device=DEVICE, desc="Eval"):
    class TweetDataset(torch.utils.data.Dataset):
        def __init__(self, dataframe, tokenizer, max_length=128):
            self.texts = dataframe["cleaned_tweets"].tolist()
            self.labels = dataframe["label"].tolist()
            self.tokenizer = tokenizer
            self.max_length = max_length
        def __len__(self): return len(self.texts)
        def __getitem__(self, idx):
            enc = self.tokenizer(
                str(self.texts[idx]),
                truncation=True,
                padding="max_length",
                max_length=self.max_length,
                return_tensors="pt",
            )
            return {
                "input_ids": enc["input_ids"][0],
                "attention_mask": enc["attention_mask"][0],
                "labels": torch.tensor(self.labels[idx], dtype=torch.long),
            }

    ds = TweetDataset(test_df_final, tokenizer, max_length=max_length)
    dl = DataLoader(ds, batch_size=batch_size, pin_memory=True)

    model = model.to(device).eval()
    all_y, all_p = [], []
    with torch.no_grad():
        for b in dl:
            ids = b["input_ids"].to(device)
            att = b["attention_mask"].to(device)
            y   = b["labels"].to(device)
            logits = model(ids, attention_mask=att).logits
            p = logits.argmax(dim=1)
            all_y.extend(y.cpu().numpy().tolist())
            all_p.extend(p.cpu().numpy().tolist())

    print(f"{desc} accuracy:", accuracy_score(all_y, all_p))
    print(classification_report(all_y, all_p, digits=4))

In [None]:
def compress_model(base_model):
    compressed = {}

    # 1) Dynamic Quantization (CPU-only module)
    cpu_model = copy.deepcopy(base_model).to("cpu")
    qmodel = torch.quantization.quantize_dynamic(
        cpu_model,
        {nn.Linear},
        dtype=torch.qint8
    )
    compressed["quantized_cpu"] = qmodel  # keep in memory; will save via state_dict

    # 2) Pruning (unstructured L1 across Linear layers)
    pruned = copy.deepcopy(base_model).to(DEVICE)
    params_to_prune = []
    for m in pruned.modules():
        if isinstance(m, nn.Linear):
            params_to_prune.append((m, "weight"))

    if len(params_to_prune) > 0:
        prune.global_unstructured(
            params_to_prune,
            pruning_method=prune.L1Unstructured,
            amount=0.40,  # 40% sparsity
        )
        # IMPORTANT: remove reparam to bake weights into .weight
        for (m, _) in params_to_prune:
            try:
                prune.remove(m, "weight")
            except Exception:
                pass
    compressed["pruned"] = pruned

    # 3) FP16 (good for GPU inference)
    half_model = copy.deepcopy(base_model).half().to(DEVICE)
    compressed["fp16"] = half_model

    return compressed

compressed_models = compress_model(model)

In [None]:
# Base FP32
evaluate_on(test_df_final, model, tokenizer, max_length=192, desc="Base FP32")

# FP16 (GPU)
evaluate_on(test_df_final, compressed_models["fp16"], tokenizer, max_length=192, desc="FP16")

# Pruned (GPU)
evaluate_on(test_df_final, compressed_models["pruned"], tokenizer, max_length=192, desc="Pruned")

# Quantized (CPU)
evaluate_on(test_df_final, compressed_models["quantized_cpu"], tokenizer, max_length=192, device="cpu", desc="Quantized CPU")

In [None]:
OUT_DIR = "/content/drive/MyDrive/תואר שני/deep"
os.makedirs(OUT_DIR, exist_ok=True)

# A) Save FP16
fp16_dir = os.path.join(OUT_DIR, "fp16")
os.makedirs(fp16_dir, exist_ok=True)
compressed_models["fp16"].save_pretrained(fp16_dir)
tokenizer.save_pretrained(fp16_dir)

# B) Save pruned (after prune.remove, save_pretrained works)
pruned_dir = os.path.join(OUT_DIR, "pruned")
os.makedirs(pruned_dir, exist_ok=True)
compressed_models["pruned"].save_pretrained(pruned_dir)
tokenizer.save_pretrained(pruned_dir)

# C) Save quantized CPU (state_dict + a tiny loader script)
q_dir = os.path.join(OUT_DIR, "quantized_cpu")
os.makedirs(q_dir, exist_ok=True)
torch.save(compressed_models["quantized_cpu"].state_dict(), os.path.join(q_dir, "quantized_state_dict.pt"))

# Save a small loader so future-you can reload easily
with open(os.path.join(q_dir, "load_quantized.py"), "w") as f:
    f.write(
        "import torch\n"
        "from torch import nn\n"
        "from transformers import AutoModelForSequenceClassification\n"
        "def load_quantized(model_dir, state_path):\n"
        "    model = AutoModelForSequenceClassification.from_pretrained(model_dir)\n"
        "    model = torch.quantization.quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8)\n"
        "    sd = torch.load(state_path, map_location='cpu')\n"
        "    model.load_state_dict(sd, strict=False)\n"
        "    model.eval()\n"
        "    return model\n"
    )
