In [None]:
# ✅ INSTALL DEPENDENCIES
!pip install evaluate
!pip install optuna
!pip install -U transformers

In [None]:
# ✅ IMPORTS
import os
import re
import wandb
import optuna
import evaluate
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch import nn
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, AutoConfig,
    TrainingArguments, Trainer, EarlyStoppingCallback,
    default_data_collator
)
from transformers.integrations import WandbCallback
from datasets import Dataset
from datetime import datetime

from optuna import trial


In [None]:
# ✅ MOUNT DRIVE
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import wandb
wandb.login()

In [None]:
# ✅ CONSTANTS
MODEL_NAME_TWITTER = "google/electra-base-discriminator"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Optional: safer SDPA fallback on some Colab combos
if torch.cuda.is_available():
    torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True)


In [None]:
# ✅ LOAD & CLEAN DATA
label2id = {
    "Extremely Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4
}

train_path = "/content/drive/MyDrive/תואר שני/deep/Corona_NLP_train_clean.xls" #Change location where you saved the cleaned data
test_path = "/content/drive/MyDrive/תואר שני/deep/Corona_NLP_test_clean.xls" #Change location where you saved the cleaned data

train_df = pd.read_csv(train_path, encoding="ISO-8859-1")
test_df  = pd.read_csv(test_path,  encoding="ISO-8859-1")

train_df["label"] = train_df["Sentiment"].map(label2id)
test_df["label"]  = test_df["Sentiment"].map(label2id)

train_df = train_df[["cleaned_tweets","label"]].dropna(subset=["cleaned_tweets"])
test_df  = test_df[["cleaned_tweets","label"]].dropna(subset=["cleaned_tweets"])

def normalize_tweet(t: str) -> str:
    t = str(t)
    t = re.sub(r"http\S+", "<url>", t)
    t = re.sub(r"@\w+", "<user>", t)
    t = re.sub(r"\d+", "<number>", t)
    return t.strip()

for df in (train_df, test_df):
    df["cleaned_tweets"] = df["cleaned_tweets"].astype(str).map(normalize_tweet)
    df.query("cleaned_tweets.str.len() >= 3", engine="python", inplace=True)

train_df["label"] = train_df["label"].astype(int)
test_df["label"]  = test_df["label"].astype(int)

# Split
train_df, val_df = train_test_split(
    train_df, test_size=0.2, stratify=train_df["label"], random_state=42
)
test_df_final = test_df.copy()

# Hard checks
assert {"cleaned_tweets","label"}.issubset(train_df.columns)
assert train_df["label"].between(0,4).all() and val_df["label"].between(0,4).all()


In [None]:
# ✅ TOKENIZER + SPECIAL TOKENS (ADD ONCE, OUTSIDE OBJECTIVE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_TWITTER, use_fast=True)

# Ensure pad token (usually present for RoBERTa; safe to check)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<pad>"})

# Add your custom specials only if missing
custom_specials = ["<url>", "<user>", "<number>"]
to_add = [t for t in custom_specials if t not in tokenizer.get_vocab()]
num_added = 0
if to_add:
    num_added = tokenizer.add_special_tokens({"additional_special_tokens": to_add})

print(f"Added {num_added} new tokens: {to_add if to_add else '[]'}")

In [None]:
# ✅ DATASETS
train_dataset_hf = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset_hf = Dataset.from_pandas(val_df.reset_index(drop=True))
test_dataset_hf = Dataset.from_pandas(test_df_final.reset_index(drop=True))

In [None]:
# ✅ METRICS
acc_metric = evaluate.load("accuracy")
f1_metric  = evaluate.load("f1")
prec_metric = evaluate.load("precision")
rec_metric  = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": acc_metric.compute(predictions=preds, references=labels)["accuracy"],
        # use macro F1 to match metric_for_best_model
        "f1_macro": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"],
        # keep weighted variants if you like viewing them
        "precision_weighted": prec_metric.compute(predictions=preds, references=labels, average="weighted")["precision"],
        "recall_weighted": rec_metric.compute(predictions=preds, references=labels, average="weighted")["recall"],
    }

In [None]:
# ✅ MODEL FACTORY (RESIZES EMBEDDINGS TO MATCH TOKENIZER)
def build_model(dropout: float | None = None):
    config = AutoConfig.from_pretrained(
        MODEL_NAME_TWITTER,
        num_labels=5,
        problem_type="single_label_classification",
        id2label={0:"Extremely Negative",1:"Negative",2:"Neutral",3:"Positive",4:"Extremely Positive"},
        label2id={"Extremely Negative":0,"Negative":1,"Neutral":2,"Positive":3,"Extremely Positive":4},
        # If dropout is provided from Optuna, apply it to all relevant fields
        hidden_dropout_prob=dropout if dropout is not None else None,
        attention_probs_dropout_prob=dropout if dropout is not None else None,
        classifier_dropout=dropout if dropout is not None else None,
    )
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_TWITTER, config=config)
    # keep tokenizer/model in sync (because you added <url>/<user>/<number>)
    model.resize_token_embeddings(len(tokenizer))
    if tokenizer.pad_token_id is not None:
        model.config.pad_token_id = tokenizer.pad_token_id
    return model

In [None]:
# ✅ OPTUNA OBJECTIVE — logs to Weights & Biases per trial
def objective(trial):
    import wandb

    # ---- Hyperparams to search (your requested space)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 3e-5, log=True)
    max_length    = trial.suggest_categorical("max_length", [96, 128])
    epochs        = trial.suggest_int("epochs", 3, 4)
    dropout       = trial.suggest_float("dropout", 0.08, 0.18)
    patience = trial.suggest_int("patience", 2, 4)
    batch_size = trial.suggest_categorical("batch_size", [16, 32])
    weight_decay = trial.suggest_categorical("weight_decay", [0.0, 0.01, 0.02])
   # warmup_ratio = trial.suggest_categorical("warmup_ratio", [0.05, 0.1])

    # ---- Start a fresh W&B run for this trial
    run = wandb.init(
        project="hf-electra",
        name=f"trial-{trial.number}",
        reinit=True,
        config={
            "learning_rate": learning_rate,
            "max_length": max_length,
            "batch_size": batch_size,
            "patience": patience,
            "epochs": epochs,
            "weight_decay": weight_decay,
            #"warmup_ratio": warmup_ratio,
            "dropout": dropout,
            "model_name": MODEL_NAME_TWITTER,
        },
    )

    try:
        # ---- Tokenize for this trial's max_length
        def preprocess(examples):
            enc = tokenizer(
                examples["cleaned_tweets"],
                truncation=True,
                padding="max_length",
                max_length=max_length,
            )
            enc["labels"] = examples["label"]
            return enc

        print(f"[DEBUG] Trial #{trial.number} → max_length = {max_length}")


        train_tok = train_dataset_hf.map(preprocess, batched=True, remove_columns=train_dataset_hf.column_names)
        val_tok   = val_dataset_hf.map(preprocess,   batched=True, remove_columns=val_dataset_hf.column_names)
        train_tok.set_format(type="torch")
        val_tok.set_format(type="torch")

        # ---- Build model (now receives dropout)
        model = build_model(dropout=dropout)

        # ---- Training args with W&B reporting
        args = TrainingArguments(
            output_dir=f"./hf_roberta_optuna_FINAL/{trial.number}",
            run_name=f"trial-{trial.number}",        # W&B run name
            report_to=["wandb"],                     # enable W&B logging

            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=epochs,
            weight_decay=weight_decay,

            #FIX: correct argument name
            eval_strategy="steps",
            save_strategy="steps",
            eval_steps=300,
            save_steps=300,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            greater_is_better=True,

            save_total_limit=1,                      # keep disk usage tiny
            logging_steps=100,
            seed=42,
            fp16=torch.cuda.is_available(),
        )

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_tok,
            eval_dataset=val_tok,
            tokenizer=tokenizer,
            data_collator=default_data_collator,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)],
        )

        trainer.train()
        metrics = trainer.evaluate()

        # record best checkpoint dir for this trial
        best_ckpt = trainer.state.best_model_checkpoint
        trial.set_user_attr("best_checkpoint", best_ckpt)

        # ensure best model (already loaded) is saved in output_dir
        trainer.save_model()

        # Log final eval metrics explicitly too
        wandb.log({
            "final_eval/accuracy": metrics.get("eval_accuracy"),
            "final_eval/f1_macro": metrics.get("eval_f1_macro"),
        })

        # 🔧 Return the SAME metric used for model selection
        return metrics["eval_accuracy"]

    finally:
        # Ensure the run is closed even if an error occurs
        wandb.finish()

In [None]:
# ✅ RUN OPTUNA
study = optuna.create_study(direction="maximize", study_name="hf-robertatwitter-attempt3")
study.optimize(objective, n_trials=10)
print("Best value:", study.best_value)
print("Best params:", study.best_params)

In [None]:
from shutil import copytree
from pathlib import Path

best_trial = study.best_trial
best_dir = best_trial.user_attrs.get("best_checkpoint", None)
assert best_dir is not None, "No best_checkpoint found on the best trial."

FINAL_DIR = Path("./best_model_autosaved")
copytree(best_dir, FINAL_DIR, dirs_exist_ok=True)
print("✅ Best model copied to:", FINAL_DIR)

Compression techniques

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os, torch, copy
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch import nn
import torch.nn.utils.prune as prune
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, classification_report

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Point to your saved checkpoint folder (where config.json + pytorch_model.bin live)
BASE_DIR = "./best_model_autosaved"

model = AutoModelForSequenceClassification.from_pretrained(BASE_DIR).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(BASE_DIR)

In [None]:
def evaluate_on(df, model, tokenizer, max_length=192, batch_size=64, device=DEVICE, desc="Eval"):
    class TweetDataset(torch.utils.data.Dataset):
        def __init__(self, dataframe, tokenizer, max_length=128):
            self.texts = dataframe["cleaned_tweets"].tolist()
            self.labels = dataframe["label"].tolist()
            self.tokenizer = tokenizer
            self.max_length = max_length
        def __len__(self): return len(self.texts)
        def __getitem__(self, idx):
            enc = self.tokenizer(
                str(self.texts[idx]),
                truncation=True,
                padding="max_length",
                max_length=self.max_length,
                return_tensors="pt",
            )
            return {
                "input_ids": enc["input_ids"][0],
                "attention_mask": enc["attention_mask"][0],
                "labels": torch.tensor(self.labels[idx], dtype=torch.long),
            }

    ds = TweetDataset(test_df_final, tokenizer, max_length=max_length)
    dl = DataLoader(ds, batch_size=batch_size, pin_memory=True)

    model = model.to(device).eval()
    all_y, all_p = [], []
    with torch.no_grad():
        for b in dl:
            ids = b["input_ids"].to(device)
            att = b["attention_mask"].to(device)
            y   = b["labels"].to(device)
            logits = model(ids, attention_mask=att).logits
            p = logits.argmax(dim=1)
            all_y.extend(y.cpu().numpy().tolist())
            all_p.extend(p.cpu().numpy().tolist())

    print(f"{desc} accuracy:", accuracy_score(all_y, all_p))
    print(classification_report(all_y, all_p, digits=4))

In [None]:
def compressed_models(base_model):
    compressed = {}

    # 1) Dynamic Quantization (CPU-only module)
    cpu_model = copy.deepcopy(base_model).to("cpu")
    qmodel = torch.quantization.quantize_dynamic(
        cpu_model,
        {nn.Linear},
        dtype=torch.qint8
    )
    qmodel = qmodel.eval()  # ✅ Set to eval mode
    compressed["quantized_cpu"] = qmodel

    # 2) Pruning (unstructured L1 across Linear layers)
    pruned = copy.deepcopy(base_model).to(DEVICE)
    params_to_prune = []
    for m in pruned.modules():
        if isinstance(m, nn.Linear):
            params_to_prune.append((m, "weight"))

    if len(params_to_prune) > 0:
        prune.global_unstructured(
            params_to_prune,
            pruning_method=prune.L1Unstructured,
            amount=0.40,  # 40% sparsity
        )
        # Remove reparametrization to bake in the zeroed-out weights
        for (m, _) in params_to_prune:
            try:
                prune.remove(m, "weight")
            except Exception:
                pass
    pruned = pruned.eval()  # ✅ Set to eval mode
    compressed["pruned"] = pruned

    # 3) FP16 (good for GPU inference)
    half_model = copy.deepcopy(base_model).half().to(DEVICE)
    half_model = half_model.eval()  # ✅ Set to eval mode
    compressed["fp16"] = half_model

    return compressed

In [None]:
# Base FP32
evaluate_on(test_df_final, model, tokenizer, max_length=192, desc="Base FP32")

# Get the compressed models dictionary
compressed_models_dict = compressed_models(model)

# FP16 (GPU)
evaluate_on(test_df_final, compressed_models_dict["fp16"], tokenizer, max_length=192, desc="FP16")

# Pruned (GPU)
evaluate_on(test_df_final, compressed_models_dict["pruned"], tokenizer, max_length=192, desc="Pruned")

# Quantized (CPU)
evaluate_on(test_df_final, compressed_models_dict["quantized_cpu"], tokenizer, max_length=192, device="cpu", desc="Quantized CPU")

In [None]:
OUT_DIR = "/content/drive/MyDrive/תואר שני/deep" #write the output directory here
os.makedirs(OUT_DIR, exist_ok=True)

# A) Save FP16
fp16_dir = os.path.join(OUT_DIR, "fp16")
os.makedirs(fp16_dir, exist_ok=True)
compressed_models_dict["fp16"].save_pretrained(fp16_dir)
tokenizer.save_pretrained(fp16_dir)

# B) Save pruned (after prune.remove, save_pretrained works)
pruned_dir = os.path.join(OUT_DIR, "pruned")
os.makedirs(pruned_dir, exist_ok=True)
compressed_models_dict["pruned"].save_pretrained(pruned_dir)
tokenizer.save_pretrained(pruned_dir)

# C) Save quantized CPU (state_dict + a tiny loader script)
q_dir = os.path.join(OUT_DIR, "quantized_cpu")
os.makedirs(q_dir, exist_ok=True)
torch.save(compressed_models_dict["quantized_cpu"].state_dict(), os.path.join(q_dir, "quantized_state_dict.pt"))

# Save a small loader so future-you can reload easily
with open(os.path.join(q_dir, "load_quantized.py"), "w") as f:
    f.write(
        "import torch\n"
        "from torch import nn\n"
        "from transformers import AutoModelForSequenceClassification\n"
        "def load_quantized(model_dir, state_path):\n"
        "    model = AutoModelForSequenceClassification.from_pretrained(model_dir)\n"
        "    model = torch.quantization.quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8)\n"
        "    sd = torch.load(state_path, map_location='cpu')\n"
        "    model.load_state_dict(sd, strict=False)\n"
        "    model.eval()\n"
        "    return model\n"
    )

print("Saved to:", OUT_DIR)