In [5]:
# ✅ INSTALL DEPENDENCIES
!pip install evaluate
!pip install optuna
!pip install -U transformers



In [6]:
# ✅ IMPORTS
import os
import re
import wandb
import optuna
import evaluate
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch import nn
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, AutoConfig,
    TrainingArguments, Trainer, EarlyStoppingCallback,
    default_data_collator
)
from transformers.integrations import WandbCallback
from datasets import Dataset
from datetime import datetime

from optuna import trial


In [7]:
# ✅ MOUNT DRIVE
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import wandb
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mchen-benm[0m ([33mchen-benm-tel-aviv-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [9]:
# ✅ CONSTANTS
MODEL_NAME_TWITTER = "google/electra-base-discriminator"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Optional: safer SDPA fallback on some Colab combos
if torch.cuda.is_available():
    torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True)


  self.gen = func(*args, **kwds)


In [10]:
# ✅ LOAD & CLEAN DATA
label2id = {
    "Extremely Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4
}

train_path = "/content/drive/MyDrive/תואר שני/deep/Corona_NLP_train_clean.xls" #Change location where you saved the cleaned data
test_path = "/content/drive/MyDrive/תואר שני/deep/Corona_NLP_test_clean.xls" #Change location where you saved the cleaned data

train_df = pd.read_csv(train_path, encoding="ISO-8859-1")
test_df  = pd.read_csv(test_path,  encoding="ISO-8859-1")

train_df["label"] = train_df["Sentiment"].map(label2id)
test_df["label"]  = test_df["Sentiment"].map(label2id)

train_df = train_df[["cleaned_tweets","label"]].dropna(subset=["cleaned_tweets"])
test_df  = test_df[["cleaned_tweets","label"]].dropna(subset=["cleaned_tweets"])

def normalize_tweet(t: str) -> str:
    t = str(t)
    t = re.sub(r"http\S+", "<url>", t)
    t = re.sub(r"@\w+", "<user>", t)
    t = re.sub(r"\d+", "<number>", t)
    return t.strip()

for df in (train_df, test_df):
    df["cleaned_tweets"] = df["cleaned_tweets"].astype(str).map(normalize_tweet)
    df.query("cleaned_tweets.str.len() >= 3", engine="python", inplace=True)

train_df["label"] = train_df["label"].astype(int)
test_df["label"]  = test_df["label"].astype(int)

# Split
train_df, val_df = train_test_split(
    train_df, test_size=0.2, stratify=train_df["label"], random_state=42
)
test_df_final = test_df.copy()

# Hard checks
assert {"cleaned_tweets","label"}.issubset(train_df.columns)
assert train_df["label"].between(0,4).all() and val_df["label"].between(0,4).all()


In [11]:
# ✅ TOKENIZER + SPECIAL TOKENS (ADD ONCE, OUTSIDE OBJECTIVE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_TWITTER, use_fast=True)

# Ensure pad token (usually present for RoBERTa; safe to check)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<pad>"})

# Add your custom specials only if missing
custom_specials = ["<url>", "<user>", "<number>"]
to_add = [t for t in custom_specials if t not in tokenizer.get_vocab()]
num_added = 0
if to_add:
    num_added = tokenizer.add_special_tokens({"additional_special_tokens": to_add})

print(f"Added {num_added} new tokens: {to_add if to_add else '[]'}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Added 3 new tokens: ['<url>', '<user>', '<number>']


In [12]:
# ✅ DATASETS
train_dataset_hf = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset_hf = Dataset.from_pandas(val_df.reset_index(drop=True))
test_dataset_hf = Dataset.from_pandas(test_df_final.reset_index(drop=True))

In [13]:
# ✅ METRICS
acc_metric = evaluate.load("accuracy")
f1_metric  = evaluate.load("f1")
prec_metric = evaluate.load("precision")
rec_metric  = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": acc_metric.compute(predictions=preds, references=labels)["accuracy"],
        # use macro F1 to match metric_for_best_model
        "f1_macro": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"],
        # keep weighted variants if you like viewing them
        "precision_weighted": prec_metric.compute(predictions=preds, references=labels, average="weighted")["precision"],
        "recall_weighted": rec_metric.compute(predictions=preds, references=labels, average="weighted")["recall"],
    }

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [14]:
# ✅ MODEL FACTORY (RESIZES EMBEDDINGS TO MATCH TOKENIZER)
def build_model(dropout: float | None = None):
    config = AutoConfig.from_pretrained(
        MODEL_NAME_TWITTER,
        num_labels=5,
        problem_type="single_label_classification",
        id2label={0:"Extremely Negative",1:"Negative",2:"Neutral",3:"Positive",4:"Extremely Positive"},
        label2id={"Extremely Negative":0,"Negative":1,"Neutral":2,"Positive":3,"Extremely Positive":4},
        # If dropout is provided from Optuna, apply it to all relevant fields
        hidden_dropout_prob=dropout if dropout is not None else None,
        attention_probs_dropout_prob=dropout if dropout is not None else None,
        classifier_dropout=dropout if dropout is not None else None,
    )
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_TWITTER, config=config)
    # keep tokenizer/model in sync (because you added <url>/<user>/<number>)
    model.resize_token_embeddings(len(tokenizer))
    if tokenizer.pad_token_id is not None:
        model.config.pad_token_id = tokenizer.pad_token_id
    return model

In [15]:
# ✅ OPTUNA OBJECTIVE — logs to Weights & Biases per trial
def objective(trial):
    import wandb

    # ---- Hyperparams to search (your requested space)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 3e-5, log=True)
    max_length    = trial.suggest_categorical("max_length", [96, 128])
    epochs        = trial.suggest_int("epochs", 3, 4)
    dropout       = trial.suggest_float("dropout", 0.08, 0.18)
    patience = trial.suggest_int("patience", 2, 4)
    batch_size = trial.suggest_categorical("batch_size", [16, 32])
    weight_decay = trial.suggest_categorical("weight_decay", [0.0, 0.01, 0.02])
   # warmup_ratio = trial.suggest_categorical("warmup_ratio", [0.05, 0.1])

    # ---- Start a fresh W&B run for this trial
    run = wandb.init(
        project="hf-electra",
        name=f"trial-{trial.number}",
        reinit=True,
        config={
            "learning_rate": learning_rate,
            "max_length": max_length,
            "batch_size": batch_size,
            "patience": patience,
            "epochs": epochs,
            "weight_decay": weight_decay,
            #"warmup_ratio": warmup_ratio,
            "dropout": dropout,
            "model_name": MODEL_NAME_TWITTER,
        },
    )

    try:
        # ---- Tokenize for this trial's max_length
        def preprocess(examples):
            enc = tokenizer(
                examples["cleaned_tweets"],
                truncation=True,
                padding="max_length",
                max_length=max_length,
            )
            enc["labels"] = examples["label"]
            return enc

        print(f"[DEBUG] Trial #{trial.number} → max_length = {max_length}")


        train_tok = train_dataset_hf.map(preprocess, batched=True, remove_columns=train_dataset_hf.column_names)
        val_tok   = val_dataset_hf.map(preprocess,   batched=True, remove_columns=val_dataset_hf.column_names)
        train_tok.set_format(type="torch")
        val_tok.set_format(type="torch")

        # ---- Build model (now receives dropout)
        model = build_model(dropout=dropout)

        # ---- Training args with W&B reporting
        args = TrainingArguments(
            output_dir=f"./hf_roberta_optuna_FINAL/{trial.number}",
            run_name=f"trial-{trial.number}",        # W&B run name
            report_to=["wandb"],                     # enable W&B logging

            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=epochs,
            weight_decay=weight_decay,

            #FIX: correct argument name
            eval_strategy="steps",
            save_strategy="steps",
            eval_steps=300,
            save_steps=300,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            greater_is_better=True,

            save_total_limit=1,                      # keep disk usage tiny
            logging_steps=100,
            seed=42,
            fp16=torch.cuda.is_available(),
        )

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_tok,
            eval_dataset=val_tok,
            tokenizer=tokenizer,
            data_collator=default_data_collator,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)],
        )

        trainer.train()
        metrics = trainer.evaluate()

        # record best checkpoint dir for this trial
        best_ckpt = trainer.state.best_model_checkpoint
        trial.set_user_attr("best_checkpoint", best_ckpt)

        # ensure best model (already loaded) is saved in output_dir
        trainer.save_model()

        # Log final eval metrics explicitly too
        wandb.log({
            "final_eval/accuracy": metrics.get("eval_accuracy"),
            "final_eval/f1_macro": metrics.get("eval_f1_macro"),
        })

        # 🔧 Return the SAME metric used for model selection
        return metrics["eval_accuracy"]

    finally:
        # Ensure the run is closed even if an error occurs
        wandb.finish()

In [16]:
# ✅ RUN OPTUNA
study = optuna.create_study(direction="maximize", study_name="hf-robertatwitter-attempt3")
study.optimize(objective, n_trials=10)
print("Best value:", study.best_value)
print("Best params:", study.best_params)

[I 2025-08-21 08:38:24,770] A new study created in memory with name: hf-robertatwitter-attempt3


[DEBUG] Trial #0 → max_length = 96


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  trainer = Trainer(


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.4069,1.347675,0.40456,0.358739,0.497156,0.40456
600,1.1989,1.168225,0.506354,0.514336,0.520789,0.506354
900,1.0842,1.092004,0.540743,0.550894,0.552051,0.540743
1200,1.0286,1.058572,0.566658,0.57523,0.582312,0.566658
1500,0.9773,1.047238,0.573386,0.583718,0.590887,0.573386
1800,0.9276,1.088254,0.56778,0.577638,0.589815,0.56778
2100,0.8907,0.951887,0.624844,0.633762,0.634705,0.624844
2400,0.8961,1.001153,0.61089,0.62282,0.624478,0.61089
2700,0.8679,1.038649,0.59706,0.6071,0.615798,0.59706
3000,0.8241,0.994425,0.624969,0.637631,0.639536,0.624969


0,1
eval/accuracy,▁▄▄▅▅▅▇▆▆▇▇▇▇▇█▇▇█▇█
eval/f1_macro,▁▄▅▆▆▆▇▇▆▇█▇▇██▇▇█▇█
eval/loss,█▅▄▄▃▄▂▃▃▃▂▃▂▂▁▂▃▂▂▁
eval/precision_weighted,▁▂▃▄▅▅▆▆▆▆▇▇▇▇█▇▇█▇█
eval/recall_weighted,▁▄▄▅▅▅▇▆▆▇▇▇▇▇█▇▇█▇█
eval/runtime,█▁▅▂▂▂▅█▆▆▅▄▄▃▄▄▃▅▅▆
eval/samples_per_second,▁█▄▇▇▆▄▁▃▃▄▄▅▅▅▅▆▄▄▃
eval/steps_per_second,▁█▄▇▇▆▄▁▃▃▄▄▅▅▅▅▆▄▄▃
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.67705
eval/f1_macro,0.68677
eval/loss,0.88478
eval/precision_weighted,0.68132
eval/recall_weighted,0.67705
eval/runtime,8.3156
eval/samples_per_second,965.175
eval/steps_per_second,60.369
final_eval/accuracy,0.67705
final_eval/f1_macro,0.68677


[I 2025-08-21 08:49:16,974] Trial 0 finished with value: 0.6770495888362821 and parameters: {'learning_rate': 1.2612688241920591e-05, 'max_length': 96, 'epochs': 3, 'dropout': 0.1716848387041739, 'patience': 4, 'batch_size': 16, 'weight_decay': 0.01}. Best is trial 0 with value: 0.6770495888362821.


[DEBUG] Trial #1 → max_length = 96


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.2724,1.261908,0.47371,0.485226,0.509774,0.47371
600,1.0402,1.037554,0.580488,0.593444,0.59132,0.580488
900,0.9641,0.921212,0.641914,0.653684,0.650698,0.641914
1200,0.9188,0.886674,0.665462,0.675667,0.672637,0.665462
1500,0.9075,0.870441,0.670446,0.679645,0.673181,0.670446
1800,0.8473,0.917741,0.647271,0.65911,0.657827,0.647271
2100,0.7636,0.868712,0.686643,0.695401,0.689017,0.686643
2400,0.7888,0.88305,0.679541,0.689603,0.682001,0.679541
2700,0.7457,0.873799,0.678919,0.688084,0.685245,0.678919
3000,0.7143,0.900804,0.690755,0.700915,0.695794,0.690755


0,1
eval/accuracy,▁▄▆▆▇▆▇▇▇▇▇█████████
eval/f1_macro,▁▄▆▆▇▆▇▇▇▇▇█████████
eval/loss,█▅▃▂▂▃▂▂▂▂▂▁▁▂▁▁▂▁▂▁
eval/precision_weighted,▁▄▆▆▆▆▇▇▇▇▇██▇██████
eval/recall_weighted,▁▄▆▆▇▆▇▇▇▇▇█████████
eval/runtime,▄▁▂▁▁▄▂▂█▂▄▅▂▅▃▅▃▄▂▃
eval/samples_per_second,▅▇▇██▅▇▇▁▇▅▄▇▄▆▄▆▅▇▆
eval/steps_per_second,▅▇▇██▅▇▇▁▇▅▄▇▄▆▄▆▅▇▆
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.72265
eval/f1_macro,0.7314
eval/loss,0.81142
eval/precision_weighted,0.7237
eval/recall_weighted,0.72265
eval/runtime,8.3206
eval/samples_per_second,964.599
eval/steps_per_second,60.333
final_eval/accuracy,0.72265
final_eval/f1_macro,0.7314


[I 2025-08-21 09:00:00,041] Trial 1 finished with value: 0.722651383005233 and parameters: {'learning_rate': 2.795687090878637e-05, 'max_length': 96, 'epochs': 3, 'dropout': 0.12261239119691239, 'patience': 4, 'batch_size': 16, 'weight_decay': 0.0}. Best is trial 1 with value: 0.722651383005233.


[DEBUG] Trial #2 → max_length = 128


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.3762,1.374841,0.415774,0.401896,0.49364,0.415774
600,1.0756,1.112051,0.54149,0.548235,0.557087,0.54149
900,0.9987,0.949677,0.629205,0.638526,0.647196,0.629205
1200,0.9402,0.94633,0.642911,0.65267,0.653225,0.642911
1500,0.9232,0.97791,0.621106,0.631813,0.628714,0.621106
1800,0.87,1.013313,0.623598,0.636336,0.633476,0.623598
2100,0.8115,0.885529,0.673187,0.682454,0.676249,0.673187
2400,0.8251,0.917264,0.661974,0.671859,0.666477,0.661974
2700,0.7802,0.9127,0.663344,0.673698,0.671331,0.663344
3000,0.7679,0.928091,0.662347,0.670912,0.673412,0.662347


0,1
eval/accuracy,▁▄▇▇▇▇█████
eval/f1_macro,▁▅▇▇▇▇█████
eval/loss,█▄▂▂▂▃▁▁▁▂▁
eval/precision_weighted,▁▃▇▇▆▆█████
eval/recall_weighted,▁▄▇▇▇▇█████
eval/runtime,█▂▃▃▆▄█▃▆▄▁
eval/samples_per_second,▁▇▆▆▃▅▁▆▃▅█
eval/steps_per_second,▁▇▆▆▃▅▁▆▃▅█
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.67319
eval/f1_macro,0.68245
eval/loss,0.88553
eval/precision_weighted,0.67625
eval/recall_weighted,0.67319
eval/runtime,8.8845
eval/samples_per_second,903.37
eval/steps_per_second,56.503
final_eval/accuracy,0.67319
final_eval/f1_macro,0.68245


[I 2025-08-21 09:06:03,562] Trial 2 finished with value: 0.6731871417891852 and parameters: {'learning_rate': 1.6882760793122672e-05, 'max_length': 128, 'epochs': 4, 'dropout': 0.13519042469057574, 'patience': 3, 'batch_size': 16, 'weight_decay': 0.02}. Best is trial 1 with value: 0.722651383005233.


[DEBUG] Trial #3 → max_length = 96


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.3811,1.431227,0.336033,0.294171,0.31938,0.336033
600,1.1105,1.138462,0.521555,0.528229,0.542915,0.521555
900,1.0228,0.996522,0.588338,0.600228,0.601283,0.588338
1200,0.9563,0.944416,0.633691,0.644677,0.642732,0.633691
1500,0.9355,0.925622,0.638425,0.649544,0.643971,0.638425
1800,0.8899,1.002841,0.610765,0.623344,0.626702,0.610765
2100,0.8117,0.898723,0.663718,0.673975,0.668207,0.663718
2400,0.8413,0.908866,0.662597,0.671974,0.66703,0.662597
2700,0.8012,0.944477,0.648144,0.658575,0.660013,0.648144
3000,0.7791,0.945579,0.649763,0.660825,0.661728,0.649763


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
eval/accuracy,▁▅▆▇▇▇█████
eval/f1_macro,▁▅▇▇█▇█████
eval/loss,█▄▂▂▁▂▁▁▂▂▁
eval/precision_weighted,▁▅▇▇█▇█████
eval/recall_weighted,▁▅▆▇▇▇█████
eval/runtime,▂▄▂▁▇▆▇▆█▂▆
eval/samples_per_second,▇▅▆█▂▃▂▃▁▇▃
eval/steps_per_second,▇▅▆█▂▃▂▃▁▇▃
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.66372
eval/f1_macro,0.67397
eval/loss,0.89872
eval/precision_weighted,0.66821
eval/recall_weighted,0.66372
eval/runtime,8.3706
eval/samples_per_second,958.832
eval/steps_per_second,59.972
final_eval/accuracy,0.66372
final_eval/f1_macro,0.67397


[I 2025-08-21 09:11:53,217] Trial 3 finished with value: 0.6637179167704959 and parameters: {'learning_rate': 1.5293400640517605e-05, 'max_length': 96, 'epochs': 3, 'dropout': 0.1482503684458641, 'patience': 3, 'batch_size': 16, 'weight_decay': 0.02}. Best is trial 1 with value: 0.722651383005233.


[DEBUG] Trial #4 → max_length = 128


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.3514,1.276059,0.451408,0.458409,0.485041,0.451408
600,1.0876,1.176451,0.494892,0.510299,0.527474,0.494892
900,1.0225,0.97467,0.615001,0.627753,0.620844,0.615001
1200,0.9548,0.943204,0.624595,0.636584,0.629121,0.624595
1500,0.9254,0.942079,0.630077,0.6417,0.638129,0.630077
1800,0.8881,0.977525,0.619985,0.632072,0.633609,0.619985
2100,0.8294,0.862222,0.68004,0.689983,0.682792,0.68004
2400,0.8454,0.925456,0.657862,0.667701,0.663516,0.657862
2700,0.7868,0.963992,0.643534,0.656014,0.654572,0.643534
3000,0.7762,0.930708,0.659482,0.669847,0.669108,0.659482


0,1
eval/accuracy,▁▂▆▆▆▆█▇▇▇▇█
eval/f1_macro,▁▃▆▆▇▆█▇▇▇▇█
eval/loss,█▆▃▂▂▃▁▂▃▂▃▁
eval/precision_weighted,▁▃▆▆▆▆█▇▇█▇█
eval/recall_weighted,▁▂▆▆▆▆█▇▇▇▇█
eval/runtime,█▂▂▄▃▃▂▅▁▆▂▂
eval/samples_per_second,▁▇▇▅▆▆▇▄█▃▇▇
eval/steps_per_second,▁▇▇▅▆▆▇▄█▃▇▇
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.68004
eval/f1_macro,0.68998
eval/loss,0.86222
eval/precision_weighted,0.68279
eval/recall_weighted,0.68004
eval/runtime,8.9555
eval/samples_per_second,896.213
eval/steps_per_second,56.055
final_eval/accuracy,0.68004
final_eval/f1_macro,0.68998


[I 2025-08-21 09:18:33,488] Trial 4 finished with value: 0.6800398704211313 and parameters: {'learning_rate': 2.5700281758605587e-05, 'max_length': 128, 'epochs': 4, 'dropout': 0.15400119972748433, 'patience': 4, 'batch_size': 16, 'weight_decay': 0.02}. Best is trial 1 with value: 0.722651383005233.


[DEBUG] Trial #5 → max_length = 128


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.4505,1.431319,0.35385,0.329896,0.438336,0.35385
600,1.1734,1.21536,0.459631,0.459545,0.497473,0.459631
900,1.0804,1.046655,0.562796,0.574699,0.572393,0.562796
1200,1.0188,1.056163,0.568029,0.577768,0.583935,0.568029
1500,0.9687,1.034045,0.58622,0.597986,0.599147,0.58622
1800,0.9202,0.984443,0.615998,0.629002,0.622844,0.615998
2100,0.8632,0.93531,0.640917,0.651288,0.647059,0.640917
2400,0.8829,1.011742,0.615873,0.625972,0.630094,0.615873
2700,0.83,1.019314,0.615749,0.625195,0.633711,0.615749


0,1
eval/accuracy,▁▄▆▆▇▇█▇▇█
eval/f1_macro,▁▄▆▆▇██▇▇█
eval/loss,█▅▃▃▂▂▁▂▂▁
eval/precision_weighted,▁▃▅▆▆▇█▇██
eval/recall_weighted,▁▄▆▆▇▇█▇▇█
eval/runtime,█▁▃▃▃▃▃▅▆▅
eval/samples_per_second,▁█▆▆▆▅▆▄▃▄
eval/steps_per_second,▁█▆▆▆▅▆▄▃▄
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.64092
eval/f1_macro,0.65129
eval/loss,0.93531
eval/precision_weighted,0.64706
eval/recall_weighted,0.64092
eval/runtime,8.9647
eval/samples_per_second,895.289
eval/steps_per_second,55.997
final_eval/accuracy,0.64092
final_eval/f1_macro,0.65129


[I 2025-08-21 09:24:01,941] Trial 5 finished with value: 0.6409170196860204 and parameters: {'learning_rate': 1.636786975440083e-05, 'max_length': 128, 'epochs': 3, 'dropout': 0.17978561023304188, 'patience': 2, 'batch_size': 16, 'weight_decay': 0.01}. Best is trial 1 with value: 0.722651383005233.


[DEBUG] Trial #6 → max_length = 96


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.2117,1.133281,0.521306,0.530902,0.522882,0.521306
600,0.9928,1.051533,0.581111,0.594511,0.602365,0.581111
900,0.9345,0.973554,0.614129,0.626815,0.623854,0.614129
1200,0.861,0.921648,0.646399,0.658541,0.651286,0.646399
1500,0.7992,0.857665,0.684401,0.694247,0.686618,0.684401
1800,0.814,0.898756,0.666584,0.676112,0.677026,0.666584
2100,0.7497,0.83984,0.700723,0.710098,0.703776,0.700723
2400,0.7538,0.863669,0.68926,0.697878,0.692643,0.68926
2700,0.7053,0.856694,0.692001,0.701891,0.694877,0.692001
3000,0.7273,0.865241,0.690381,0.700659,0.693304,0.690381


0,1
eval/accuracy,▁▃▅▆▇▇█████
eval/f1_macro,▁▃▅▆▇▇█████
eval/loss,█▆▄▃▁▂▁▂▁▂▁
eval/precision_weighted,▁▄▅▆▇▇█████
eval/recall_weighted,▁▃▅▆▇▇█████
eval/runtime,█▂▃▃▄▃▃▃▁▁▁
eval/samples_per_second,▁▇▆▆▅▆▆▆███
eval/steps_per_second,▁▇▆▆▅▆▆▆███
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.70072
eval/f1_macro,0.7101
eval/loss,0.83984
eval/precision_weighted,0.70378
eval/recall_weighted,0.70072
eval/runtime,6.2186
eval/samples_per_second,1290.647
eval/steps_per_second,40.363
final_eval/accuracy,0.70072
final_eval/f1_macro,0.7101


[I 2025-08-21 09:30:28,455] Trial 6 finished with value: 0.7007226513830053 and parameters: {'learning_rate': 1.743493448310708e-05, 'max_length': 96, 'epochs': 3, 'dropout': 0.14322885794260898, 'patience': 4, 'batch_size': 32, 'weight_decay': 0.02}. Best is trial 1 with value: 0.722651383005233.


[DEBUG] Trial #7 → max_length = 96


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.1992,1.142856,0.522178,0.530846,0.535171,0.522178
600,0.9917,1.147095,0.529031,0.540346,0.567866,0.529031
900,0.9368,1.011207,0.606903,0.618148,0.623995,0.606903
1200,0.8615,0.96549,0.632445,0.644824,0.643977,0.632445
1500,0.7997,0.942235,0.647396,0.656699,0.660211,0.647396
1800,0.8104,0.954085,0.647022,0.656653,0.664672,0.647022
2100,0.7397,0.884595,0.682781,0.692827,0.689688,0.682781
2400,0.7455,0.890738,0.68004,0.688441,0.687315,0.68004
2700,0.6935,0.894576,0.679168,0.68863,0.686281,0.679168
3000,0.7185,0.914191,0.672938,0.682513,0.682034,0.672938


0,1
eval/accuracy,▁▁▅▆▆▆█████
eval/f1_macro,▁▁▅▆▆▆█████
eval/loss,██▄▃▃▃▁▁▁▂▁
eval/precision_weighted,▁▂▅▆▇▇█████
eval/recall_weighted,▁▁▅▆▆▆█████
eval/runtime,█▄▂▅▂▄▃▂▃▁▁
eval/samples_per_second,▁▅▇▄▇▅▆▇▆██
eval/steps_per_second,▁▅▇▄▇▅▆▇▆██
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.68278
eval/f1_macro,0.69283
eval/loss,0.8846
eval/precision_weighted,0.68969
eval/recall_weighted,0.68278
eval/runtime,6.2405
eval/samples_per_second,1286.118
eval/steps_per_second,40.221
final_eval/accuracy,0.68278
final_eval/f1_macro,0.69283


[I 2025-08-21 09:36:53,497] Trial 7 finished with value: 0.6827809618739098 and parameters: {'learning_rate': 2.406015618061144e-05, 'max_length': 96, 'epochs': 3, 'dropout': 0.17145024519350027, 'patience': 3, 'batch_size': 32, 'weight_decay': 0.01}. Best is trial 1 with value: 0.722651383005233.


[DEBUG] Trial #8 → max_length = 128


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.2342,1.147036,0.525417,0.534558,0.530473,0.525417
600,1.0117,1.024984,0.586469,0.597691,0.599024,0.586469
900,0.9364,0.937768,0.629454,0.642058,0.637327,0.629454
1200,0.8675,0.940564,0.633441,0.645524,0.640785,0.633441
1500,0.8028,0.859582,0.677423,0.686538,0.681643,0.677423
1800,0.8067,0.868243,0.678545,0.688662,0.686029,0.678545
2100,0.7589,0.852293,0.693247,0.70272,0.696456,0.693247
2400,0.7518,0.84162,0.699352,0.707506,0.701887,0.699352
2700,0.704,0.825292,0.70471,0.713877,0.705722,0.70471
3000,0.7172,0.847971,0.694244,0.70417,0.698394,0.694244


0,1
eval/accuracy,▁▃▅▅▇▇████████
eval/f1_macro,▁▃▅▅▇▇█████▇██
eval/loss,█▅▃▄▂▂▂▁▁▁▁▂▁▁
eval/precision_weighted,▁▄▅▅▇▇████████
eval/recall_weighted,▁▃▅▅▇▇████████
eval/runtime,█▃▃▂▂▂▃▂▂▂▂▂▂▁
eval/samples_per_second,▁▆▆▇▇▇▆▇▇▇▇▇▇█
eval/steps_per_second,▁▆▆▇▇▇▆▇▇▇▇▇▇█
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.70533
eval/f1_macro,0.71437
eval/loss,0.83502
eval/precision_weighted,0.70583
eval/recall_weighted,0.70533
eval/runtime,8.5246
eval/samples_per_second,941.508
eval/steps_per_second,29.444
final_eval/accuracy,0.70533
final_eval/f1_macro,0.71437


[I 2025-08-21 09:47:32,631] Trial 8 finished with value: 0.7053326688263145 and parameters: {'learning_rate': 1.2789470297634006e-05, 'max_length': 128, 'epochs': 4, 'dropout': 0.10913962878608531, 'patience': 2, 'batch_size': 32, 'weight_decay': 0.01}. Best is trial 1 with value: 0.722651383005233.


[DEBUG] Trial #9 → max_length = 96


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.493,1.400614,0.362322,0.31206,0.446581,0.362322
600,1.1185,1.181964,0.492275,0.508049,0.5327,0.492275
900,0.997,1.011993,0.593571,0.605991,0.602937,0.593571
1200,0.9144,1.095688,0.566035,0.579269,0.589143,0.566035
1500,0.8338,0.934011,0.648019,0.659615,0.654451,0.648019
1800,0.8347,0.996795,0.617618,0.628404,0.642013,0.617618
2100,0.7615,0.887439,0.68141,0.691679,0.686522,0.68141
2400,0.772,0.942976,0.657862,0.666786,0.671166,0.657862
2700,0.7265,0.905586,0.675928,0.685064,0.684685,0.675928
3000,0.7334,0.959002,0.654373,0.664248,0.668479,0.654373


0,1
eval/accuracy,▁▄▆▅▇▇█▇█▇█
eval/f1_macro,▁▅▆▆▇▇███▇█
eval/loss,█▅▃▄▂▂▁▂▁▂▁
eval/precision_weighted,▁▄▆▅▇▇███▇█
eval/recall_weighted,▁▄▆▅▇▇█▇█▇█
eval/runtime,█▁▂▃▂▁▃▄▃▄▃
eval/samples_per_second,▁█▇▆▇█▆▅▆▅▆
eval/steps_per_second,▁█▇▆▇█▆▅▆▅▆
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.68141
eval/f1_macro,0.69168
eval/loss,0.88744
eval/precision_weighted,0.68652
eval/recall_weighted,0.68141
eval/runtime,6.2148
eval/samples_per_second,1291.442
eval/steps_per_second,40.388
final_eval/accuracy,0.68141
final_eval/f1_macro,0.69168


[I 2025-08-21 09:53:56,154] Trial 9 finished with value: 0.6814104161475205 and parameters: {'learning_rate': 2.3213404368784804e-05, 'max_length': 96, 'epochs': 4, 'dropout': 0.1773852280121806, 'patience': 3, 'batch_size': 32, 'weight_decay': 0.0}. Best is trial 1 with value: 0.722651383005233.


Best value: 0.722651383005233
Best params: {'learning_rate': 2.795687090878637e-05, 'max_length': 96, 'epochs': 3, 'dropout': 0.12261239119691239, 'patience': 4, 'batch_size': 16, 'weight_decay': 0.0}


In [18]:
from shutil import copytree
from pathlib import Path

best_trial = study.best_trial
best_dir = best_trial.user_attrs.get("best_checkpoint", None)
assert best_dir is not None, "No best_checkpoint found on the best trial."

FINAL_DIR = Path("./best_model_autosaved")
copytree(best_dir, FINAL_DIR, dirs_exist_ok=True)
print("✅ Best model copied to:", FINAL_DIR)

✅ Best model copied to: best_model_autosaved


Compression techniques

In [19]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os, torch, copy
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch import nn
import torch.nn.utils.prune as prune
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, classification_report

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Point to your saved checkpoint folder (where config.json + pytorch_model.bin live)
BASE_DIR = "./best_model_autosaved"

model = AutoModelForSequenceClassification.from_pretrained(BASE_DIR).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(BASE_DIR)

In [20]:
def evaluate_on(df, model, tokenizer, max_length=192, batch_size=64, device=DEVICE, desc="Eval"):
    class TweetDataset(torch.utils.data.Dataset):
        def __init__(self, dataframe, tokenizer, max_length=128):
            self.texts = dataframe["cleaned_tweets"].tolist()
            self.labels = dataframe["label"].tolist()
            self.tokenizer = tokenizer
            self.max_length = max_length
        def __len__(self): return len(self.texts)
        def __getitem__(self, idx):
            enc = self.tokenizer(
                str(self.texts[idx]),
                truncation=True,
                padding="max_length",
                max_length=self.max_length,
                return_tensors="pt",
            )
            return {
                "input_ids": enc["input_ids"][0],
                "attention_mask": enc["attention_mask"][0],
                "labels": torch.tensor(self.labels[idx], dtype=torch.long),
            }

    ds = TweetDataset(test_df_final, tokenizer, max_length=max_length)
    dl = DataLoader(ds, batch_size=batch_size, pin_memory=True)

    model = model.to(device).eval()
    all_y, all_p = [], []
    with torch.no_grad():
        for b in dl:
            ids = b["input_ids"].to(device)
            att = b["attention_mask"].to(device)
            y   = b["labels"].to(device)
            logits = model(ids, attention_mask=att).logits
            p = logits.argmax(dim=1)
            all_y.extend(y.cpu().numpy().tolist())
            all_p.extend(p.cpu().numpy().tolist())

    print(f"{desc} accuracy:", accuracy_score(all_y, all_p))
    print(classification_report(all_y, all_p, digits=4))

In [21]:
def compressed_models(base_model):
    compressed = {}

    # 1) Dynamic Quantization (CPU-only module)
    cpu_model = copy.deepcopy(base_model).to("cpu")
    qmodel = torch.quantization.quantize_dynamic(
        cpu_model,
        {nn.Linear},
        dtype=torch.qint8
    )
    qmodel = qmodel.eval()  # ✅ Set to eval mode
    compressed["quantized_cpu"] = qmodel

    # 2) Pruning (unstructured L1 across Linear layers)
    pruned = copy.deepcopy(base_model).to(DEVICE)
    params_to_prune = []
    for m in pruned.modules():
        if isinstance(m, nn.Linear):
            params_to_prune.append((m, "weight"))

    if len(params_to_prune) > 0:
        prune.global_unstructured(
            params_to_prune,
            pruning_method=prune.L1Unstructured,
            amount=0.40,  # 40% sparsity
        )
        # Remove reparametrization to bake in the zeroed-out weights
        for (m, _) in params_to_prune:
            try:
                prune.remove(m, "weight")
            except Exception:
                pass
    pruned = pruned.eval()  # ✅ Set to eval mode
    compressed["pruned"] = pruned

    # 3) FP16 (good for GPU inference)
    half_model = copy.deepcopy(base_model).half().to(DEVICE)
    half_model = half_model.eval()  # ✅ Set to eval mode
    compressed["fp16"] = half_model

    return compressed

In [22]:
# Base FP32
evaluate_on(test_df_final, model, tokenizer, max_length=192, desc="Base FP32")

# Get the compressed models dictionary
compressed_models_dict = compressed_models(model)

# FP16 (GPU)
evaluate_on(test_df_final, compressed_models_dict["fp16"], tokenizer, max_length=192, desc="FP16")

# Pruned (GPU)
evaluate_on(test_df_final, compressed_models_dict["pruned"], tokenizer, max_length=192, desc="Pruned")

# Quantized (CPU)
evaluate_on(test_df_final, compressed_models_dict["quantized_cpu"], tokenizer, max_length=192, device="cpu", desc="Quantized CPU")

Base FP32 accuracy: 0.6930979978925185
              precision    recall  f1-score   support

           0     0.7057    0.7534    0.7288       592
           1     0.6351    0.6638    0.6491      1041
           2     0.7632    0.7472    0.7551       617
           3     0.6625    0.6199    0.6405       947
           4     0.7611    0.7446    0.7527       599

    accuracy                         0.6931      3796
   macro avg     0.7055    0.7058    0.7052      3796
weighted avg     0.6937    0.6931    0.6930      3796



For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  qmodel = torch.quantization.quantize_dynamic(


FP16 accuracy: 0.6936248682824026
              precision    recall  f1-score   support

           0     0.7068    0.7534    0.7294       592
           1     0.6364    0.6657    0.6507      1041
           2     0.7645    0.7472    0.7557       617
           3     0.6625    0.6199    0.6405       947
           4     0.7598    0.7446    0.7521       599

    accuracy                         0.6936      3796
   macro avg     0.7060    0.7061    0.7057      3796
weighted avg     0.6942    0.6936    0.6935      3796

Pruned accuracy: 0.5956269757639621
              precision    recall  f1-score   support

           0     0.8317    0.4257    0.5631       592
           1     0.5154    0.8367    0.6379      1041
           2     0.7418    0.6937    0.7169       617
           3     0.5251    0.5966    0.5586       947
           4     0.9667    0.2421    0.3872       599

    accuracy                         0.5956      3796
   macro avg     0.7161    0.5589    0.5727      3796
weighte

In [23]:
OUT_DIR = "/content/drive/MyDrive/תואר שני/deep" #write the output directory here
os.makedirs(OUT_DIR, exist_ok=True)

# A) Save FP16
fp16_dir = os.path.join(OUT_DIR, "fp16")
os.makedirs(fp16_dir, exist_ok=True)
compressed_models_dict["fp16"].save_pretrained(fp16_dir)
tokenizer.save_pretrained(fp16_dir)

# B) Save pruned (after prune.remove, save_pretrained works)
pruned_dir = os.path.join(OUT_DIR, "pruned")
os.makedirs(pruned_dir, exist_ok=True)
compressed_models_dict["pruned"].save_pretrained(pruned_dir)
tokenizer.save_pretrained(pruned_dir)

# C) Save quantized CPU (state_dict + a tiny loader script)
q_dir = os.path.join(OUT_DIR, "quantized_cpu")
os.makedirs(q_dir, exist_ok=True)
torch.save(compressed_models_dict["quantized_cpu"].state_dict(), os.path.join(q_dir, "quantized_state_dict.pt"))

# Save a small loader so future-you can reload easily
with open(os.path.join(q_dir, "load_quantized.py"), "w") as f:
    f.write(
        "import torch\n"
        "from torch import nn\n"
        "from transformers import AutoModelForSequenceClassification\n"
        "def load_quantized(model_dir, state_path):\n"
        "    model = AutoModelForSequenceClassification.from_pretrained(model_dir)\n"
        "    model = torch.quantization.quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8)\n"
        "    sd = torch.load(state_path, map_location='cpu')\n"
        "    model.load_state_dict(sd, strict=False)\n"
        "    model.eval()\n"
        "    return model\n"
    )

print("Saved to:", OUT_DIR)

Saved to: /content/drive/MyDrive/תואר שני/deep
