In [7]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    AdamW,
)


In [8]:

# Load data
df = pd.read_csv("/home/liorkob/M.Sc/thesis/data/drugs_3k/tagged_punishment_range_sentences.csv")
texts = df["text"].astype(str).tolist()

# Load HeBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("avichr/heBERT")

# Tokenize and get lengths
token_lengths = [len(tokenizer.encode(t, truncation=False)) for t in texts]

# Stats
max_len = max(token_lengths)
min_len = min(token_lengths)
avg_len = sum(token_lengths) / len(token_lengths)

print(f"📏 Max tokens: {max_len}")
print(f"📉 Min tokens: {min_len}")
print(f"📊 Avg tokens: {avg_len:.2f}")


📏 Max tokens: 1287
📉 Min tokens: 7
📊 Avg tokens: 88.76


In [12]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    AdamW,
)

# Load data
df = pd.read_csv("/home/liorkob/M.Sc/thesis/data/drugs_3k/tagged_punishment_range_sentences.csv")
texts = df["text"].astype(str).tolist()
labels = df["Tag"].astype(int).tolist()


train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts,
    labels,
    test_size=0.2,
    stratify=labels,
    random_state=42
)


# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("avichr/heBERT")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Dataset
class PunishmentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = PunishmentDataset(train_encodings, train_labels)
val_dataset = PunishmentDataset(val_encodings, val_labels)

# Grid setup
optimizers = {
    "adamw": lambda model, lr: AdamW(model.parameters(), lr=lr),
    "adam": lambda model, lr: torch.optim.Adam(model.parameters(), lr=lr),
    "sgd": lambda model, lr: torch.optim.SGD(model.parameters(), lr=lr),
    "adagrad": lambda model, lr: torch.optim.Adagrad(model.parameters(), lr=lr),
}
epochs_list = [2, 3, 4]
learning_rates = [5e-5, 3e-5, 1e-5]

# Best tracker
best_f1 = 0
best_model_info = ""

def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    f1 = f1_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    return {
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

# Grid search
for opt_name, opt_fn in optimizers.items():
    for lr in learning_rates:
        for epochs in epochs_list:
            print(f"\n🔍 Trying: {opt_name}, LR={lr}, Epochs={epochs}")
            
            model = AutoModelForSequenceClassification.from_pretrained("avichr/heBERT", num_labels=2)
            optimizer = opt_fn(model, lr)
            
            args = TrainingArguments(
                output_dir=f"./results_{opt_name}_{lr}_{epochs}",
                num_train_epochs=epochs,
                per_device_train_batch_size=8,
                per_device_eval_batch_size=8,
                evaluation_strategy="epoch",
                save_strategy="no",
                logging_strategy="no",
                report_to="none"
            )

            trainer = Trainer(
                model=model,
                args=args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                compute_metrics=compute_metrics,
                optimizers=(optimizer, None)
            )

            trainer.train()
            metrics = trainer.evaluate()
            f1, precision, recall = metrics["eval_f1"], metrics["eval_precision"], metrics["eval_recall"]

            print(f"✅ F1: {f1:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}")

            if f1 > best_f1:
                best_f1 = f1
                best_model_info = f"{opt_name} | LR={lr} | Epochs={epochs} | F1={f1:.4f} | Precision={precision:.4f} | Recall={recall:.4f}"
                model.save_pretrained("best_model")
                tokenizer.save_pretrained("best_model")

print(f"\n🏆 Best model:\n{best_model_info}")




🔍 Trying: adamw, LR=5e-05, Epochs=2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.207119,0.905109,0.939394,0.873239
2,No log,0.190336,0.930556,0.917808,0.943662


✅ F1: 0.9306 | Precision: 0.9178 | Recall: 0.9437

🔍 Trying: adamw, LR=5e-05, Epochs=3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.180717,0.913043,0.940299,0.887324
2,No log,0.19305,0.938776,0.907895,0.971831
3,No log,0.202528,0.930556,0.917808,0.943662


✅ F1: 0.9306 | Precision: 0.9178 | Recall: 0.9437

🔍 Trying: adamw, LR=5e-05, Epochs=4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.205849,0.906475,0.926471,0.887324
2,No log,0.192692,0.92517,0.894737,0.957746
3,No log,0.184062,0.923077,0.916667,0.929577
4,No log,0.177948,0.930556,0.917808,0.943662


✅ F1: 0.9306 | Precision: 0.9178 | Recall: 0.9437

🔍 Trying: adamw, LR=3e-05, Epochs=2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.219205,0.906475,0.926471,0.887324
2,No log,0.185799,0.923077,0.916667,0.929577


✅ F1: 0.9231 | Precision: 0.9167 | Recall: 0.9296

🔍 Trying: adamw, LR=3e-05, Epochs=3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.228432,0.906475,0.926471,0.887324
2,No log,0.200726,0.92517,0.894737,0.957746
3,No log,0.21918,0.924138,0.905405,0.943662


✅ F1: 0.9241 | Precision: 0.9054 | Recall: 0.9437

🔍 Trying: adamw, LR=3e-05, Epochs=4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.214012,0.906475,0.926471,0.887324
2,No log,0.188565,0.931507,0.906667,0.957746
3,No log,0.200565,0.921986,0.928571,0.915493
4,No log,0.17814,0.929577,0.929577,0.929577


✅ F1: 0.9296 | Precision: 0.9296 | Recall: 0.9296

🔍 Trying: adamw, LR=1e-05, Epochs=2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.234898,0.901408,0.901408,0.901408
2,No log,0.197412,0.901408,0.901408,0.901408


✅ F1: 0.9014 | Precision: 0.9014 | Recall: 0.9014

🔍 Trying: adamw, LR=1e-05, Epochs=3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.225491,0.901408,0.901408,0.901408
2,No log,0.189335,0.916667,0.90411,0.929577
3,No log,0.193136,0.916667,0.90411,0.929577


✅ F1: 0.9167 | Precision: 0.9041 | Recall: 0.9296

🔍 Trying: adamw, LR=1e-05, Epochs=4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.210102,0.892086,0.911765,0.873239
2,No log,0.179201,0.917808,0.893333,0.943662
3,No log,0.190428,0.923077,0.916667,0.929577
4,No log,0.19243,0.923077,0.916667,0.929577


✅ F1: 0.9231 | Precision: 0.9167 | Recall: 0.9296

🔍 Trying: adam, LR=5e-05, Epochs=2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.219011,0.913043,0.940299,0.887324
2,No log,0.197582,0.923077,0.916667,0.929577


✅ F1: 0.9231 | Precision: 0.9167 | Recall: 0.9296

🔍 Trying: adam, LR=5e-05, Epochs=3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.180013,0.919708,0.954545,0.887324
2,No log,0.20912,0.933333,0.886076,0.985915
3,No log,0.206251,0.924138,0.905405,0.943662


✅ F1: 0.9241 | Precision: 0.9054 | Recall: 0.9437

🔍 Trying: adam, LR=5e-05, Epochs=4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.173946,0.921986,0.928571,0.915493
2,No log,0.162915,0.939597,0.897436,0.985915
3,No log,0.243815,0.920863,0.941176,0.901408
4,No log,0.191472,0.929577,0.929577,0.929577


✅ F1: 0.9296 | Precision: 0.9296 | Recall: 0.9296

🔍 Trying: adam, LR=3e-05, Epochs=2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.245594,0.905109,0.939394,0.873239
2,No log,0.20192,0.923077,0.916667,0.929577


✅ F1: 0.9231 | Precision: 0.9167 | Recall: 0.9296

🔍 Trying: adam, LR=3e-05, Epochs=3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.204276,0.905109,0.939394,0.873239
2,No log,0.196135,0.931507,0.906667,0.957746
3,No log,0.222176,0.924138,0.905405,0.943662


✅ F1: 0.9241 | Precision: 0.9054 | Recall: 0.9437

🔍 Trying: adam, LR=3e-05, Epochs=4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.213879,0.913043,0.940299,0.887324
2,No log,0.192949,0.937931,0.918919,0.957746
3,No log,0.203939,0.921986,0.928571,0.915493
4,No log,0.199345,0.921986,0.928571,0.915493


✅ F1: 0.9220 | Precision: 0.9286 | Recall: 0.9155

🔍 Trying: adam, LR=1e-05, Epochs=2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.239922,0.907801,0.914286,0.901408
2,No log,0.204167,0.895105,0.888889,0.901408


✅ F1: 0.8951 | Precision: 0.8889 | Recall: 0.9014

🔍 Trying: adam, LR=1e-05, Epochs=3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.227121,0.901408,0.901408,0.901408
2,No log,0.192682,0.924138,0.905405,0.943662
3,No log,0.194901,0.924138,0.905405,0.943662


✅ F1: 0.9241 | Precision: 0.9054 | Recall: 0.9437

🔍 Trying: adam, LR=1e-05, Epochs=4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.215229,0.9,0.913043,0.887324
2,No log,0.183644,0.917808,0.893333,0.943662
3,No log,0.195841,0.923077,0.916667,0.929577
4,No log,0.197635,0.923077,0.916667,0.929577


✅ F1: 0.9231 | Precision: 0.9167 | Recall: 0.9296

🔍 Trying: sgd, LR=5e-05, Epochs=2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.642672,0.0,0.0,0.0
2,No log,0.640146,0.0,0.0,0.0


✅ F1: 0.0000 | Precision: 0.0000 | Recall: 0.0000

🔍 Trying: sgd, LR=5e-05, Epochs=3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.667544,0.024691,0.1,0.014085
2,No log,0.660903,0.0,0.0,0.0
3,No log,0.65878,0.0,0.0,0.0


✅ F1: 0.0000 | Precision: 0.0000 | Recall: 0.0000

🔍 Trying: sgd, LR=5e-05, Epochs=4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.64432,0.0,0.0,0.0
2,No log,0.6371,0.0,0.0,0.0
3,No log,0.633032,0.0,0.0,0.0
4,No log,0.631708,0.0,0.0,0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


✅ F1: 0.0000 | Precision: 0.0000 | Recall: 0.0000

🔍 Trying: sgd, LR=3e-05, Epochs=2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.646234,0.0,0.0,0.0
2,No log,0.64464,0.0,0.0,0.0


✅ F1: 0.0000 | Precision: 0.0000 | Recall: 0.0000

🔍 Trying: sgd, LR=3e-05, Epochs=3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.672714,0.063158,0.125,0.042254
2,No log,0.668444,0.02439,0.090909,0.014085
3,No log,0.667058,0.024691,0.1,0.014085


✅ F1: 0.0247 | Precision: 0.1000 | Recall: 0.0141

🔍 Trying: sgd, LR=3e-05, Epochs=4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.649072,0.121951,0.454545,0.070423
2,No log,0.64438,0.0,0.0,0.0
3,No log,0.641657,0.0,0.0,0.0
4,No log,0.640759,0.0,0.0,0.0


✅ F1: 0.0000 | Precision: 0.0000 | Recall: 0.0000

🔍 Trying: sgd, LR=1e-05, Epochs=2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.650001,0.0,0.0,0.0
2,No log,0.64946,0.0,0.0,0.0


✅ F1: 0.0000 | Precision: 0.0000 | Recall: 0.0000

🔍 Trying: sgd, LR=1e-05, Epochs=3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.678201,0.213115,0.254902,0.183099
2,No log,0.676709,0.218487,0.270833,0.183099
3,No log,0.676223,0.176991,0.238095,0.140845


✅ F1: 0.1770 | Precision: 0.2381 | Recall: 0.1408

🔍 Trying: sgd, LR=1e-05, Epochs=4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.654146,0.229885,0.625,0.140845
2,No log,0.652486,0.209302,0.6,0.126761
3,No log,0.651499,0.166667,0.538462,0.098592
4,No log,0.651173,0.166667,0.538462,0.098592


✅ F1: 0.1667 | Precision: 0.5385 | Recall: 0.0986

🔍 Trying: adagrad, LR=5e-05, Epochs=2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.21293,0.907801,0.914286,0.901408
2,No log,0.202431,0.907801,0.914286,0.901408


✅ F1: 0.9078 | Precision: 0.9143 | Recall: 0.9014

🔍 Trying: adagrad, LR=5e-05, Epochs=3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.197802,0.895105,0.888889,0.901408
2,No log,0.182838,0.901408,0.901408,0.901408
3,No log,0.177313,0.901408,0.901408,0.901408


✅ F1: 0.9014 | Precision: 0.9014 | Recall: 0.9014

🔍 Trying: adagrad, LR=5e-05, Epochs=4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.190216,0.9,0.913043,0.887324
2,No log,0.172123,0.923077,0.916667,0.929577
3,No log,0.172017,0.915493,0.915493,0.915493
4,No log,0.173548,0.915493,0.915493,0.915493


✅ F1: 0.9155 | Precision: 0.9155 | Recall: 0.9155

🔍 Trying: adagrad, LR=3e-05, Epochs=2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.19283,0.895105,0.888889,0.901408
2,No log,0.193246,0.901408,0.901408,0.901408


✅ F1: 0.9014 | Precision: 0.9014 | Recall: 0.9014

🔍 Trying: adagrad, LR=3e-05, Epochs=3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.193063,0.888889,0.876712,0.901408
2,No log,0.189606,0.901408,0.901408,0.901408
3,No log,0.188354,0.888889,0.876712,0.901408


✅ F1: 0.8889 | Precision: 0.8767 | Recall: 0.9014

🔍 Trying: adagrad, LR=3e-05, Epochs=4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.173967,0.881119,0.875,0.887324
2,No log,0.173835,0.888889,0.876712,0.901408
3,No log,0.167399,0.901408,0.901408,0.901408
4,No log,0.165802,0.901408,0.901408,0.901408


✅ F1: 0.9014 | Precision: 0.9014 | Recall: 0.9014

🔍 Trying: adagrad, LR=1e-05, Epochs=2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.321442,0.880597,0.936508,0.830986
2,No log,0.282405,0.913043,0.940299,0.887324


✅ F1: 0.9130 | Precision: 0.9403 | Recall: 0.8873

🔍 Trying: adagrad, LR=1e-05, Epochs=3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.330328,0.859375,0.964912,0.774648
2,No log,0.258846,0.877698,0.897059,0.859155
3,No log,0.243575,0.887324,0.887324,0.887324


✅ F1: 0.8873 | Precision: 0.8873 | Recall: 0.8873

🔍 Trying: adagrad, LR=1e-05, Epochs=4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avichr/heBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.298555,0.870229,0.95,0.802817
2,No log,0.22272,0.875,0.863014,0.887324
3,No log,0.199802,0.868966,0.851351,0.887324
4,No log,0.194455,0.868966,0.851351,0.887324


✅ F1: 0.8690 | Precision: 0.8514 | Recall: 0.8873

🏆 Best model:
adamw | LR=5e-05 | Epochs=2 | F1=0.9306 | Precision=0.9178 | Recall=0.9437


In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report

# 1. Load model and tokenizer
model_path = "best_model"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model.eval()  # Set to evaluation mode

# 2. Tokenize validation texts again
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# 3. Predict in batches
with torch.no_grad():
    outputs = model(**val_encodings)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).numpy()

# 4. Print predictions vs. ground truth
for i, (text, real, pred) in enumerate(zip(val_texts, val_labels, predictions)):
    print(f"\n📝 Text {i+1}:\n{text}\nReal: {real} | Predicted: {pred}")

# 5. Optional: Classification report
print("\n📊 Classification Report:")
print(classification_report(val_labels, predictions, digits=4))



📝 Text 1:
מאסר על תנאי לתקופה של 4 חודשים, והתנאי הוא שלא יעבור כל עבירה מסוג עוון לפי פקודת הסמים המסוכנים, למשך 3 שנים מהיום;
Real: 0 | Predicted: 0

📝 Text 2:
עיון בפסיקה מלמד אפוא, כי מתחם העונש ההולם את מעשיו של הנאשם בנסיבותיהם, מחייב עונש מאסר בפועל, כשהמתחם נע בין 12 ל-24 חודשי מאסר בפועל. בכל המקרים בהם הושת עונש מאסר בעבודות שירות, הדבר נעשה בסטייה ממתחם העונש ההולם, בשל הליך שיקום משמעותי ובצירוף נסיבות חריגות נוספות. כאמור, בת"פ (מחוזי-ת"א) 38450-09-12 בעניין סקוט, על אף השיקום המשמעותי שהנאשם עבר, ומדובר היה בנאשם בן 61 שצרך סמים מגיל 45 בשל מצב נפשי כתוצאה מנטייתו המינית, ואף ששם הוסכם כי הסם נרכש לשימוש עצמי, בית המשפט הקפיד לציין כי לא היה בכל אלו כדי להחריג את הנאשם ממתחם העונש ההולם המחייב עונש מאסר, והוא עשה זאת רק כי לכל האמור הצטרפו שיקולים רפואיים והומניטאריים מיוחדים. וכאמור, מדובר היה במי שעבר כברת דרך משמעותית בהליך השיקום שהחל מיוזמתו, וגילה מחויבות ומוטיבציה גבוהה להצליח בו. איני סבור כי בענייננו ניתן לומר כי הנאשם עבר הליך שיקומי. בוודאי לא משמעותי. אדרבה, 