In [None]:
# !pip install transformers datasets accelerate
# !pip install --force-reinstall soundfile

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from datasets import load_dataset
from transformers import TrainingArguments
from transformers import Trainer, DataCollatorForLanguageModeling
import pandas as pd
import math


2025-04-20 13:19:00.145998: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745144341.426047 3763741 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745144342.023016 3763741 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745144346.831386 3763741 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745144346.831415 3763741 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745144346.831418 3763741 computation_placer.cc:177] computation placer alr

In [4]:
df = pd.read_csv("/home/liorkob/M.Sc/thesis/data/5k/gpt/processed_verdicts_with_gpt.csv")

texts = df["extracted_gpt_facts"].dropna().tolist()
with open("gpt_facts.txt", "w", encoding="utf-8") as f:
    for line in texts:
        f.write(line.strip() + "\n")

        
eval_texts = df["extracted_gpt_facts"].dropna().sample(frac=0.1, random_state=42).tolist()

with open("eval.txt", "w", encoding="utf-8") as f:
    for line in eval_texts:
        f.write(line.strip() + "\n")


In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from datasets import load_dataset

# טוען את המודל והטוקנייזר
model_name = "avichr/heBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

# טוען את הקבצים
dataset = load_dataset("text", data_files={"train": "gpt_facts.txt"})
eval_dataset = load_dataset("text", data_files={"eval": "eval.txt"})

# פונקציית טוקניזציה עם sliding window
def sliding_tokenize_function(examples):
    return tokenizer(
        examples["text"],
        max_length=512,
        stride=256,
        truncation=True,
        padding="max_length",
        return_overflowing_tokens=True
    )

# טוקניזציה עם פיצול חכם
tokenized = dataset.map(sliding_tokenize_function, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(sliding_tokenize_function, batched=True, remove_columns=["text"])

# הגדרות אימון
training_args = TrainingArguments(
    output_dir="./hebert-mlm-verdicts",
    logging_dir="./hebert-mlm-verdicts/logs",  # NEW: log dir for TensorBoard
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    save_steps=500,
    logging_steps=100,
    evaluation_strategy="epoch",  # הפעלת הערכה בסוף כל אפוק
    eval_steps=None,              # אם את רוצה שזה יקרה כל X צעדים – תגדירי כאן מספר
    save_total_limit=2            # לשמור רק את 2 המודלים האחרונים
)

# Data collator למסוך טוקנים
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized_eval["eval"],
    data_collator=data_collator
)

# התחלת האימון
trainer.train()
# שמירת המודל המאומן
trainer.save_model("./hebert-mlm-verdicts/final")


import matplotlib.pyplot as plt

log_history = trainer.state.log_history

train_loss = [x["loss"] for x in log_history if "loss" in x]
eval_loss = [x["eval_loss"] for x in log_history if "eval_loss" in x]

plt.plot(train_loss, label="Train Loss")
plt.plot(eval_loss, label="Eval Loss")
plt.xlabel("Logging Steps / Epochs")
plt.ylabel("Loss")
plt.legend()
plt.title("Training vs Eval Loss")
plt.grid(True)
plt.show()


import math

perplexities = [math.exp(loss) for loss in eval_loss]

plt.plot(perplexities, label="Eval Perplexity")
plt.xlabel("Epoch")
plt.ylabel("Perplexity")
plt.title("Model Perplexity Over Epochs")
plt.grid(True)
plt.legend()
plt.show()


Map:   0%|          | 0/366 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss
1,4.787,4.62493
2,4.6307,4.407584
3,4.1224,3.827338
4,3.1471,2.43061
5,2.2035,1.82918
6,1.6517,1.447101
7,1.5456,1.332974
8,1.4283,1.243729
9,1.3671,1.196902
10,1.2979,1.153709


In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # או כל מודל אחר

def get_token_count(example):
    return {"token_count": len(tokenizer(example["text"], add_special_tokens=True)["input_ids"])}

total_tokens = 0
total_texts = 0

for split_name, split_dataset in dataset.items():
    tokenized_dataset = split_dataset.map(get_token_count)
    split_total = sum(tokenized_dataset["token_count"])
    split_count = len(tokenized_dataset)

    avg = split_total / split_count
    print(f"{split_name} - ממוצע טוקנים: {avg:.2f}")

    total_tokens += split_total
    total_texts += split_count

print(f"\nסה\"כ ממוצע טוקנים בכל הדאטה: {total_tokens / total_texts:.2f}")


Map:   0%|          | 0/3676 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (576 > 512). Running this sequence through the model will result in indexing errors


train - ממוצע טוקנים: 954.30

סה"כ ממוצע טוקנים בכל הדאטה: 954.30


In [7]:
# הערכה וחישוב Perplexity
eval_results = trainer.evaluate(eval_dataset=tokenized_eval["eval"])
perplexity = math.exp(eval_results["eval_loss"])
print("Perplexity:", perplexity)


Perplexity: 2.5685408549047715
