In [None]:
!pip install transformers torch datasets

In [None]:
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments
import torch
from datasets import load_dataset, Dataset

In [None]:
model_name = "HooshvareLab/bert-fa-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

src_path = "/content/drive/My Drive/MT/shams.txt"

# Load your dataset
with open(src_path, 'r', encoding='utf-8') as f:
    texts = f.readlines()

# Create a DataFrame
df = pd.DataFrame({'text': texts})

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)


In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=40,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

trainer.train()


In [None]:
model.save_pretrained("/content/drive/My Drive/MT/fine-tuned-bert-fa")
tokenizer.save_pretrained("/content/drive/My Drive/MT/fine-tuned-bert-fa")

In [None]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

# Load your fine-tuned model
fine_tuned_model_path = "/content/drive/My Drive/MT/fine-tuned-bert-fa"
fine_tuned_tokenizer = BertTokenizer.from_pretrained(fine_tuned_model_path)
fine_tuned_model = BertForMaskedLM.from_pretrained(fine_tuned_model_path)

# Load the original ParsBERT model
parsbert_model_name = "HooshvareLab/bert-base-parsbert-uncased"
parsbert_tokenizer = BertTokenizer.from_pretrained(parsbert_model_name)
parsbert_model = BertForMaskedLM.from_pretrained(parsbert_model_name)



In [None]:
# Example Farsi sentence with a masked token
masked_sentence = "من [MASK] هستم"

# Tokenize with the fine-tuned tokenizer
fine_tuned_inputs = fine_tuned_tokenizer(masked_sentence, return_tensors="pt")

# Predict the masked word with the fine-tuned model
with torch.no_grad():
    fine_tuned_outputs = fine_tuned_model(**fine_tuned_inputs)
    fine_tuned_predictions = fine_tuned_outputs.logits

# Get the index of the masked token
mask_token_index = torch.where(fine_tuned_inputs["input_ids"] == fine_tuned_tokenizer.mask_token_id)[1]

# Get the logits for the masked token
fine_tuned_mask_token_logits = fine_tuned_predictions[0, mask_token_index, :]

# Get the top 15 predictions for the fine-tuned model
fine_tuned_top_15_tokens = torch.topk(fine_tuned_mask_token_logits, 15, dim=1).indices[0].tolist()

print("Fine-tuned Model Predictions:")
for token in fine_tuned_top_15_tokens:
    print(f"Predicted token: {fine_tuned_tokenizer.decode([token])}")

# Tokenize with the ParsBERT tokenizer
parsbert_inputs = parsbert_tokenizer(masked_sentence, return_tensors="pt")

# Predict the masked word with the ParsBERT model
with torch.no_grad():
    parsbert_outputs = parsbert_model(**parsbert_inputs)
    parsbert_predictions = parsbert_outputs.logits

# Get the logits for the masked token
parsbert_mask_token_logits = parsbert_predictions[0, mask_token_index, :]

# Get the top 15 predictions for the ParsBERT model
parsbert_top_15_tokens = torch.topk(parsbert_mask_token_logits, 15, dim=1).indices[0].tolist()

print("\nParsBERT Model Predictions:")
for token in parsbert_top_15_tokens:
    print(f"Predicted token: {parsbert_tokenizer.decode([token])}")
