In [None]:
!git lfs install
!git clone https://huggingface.co/HooshvareLab/gpt2-fa-poetry
data_path = '/content/drive/MyDrive/Colab Notebooks/ferdousi.txt'
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
lines = lines[2:]

from transformers import AutoTokenizer, AutoConfig, GPT2LMHeadModel
import torch

def initialize_tokenizer_and_config(model_name_or_path, output_directory):
    tokenizer = create_tokenizer(model_name_or_path, output_directory)
    config = create_config(model_name_or_path, tokenizer, output_directory)
    return tokenizer, config

def create_tokenizer(model_name_or_path, output_directory):
    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path,
        bos_token='<s>',
        eos_token='</s>',
        pad_token='<pad>',
        unk_token='<unk>'
    )
    tokenizer.add_special_tokens({
        "bos_token": '</s>',
        "eos_token": '</s>',
        "pad_token": '<pad>',
        "unk_token": '<unk>'
    })
    tokenizer.save_pretrained(output_directory)
    return tokenizer

def create_config(model_name_or_path, tokenizer, output_directory):
    config = AutoConfig.from_pretrained(
        model_name_or_path,
        bos_token_id=tokenizer("<s>")["input_ids"][0],
        eos_token_id=tokenizer("</s>")["input_ids"][0],
        pad_token_id=tokenizer("<pad>")["input_ids"][0],
        unk_token_id=tokenizer("<unk>")["input_ids"][0],
    )
    config.save_pretrained(output_directory)
    return config

model_name_or_path = "HooshvareLab/gpt2-fa"
output_directory = "./gpt2-fa/"

tokenizer, config = initialize_tokenizer_and_config(model_name_or_path, output_directory)
df_input, df_target, df_concat = [], [], []

for i in range(0, len(lines) - 3, 2):
    df_input.append(lines[i])
    df_target.append(' <s> ' + lines[i + 1] + '   ' + lines[i + 2] + '    ' + lines[i + 3] + '  </s>  ')
    df_concat.append(lines[i] + ' <s> ' + lines[i + 1] + '   ' + lines[i + 2] + '    ' + lines[i + 3] + '  </s>  ')

# Sample input text
input_text = df_input[1000]

# Tokenize and encode the input text
input_encoding = tokenizer.encode(input_text)

# Decode the tokenized input back to text
decoded_input = tokenizer.decode(input_encoding)

# Print results
print("Original Text:", input_text)
print("Tokenized Input:", input_encoding)
print("Decoded Input:", decoded_input)

# Set a random seed for reproducibility
torch.manual_seed(42)

# Create a custom dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_texts, tokenizer, max_length=1024):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []

        for txt in input_texts:
            encodings_dict = tokenizer(
                txt, truncation=True, max_length=max_length, padding="max_length"
            )
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

# Define the maximum sequence length
max_seq = 256

# Create an instance of the CustomDataset
dataset = CustomDataset(df_concat, tokenizer, max_length=max_seq)

# Define the percentage of data to use for testing (e.g., 20%)
test_percentage = 0.3
total_samples = len(dataset)
test_size = int(total_samples * test_percentage)
train_size = total_samples - test_size

# Split the dataset into training and testing sets
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

print(f"Number of samples in the training set: {len(train_dataset)}")
print(f"Number of samples in the validation set: {len(test_dataset)}")

# Loading the model configuration and setting it to the GPT2 standard settings.
model = GPT2LMHeadModel.from_pretrained(model_name_or_path, config=config)
model.resize_token_embeddings(len(tokenizer))


Git LFS initialized.
fatal: destination path 'gpt2-fa-poetry' already exists and is not an empty directory.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Original Text: ز بالا چو پی بر زمین برنهاد
Tokenized Input: [286, 846, 13052, 625, 327, 762, 327, 15156]
Decoded Input: ز بالا چو پی بر زمین برنهاد
Number of samples in the training set: 34725
Number of samples in the validation set: 14882


Embedding(42001, 768)

In [None]:
epochs = 2
warmup_steps = 1e2
sample_every = 300
from transformers import AdamW

import torch
# Define RMSProp optimizer
optimizer = torch.optim.RMSprop(
    model.parameters(),
    lr=5e-4,
    alpha=0.99,
    eps=1e-8,
    centered=False,
)

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=8
)

validation_dataloader = DataLoader(
    test_dataset,
    sampler=SequentialSampler(test_dataset),
    batch_size=8
)
from transformers import get_linear_schedule_with_warmup
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps)


In [None]:
import numpy as np
sample_input = df_input[np.random.randint(0, len(df_input))]
print(sample_input)
sample_input_ids = torch.tensor(tokenizer([sample_input])["input_ids"])
sample_input_ids = sample_input_ids.to('cpu')

sample_outputs = model.generate(
    input_ids=sample_input_ids,
    do_sample=True,
    top_k=50,
    max_length=50,
    top_p=0.95,
    num_return_sequences=5
)
for i, sample_output in enumerate(sample_outputs):
    output = tokenizer.decode(sample_output, skip_special_tokens=False)
    output = output.replace("<|startoftext|>", "\n").replace("<s>", "").replace("</s>", "").replace("<sep>", "\n")

    print(f'output: {output}')

برآنگونه گرد اندر آمد سپاه
output: برآنگونه گرد اندر آمد سپاه وی و سربازان وی در پشت سپاه گرد اندر رفت که این کار به شکست انجامید و سرانجام سپاه به فرماندهی پیروس سردار رومی پس از پنج روز جنگ از رم به شهر رسید و سپاهیان روم به پیروزی دست یافتند.
output: برآنگونه گرد اندر آمد سپاه چین را از آن سو فرستاد، چین را با آنان رو‌به‌رو ساخت و بدین‌سان در پی وی تاخت و تازهای چین‌رو را به‌دنبال آورد. با نزدیک شدن چین، چین
output: برآنگونه گرد اندر آمد سپاه را برای این که به سوی رود، بر ایشان گشوده شود، بر گرد اندر آمد، از سوی غرب، که میان آن‌ها می‌رسد، به سوی مغرب، بر گرد اندر نشست (نزدیک اندرشد
output: برآنگونه گرد اندر آمد سپاه و جنگ است و به جنگ در میان نیست. و چون جنگ رخ نداد، با شمشیر استمداد استمداد و نیزهٔ پربرگردانان، بر او زد و او را گرفت. در سال ۴۶۹ میلادی
output: برآنگونه گرد اندر آمد سپاه در حال فزونی به سوی پایتخت بود. چون فرمانده‌ای در حال طغیان بود، بر او دست درازی می‌شد تا از این رهگذر به جنگ بپردازد. در این میان، سالار او نیز با سپاهیان روبرو گشت


In [None]:
from tqdm import tqdm
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer
import torch
import random
import time
import datetime

training_args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    learning_rate=5e-5,
    logging_steps=500,
    save_steps=1000,
    evaluation_strategy="steps",
    eval_steps=1000,
    # Add distributed training options
    per_device_eval_batch_size=32,
    fp16=False,  # Enable mixed-precision training if supported
    dataloader_num_workers=16,  # Adjust based on your system capabilities
    report_to="tensorboard",  # You can use TensorBoard for logging
)

# Initialize Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)
import random
import time
import datetime
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))
total_t0 = time.time()
training_stats = []
# Check if a GPU is available, and if so, use it; otherwise, use the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
for epoch_i in tqdm(range(training_args.num_train_epochs),position=0):
    print(f'Beginning epoch {epoch_i + 1} of {training_args.num_train_epochs}')
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader), position=0):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        model.zero_grad()
        outputs = model(b_input_ids, labels=b_labels, attention_mask=b_masks, token_type_ids=None)
        loss = outputs.loss
        loss = loss.mean()
        batch_loss = loss
        total_train_loss += batch_loss
        loss.backward()
        optimizer.step()
        scheduler.step()
    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    print()
    print(f'Average Training Loss: {avg_train_loss}. Epoch time: {training_time}')
    print()
    t0 = time.time()
    model.eval()
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in tqdm(validation_dataloader, total=len(validation_dataloader), position=0):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_masks, labels=b_labels)
            loss = outputs.loss
            loss = loss.mean()
            batch_loss = loss
        total_eval_loss += batch_loss
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)
    print()
    print(f'Validation loss: {avg_val_loss}. Validation Time: {validation_time}')
    print()
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print(f'Total training took {format_time(time.time()-total_t0)}')



  0%|          | 0/1 [00:00<?, ?it/s]

Beginning epoch 1 of 1


100%|██████████| 4341/4341 [37:27<00:00,  1.93it/s]



Average Training Loss: 0.5336850881576538. Epoch time: 0:37:28



100%|██████████| 1861/1861 [05:23<00:00,  5.76it/s]
100%|██████████| 1/1 [42:50<00:00, 2570.94s/it]


Validation loss: 0.39344531297683716. Validation Time: 0:05:23

Total training took 0:42:51





In [None]:
import numpy as np
import torch

# Define the number of times you want to run the code
num_iterations = 10  # You can change this to the desired number of iterations

for _ in range(num_iterations):
    # Change the name of sample_input to input_data
    input_data = df_input[np.random.randint(0, len(df_input))]
    print(input_data)

    # Change sample_input_ids to input_ids
    input_ids = torch.tensor(tokenizer([input_data])["input_ids"])
    input_ids = input_ids.to(device)

    # Change sample_outputs to generated_sequences
    generated_sequences = model.generate(
        input_ids=input_ids,
        do_sample=True,
        top_k=50,
        max_length=50,
        top_p=0.95,
        num_return_sequences=1
    )

    # Update variable names in the loop as well
    for i, generated_sequence in enumerate(generated_sequences): out = tokenizer.decode(generated_sequence, skip_special_tokens=False).replace("\n", "").replace("<s>", "").replace("</s>", "").replace("<sep>", "\n"); print(f'output: {out}')



خود و سرکشان سوی جیحون کشید
output: خود و سرکشان سوی جیحون کشید  به خون از در تیره اندیشه اندر کشید   سوی بیشهٔ اژدها شد دراز    جهانی برو پر ز آتش همی کرد باز  
چنین داد پاسخ که چرخ بلند
output: چنین داد پاسخ که چرخ بلند  بلند آسمان را دهد بر بلند   همان بد که در بزم جوید همی    سپهرش روانها به زهر آب بفگنیم  
ببردند چیزی که شایسته بود
output: ببردند چیزی که شایسته بود  به یزدان جز از باد هرگز مباد   ز گفتار او شاد شد شهریار    بشد شاد و خندان به ایوان نگار  
سه یک زان نخستین بدرویش داد
output: سه یک زان نخستین بدرویش داد  ازان پس به فرجامش نوید   به مهر اندرون نامه بود    ز گیتی یکی بهره بود  
گر ایدونک با من تو پیمان کنی
output: گر ایدونک با من تو پیمان کنی  به فرمانت گروگان کنی   گر ای دون که او را کنی جای خویش    بدانگه که آید به پیمان خویش  
سپه بودش از جنگیان صدهزار
output: سپه بودش از جنگیان صدهزار  ببردند گردان آن کارزار   بفرمود تا پیش او برنشست    سوی رزمگه لشکر آراستند  
به شاه جهان گفت کای شهریار
output: به شاه جهان گفت کای شهریار  توی پرگنهکار از روزگار   ترا دل به کین از

In [40]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!python -m nltk.downloader popular

# Example usage
reference = "به نام خداوند جان و خرد   	کزین برتر اندیشه برنگذرد     خداوند نام و خداوند جای     خداوند روزی ده رهنمای  "
candidate = "بفرمود و گفت ار بماند یکی  ببایدت رفتن مرا اندکی   کنون هرچ بایستشان پند ما    بباید که گردد به پیوند ما  "
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize

def calculate_bleu(reference_texts, candidate_text):
    reference_tokens = [word_tokenize(ref.lower()) for ref in reference_texts]
    candidate_tokens = word_tokenize(candidate_text.lower())
    score = sentence_bleu(reference_tokens, candidate_tokens)
    return score

bleu_score = calculate_bleu(reference, candidate)
print("**************************************************************")
print("**************************************************************")
print("**************************************************************")
print("**************************************************************")
print("**************************************************************")
print("**************************************************************")

print(f"BLEU Score: {bleu_score}")




[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

In [41]:
!pip install nltk rouge


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [44]:
import re
import math

def tokenize(text):
    # Split text into tokens (words)
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens

def get_word_frequency(tokens):
    # Count the frequency of each word in tokens
    word_frequency = {}
    for token in tokens:
        if token in word_frequency:
            word_frequency[token] += 1
        else:
            word_frequency[token] = 1
    return word_frequency

def cosine_similarity(reference, candidate):
    # Tokenize the reference and candidate strings
    ref_tokens = tokenize(reference)
    cand_tokens = tokenize(candidate)

    # Get word frequency vectors for reference and candidate
    ref_word_freq = get_word_frequency(ref_tokens)
    cand_word_freq = get_word_frequency(cand_tokens)

    # Calculate dot product and magnitudes
    dot_product = sum(ref_word_freq.get(word, 0) * cand_word_freq.get(word, 0) for word in set(ref_tokens) & set(cand_tokens))
    ref_magnitude = math.sqrt(sum(ref_word_freq[word] ** 2 for word in ref_tokens))
    cand_magnitude = math.sqrt(sum(cand_word_freq[word] ** 2 for word in cand_tokens))

    # Calculate cosine similarity
    if ref_magnitude == 0 or cand_magnitude == 0:
        return 0  # Handle division by zero
    else:
        return dot_product / (ref_magnitude * cand_magnitude)

# Given reference and candidate strings
reference = "به نام خداوند جان و خرد   	کزین برتر اندیشه برنگذرد     خداوند نام و خداوند جای     خداوند روزی ده رهنمای  "
candidate = "بفرمود و گفت ار بماند یکی  ببایدت رفتن مرا اندکی   کنون هرچ بایستشان پند ما    بباید که گردد به پیوند ما  "

# Calculate cosine similarity
similarity = cosine_similarity(reference, candidate)
print(f"Cosine Similarity: {similarity:.2f}")


Cosine Similarity: 0.06
