# Imports

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.optim import AdamW
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import io
from transformers import GenerationConfig

# Loading Dataset

In [2]:
with io.open('ferdousi.txt', 'r', encoding='utf-8') as file:
    content = file.read()

lines = content.splitlines()

verses = [lines[i] + ' ' +lines[i+1] for i in range(0, len(lines)-1, 2)]

verses[:4]

['به نام خداوند جان و خرد کزین برتر اندیشه برنگذرد',
 'خداوند نام و خداوند جای خداوند روزی ده رهنمای',
 'خداوند کیوان و گردان سپهر فروزنده ماه و ناهید و مهر',
 'ز نام و نشان و گمان برترست نگارندهٔ بر شده پیکرست']

# Loading GPT-2 Model

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model_name = "HooshvareLab/gpt2-fa"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
model.config.pad_token_id = model.config.eos_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/875k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.75M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/14.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/104 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/485M [00:00<?, ?B/s]

# Creating Dataset Class

In [13]:
class PoetryDataset(Dataset):
    def __init__(self, verses):
        self.verses = verses

    def __len__(self):
        return len(self.verses)

    def __getitem__(self, idx):
        tokenized = tokenizer(self.verses[idx], return_tensors="pt", truncation=True, padding=True)
        return {'input_ids': tokenized['input_ids'].squeeze(), 'attention_mask': tokenized['attention_mask'].squeeze()}

def collate_fn(batch):
    return {
        'input_ids': pad_sequence([item['input_ids'] for item in batch], batch_first=True),
        'attention_mask': pad_sequence([item['attention_mask'] for item in batch], batch_first=True)
    }

poetry_dataset = PoetryDataset(verses)
train_dataset, test_dataset = train_test_split(poetry_dataset, test_size=0.2, random_state=42)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


# Implementing Required Functions

In [14]:
def train_one_epoch(dataloader, model, optimizer, epoch):
    model.train()
    total_loss = 0

    # Add progress bar
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch + 1}")

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.clone()

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Update progress bar
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    average_loss = total_loss / len(dataloader)
    progress_bar.set_postfix({'average_training_loss': '{:.3f}'.format(average_loss)})

    return average_loss


from tqdm import tqdm

def test(dataloader, model):
    model.eval()
    total_bleu_score = 0
    total_loss = 0

    # Add progress bar
    progress_bar = tqdm(dataloader, desc="Testing")

    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].squeeze(dim=1).to(device)
            labels = input_ids.clone()

            # Generate with the specified parameters
            outputs = model.generate(
                input_ids,
                max_length=25,
                num_beams=5,
                no_repeat_ngram_size=2,
                top_k=50,
                attention_mask=torch.ones_like(input_ids),
            )

            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            reference_text = tokenizer.decode(labels[0], skip_special_tokens=True)

            bleu_score = corpus_bleu([[reference_text.split()]], [generated_text.split()])
            total_bleu_score += bleu_score

            # Compute loss for perplexity
            loss_outputs = model(input_ids, attention_mask=torch.ones_like(input_ids), labels=labels)
            total_loss += loss_outputs.loss.item()

            # Update progress bar
            progress_bar.set_postfix({'testing_loss': '{:.3f}'.format(loss_outputs.loss.item()/len(batch))})

    average_bleu_score = total_bleu_score / len(dataloader)
    average_loss = total_loss / len(dataloader)
    perplexity = torch.exp(torch.tensor(average_loss))

    progress_bar.set_postfix({'average_testing_loss': '{:.3f}'.format(average_loss),
                              'average_bleu_score': '{:.3f}'.format(average_bleu_score),
                              'perplexity': '{:.3f}'.format(perplexity)})

    return average_bleu_score, perplexity



# Fine-tuning GPT-2 Model

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

optimizer = AdamW(model.parameters(), lr=1e-4)

num_epochs = 15
for epoch in range(num_epochs):
    average_loss = train_one_epoch(train_dataloader, model, optimizer, epoch)
    average_bleu_score, perplexity = test(test_dataloader, model)
    print(f"Epoch {epoch + 1}, Average Loss: {average_loss}, Average BLEU Score: {average_bleu_score}, Perplexity: {perplexity}")


Epoch 1: 100%|██████████| 1241/1241 [03:39<00:00,  5.65it/s, training_loss=2.285]
Testing: 100%|██████████| 311/311 [03:26<00:00,  1.51it/s, testing_loss=2.159]


Epoch 1, Average Loss: 4.1271411876924375, Average BLEU Score: 0.8009110347372553, Perplexity: 36.423030853271484


Epoch 2: 100%|██████████| 1241/1241 [03:43<00:00,  5.55it/s, training_loss=2.096]
Testing: 100%|██████████| 311/311 [03:23<00:00,  1.53it/s, testing_loss=1.814]


Epoch 2, Average Loss: 3.2933927393074867, Average BLEU Score: 0.8269928782083084, Perplexity: 25.5499210357666


Epoch 3: 100%|██████████| 1241/1241 [03:44<00:00,  5.53it/s, training_loss=1.202]
Testing: 100%|██████████| 311/311 [03:25<00:00,  1.52it/s, testing_loss=1.776]


Epoch 3, Average Loss: 2.8230474022105274, Average BLEU Score: 0.8405910604670512, Perplexity: 22.662818908691406


Epoch 4: 100%|██████████| 1241/1241 [03:44<00:00,  5.53it/s, training_loss=1.613]
Testing: 100%|██████████| 311/311 [03:25<00:00,  1.51it/s, testing_loss=1.831]


Epoch 4, Average Loss: 2.4712543572080796, Average BLEU Score: 0.824036767053807, Perplexity: 22.356470108032227


Epoch 5: 100%|██████████| 1241/1241 [03:44<00:00,  5.54it/s, training_loss=1.432]
Testing: 100%|██████████| 311/311 [03:24<00:00,  1.52it/s, testing_loss=1.693]


Epoch 5, Average Loss: 2.1607177586059816, Average BLEU Score: 0.8354344764901995, Perplexity: 23.357463836669922


Epoch 6: 100%|██████████| 1241/1241 [03:44<00:00,  5.54it/s, training_loss=0.975]
Testing: 100%|██████████| 311/311 [03:24<00:00,  1.52it/s, testing_loss=1.736]


Epoch 6, Average Loss: 1.8791482352518825, Average BLEU Score: 0.8285210603482946, Perplexity: 24.90880584716797


Epoch 7: 100%|██████████| 1241/1241 [03:43<00:00,  5.54it/s, training_loss=1.009]
Testing: 100%|██████████| 311/311 [03:23<00:00,  1.53it/s, testing_loss=1.789]


Epoch 7, Average Loss: 1.633064536583414, Average BLEU Score: 0.801044701565886, Perplexity: 27.570831298828125


Epoch 8: 100%|██████████| 1241/1241 [03:44<00:00,  5.54it/s, training_loss=0.834]
Testing: 100%|██████████| 311/311 [03:24<00:00,  1.52it/s, testing_loss=1.819]


Epoch 8, Average Loss: 1.407717129955553, Average BLEU Score: 0.8086649098557998, Perplexity: 30.562545776367188


Epoch 9: 100%|██████████| 1241/1241 [03:44<00:00,  5.53it/s, training_loss=0.721]
Testing: 100%|██████████| 311/311 [03:26<00:00,  1.51it/s, testing_loss=1.865]


Epoch 9, Average Loss: 1.2137296231498074, Average BLEU Score: 0.7772703932429421, Perplexity: 34.31254577636719


Epoch 10: 100%|██████████| 1241/1241 [03:44<00:00,  5.54it/s, training_loss=0.590]
Testing: 100%|██████████| 311/311 [03:25<00:00,  1.51it/s, testing_loss=1.923]


Epoch 10, Average Loss: 1.0572316395381494, Average BLEU Score: 0.8086332170017367, Perplexity: 38.33250427246094


Epoch 11: 100%|██████████| 1241/1241 [03:44<00:00,  5.53it/s, training_loss=0.515]
Testing: 100%|██████████| 311/311 [03:25<00:00,  1.51it/s, testing_loss=1.895]


Epoch 11, Average Loss: 0.9292761433710502, Average BLEU Score: 0.7836863768387727, Perplexity: 42.818058013916016


Epoch 12: 100%|██████████| 1241/1241 [03:44<00:00,  5.53it/s, training_loss=0.573]
Testing: 100%|██████████| 311/311 [03:25<00:00,  1.51it/s, testing_loss=1.804]


Epoch 12, Average Loss: 0.8347934387462164, Average BLEU Score: 0.7749109308801122, Perplexity: 46.79035568237305


Epoch 13: 100%|██████████| 1241/1241 [03:44<00:00,  5.54it/s, training_loss=0.457]
Testing: 100%|██████████| 311/311 [03:29<00:00,  1.48it/s, testing_loss=2.034]


Epoch 13, Average Loss: 0.7669834440797687, Average BLEU Score: 0.798193485894769, Perplexity: 51.5494270324707


Epoch 14: 100%|██████████| 1241/1241 [03:44<00:00,  5.53it/s, training_loss=0.457]
Testing: 100%|██████████| 311/311 [03:27<00:00,  1.50it/s, testing_loss=2.016]


Epoch 14, Average Loss: 0.7196963620608512, Average BLEU Score: 0.7547466381969185, Perplexity: 55.431270599365234


Epoch 15: 100%|██████████| 1241/1241 [03:44<00:00,  5.54it/s, training_loss=0.436]
Testing: 100%|██████████| 311/311 [03:24<00:00,  1.52it/s, testing_loss=2.186]

Epoch 15, Average Loss: 0.6828872918314169, Average BLEU Score: 0.7583794321179489, Perplexity: 59.37154769897461





In [28]:
# Function for generating poetry verses
def generate_poetry(input_text, max_length=100):
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Generate with attention_mask
    output_sequence = model.generate(
        input_ids,
        max_length=18,
        num_beams=5,
        no_repeat_ngram_size=2,
        top_k=50,
        attention_mask=torch.ones_like(input_ids)
    )

    # Decode and print the generated sequence
    generated_verse = tokenizer.decode(output_sequence[0], skip_special_tokens=True)
    print(f"Generated Verse:\n {generated_verse}")

# Example usage
input_sentence = "تو نیکی می کن"
generate_poetry(input_sentence)

# Example usage
input_sentence = "سعدیا مرد نکونام"
generate_poetry(input_sentence)

# Example usage
input_sentence = "سلام من به تو"
generate_poetry(input_sentence)

Generated Verse:
 تو نیکی می کن و مستان مکن با کس مگردان سخن جز به بیداد مکن
Generated Verse:
 سعدیا مرد نکونامش را بخواند بپرسید بسیار و بنشست پیشش
Generated Verse:
 سلام من به تو شادمان شدم ز خردک به جام دمادم شدم؟
