# preparing the data

In [1]:
from torch.utils.data import DataLoader
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# 1. Preprocess Data

In [2]:
df = pd.read_csv('Arabic Poem Comprehensive Dataset (APCD).csv')
df.drop(columns=['البيت' , 'العصر' , 'الديوان' , 'القافية' , 'الشاعر' , 'البحر'], inplace=True)
df.dropna(subset=['الشطر الايمن', 'الشطر الايسر'], inplace=True)
poems = df.rename(columns={"الشطر الايمن": "prompt", "الشطر الايسر": "completion"})[:50000].to_dict(orient="records")

# 2. Create Dataset Class

In [3]:
class PoemDataset(torch.utils.data.Dataset):
    def __init__(self, poems, tokenizer, max_length):
        self.poems = poems
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.poems)

    def __getitem__(self, idx):
        poem = self.poems[idx]
        prompt = poem["prompt"]
        completion = poem["completion"]

        # Tokenize the prompt and completion
        inputs = self.tokenizer.encode(prompt, add_special_tokens=True, max_length=self.max_length, truncation=True, padding='max_length', return_tensors="pt")
        labels = self.tokenizer.encode(completion, add_special_tokens=True, max_length=self.max_length, truncation=True, padding='max_length', return_tensors="pt")

        return {
            "input_ids": inputs.squeeze(0),  # Remove the batch dimension
            "labels": labels.squeeze(0),      # Remove the batch dimension
        }


In [4]:
#class PoemDataset(torch.utils.data.Dataset):
   # def __init__(self, poems):
       # self.poems = poems

  #  def __len__(self):
     #   return len(self.poems)

   # def __getitem__(self, idx):
       #poem = self.poems[idx]
        #prompt = poem["prompt"]
        #completion = poem["completion"]
        #encoded_data = tokenizer(prompt, completion, padding="max_length", truncation=True, return_tensors="pt")
        #return encoded_data



# 3. Model and Tokenizer

In [5]:
model_name = "bakrianoo/t5-arabic-small" 
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

pytorch_model.bin:   0%|          | 0.00/320M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.huggingface.co/bakrianoo/t5-arabic-small/a4dfa25896e0801b897f27e5f4b683cbea05f267fb7d4f95522d04d255c1dd65?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1712079249&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxMjA3OTI0OX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9iYWtyaWFub28vdDUtYXJhYmljLXNtYWxsL2E0ZGZhMjU4OTZlMDgwMWI4OTdmMjdlNWY0YjY4M2NiZWEwNWYyNjdmYjdkNGY5NTUyMmQwNGQyNTVjMWRkNjU%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIn1dfQ__&Signature=ciU6pgPXLpQby5xjQAqoN-v-x5yDsmv4wHVXTltvV6BysT6P-dnn5Wcm6L5QwbhynqLUrKWg08voBIDCCw9vT6YEIuh0te%7E0e8dvCOGn-xYBNjKcSsLoh6stwBwAgnry9di268VunAv1hOkEuo9CBYCPBr1LdFsvAKNrjQVtHruy-T97Eh8bUStHsUSxk8tkdqBEl8UC-ZrVh48IloU%7EmjNnEcNCDp%7E%7E%7EnUsjkjvkebcZaTS1xhQtqCSOSD8QHvfZgqY6EBa3uXQ36

pytorch_model.bin:   0%|          | 0.00/320M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/847k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# 4. Create Data Loader

In [6]:
max_length = 64 

# Create dataset and data loader
dataset = PoemDataset(poems, tokenizer, max_length)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [7]:
data_loader

<torch.utils.data.dataloader.DataLoader at 0x7f744b773910>

In [8]:
# 5. Data Preparation Function 
#def prepare_data(data):
    #encoded_data = tokenizer(data, padding="max_length", truncation=True, return_tensors="pt")
    #return encoded_data



# 6. Optimizer and Training Loop

In [9]:
num_epochs = 1
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(num_epochs):
    for data in data_loader:
        inputs = data["input_ids"]
        labels = data["labels"]

        optimizer.zero_grad()
        outputs = model(input_ids=inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [11]:
# 7.  Generation Function 
def generate_poem(prompt, max_length=50, temperature=1.0, num_beams=5):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        temperature=temperature,
        num_beams=num_beams,
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [12]:
prompt = "وردة حمراء تتفتح"  
generated_poem = generate_poem(prompt)
print(generated_poem)

<extra_id_0> على على رأسِه على رُؤوسِه


In [None]:
model.save('model_soumia.model')

In [14]:
torch.save(model.state_dict(), 'model_soumia_torch.model')