In [165]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW

In [166]:
max_length=512
num_meter_classes = 23
batch_size = 1
learning_rate = 3e-4
num_epochs = 10
top = 10000
def get_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

In [167]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('asafaya/bert-base-arabic')
model = BertForSequenceClassification.from_pretrained('asafaya/bert-base-arabic', num_labels=num_meter_classes)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at asafaya/bert-base-arabic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [202]:
meters = pd.read_csv('meter.csv')
#meters = meters[:top]
meters.head()

Unnamed: 0,الشطر,البحر
0,خَليلَيَّ لا تَستَعجِلا أَن تَزَوَّدا,الطويل
1,فَما لَبَثٌ يَوماً بِسابِقٍ مَغنَمٍ,الطويل
2,وَإِن تُنظِراني اليَومَ أَقضِ لُبانَةً,الطويل
3,لَعَمرُكَ ما نَفسٌ بِجِدٍ رَشيدَةٍ,الطويل
4,وَإِن ظَهَرَت مِنهُ قَوارِصُ جَمَّةٌ,الطويل


In [210]:
## print how many each class appears in the dataset
print(meters['البحر'].value_counts())
## drop class which has less than 70000 instances
meters = meters[meters['البحر'] != 'المنسرح']
meters = meters[meters['البحر'] != 'موشح']
meters = meters[meters['البحر'] != 'المجتث']
meters = meters[meters['البحر'] != 'الهزج']
meters = meters[meters['البحر'] != 'المديد']
meters = meters[meters['البحر'] != 'المتدارك']
meters = meters[meters['البحر'] != 'الدوبيت']
meters = meters[meters['البحر'] != 'المواليا']
meters = meters[meters['البحر'] != 'السلسلة']
meters = meters[meters['البحر'] != 'المقتضب']
meters = meters[meters['البحر'] != 'عامي']
meters = meters[meters['البحر'] != 'المضارع']
meters = meters[meters['البحر'] != 'شعر التفعيلة']
meters = meters[meters['البحر'] != 'شعر حر']
print(meters['البحر'].value_counts())
## save the new dataset


البحر
الطويل      516720
الكامل      440698
البسيط      290574
الخفيف      191568
الوافر      170820
الرجز       115762
الرمل        85472
المتقارب     84778
السريع       76166
Name: count, dtype: int64
البحر
الطويل      516720
الكامل      440698
البسيط      290574
الخفيف      191568
الوافر      170820
الرجز       115762
الرمل        85472
المتقارب     84778
السريع       76166
Name: count, dtype: int64


In [217]:
meters.to_csv('meter_filtered.csv', index=False)

In [169]:
class ArabicPoetryDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [170]:
meters['البحر'].unique()
label_dict = {label: i for i, label in enumerate(meters['البحر'].unique())}
label_dict

{'الطويل': 0,
 'المنسرح': 1,
 'المتقارب': 2,
 'الخفيف': 3,
 'الكامل': 4,
 'السريع': 5,
 'الوافر': 6,
 'البسيط': 7,
 'الرجز': 8,
 'الرمل': 9,
 'المجتث': 10,
 'المديد': 11}

In [171]:
text = meters['الشطر']
label = meters['البحر']
label = label.apply(lambda x: label_dict[x])
texts = text.to_list()
labels = label.to_list()

In [201]:
import numpy as np
# Assuming 'labels' is a list or array of your labels
label_counts = np.unique(labels, return_counts=True)
print(dict(zip(label_counts[0], label_counts[1])))

{0: 1}


In [172]:
dataset = ArabicPoetryDataset(texts, labels)
dataloader = DataLoader(dataset, batch_size = batch_size, shuffle=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [174]:
print('Dataset size:', len(dataset))
print('Number of batches:', len(dataloader))

Dataset size: 10000
Number of batches: 10000


In [215]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
losses = torch.tensor([])
for epoch in range(num_epochs):  # number of epochs
    model.train()
    model.to(get_device())
    i = 0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        input_ids = input_ids.to(get_device())
        attention_mask = attention_mask.to(get_device())
        labels = labels.to(get_device())
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        losses = torch.cat((losses, loss.view(1)))
        loss.backward()
        optimizer.step()
        print(f"iteration {i+1}, Loss: {loss.item()}")
        i += 1
    print(f"Epoch {epoch+1}, Loss: {losses.mean()}")

KeyboardInterrupt: 

In [176]:
def predict_meter(text, model, tokenizer):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: val.to(model.device) for key, val in inputs.items()}

    # Predict the meter using the model
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=1)

    # Return the predicted meter class (as integer)
    return predictions.item()

In [216]:
# get random index
text_example = meters.iloc[36]
print(text_example['البحر'])
example_text = "خَليلَيَّ لا تَستَعجِلا أَن تَزَوَّدا"
predicted_meter = predict_meter(text_example['الشطر'], model, tokenizer)
labelling = {v: k for k, v in label_dict.items()}
print(f"Predicted Meter Class: {labelling[predicted_meter]}")

الطويل
Predicted Meter Class: الطويل


NameError: name 'np' is not defined