### find maximum lenght of poem_text in  train,test,validation sets

In [None]:

import pandas as pd
df = pd.read_csv('train_samples.csv')
max_len = df['poem_text'].str.len().max()
print("Maximum length:", max_len)

df = pd.read_csv('validation_samples.csv')
max_len = df['poem_text'].str.len().max()
print("Maximum length:", max_len)


df = pd.read_csv('test_samples.csv')
max_len = df['poem_text'].str.len().max()
print("Maximum length:", max_len)

Maximum length: 58
Maximum length: 51
Maximum length: 49


### find maximum token length in train dataset for deteining in the Roberta training

In [None]:
from transformers import XLMRobertaTokenizer
import pandas as pd

tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')

train_df = pd.read_csv('train_samples.csv')

max_token_len = 0
for text in train_df['poem_text']:
    tokens = tokenizer.encode(text, add_special_tokens=True)
    max_token_len = max(max_token_len, len(tokens))

print(f"The maximum token length in the training data is: {max_token_len}")

: 

## Main code 

In [None]:
import pandas as pd
import re

# Load and shuffle the dataset
train_df = pd.read_csv('train_samples.csv')
train_df = train_df.sample(frac=1).reset_index(drop=True)

def clean_text(text):
    # Remove unwanted Unicode characters
    text = re.sub(r'\u200c', '', text)  # Remove zero-width non-joiners
    return text

# Apply cleaning to 'poem_text' column
train_df['poem_text'] = train_df['poem_text'].apply(clean_text)

print("Sample of cleaned and shuffled training data:")
print(train_df[['poem_text', 'metre']].head(5))

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW
import tqdm


val_df = pd.read_csv('validation_samples.csv')

# Combine unique classes from both train and validation sets
combined_classes = pd.Series(train_df['metre'].tolist() + val_df['metre'].tolist()).unique()
metre_to_id = {metre: idx for idx, metre in enumerate(combined_classes)}

# Print the mapping to check
print("Metre to ID mapping:", metre_to_id)

y_train = torch.tensor([metre_to_id.get(m, -1) for m in train_df['metre']])  
y_val = torch.tensor([metre_to_id.get(m, -1) for m in val_df['metre']])      

class RobertaDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=40):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the input text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label
        }

# Initialize tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')

# Prepare DataLoaders
train_dataset = RobertaDataset(train_df['poem_text'].tolist(), y_train, tokenizer)
val_dataset = RobertaDataset(val_df['poem_text'].tolist(), y_val, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Initialize the model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-large', num_labels=len(metre_to_id), ignore_mismatched_sizes=True)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

## we can add compute weights to code using these lines !! 

# class_counts = np.bincount(y_train.numpy())
# class_weights = torch.tensor(len(y_train) / (len(class_counts) * class_counts), dtype=torch.float).to(device)
# loss_fn = CrossEntropyLoss(weight=class_weights)

# Initialize loss function without class weights
loss_fn = CrossEntropyLoss()

# Modify train_epoch function to use the standard loss function
def train_epoch(model, data_loader, optimizer, device, loss_fn):
    model = model.train()
    total_loss = 0
    correct_predictions = 0

    for data in tqdm.tqdm(data_loader):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits

        loss = loss_fn(logits, labels)  
        total_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels).item()

        loss.backward()
        optimizer.step()

    return correct_predictions / len(data_loader.dataset), total_loss / len(data_loader)

# Early stopping parameters
patience = 4  #
best_val_loss = float('inf')
early_stop_counter = 0

from sklearn.metrics import f1_score, precision_score, recall_score

def eval_model(model, data_loader, device):
    model = model.eval()
    correct_predictions = 0
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels).item()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = correct_predictions / len(data_loader.dataset)
    f1 = f1_score(all_labels, all_preds, average='macro')
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')

    return accuracy, f1, precision, recall

# Training loop with early stopping
epochs = 7  # Set the number of epochs
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, device, loss_fn)
    print(f"Train loss: {train_loss}, accuracy: {train_acc}")
    val_accuracy, val_f1, val_precision, val_recall = eval_model(model, val_loader, device)
    print(f"Validation accuracy: {val_accuracy}, F1: {val_f1}, Precision: {val_precision}, Recall: {val_recall}")

    # Early stopping check
    if val_f1 < best_val_loss:
        best_val_loss = val_f1
        early_stop_counter = 0
        model.save_pretrained('xlm_roberta_large_best.bin')  # Save the best model
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("Early stopping triggered.")
            break



Sample of cleaned and shuffled training data:
                            poem_text                      metre
0     چو لعل میخری از کان من بخر باری  مفاعلن فعلاتن مفاعلن فعلن
1          ز شاهان به شمشیر بستد خراج      فعولن فعولن فعولن فعل
2  که همه عمر دعاگوی و هوادار تو نیست  فعلاتن فعلاتن فعلاتن فعلن
3         گلی زینسان چمن افروز و دلکش      مفاعیلن مفاعیلن فعولن
4          نیندیشد از هیچ باران و برف      فعولن فعولن فعولن فعل


  from .autonotebook import tqdm as notebook_tqdm


Metre to ID mapping: {'مفاعلن فعلاتن مفاعلن فعلن': 0, 'فعولن فعولن فعولن فعل': 1, 'فعلاتن فعلاتن فعلاتن فعلن': 2, 'مفاعیلن مفاعیلن فعولن': 3, 'مفعول مفاعیل مفاعیل فعل': 4, 'مفعول مفاعلن فعولن': 5, 'فاعلاتن فاعلاتن فاعلن': 6, 'مفتعلن فاعلن مفتعلن فاعلن': 7, 'مفعول فاعلات مفاعیل فاعلن': 8, 'فعلاتن مفاعلن فعلن': 9, 'فعلاتن فعلاتن فعلن': 10, 'مستفعلن مستفعلن مستفعلن مستفعلن': 11, 'مفتعلن مفتعلن فاعلن': 12, 'فعلات فاعلاتن فعلات فاعلاتن': 13, 'مفعول مفاعیلن مفعول مفاعیلن': 14, 'فاعلاتن فاعلاتن فاعلاتن فاعلن': 15, 'مفعول مفاعیل مفاعیل فعولن': 16, 'مفعول مفاعیل فاعلاتن': 17, 'مفاعیلن مفاعیلن مفاعیلن مفاعیلن': 18, 'مفتعلن فاعلات مفتعلن فع': 19, 'مفعول مفاعلن مفاعیلن': 20, 'فعلاتن فعلاتن فعلاتن فعلاتن': 21, 'مفتعلن مفاعلن مفتعلن مفاعلن': 22, 'مفعول فاعلاتن مفعول فاعلاتن': 23, 'فعولن فعولن فعولن فعولن': 24, 'فعلاتن مفاعلن فعلاتن مفاعلن': 25, 'فاعلاتن فاعلاتن فاعلاتن': 26, 'مستفعلن فعلن مستفعلن فعلن': 27, 'مفاعیل مفاعیل مفاعیل فعولن': 28, 'فعلاتن فعلاتن فعلاتن فع': 29, 'مفاعلن فعلاتن مفاعلن فعلاتن

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at /home/mh/Documents/NLU-exe/XLM-roberta-large/xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/7


100%|██████████| 11706/11706 [1:56:07<00:00,  1.68it/s]


Train loss: 0.5137073632146557, accuracy: 0.8392557769519904


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation accuracy: 0.9464096838641438, F1: 0.5887457481108713, Precision: 0.6406803700076417, Recall: 0.5957461149380252
Epoch 2/7


100%|██████████| 11706/11706 [1:56:14<00:00,  1.68it/s]


Train loss: 0.11507444930668298, accuracy: 0.9650085960191355


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation accuracy: 0.9672817017275825, F1: 0.7023443653439451, Precision: 0.778921323084603, Recall: 0.6951380161528952
Epoch 3/7


100%|██████████| 11706/11706 [1:56:27<00:00,  1.68it/s]


Train loss: 0.0826995539632388, accuracy: 0.9749741051597471


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation accuracy: 0.9714420025854977, F1: 0.7722337217424986, Precision: 0.852772964213258, Recall: 0.7523141636714571
Epoch 4/7


100%|██████████| 11706/11706 [1:56:17<00:00,  1.68it/s]


Train loss: 0.06477536235804673, accuracy: 0.9801463992824193


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation accuracy: 0.9764484663297684, F1: 0.8159462250365365, Precision: 0.8989265836544597, Recall: 0.7931833763825734
Epoch 5/7


100%|██████████| 11706/11706 [1:56:29<00:00,  1.67it/s]


Train loss: 0.05373352673781375, accuracy: 0.9835447633692124
Validation accuracy: 0.97743565636385, F1: 0.8346389942947354, Precision: 0.9133923071843354, Recall: 0.8139478518520299
Early stopping triggered.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [2]:

val_accuracy, val_f1, val_precision, val_recall = eval_model(model, val_loader, device)
print(f"Validation accuracy: {val_accuracy}, F1: {val_f1}, Precision: {val_precision}, Recall: {val_recall}")

Validation accuracy: 0.97743565636385, F1: 0.8346389942947354, Precision: 0.9133923071843354, Recall: 0.8139478518520299


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
import pandas as pd

# Load the test data
test_df = pd.read_csv('/Poem Meter Dataset/test_samples.csv')
test_df['poem_text'] = test_df['poem_text'].apply(clean_text)  # Apply the cleaning function

# Define the custom dataset class
class RobertaDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        
        # Tokenize the input text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }

        # Add label only if available
        if self.labels is not None:
            item['label'] = self.labels[idx]

        return item

# Initialize the tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large")

# Prepare test dataset and dataloader
test_dataset = RobertaDataset(test_df['poem_text'].tolist(), labels=None, tokenizer=tokenizer)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Load the trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Perform predictions
predictions = []
with torch.no_grad():
    for data in test_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())

# Map predictions back to metre labels
id_to_metre = {idx: metre for metre, idx in metre_to_id.items()}
test_df['predicted_metre'] = [id_to_metre[pred] for pred in predictions]

# Save predictions to CSV
test_df.to_csv('classification-based-Roberta-large-test_predictions.csv', index=False)

print("Predictions saved to classification-based-Roberta-large-test_predictions.csv")


Predictions saved to classification-based-Roberta-large-test_predictions.csv
