In [1]:
import torch
from transformers import AutoModelForTokenClassification
from torch.utils.data import DataLoader , Dataset
from sklearn.model_selection import train_test_split

from keras.optimizers import Adam
from transformers import AutoModel, BertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments ,  AutoTokenizer
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
from transformers.data.processors.utils import InputFeatures
import pandas as pd
import numpy as np

Setting the device to CUDA

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
train_path = 'preprocessed_data.txt'
val_path = 'preprocessed_val_data.txt'

In [44]:
df = pd.read_csv(train_path, sep="\t", names=["token", "tag"])
val_df = pd.read_csv(val_path, sep="\t", names=["token", "tag"])

train_labels = df['tag'].unique()
val_labels = val_df['tag'].unique()

all_labels = np.unique(np.concatenate((train_labels, val_labels)))
print(len(all_labels))

95


In [4]:
df

Unnamed: 0,token,tag
0,وعي,O
1,لاوعي,O
2,عم,O
3,شاف,O
4,حلم,O
...,...,...
50049,بازار,O
50050,انطلاقة,O
50051,ضابط,O
50052,احز,O


In [5]:
max_length = df["token"].str.len().max()
max_length

19

In [9]:
model_name = 'xlm-roberta-base'

xlm_model = AutoModelForTokenClassification.from_pretrained(model_name, from_tf=True)

tf_model.h5:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
All TF 2.0 model weights were used when initializing XLMRobertaForTokenClassification.

All the weights of XLMRobertaForTokenClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use XLMRobertaForTokenClassification for predictions without further training.


In [45]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(all_labels)
train_label_encoded= label_encoder.transform(train_labels)
val_label_encoded= label_encoder.transform(val_labels)
train_labels = train_label_encoded.tolist()
val_labels= val_label_encoded.tolist()

In [46]:
label_map = {label: index for label, index in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}
print("Length: ",len(label_map),"\nContent: ",label_map)


Length:  95 
Content:  {'B-AIRPORT': 0, 'B-BOUNDARY': 1, 'B-BUILDING-OR-GROUNDS': 2, 'B-CAMP': 3, 'B-CARDINAL': 4, 'B-CELESTIAL': 5, 'B-CLUSTER': 6, 'B-COM': 7, 'B-CONTINENT': 8, 'B-COUNTRY': 9, 'B-CURR': 10, 'B-Cluster': 11, 'B-DATE': 12, 'B-EDU': 13, 'B-ENT': 14, 'B-EVENT': 15, 'B-GOV': 16, 'B-LAND': 17, 'B-LAND-REGION-NATURAL': 18, 'B-LANGUAGE': 19, 'B-LAW': 20, 'B-MED': 21, 'B-MONEY': 22, 'B-NEIGHBORHOOD': 23, 'B-NONGOV': 24, 'B-NORP': 25, 'B-OCC': 26, 'B-ORDINAL': 27, 'B-ORG': 28, 'B-PATH': 29, 'B-PERCENT': 30, 'B-PERS': 31, 'B-PLANT': 32, 'B-PRODUCT': 33, 'B-QUANTITY': 34, 'B-REGION-GENERAL': 35, 'B-REGION-INTERNATIONAL': 36, 'B-REL': 37, 'B-SCI': 38, 'B-SPO': 39, 'B-SPORT': 40, 'B-STATE-OR-PROVINCE': 41, 'B-SUBAREA-FACILITY': 42, 'B-TIME': 43, 'B-TOWN': 44, 'B-UNIT': 45, 'B-WATER-BODY': 46, 'B-WEBSITE': 47, 'I-AIRPORT': 48, 'I-BOUNDARY': 49, 'I-BUILDING-OR-GROUNDS': 50, 'I-CAMP': 51, 'I-CARDINAL': 52, 'I-CLUSTER': 53, 'I-COM': 54, 'I-CONTINENT': 55, 'I-COUNTRY': 56, 'I-CURR': 57

### Preparing the Dataset

In [47]:
class XLMDataset(Dataset):
    def __init__(self, data, model_name, max_len, label_map):
      super(XLMDataset).__init__()
      self.data = data
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
      self.max_len = max_len
      self.label_map = label_map


    def __len__(self):
      # return len(self.text)
      return len(self.data)

    def __getitem__(self,item):
      text = str(self.data[item][0])
      target = self.data[item][1]

      encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )


      input_ids = encoding['input_ids'].squeeze(0)
      attention_mask = encoding['attention_mask'].squeeze(0)
      label_id = self.label_map[target]

      return{
         'input_ids': input_ids,
          'attention_mask': attention_mask,
            'labels': label_id
      }

## Dividing the dataset into training and validation

In [48]:
train_dataset = XLMDataset(df[['token', 'tag']].values.tolist(), model_name, max_length, label_map)

In [49]:
len(train_dataset)

50054

In [50]:
val_dataset = XLMDataset(val_df[['token', 'tag']].values.tolist(), model_name, max_length, label_map)

In [51]:
len(val_dataset)

14576

In [106]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
print(len(train_dataloader))

783


In [107]:
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=True)
print(len(val_dataloader))

228


## Defining the model

In [54]:
learning_rate = 2e-5
epochs = 5 # we changed them later
num_classes = len(label_map)

In [63]:
model=AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
def compute_accuracy(predictions, labels):
    predictions = torch.argmax(predictions, dim=1)
    correct = (predictions == labels).sum().item()
    total = labels.size(0)
    return correct / total

## Testing the model on a small subset

We used here mixed precision technique to speed the training process

In [None]:
subset_size = 1000  # Set the desired size of the subset
train_dataset_subset = torch.utils.data.Subset(train_dataset, range(subset_size))
subset_train_dataloader = DataLoader(train_dataset_subset, batch_size=64, shuffle=True)

val_dataset_subset = torch.utils.data.Subset(val_dataset, range(subset_size))
subset_val_dataloader = DataLoader(train_dataset_subset, batch_size=64, shuffle=True)

In [69]:
from transformers import AdamW, get_scheduler
from tqdm import tqdm
from torch.cuda.amp import autocast

optimizer = AdamW(model.parameters(), lr=2e-5)

accumulation_steps = 4 
accumulated_batch_count = 0

epochs  = 10

for epoch in range(epochs):
    model.train()
    total_train_loss = 0.0
    total_train_acc = 0.0
    train_samples = 0

    print("-------------------------------------------------------")
    with tqdm(total=len(subset_train_dataloader), desc=f"Epoch {epoch+1}", leave=False) as pbar:
        with autocast():
            for batch in subset_train_dataloader:
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels'].long()

                optimizer.zero_grad()

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = (outputs.loss) / accumulation_steps
                logits = outputs.logits
                loss.backward()
                accumulated_batch_count += 1
                # optimizer.step()
                if accumulated_batch_count % accumulation_steps == 0:
                    # Perform optimizer step and reset accumulated_batch_count
                    optimizer.step()
                    optimizer.zero_grad()
                    accumulated_batch_count = 0  # Reset accumulated_batch_count

                acc = compute_accuracy(logits, labels)

                total_train_loss += loss.item() * labels.size(0)
                total_train_acc += acc * labels.size(0)
                train_samples += labels.size(0)
                pbar.update(1)

        #  Final optimizer step for remaining accumulated gradients
        if accumulated_batch_count > 0:
            optimizer.step()
            optimizer.zero_grad()
            accumulated_batch_count = 0

    # Evaluate on validation set
    model.eval()
    total_val_loss = 0.0
    total_val_acc = 0.0
    val_samples = 0

    with tqdm(total=len(subset_val_dataloader), desc=f"Epoch {epoch+1} (validation)", leave=False) as pbar:
        for batch in subset_val_dataloader:
            with autocast():
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels'].long()

                with torch.no_grad():
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                    logits = outputs.logits

                acc = compute_accuracy(logits, labels)

                total_val_loss += loss.item() * labels.size(0)
                total_val_acc += acc * labels.size(0)
                val_samples += labels.size(0)
                pbar.update(1)

    train_loss = total_train_loss / train_samples
    train_acc = total_train_acc / train_samples
    val_loss = total_val_loss / val_samples
    val_acc = total_val_acc / val_samples

    print(f"Epoch number: {epoch+1 / epochs} ----> "
          f"Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.4f} "
          f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")



-------------------------------------------------------


                                                                   

Epoch number: 0.1 ----> Training Loss: 0.1245, Training Accuracy: 0.9625 Validation Loss: 0.5432, Validation Accuracy: 0.9350
-------------------------------------------------------


                                                                   

Epoch number: 1.1 ----> Training Loss: 0.1019, Training Accuracy: 0.9625 Validation Loss: 0.5203, Validation Accuracy: 0.9350
-------------------------------------------------------


                                                                   

Epoch number: 2.1 ----> Training Loss: 0.0902, Training Accuracy: 0.9625 Validation Loss: 0.5198, Validation Accuracy: 0.9350
-------------------------------------------------------


                                                                   

Epoch number: 3.1 ----> Training Loss: 0.0843, Training Accuracy: 0.9625 Validation Loss: 0.5240, Validation Accuracy: 0.9350
-------------------------------------------------------


                                                                   

Epoch number: 4.1 ----> Training Loss: 0.0827, Training Accuracy: 0.9625 Validation Loss: 0.5394, Validation Accuracy: 0.9350
-------------------------------------------------------


                                                                   

Epoch number: 5.1 ----> Training Loss: 0.0808, Training Accuracy: 0.9625 Validation Loss: 0.5433, Validation Accuracy: 0.9350
-------------------------------------------------------


                                                                   

Epoch number: 6.1 ----> Training Loss: 0.0790, Training Accuracy: 0.9625 Validation Loss: 0.5453, Validation Accuracy: 0.9350
-------------------------------------------------------


                                                                   

Epoch number: 7.1 ----> Training Loss: 0.0802, Training Accuracy: 0.9625 Validation Loss: 0.5264, Validation Accuracy: 0.9350
-------------------------------------------------------


                                                                   

Epoch number: 8.1 ----> Training Loss: 0.0807, Training Accuracy: 0.9625 Validation Loss: 0.5189, Validation Accuracy: 0.9350
-------------------------------------------------------


                                                                    

Epoch number: 9.1 ----> Training Loss: 0.0790, Training Accuracy: 0.9625 Validation Loss: 0.5257, Validation Accuracy: 0.9350




# **Training the entire dataset**

Here we trained the dataset using samples dump and load method, but dividing the dataset into samples, train and evaluate the model, then store the model into a pickle file for later training

In [74]:
train_subset_size = 5000  # Set the desired size of the subset
train_subset = torch.utils.data.Subset(train_dataset, range(train_subset_size))
subset_train_dataloader = DataLoader(train_subset, batch_size=64, shuffle=True)

val_subset_size = 1000
val_subset = torch.utils.data.Subset(val_dataset, range(val_subset_size))
subset_val_dataloader = DataLoader(val_subset, batch_size=64, shuffle=True)

In [77]:
len(train_subset)

5000

In [78]:
len(val_subset)

1000

In [79]:
from transformers import AdamW, get_scheduler
from tqdm import tqdm
from torch.cuda.amp import autocast

optimizer = AdamW(model.parameters(), lr=2e-5)

accumulation_steps = 4 
accumulated_batch_count = 0

epochs  = 5

for epoch in range(epochs):
    model.train()
    total_train_loss = 0.0
    total_train_acc = 0.0
    train_samples = 0

    print("-------------------------------------------------------")
    with tqdm(total=len(subset_train_dataloader), desc=f"Epoch {epoch+1}", leave=False) as pbar:
        with autocast():
            for batch in subset_train_dataloader:
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels'].long()

                optimizer.zero_grad()

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = (outputs.loss) / accumulation_steps
                logits = outputs.logits
                loss.backward()
                accumulated_batch_count += 1
                # optimizer.step()
                if accumulated_batch_count % accumulation_steps == 0:
                    # Perform optimizer step and reset accumulated_batch_count
                    optimizer.step()
                    optimizer.zero_grad()
                    accumulated_batch_count = 0  # Reset accumulated_batch_count

                acc = compute_accuracy(logits, labels)

                total_train_loss += loss.item() * labels.size(0)
                total_train_acc += acc * labels.size(0)
                train_samples += labels.size(0)
                pbar.update(1)

        #  Final optimizer step for remaining accumulated gradients
        if accumulated_batch_count > 0:
            optimizer.step()
            optimizer.zero_grad()
            accumulated_batch_count = 0

    # Evaluate on validation set
    model.eval()
    total_val_loss = 0.0
    total_val_acc = 0.0
    val_samples = 0

    with tqdm(total=len(subset_val_dataloader), desc=f"Epoch {epoch+1} (validation)", leave=False) as pbar:
        for batch in subset_val_dataloader:
            with autocast():
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels'].long()

                with torch.no_grad():
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                    logits = outputs.logits

                acc = compute_accuracy(logits, labels)

                total_val_loss += loss.item() * labels.size(0)
                total_val_acc += acc * labels.size(0)
                val_samples += labels.size(0)
                pbar.update(1)

    train_loss = total_train_loss / train_samples
    train_acc = total_train_acc / train_samples
    val_loss = total_val_loss / val_samples
    val_acc = total_val_acc / val_samples

    print(f"Epoch number: {epoch+1 / epochs} ----> "
          f"Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.4f} "
          f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")



-------------------------------------------------------


                                                                     

Epoch number: 0.2 ----> Training Loss: 1.2618, Training Accuracy: 0.0000 Validation Loss: 3.9749, Validation Accuracy: 0.0000
-------------------------------------------------------


                                                                     

Epoch number: 1.2 ----> Training Loss: 0.6378, Training Accuracy: 0.5144 Validation Loss: 0.8998, Validation Accuracy: 0.9390
-------------------------------------------------------


                                                                     

Epoch number: 2.2 ----> Training Loss: 0.1938, Training Accuracy: 0.9322 Validation Loss: 0.5089, Validation Accuracy: 0.9390
-------------------------------------------------------


                                                                     

Epoch number: 3.2 ----> Training Loss: 0.1436, Training Accuracy: 0.9322 Validation Loss: 0.4988, Validation Accuracy: 0.9390
-------------------------------------------------------


                                                                     

Epoch number: 4.2 ----> Training Loss: 0.1373, Training Accuracy: 0.9322 Validation Loss: 0.4954, Validation Accuracy: 0.9390




In [80]:
import pickle

with open('xlm_model_5000.pkl', 'wb') as f:
    pickle.dump(model, f)

In [82]:
with open('xlm_model_5000.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [83]:
i = 2
train_subset = torch.utils.data.Subset(train_dataset, range(train_subset_size, i*train_subset_size))
subset_train_dataloader = DataLoader(train_subset, batch_size=64, shuffle=True)

val_subset = torch.utils.data.Subset(val_dataset, range(val_subset_size, i*val_subset_size))
subset_val_dataloader = DataLoader(val_subset, batch_size=64, shuffle=True)

i+=1

print(len(train_subset))
print(len(val_subset))
print(i)

5000
1000
3


In [84]:
from transformers import AdamW, get_scheduler
from tqdm import tqdm
from torch.cuda.amp import autocast

optimizer = AdamW(model.parameters(), lr=2e-5)

accumulation_steps = 4 
accumulated_batch_count = 0

model = loaded_model

for epoch in range(epochs):
    model.train()
    total_train_loss = 0.0
    total_train_acc = 0.0
    train_samples = 0

    print("-------------------------------------------------------")
    with tqdm(total=len(subset_train_dataloader), desc=f"Epoch {epoch+1}", leave=False) as pbar:
        with autocast():
            for batch in subset_train_dataloader:
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels'].long()

                optimizer.zero_grad()

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = (outputs.loss) / accumulation_steps
                logits = outputs.logits
                loss.backward()
                accumulated_batch_count += 1
                # optimizer.step()
                if accumulated_batch_count % accumulation_steps == 0:
                    # Perform optimizer step and reset accumulated_batch_count
                    optimizer.step()
                    optimizer.zero_grad()
                    accumulated_batch_count = 0  # Reset accumulated_batch_count

                acc = compute_accuracy(logits, labels)

                total_train_loss += loss.item() * labels.size(0)
                total_train_acc += acc * labels.size(0)
                train_samples += labels.size(0)
                pbar.update(1)

        #  Final optimizer step for remaining accumulated gradients
        if accumulated_batch_count > 0:
            optimizer.step()
            optimizer.zero_grad()
            accumulated_batch_count = 0

    # Evaluate on validation set
    model.eval()
    total_val_loss = 0.0
    total_val_acc = 0.0
    val_samples = 0

    with tqdm(total=len(subset_val_dataloader), desc=f"Epoch {epoch+1} (validation)", leave=False) as pbar:
        for batch in subset_val_dataloader:
            with autocast():
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels'].long()

                with torch.no_grad():
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                    logits = outputs.logits

                acc = compute_accuracy(logits, labels)

                total_val_loss += loss.item() * labels.size(0)
                total_val_acc += acc * labels.size(0)
                val_samples += labels.size(0)
                pbar.update(1)

    train_loss = total_train_loss / train_samples
    train_acc = total_train_acc / train_samples
    val_loss = total_val_loss / val_samples
    val_acc = total_val_acc / val_samples

    print(f"Epoch number: {epoch+1 / epochs} ----> "
          f"Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.4f} "
          f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")



-------------------------------------------------------


                                                                     

Epoch number: 0.2 ----> Training Loss: 0.1383, Training Accuracy: 0.9310 Validation Loss: 0.7253, Validation Accuracy: 0.9070
-------------------------------------------------------


                                                                     

Epoch number: 1.2 ----> Training Loss: 0.1376, Training Accuracy: 0.9310 Validation Loss: 0.7253, Validation Accuracy: 0.9070
-------------------------------------------------------


                                                                     

Epoch number: 2.2 ----> Training Loss: 0.1378, Training Accuracy: 0.9310 Validation Loss: 0.7253, Validation Accuracy: 0.9070
-------------------------------------------------------


                                                                     

Epoch number: 3.2 ----> Training Loss: 0.1378, Training Accuracy: 0.9310 Validation Loss: 0.7253, Validation Accuracy: 0.9070
-------------------------------------------------------


                                                                     

Epoch number: 4.2 ----> Training Loss: 0.1380, Training Accuracy: 0.9310 Validation Loss: 0.7253, Validation Accuracy: 0.9070




In [85]:
import pickle

with open('xlm_model_10000.pkl', 'wb') as f:
    pickle.dump(model, f)

In [87]:
with open('xlm_model_10000.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [88]:

train_subset = torch.utils.data.Subset(train_dataset, range(10000, 20000))
subset_train_dataloader = DataLoader(train_subset, batch_size=64, shuffle=True)

val_subset = torch.utils.data.Subset(val_dataset, range(2000, 4000))
subset_val_dataloader = DataLoader(val_subset, batch_size=64, shuffle=True)

print(len(train_subset))
print(len(val_subset))
print(i)

10000
2000
3


In [89]:
from transformers import AdamW, get_scheduler
from tqdm import tqdm
from torch.cuda.amp import autocast

optimizer = AdamW(model.parameters(), lr=2e-5)

accumulation_steps = 4 
accumulated_batch_count = 0

model = loaded_model

for epoch in range(epochs):
    model.train()
    total_train_loss = 0.0
    total_train_acc = 0.0
    train_samples = 0

    print("-------------------------------------------------------")
    with tqdm(total=len(subset_train_dataloader), desc=f"Epoch {epoch+1}", leave=False) as pbar:
        with autocast():
            for batch in subset_train_dataloader:
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels'].long()

                optimizer.zero_grad()

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = (outputs.loss) / accumulation_steps
                logits = outputs.logits
                loss.backward()
                accumulated_batch_count += 1
                # optimizer.step()
                if accumulated_batch_count % accumulation_steps == 0:
                    # Perform optimizer step and reset accumulated_batch_count
                    optimizer.step()
                    optimizer.zero_grad()
                    accumulated_batch_count = 0  # Reset accumulated_batch_count

                acc = compute_accuracy(logits, labels)

                total_train_loss += loss.item() * labels.size(0)
                total_train_acc += acc * labels.size(0)
                train_samples += labels.size(0)
                pbar.update(1)

        #  Final optimizer step for remaining accumulated gradients
        if accumulated_batch_count > 0:
            optimizer.step()
            optimizer.zero_grad()
            accumulated_batch_count = 0

    # Evaluate on validation set
    model.eval()
    total_val_loss = 0.0
    total_val_acc = 0.0
    val_samples = 0

    with tqdm(total=len(subset_val_dataloader), desc=f"Epoch {epoch+1} (validation)", leave=False) as pbar:
        for batch in subset_val_dataloader:
            with autocast():
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels'].long()

                with torch.no_grad():
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                    logits = outputs.logits

                acc = compute_accuracy(logits, labels)

                total_val_loss += loss.item() * labels.size(0)
                total_val_acc += acc * labels.size(0)
                val_samples += labels.size(0)
                pbar.update(1)

    train_loss = total_train_loss / train_samples
    train_acc = total_train_acc / train_samples
    val_loss = total_val_loss / val_samples
    val_acc = total_val_acc / val_samples

    print(f"Epoch number: {epoch+1}/{epochs} ---> "
          f"Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.4f} "
          f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")



-------------------------------------------------------


                                                                     

Epoch number: 1/5 ---> Training Loss: 0.5592, Training Accuracy: 0.6901 Validation Loss: 2.0605, Validation Accuracy: 0.7310
-------------------------------------------------------


                                                                     

Epoch number: 2/5 ---> Training Loss: 0.5584, Training Accuracy: 0.6901 Validation Loss: 2.0605, Validation Accuracy: 0.7310
-------------------------------------------------------


                                                                     

Epoch number: 3/5 ---> Training Loss: 0.5576, Training Accuracy: 0.6901 Validation Loss: 2.0605, Validation Accuracy: 0.7310
-------------------------------------------------------


                                                                     

Epoch number: 4/5 ---> Training Loss: 0.5580, Training Accuracy: 0.6901 Validation Loss: 2.0605, Validation Accuracy: 0.7310
-------------------------------------------------------


                                                                     

Epoch number: 5/5 ---> Training Loss: 0.5581, Training Accuracy: 0.6901 Validation Loss: 2.0605, Validation Accuracy: 0.7310




In [90]:
import pickle

with open('xlm_model_20000.pkl', 'wb') as f:
    pickle.dump(model, f)

In [91]:
with open('xlm_model_20000.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [92]:

train_subset = torch.utils.data.Subset(train_dataset, range(20000, 30000))
subset_train_dataloader = DataLoader(train_subset, batch_size=64, shuffle=True)

val_subset = torch.utils.data.Subset(val_dataset, range(4000, 7000))
subset_val_dataloader = DataLoader(val_subset, batch_size=64, shuffle=True)

print(len(train_subset))
print(len(val_subset))
print(i)

10000
3000
3


In [93]:
from transformers import AdamW, get_scheduler
from tqdm import tqdm
from torch.cuda.amp import autocast

optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3

accumulation_steps = 4 
accumulated_batch_count = 0

model = loaded_model

for epoch in range(epochs):
    model.train()
    total_train_loss = 0.0
    total_train_acc = 0.0
    train_samples = 0

    print("-------------------------------------------------------")
    with tqdm(total=len(subset_train_dataloader), desc=f"Epoch {epoch+1}", leave=False) as pbar:
        with autocast():
            for batch in subset_train_dataloader:
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels'].long()

                optimizer.zero_grad()

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = (outputs.loss) / accumulation_steps
                logits = outputs.logits
                loss.backward()
                accumulated_batch_count += 1
                # optimizer.step()
                if accumulated_batch_count % accumulation_steps == 0:
                    # Perform optimizer step and reset accumulated_batch_count
                    optimizer.step()
                    optimizer.zero_grad()
                    accumulated_batch_count = 0  # Reset accumulated_batch_count

                acc = compute_accuracy(logits, labels)

                total_train_loss += loss.item() * labels.size(0)
                total_train_acc += acc * labels.size(0)
                train_samples += labels.size(0)
                pbar.update(1)

        #  Final optimizer step for remaining accumulated gradients
        if accumulated_batch_count > 0:
            optimizer.step()
            optimizer.zero_grad()
            accumulated_batch_count = 0

    # Evaluate on validation set
    model.eval()
    total_val_loss = 0.0
    total_val_acc = 0.0
    val_samples = 0

    with tqdm(total=len(subset_val_dataloader), desc=f"Epoch {epoch+1} (validation)", leave=False) as pbar:
        for batch in subset_val_dataloader:
            with autocast():
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels'].long()

                with torch.no_grad():
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                    logits = outputs.logits

                acc = compute_accuracy(logits, labels)

                total_val_loss += loss.item() * labels.size(0)
                total_val_acc += acc * labels.size(0)
                val_samples += labels.size(0)
                pbar.update(1)

    train_loss = total_train_loss / train_samples
    train_acc = total_train_acc / train_samples
    val_loss = total_val_loss / val_samples
    val_acc = total_val_acc / val_samples

    print(f"Epoch number: {epoch+1}/{epochs} ---> "
          f"Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.4f} "
          f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")



-------------------------------------------------------


                                                                     

Epoch number: 1/3 ---> Training Loss: 0.6805, Training Accuracy: 0.6234 Validation Loss: 3.5797, Validation Accuracy: 0.5323
-------------------------------------------------------


                                                                     

Epoch number: 2/3 ---> Training Loss: 0.6813, Training Accuracy: 0.6234 Validation Loss: 3.5797, Validation Accuracy: 0.5323
-------------------------------------------------------


                                                                     

Epoch number: 3/3 ---> Training Loss: 0.6817, Training Accuracy: 0.6233 Validation Loss: 3.5797, Validation Accuracy: 0.5323




In [94]:
import pickle

with open('xlm_model_30000.pkl', 'wb') as f:
    pickle.dump(model, f)

In [95]:
with open('xlm_model_30000.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [96]:

train_subset = torch.utils.data.Subset(train_dataset, range(30000, 40000))
subset_train_dataloader = DataLoader(train_subset, batch_size=64, shuffle=True)

val_subset = torch.utils.data.Subset(val_dataset, range(7000, 10000))
subset_val_dataloader = DataLoader(val_subset, batch_size=64, shuffle=True)

print(len(train_subset))
print(len(val_subset))
print(i)

10000
3000
3


In [97]:
from transformers import AdamW, get_scheduler
from tqdm import tqdm
from torch.cuda.amp import autocast

optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3

accumulation_steps = 4 
accumulated_batch_count = 0

model = loaded_model

for epoch in range(epochs):
    model.train()
    total_train_loss = 0.0
    total_train_acc = 0.0
    train_samples = 0

    print("-------------------------------------------------------")
    with tqdm(total=len(subset_train_dataloader), desc=f"Epoch {epoch+1}", leave=False) as pbar:
        with autocast():
            for batch in subset_train_dataloader:
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels'].long()

                optimizer.zero_grad()

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = (outputs.loss) / accumulation_steps
                logits = outputs.logits
                loss.backward()
                accumulated_batch_count += 1
                # optimizer.step()
                if accumulated_batch_count % accumulation_steps == 0:
                    # Perform optimizer step and reset accumulated_batch_count
                    optimizer.step()
                    optimizer.zero_grad()
                    accumulated_batch_count = 0  # Reset accumulated_batch_count

                acc = compute_accuracy(logits, labels)

                total_train_loss += loss.item() * labels.size(0)
                total_train_acc += acc * labels.size(0)
                train_samples += labels.size(0)
                pbar.update(1)

        #  Final optimizer step for remaining accumulated gradients
        if accumulated_batch_count > 0:
            optimizer.step()
            optimizer.zero_grad()
            accumulated_batch_count = 0

    # Evaluate on validation set
    model.eval()
    total_val_loss = 0.0
    total_val_acc = 0.0
    val_samples = 0

    with tqdm(total=len(subset_val_dataloader), desc=f"Epoch {epoch+1} (validation)", leave=False) as pbar:
        for batch in subset_val_dataloader:
            with autocast():
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels'].long()

                with torch.no_grad():
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                    logits = outputs.logits

                acc = compute_accuracy(logits, labels)

                total_val_loss += loss.item() * labels.size(0)
                total_val_acc += acc * labels.size(0)
                val_samples += labels.size(0)
                pbar.update(1)

    train_loss = total_train_loss / train_samples
    train_acc = total_train_acc / train_samples
    val_loss = total_val_loss / val_samples
    val_acc = total_val_acc / val_samples

    print(f"Epoch number: {epoch+1}/{epochs} ---> "
          f"Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.4f} "
          f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")

-------------------------------------------------------


                                                                     

Epoch number: 1/3 ---> Training Loss: 0.4874, Training Accuracy: 0.7315 Validation Loss: 1.9389, Validation Accuracy: 0.7493
-------------------------------------------------------


                                                                     

Epoch number: 2/3 ---> Training Loss: 0.4886, Training Accuracy: 0.7315 Validation Loss: 1.9389, Validation Accuracy: 0.7493
-------------------------------------------------------


                                                                     

Epoch number: 3/3 ---> Training Loss: 0.4877, Training Accuracy: 0.7315 Validation Loss: 1.9389, Validation Accuracy: 0.7493




In [98]:
import pickle

with open('xlm_model_40000.pkl', 'wb') as f:
    pickle.dump(model, f)

In [99]:
with open('xlm_model_40000.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [100]:

train_subset = torch.utils.data.Subset(train_dataset, range(40000, 50054))
subset_train_dataloader = DataLoader(train_subset, batch_size=64, shuffle=True)

val_subset = torch.utils.data.Subset(val_dataset, range(10000, 14576))
subset_val_dataloader = DataLoader(val_subset, batch_size=64, shuffle=True)

print(len(train_subset))
print(len(val_subset))

10054
4576


In [101]:
from transformers import AdamW, get_scheduler
from tqdm import tqdm
from torch.cuda.amp import autocast

optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3

accumulation_steps = 4 
accumulated_batch_count = 0

model = loaded_model

for epoch in range(epochs):
    model.train()
    total_train_loss = 0.0
    total_train_acc = 0.0
    train_samples = 0

    print("-------------------------------------------------------")
    with tqdm(total=len(subset_train_dataloader), desc=f"Epoch {epoch+1}", leave=False) as pbar:
        with autocast():
            for batch in subset_train_dataloader:
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels'].long()

                optimizer.zero_grad()

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = (outputs.loss) / accumulation_steps
                logits = outputs.logits
                loss.backward()
                accumulated_batch_count += 1
                # optimizer.step()
                if accumulated_batch_count % accumulation_steps == 0:
                    # Perform optimizer step and reset accumulated_batch_count
                    optimizer.step()
                    optimizer.zero_grad()
                    accumulated_batch_count = 0  # Reset accumulated_batch_count

                acc = compute_accuracy(logits, labels)

                total_train_loss += loss.item() * labels.size(0)
                total_train_acc += acc * labels.size(0)
                train_samples += labels.size(0)
                pbar.update(1)

        #  Final optimizer step for remaining accumulated gradients
        if accumulated_batch_count > 0:
            optimizer.step()
            optimizer.zero_grad()
            accumulated_batch_count = 0

    # Evaluate on validation set
    model.eval()
    total_val_loss = 0.0
    total_val_acc = 0.0
    val_samples = 0

    with tqdm(total=len(subset_val_dataloader), desc=f"Epoch {epoch+1} (validation)", leave=False) as pbar:
        for batch in subset_val_dataloader:
            with autocast():
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels'].long()

                with torch.no_grad():
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                    logits = outputs.logits

                acc = compute_accuracy(logits, labels)

                total_val_loss += loss.item() * labels.size(0)
                total_val_acc += acc * labels.size(0)
                val_samples += labels.size(0)
                pbar.update(1)

    train_loss = total_train_loss / train_samples
    train_acc = total_train_acc / train_samples
    val_loss = total_val_loss / val_samples
    val_acc = total_val_acc / val_samples

    print(f"Epoch number: {epoch+1}/{epochs} ---> "
          f"Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.4f} "
          f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")

-------------------------------------------------------


                                                                     

Epoch number: 1/3 ---> Training Loss: 0.5479, Training Accuracy: 0.6972 Validation Loss: 1.9979, Validation Accuracy: 0.7399
-------------------------------------------------------


                                                                     

Epoch number: 2/3 ---> Training Loss: 0.5464, Training Accuracy: 0.6972 Validation Loss: 1.9979, Validation Accuracy: 0.7399
-------------------------------------------------------


                                                                     

Epoch number: 3/3 ---> Training Loss: 0.5477, Training Accuracy: 0.6972 Validation Loss: 1.9979, Validation Accuracy: 0.7399




In [102]:
import pickle

with open('xlm_model_full.pkl', 'wb') as f:
    pickle.dump(model, f)

In [103]:
with open('xlm_model_full.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# **Model Evaluation:**

In [108]:
from sklearn.metrics import precision_recall_fscore_support

model.eval()
all_logits = []
all_labels = []

with tqdm(total=len(val_dataloader), desc=f"Evaluating model", leave=False) as pbar:
    for batch in val_dataloader:
        with torch.no_grad():
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels'].long()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            all_logits.append(logits.detach().cpu())
            all_labels.append(labels.detach().cpu())
            pbar.update(1)

all_logits = torch.cat(all_logits, dim=0)
all_labels = torch.cat(all_labels, dim=0)

predictions = torch.argmax(all_logits, dim=-1)

precision, recall, f1, _ = precision_recall_fscore_support(all_labels, predictions, average='weighted')

print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

                                                                   

Precision: 0.5228, Recall: 0.7230, F1-score: 0.6068


  _warn_prf(average, modifier, msg_start, len(result))
