In [3]:
!cp -r /content/xlm_roberta_large_best.bin /content/drive/MyDrive

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import unicodedata

In [2]:
import json

def create_tokens_and_labels(id, sample):
    intent = sample['intent']
    utt = sample['utt']
    annot_utt = sample['annot_utt']
    tokens = utt.split()
    labels = []
    label = 'O'
    split_annot_utt = annot_utt.split()
    idx = 0
    BIO_SLOT = False
    while idx < len(split_annot_utt):
        if split_annot_utt[idx].startswith('['):
            label = split_annot_utt[idx].lstrip('[')
            idx += 2
            BIO_SLOT = True
        elif split_annot_utt[idx].endswith(']'):
            if split_annot_utt[idx-1] == ":":
                labels.append("B-" + label)
                label = 'O'
                idx += 1
            else:
                labels.append("I-" + label)
                label = 'O'
                idx += 1
            BIO_SLOT = False
        else:
            if split_annot_utt[idx-1] == ":":
                labels.append("B-" + label)
                idx += 1
            elif BIO_SLOT == True:
                labels.append("I-" + label)
                idx += 1
            else:
                labels.append("O")
                idx += 1

    if len(tokens) != len(labels):
        raise ValueError(f"Len of tokens, {tokens}, doesnt match len of labels, {labels}, "
                         f"for id {id} and annot utt: {annot_utt}")
    return tokens, labels, intent


def Read_Massive_dataset(massive_raw):
    sentences_tr, tags_tr, intent_tags_tr = [], [], []
    sentences_val, tags_val, intent_tags_val = [], [], []
    sentences_test, tags_test, intent_tags_test = [], [], []
    all_tags, all_intents = [], []

    for id, sample in enumerate(massive_raw):
        if sample['partition'] == 'train':
            tokens, labels, intent = create_tokens_and_labels(id, sample)
            sentences_tr.append(tokens)
            tags_tr.append(labels)
            intent_tags_tr.append(intent)
            all_tags.extend(labels)
            all_intents.append(intent)
        elif sample['partition'] == 'dev':
            tokens, labels, intent = create_tokens_and_labels(id, sample)
            sentences_val.append(tokens)
            tags_val.append(labels)
            intent_tags_val.append(intent)
        elif sample['partition'] == 'test':
            tokens, labels, intent = create_tokens_and_labels(id, sample)
            sentences_test.append(tokens)
            tags_test.append(labels)
            intent_tags_test.append(intent)

    # Unique labels and intents
    unique_tags = sorted(set(all_tags))
    unique_intents = sorted(set(all_intents))
    tag2id = {tag: i for i, tag in enumerate(unique_tags)}
    intent2id = {intent: i for i, intent in enumerate(unique_intents)}

    return (sentences_tr, tags_tr, intent_tags_tr), \
           (sentences_val, tags_val, intent_tags_val), \
           (sentences_test, tags_test, intent_tags_test), \
           tag2id, intent2id

# Download and extract MASSIVE dataset
!gdown https://amazon-massive-nlu-dataset.s3.amazonaws.com/amazon-massive-dataset-1.0.tar.gz
!tar -xvf /content/amazon-massive-dataset-1.0.tar.gz

#Read the dataset
massive_raw_fa = []
with open('/content/1.0/data/fa-IR.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        massive_raw_fa.append(json.loads(line))

(train_data, val_data, test_data, tag2id, intent2id) = Read_Massive_dataset(massive_raw_fa)


Downloading...
From: https://amazon-massive-nlu-dataset.s3.amazonaws.com/amazon-massive-dataset-1.0.tar.gz
To: /content/amazon-massive-dataset-1.0.tar.gz
100% 39.5M/39.5M [00:01<00:00, 26.8MB/s]
1.0/
1.0/CITATION.md
1.0/NOTICE.md
1.0/data/
1.0/data/mn-MN.jsonl
1.0/data/af-ZA.jsonl
1.0/data/el-GR.jsonl
1.0/data/ta-IN.jsonl
1.0/data/ar-SA.jsonl
1.0/data/ur-PK.jsonl
1.0/data/pl-PL.jsonl
1.0/data/ko-KR.jsonl
1.0/data/az-AZ.jsonl
1.0/data/da-DK.jsonl
1.0/data/kn-IN.jsonl
1.0/data/tl-PH.jsonl
1.0/data/is-IS.jsonl
1.0/data/lv-LV.jsonl
1.0/data/it-IT.jsonl
1.0/data/es-ES.jsonl
1.0/data/fr-FR.jsonl
1.0/data/ml-IN.jsonl
1.0/data/km-KH.jsonl
1.0/data/fa-IR.jsonl
1.0/data/sw-KE.jsonl
1.0/data/en-US.jsonl
1.0/data/tr-TR.jsonl
1.0/data/bn-BD.jsonl
1.0/data/he-IL.jsonl
1.0/data/te-IN.jsonl
1.0/data/pt-PT.jsonl
1.0/data/ka-GE.jsonl
1.0/data/ja-JP.jsonl
1.0/data/id-ID.jsonl
1.0/data/ru-RU.jsonl
1.0/data/hy-AM.jsonl
1.0/data/nb-NO.jsonl
1.0/data/ms-MY.jsonl
1.0/data/sq-AL.jsonl
1.0/data/sv-SE.jsonl
1.0/

In [5]:
import json
import random
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, AdamW
from torch.nn import CrossEntropyLoss
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score

# Tokenize the sentences
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

def encode_data(sentences, intents, tokenizer, intent2id):
    encodings = tokenizer(sentences, is_split_into_words=True, padding=True, truncation=True, max_length=128, return_tensors='pt')
    intent_labels = [intent2id[intent] for intent in intents]
    return encodings, torch.tensor(intent_labels)

# Encode data
train_encodings, train_intents = encode_data(train_data[0], train_data[2], tokenizer, intent2id)
val_encodings, val_intents = encode_data(val_data[0], val_data[2], tokenizer, intent2id)

# Convert encodings to tensors for DataLoader
def convert_to_tensor(encodings, labels):
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    labels = labels
    return TensorDataset(input_ids, attention_mask, labels)

train_dataset = convert_to_tensor(train_encodings, train_intents)
val_dataset = convert_to_tensor(val_encodings, val_intents)

# DataLoader for batch processing
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Model initialization
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base',
                                                            num_labels=len(intent2id),
                                                            ignore_mismatched_sizes=True)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Initialize loss function without class weights
loss_fn = CrossEntropyLoss()

def train_epoch(model, data_loader, optimizer, device, loss_fn):
    model.train()
    total_loss = 0
    correct_predictions = 0

    for data in tqdm(data_loader):
        input_ids, attention_mask, labels = [t.to(device) for t in data]

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels).item()

        loss.backward()
        optimizer.step()

    return correct_predictions / len(data_loader.dataset), total_loss / len(data_loader)

def eval_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for data in data_loader:
            input_ids, attention_mask, labels = [t.to(device) for t in data]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate both micro and macro metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(all_labels, all_preds, average='micro')
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(all_labels, all_preds, average='macro')

    return {
        'accuracy': accuracy,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro,
        'f1_micro': f1_micro,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro
    }

# Early stopping parameters
patience = 4
best_val_f1 = float('-inf')  
early_stop_counter = 0

# Training loop with early stopping
epochs = 7
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, device, loss_fn)
    print(f"Train loss: {train_loss}, accuracy: {train_acc}")

    val_metrics = eval_model(model, val_loader, device)
    print(f"Validation Metrics:")
    print(f"  Accuracy: {val_metrics['accuracy']}")
    print(f"  Micro - Precision: {val_metrics['precision_micro']}, Recall: {val_metrics['recall_micro']}, F1: {val_metrics['f1_micro']}")
    print(f"  Macro - Precision: {val_metrics['precision_macro']}, Recall: {val_metrics['recall_macro']}, F1: {val_metrics['f1_macro']}")

    # Early stopping check
    if val_metrics['f1_macro'] > best_val_f1:  
        best_val_f1 = val_metrics['f1_macro']
        early_stop_counter = 0
        model.save_pretrained('xlm_roberta_large_best.bin')  # Save the best model
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("Early stopping triggered.")
            break

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/7


100%|██████████| 720/720 [02:24<00:00,  4.98it/s]


Train loss: 2.322411572105355, accuracy: 0.4656939378148341


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics:
  Accuracy: 0.7456960157402853
  Micro - Precision: 0.7456960157402853, Recall: 0.7456960157402853, F1: 0.7456960157402853
  Macro - Precision: 0.5397439405315229, Recall: 0.5554048169085285, F1: 0.5305597699284388
Epoch 2/7


100%|██████████| 720/720 [02:26<00:00,  4.92it/s]


Train loss: 0.9307019520137045, accuracy: 0.7864339065485496


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics:
  Accuracy: 0.8288243974422036
  Micro - Precision: 0.8288243974422036, Recall: 0.8288243974422036, F1: 0.8288243974422036
  Macro - Precision: 0.7288154775773277, Recall: 0.6939485398637507, F1: 0.6931922175270497
Epoch 3/7


100%|██████████| 720/720 [02:30<00:00,  4.79it/s]


Train loss: 0.5807530314040681, accuracy: 0.8672051415667883


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics:
  Accuracy: 0.8603049680275455
  Micro - Precision: 0.8603049680275455, Recall: 0.8603049680275455, F1: 0.8603049680275455
  Macro - Precision: 0.8043833819069833, Recall: 0.80346470870832, F1: 0.7977478029631502
Epoch 4/7


100%|██████████| 720/720 [02:26<00:00,  4.92it/s]


Train loss: 0.4211435177363455, accuracy: 0.90055584505819


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics:
  Accuracy: 0.8632562715199213
  Micro - Precision: 0.8632562715199213, Recall: 0.8632562715199213, F1: 0.8632562715199213
  Macro - Precision: 0.817236986014288, Recall: 0.8142945311208737, F1: 0.8083591062485576
Epoch 5/7


100%|██████████| 720/720 [02:26<00:00,  4.92it/s]


Train loss: 0.32587931454181673, accuracy: 0.92313705054716


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics:
  Accuracy: 0.8612887358583374
  Micro - Precision: 0.8612887358583374, Recall: 0.8612887358583374, F1: 0.8612887358583374
  Macro - Precision: 0.8494674259353955, Recall: 0.8303031696767489, F1: 0.8282930331212112
Epoch 6/7


100%|██████████| 720/720 [02:26<00:00,  4.92it/s]


Train loss: 0.2661159396637231, accuracy: 0.933993399339934


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics:
  Accuracy: 0.8598130841121495
  Micro - Precision: 0.8598130841121495, Recall: 0.8598130841121495, F1: 0.8598130841121495
  Macro - Precision: 0.7920880784142345, Recall: 0.8212252927562588, F1: 0.8025340664860299
Epoch 7/7


100%|██████████| 720/720 [02:26<00:00,  4.92it/s]


Train loss: 0.20553261004647033, accuracy: 0.9502344971339239


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics:
  Accuracy: 0.8721101819970487
  Micro - Precision: 0.8721101819970487, Recall: 0.8721101819970487, F1: 0.8721101819970487
  Macro - Precision: 0.8539527728310824, Recall: 0.8429908507409039, F1: 0.8415897999368627


In [8]:
def predict_example(text, tokenizer, model, intent2id):
    model.eval()
    encoding = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    predicted_intent = list(intent2id.keys())[list(intent2id.values()).index(predicted_class)]
    return predicted_intent

import random
num_examples = 8  
random_indices = random.sample(range(len(test_data[0])), num_examples)

for idx in random_indices:
    text = ' '.join(test_data[0][idx])  # Join tokens back into a sentence
    true_intent = test_data[2][idx]  # Get the true intent label

    predicted_intent = predict_example(text, tokenizer, model, intent2id)

    print(f"Text: {text}")
    print(f"True Intent: {true_intent}")
    print(f"Predicted Intent: {predicted_intent}")
    print("-" * 50)

Text: وای فای من را خاموش کن
True Intent: iot_wemo_off
Predicted Intent: iot_wemo_off
--------------------------------------------------
Text: به بیمه دی توئیت کن که آنها خیلی بد اند
True Intent: social_post
Predicted Intent: social_post
--------------------------------------------------
Text: در ساری چه کاری می‌توانم انجام دهم
True Intent: recommendation_events
Predicted Intent: recommendation_events
--------------------------------------------------
Text: دستور غذاهایی که می‌توان در یک ساعت پخت
True Intent: cooking_recipe
Predicted Intent: cooking_recipe
--------------------------------------------------
Text: برای این نشانی ایمیل بفرست.
True Intent: email_sendemail
Predicted Intent: email_sendemail
--------------------------------------------------
Text: آیا امروز هیچ ایمیل جدیدی برای من وجود دارد
True Intent: email_query
Predicted Intent: email_query
--------------------------------------------------
Text: عالی را به عنوان نظر من به این اهنگ اضافه کن
True Intent: music_likeness
Pre

In [6]:
# Function to predict for entire test set
def predict_example(text, tokenizer, model, intent2id):
    encoding = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    predicted_intent = list(intent2id.keys())[list(intent2id.values()).index(predicted_class)]
    return predicted_intent

import pandas as pd
# Prepare data for prediction
results = []
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
# Load the model
model = XLMRobertaForSequenceClassification.from_pretrained('/content/xlm_roberta_large_best.bin')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

for idx, (sentence, true_intent, id) in enumerate(zip(test_data[0], test_data[2], [entry['id'] for entry in massive_raw_fa if entry['partition'] == 'test'])):
    text = ' '.join(sentence)  # Join tokens into text
    predicted_intent = predict_example(text, tokenizer, model, intent2id)
    results.append({
        'id': id,
        'text': text,
        'true_intent': true_intent,
        'predicted_intent': predicted_intent
    })

df_results = pd.DataFrame(results)
df_results.to_csv('fa_xlm_roberta_test_set_predictions.csv', index=False)

print("Predictions saved to 'test_set_predictions.csv'")

Predictions saved to 'test_set_predictions.csv'


In [10]:
from transformers import XLMRobertaForSequenceClassification

model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base',
                                                            num_labels=len(intent2id),
                                                            ignore_mismatched_sizes=True)

total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters for xlm-roberta-base (used for persian intent det): {total_params}")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total number of parameters for xlm-roberta-base (used for persian intent det): 278089788


# ---> Joint Bert


In [3]:
import json
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import XLMRobertaTokenizerFast, XLMRobertaConfig, XLMRobertaForTokenClassification, AdamW, get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss
import numpy as np
from tqdm import tqdm
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
import pandas as pd
from seqeval.scheme import IOB2

# Load data
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

# Convert to BIO format for slot filling
def convert_to_bio(utt, annot_utt):
    tokens = utt.split()
    labels = ['O'] * len(tokens)
    parts = annot_utt.split()

    i = 0
    while i < len(parts):
        if parts[i].startswith('['):
            try:
                slot_type = parts[i].split(':')[0].strip('[')
                value = parts[i].split(':')[1].strip(']')
            except IndexError:
                print(f"Warning: Malformed annotation at {i} in '{annot_utt}'")
                i += 1
                continue

            value_tokens = value.split()

            start_idx = utt.find(value, utt.find(parts[i-1] if i > 0 else '') + len(parts[i-1] if i > 0 else ''))
            if start_idx == -1:
                print(f"Warning: Slot value '{value}' not found in '{utt}'")
                i += 1
                continue
            start_token_idx = len(utt[:start_idx].split())

            for j, token in enumerate(value_tokens):
                if start_token_idx + j < len(tokens):
                    if j == 0:
                        labels[start_token_idx + j] = f'B-{slot_type}'
                    else:
                        labels[start_token_idx + j] = f'I-{slot_type}'

            i += len(value_tokens) + 1
        else:
            i += 1

    return tokens, labels

# Prepare data for both tasks
def prepare_data(data, tokenizer, label2id, intent2id):
    encodings = []
    for entry in data:
        tokens, labels = convert_to_bio(entry['utt'], entry['annot_utt'])
        encoded = tokenizer(tokens, is_split_into_words=True, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

        labels_ids = []
        word_ids = encoded.word_ids(batch_index=0)
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                labels_ids.append(-100)
            elif word_idx != previous_word_idx:
                try:
                    labels_ids.append(label2id[labels[word_idx]])
                except IndexError:
                    labels_ids.append(-100)
            else:
                labels_ids.append(-100)
            previous_word_idx = word_idx

        encodings.append({
            'input_ids': encoded['input_ids'].squeeze(),
            'attention_mask': encoded['attention_mask'].squeeze(),
            'labels': torch.tensor(labels_ids),
            'intent': torch.tensor([intent2id[entry['intent']]])
        })
    return encodings

# Load data
massive_raw_fa = load_data('/content/1.0/data/fa-IR.jsonl')
train_data, val_data, test_data = [], [], []

for entry in massive_raw_fa:
    if entry['partition'] == 'train':
        train_data.append(entry)
    elif entry['partition'] == 'dev':
        val_data.append(entry)
    elif entry['partition'] == 'test':
        test_data.append(entry)

# Get unique labels and intents
all_labels = [label for entry in train_data for label in convert_to_bio(entry['utt'], entry['annot_utt'])[1]]
unique_labels = sorted(set(all_labels))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

unique_intents = sorted(set([entry['intent'] for entry in train_data]))
intent2id = {intent: i for i, intent in enumerate(unique_intents)}
id2intent = {i: intent for intent, i in intent2id.items()}

# Tokenizer and model
tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base')
config = XLMRobertaConfig.from_pretrained('xlm-roberta-base', output_hidden_states=True, num_labels=len(label2id))

# Model: Joint BERT
class JointBERT(torch.nn.Module):
    def __init__(self, num_slot_labels, num_intent_labels):
        super(JointBERT, self).__init__()
        self.bert = XLMRobertaForTokenClassification.from_pretrained('xlm-roberta-base', config=config)
        self.intent_classifier = torch.nn.Linear(self.bert.config.hidden_size, num_intent_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        if outputs.hidden_states is None:
            raise ValueError("hidden_states are not available in the model outputs. Make sure to set `output_hidden_states=True` in the config.")
        sequence_output = outputs.hidden_states[-1]  # Last hidden state for intent prediction
        pooled_output = sequence_output[:, 0]  # CLS token for intent prediction
        intent_logits = self.intent_classifier(pooled_output)
        return outputs, intent_logits

# Model initialization
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = JointBERT(num_slot_labels=len(label2id), num_intent_labels=len(intent2id))
model = model.to(device)

def convert_to_tensor(encodings):
    input_ids = torch.stack([encoding['input_ids'] for encoding in encodings])
    attention_mask = torch.stack([encoding['attention_mask'] for encoding in encodings])
    slot_labels = torch.stack([encoding['labels'] for encoding in encodings])
    intent_labels = torch.stack([encoding['intent'] for encoding in encodings])
    return TensorDataset(input_ids, attention_mask, slot_labels, intent_labels)

train_encodings = prepare_data(train_data, tokenizer, label2id, intent2id)
val_encodings = prepare_data(val_data, tokenizer, label2id, intent2id)
test_encodings = prepare_data(test_data, tokenizer, label2id, intent2id)

train_dataset = convert_to_tensor(train_encodings)
val_dataset = convert_to_tensor(val_encodings)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

# Scheduler
total_steps = len(train_loader) * 10  # Assuming 10 epochs for calculation
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Loss functions
slot_loss_fn = CrossEntropyLoss(ignore_index=-100)
intent_loss_fn = CrossEntropyLoss()

def train_epoch(model, data_loader, optimizer, scheduler, device, slot_loss_fn, intent_loss_fn):
    model.train()
    total_slot_loss = 0
    total_intent_loss = 0
    correct_intent = 0

    for batch in tqdm(data_loader):
        input_ids, attention_mask, slot_labels, intent_labels = [t.to(device) for t in batch]

        optimizer.zero_grad()
        outputs, intent_logits = model(input_ids=input_ids, attention_mask=attention_mask, labels=slot_labels)

        slot_loss = outputs.loss
        intent_loss = intent_loss_fn(intent_logits.view(-1, len(intent2id)), intent_labels.view(-1))

        loss = slot_loss + intent_loss
        total_slot_loss += slot_loss.item()
        total_intent_loss += intent_loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

        _, preds = torch.max(intent_logits, dim=1)
        correct_intent += torch.sum(preds == intent_labels.view(-1)).item()

    return total_slot_loss / len(data_loader), total_intent_loss / len(data_loader), correct_intent / len(train_dataset)

def eval_model(model, data_loader, device, id2label):
    model.eval()
    all_slot_preds, all_slot_true = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, slot_labels, intent_labels = [t.to(device) for t in batch]

            outputs, _ = model(input_ids=input_ids, attention_mask=attention_mask, labels=slot_labels)

            # Slot filling
            slot_predictions = torch.argmax(outputs.logits, dim=-1)
            slot_true = slot_labels.view(-1)
            mask = slot_true != -100

            true_labels = [id2label[l.item()] for l in slot_true[mask]]
            pred_labels = [id2label[l.item()] for l in slot_predictions.view(-1)[mask]]

            all_slot_preds.append(pred_labels)
            all_slot_true.append(true_labels)

    # Convert to list of lists for seqeval
    true_labels = [[label for label in seq if label != 'O'] for seq in all_slot_true]
    pred_labels = [[label for label in seq if label != 'O'] for seq in all_slot_preds]

    # Metrics calculation using seqeval
    precision_micro = precision_score(true_labels, pred_labels, mode='strict', scheme=IOB2)
    recall_micro = recall_score(true_labels, pred_labels, mode='strict', scheme=IOB2)
    f1_micro = f1_score(true_labels, pred_labels, mode='strict', scheme=IOB2)
    precision_macro = precision_score(true_labels, pred_labels, average='macro', mode='strict', scheme=IOB2)
    recall_macro = recall_score(true_labels, pred_labels, average='macro', mode='strict', scheme=IOB2)
    f1_macro = f1_score(true_labels, pred_labels, average='macro', mode='strict', scheme=IOB2)

    slot_report = classification_report(true_labels, pred_labels, mode='strict', scheme=IOB2)


    return {
        'slot_report': slot_report,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro,
        'f1_micro': f1_micro,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
    }

# Training loop with early stopping
epochs = 7
best_slot_f1 = 0
best_intent_acc = 0
patience = 4
early_stopping_counter = 0

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    slot_loss, intent_loss, intent_acc = train_epoch(model, train_loader, optimizer, scheduler, device, slot_loss_fn, intent_loss_fn)
    print(f"Slot Loss: {slot_loss}, Intent Loss: {intent_loss}, Intent Accuracy: {intent_acc}")

    val_metrics = eval_model(model, val_loader, device, id2label)
    print("Validation Slot Filling Metrics:")
    print(val_metrics['slot_report'])
    print(f"Validation Slot Metrics - Micro: Precision: {val_metrics['precision_micro']:.4f}, Recall: {val_metrics['recall_micro']:.4f}, F1: {val_metrics['f1_micro']:.4f}")
    print(f"Validation Slot Metrics - Macro: Precision: {val_metrics['precision_macro']:.4f}, Recall: {val_metrics['recall_macro']:.4f}, F1: {val_metrics['f1_macro']:.4f}")

    if val_metrics['f1_macro'] > best_slot_f1:
        best_slot_f1 = val_metrics['f1_macro']
        early_stopping_counter = 0
        torch.save(model.state_dict(), 'fa_best_joint_bert_model.pth')
        print("Saved best model.")
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= patience:
            print("Early stopping triggered.")
            break

# Evaluate on test set after training
test_loader = DataLoader(convert_to_tensor(test_encodings), batch_size=16)
test_metrics = eval_model(model, test_loader, device, id2label)
print("Test Slot Filling Metrics:")
print(test_metrics['slot_report'])
print(f"Test Slot Metrics - Micro: Precision: {test_metrics['precision_micro']:.4f}, Recall: {test_metrics['recall_micro']:.4f}, F1: {test_metrics['f1_micro']:.4f}")
print(f"Test Slot Metrics - Macro: Precision: {test_metrics['precision_macro']:.4f}, Recall: {test_metrics['recall_macro']:.4f}, F1: {test_metrics['f1_macro']:.4f}")

def predict(text, tokenizer, model, label2id, id2label, intent2id, id2intent):
    model.eval()
    encoded = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt').to(device)

    with torch.no_grad():
        outputs, intent_logits = model(**encoded)

    slot_predictions = torch.argmax(outputs.logits, dim=-1)
    slot_labels = [id2label[int(label)] for label in slot_predictions[0]]

    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
    words = []
    last_word = ""
    for token in tokens:
        if token.startswith("##"):
            last_word += token[2:]
        else:
            if last_word:
                words.append(last_word)
            last_word = token
    if last_word:
        words.append(last_word)

    # Align predictions with words, ignoring special tokens
    slot_labels_aligned = []
    word_index = 0
    for i, token in enumerate(tokens):
        if token not in tokenizer.all_special_tokens:
            if word_index < len(words):
                slot_labels_aligned.append((words[word_index], slot_labels[i]))
                word_index += 1

    intent_prediction = id2intent[int(torch.argmax(intent_logits, dim=1)[0])]

    return slot_labels_aligned, intent_prediction

# Apply model to test set to save results in CSV
def predict_on_test_set(test_data, tokenizer, model, label2id, id2label, intent2id, id2intent):
    results = []

    for entry in test_data:
        utt = ' '.join(entry['utt'].split())  # Ensure single spaces between words
        slot_labels, predicted_intent = predict(utt, tokenizer, model, label2id, id2label, intent2id, id2intent)

        # Convert slot labels back to string format for CSV
        slot_labels_str = " ".join([f"{word}:{label}" for word, label in slot_labels])

        results.append({
            'id': entry['id'],
            'text': utt,
            'true_intent': entry['intent'],
            'predicted_intent': predicted_intent,
            'predicted_slots': slot_labels_str
        })

    return pd.DataFrame(results)

# Apply model to test set
test_results_df = predict_on_test_set(test_data, tokenizer, model, label2id, id2label, intent2id, id2intent)

# Save results to CSV
csv_file_path = 'persian_joint_bert_test_set_predictions.csv'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 1/7


100%|██████████| 720/720 [04:37<00:00,  2.60it/s]


Slot Loss: 0.0, Intent Loss: 1.584055765469869, Intent Accuracy: 0.6139482369289561
Validation Slot Filling Metrics:
              precision    recall  f1-score   support

           O       1.00      1.00      1.00     14907

    accuracy                           1.00     14907
   macro avg       1.00      1.00      1.00     14907
weighted avg       1.00      1.00      1.00     14907

Validation Intent Accuracy: 0.8199704869650762
Saved best model.
Epoch 2/7


100%|██████████| 720/720 [04:42<00:00,  2.55it/s]


Slot Loss: 0.0, Intent Loss: 0.6125061470497813, Intent Accuracy: 0.8454924439812402
Validation Slot Filling Metrics:
              precision    recall  f1-score   support

           O       1.00      1.00      1.00     14907

    accuracy                           1.00     14907
   macro avg       1.00      1.00      1.00     14907
weighted avg       1.00      1.00      1.00     14907

Validation Intent Accuracy: 0.8440727988194786
Epoch 3/7


100%|██████████| 720/720 [04:43<00:00,  2.54it/s]


Slot Loss: 0.0, Intent Loss: 0.40910950446656597, Intent Accuracy: 0.8931735278791036
Validation Slot Filling Metrics:
              precision    recall  f1-score   support

           O       1.00      1.00      1.00     14907

    accuracy                           1.00     14907
   macro avg       1.00      1.00      1.00     14907
weighted avg       1.00      1.00      1.00     14907

Validation Intent Accuracy: 0.8568617806197737
Epoch 4/7


100%|██████████| 720/720 [04:43<00:00,  2.54it/s]


Slot Loss: 0.0, Intent Loss: 0.2975295186236811, Intent Accuracy: 0.9206183776272364
Validation Slot Filling Metrics:
              precision    recall  f1-score   support

           O       1.00      1.00      1.00     14907

    accuracy                           1.00     14907
   macro avg       1.00      1.00      1.00     14907
weighted avg       1.00      1.00      1.00     14907

Validation Intent Accuracy: 0.8637481554353172
Epoch 5/7


100%|██████████| 720/720 [04:43<00:00,  2.54it/s]


Slot Loss: 0.0, Intent Loss: 0.20299717771639633, Intent Accuracy: 0.9446760465520236
Validation Slot Filling Metrics:
              precision    recall  f1-score   support

           O       1.00      1.00      1.00     14907

    accuracy                           1.00     14907
   macro avg       1.00      1.00      1.00     14907
weighted avg       1.00      1.00      1.00     14907

Validation Intent Accuracy: 0.8666994589276931
Early stopping triggered.
Test Slot Filling Metrics:
              precision    recall  f1-score   support

           O       1.00      1.00      1.00     21834

    accuracy                           1.00     21834
   macro avg       1.00      1.00      1.00     21834
weighted avg       1.00      1.00      1.00     21834

Test Intent Accuracy: 0.8651647612642905
Predictions saved to persian_joint_bert_test_set_predictions.csv


In [4]:
total_params_fa = sum(p.numel() for p in model.parameters())
print(f"Total parameters for joint bert used Persian dataset: {total_params_fa}")

Total parameters for joint bert used Persian dataset: 277499965


##Roberta model for english intent classification

```
# This is formatted as code
```



In [3]:
import json
import random
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaForSequenceClassification, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score

massive_raw_en = []
with open('/content/1.0/data/en-US.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        massive_raw_en.append(json.loads(line))

(train_data, val_data, test_data, tag2id, intent2id) = Read_Massive_dataset(massive_raw_en)

# Tokenize the sentences
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

def encode_data(sentences, intents, tokenizer, intent2id):
    encodings = tokenizer(sentences, is_split_into_words=True, padding=True, truncation=True, max_length=128, return_tensors='pt')
    intent_labels = [intent2id[intent] for intent in intents]
    return encodings, torch.tensor(intent_labels)

# Encode data
train_encodings, train_intents = encode_data(train_data[0], train_data[2], tokenizer, intent2id)
val_encodings, val_intents = encode_data(val_data[0], val_data[2], tokenizer, intent2id)


def convert_to_tensor(encodings, labels):
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    labels = labels
    return TensorDataset(input_ids, attention_mask, labels)

train_dataset = convert_to_tensor(train_encodings, train_intents)
val_dataset = convert_to_tensor(val_encodings, val_intents)


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Model initialization
model_en = RobertaForSequenceClassification.from_pretrained('roberta-large',
                                                            num_labels=len(intent2id))

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_en = model_en.to(device)

optimizer = AdamW(model_en.parameters(), lr=1e-5, weight_decay=0.01) 

# Scheduler for learning rate
total_steps = len(train_loader) * 10 
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

loss_fn = CrossEntropyLoss()

def train_epoch(model, data_loader, optimizer, scheduler, device, loss_fn):
    model.train()
    total_loss = 0
    correct_predictions = 0

    for data in tqdm(data_loader):
        input_ids, attention_mask, labels = [t.to(device) for t in data]

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels).item()

        loss.backward()
        optimizer.step()
        scheduler.step()  # Step the scheduler after each batch

    return correct_predictions / len(data_loader.dataset), total_loss / len(data_loader)

def eval_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for data in data_loader:
            input_ids, attention_mask, labels = [t.to(device) for t in data]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate both micro and macro metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(all_labels, all_preds, average='micro')
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(all_labels, all_preds, average='macro')

    return {
        'accuracy': accuracy,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro,
        'f1_micro': f1_micro,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro
    }

# Early stopping parameters
patience = 3
best_val_f1 = float('-inf')
early_stop_counter = 0

# Training loop
epochs = 10  
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_acc, train_loss = train_epoch(model_en, train_loader, optimizer, scheduler, device, loss_fn)
    print(f"Train loss: {train_loss}, accuracy: {train_acc}")

    val_metrics = eval_model(model_en, val_loader, device)
    print(f"Validation Metrics:")
    print(f"  Accuracy: {val_metrics['accuracy']}")
    print(f"  Micro - Precision: {val_metrics['precision_micro']}, Recall: {val_metrics['recall_micro']}, F1: {val_metrics['f1_micro']}")
    print(f"  Macro - Precision: {val_metrics['precision_macro']}, Recall: {val_metrics['recall_macro']}, F1: {val_metrics['f1_macro']}")

    # Early stopping check
    if val_metrics['f1_macro'] > best_val_f1:
        best_val_f1 = val_metrics['f1_macro']
        early_stop_counter = 0
        model_en.save_pretrained('en_roberta_large_best.bin')  # Save the best model
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("Early stopping triggered.")
            break

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10


100%|██████████| 360/360 [04:43<00:00,  1.27it/s]


Train loss: 1.980842079801692, accuracy: 0.5708702449192288


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics:
  Accuracy: 0.8504672897196262
  Micro - Precision: 0.8504672897196262, Recall: 0.8504672897196262, F1: 0.8504672897196262
  Macro - Precision: 0.7587096216384056, Recall: 0.7494755821972547, F1: 0.7481035717842383
Epoch 2/10


100%|██████████| 360/360 [04:43<00:00,  1.27it/s]


Train loss: 0.604397638183501, accuracy: 0.8737189508424527


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics:
  Accuracy: 0.8834235120511559
  Micro - Precision: 0.8834235120511559, Recall: 0.8834235120511559, F1: 0.8834235120511559
  Macro - Precision: 0.8231680229865462, Recall: 0.8244824159643258, F1: 0.8214460138461742
Epoch 3/10


100%|██████████| 360/360 [04:43<00:00,  1.27it/s]


Train loss: 0.3903548603463504, accuracy: 0.9142782699322564


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics:
  Accuracy: 0.8878504672897196
  Micro - Precision: 0.8878504672897196, Recall: 0.8878504672897196, F1: 0.8878504672897196
  Macro - Precision: 0.832011350235342, Recall: 0.8331037932156097, F1: 0.8296259937148008
Epoch 4/10


100%|██████████| 360/360 [04:42<00:00,  1.27it/s]


Train loss: 0.27932294765487314, accuracy: 0.9375542817439638


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics:
  Accuracy: 0.8996556812592228
  Micro - Precision: 0.8996556812592228, Recall: 0.8996556812592228, F1: 0.8996556812592228
  Macro - Precision: 0.8727984594788777, Recall: 0.8685618938686809, F1: 0.8618127093840361
Epoch 5/10


100%|██████████| 360/360 [04:42<00:00,  1.27it/s]


Train loss: 0.21499192483930124, accuracy: 0.9534479763765851


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics:
  Accuracy: 0.8912936546974914
  Micro - Precision: 0.8912936546974914, Recall: 0.8912936546974914, F1: 0.8912936546974914
  Macro - Precision: 0.8742172000099838, Recall: 0.8810205962397796, F1: 0.8713752718780203
Epoch 6/10


100%|██████████| 360/360 [04:42<00:00,  1.27it/s]


Train loss: 0.16555898034324248, accuracy: 0.9649122807017544


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics:
  Accuracy: 0.9016232169208067
  Micro - Precision: 0.9016232169208067, Recall: 0.9016232169208067, F1: 0.9016232169208067
  Macro - Precision: 0.8796417172482971, Recall: 0.8812942223045661, F1: 0.8767955831772343
Epoch 7/10


100%|██████████| 360/360 [04:42<00:00,  1.28it/s]


Train loss: 0.1314190384414461, accuracy: 0.9729025534132361


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics:
  Accuracy: 0.9016232169208067
  Micro - Precision: 0.9016232169208067, Recall: 0.9016232169208067, F1: 0.9016232169208067
  Macro - Precision: 0.8821965623945655, Recall: 0.8872866927613574, F1: 0.8803328810369219
Epoch 8/10


100%|██████████| 360/360 [04:42<00:00,  1.27it/s]


Train loss: 0.10953676234413352, accuracy: 0.9781136008337676


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics:
  Accuracy: 0.8991637973438269
  Micro - Precision: 0.8991637973438269, Recall: 0.8991637973438269, F1: 0.8991637973438269
  Macro - Precision: 0.8760095905938026, Recall: 0.8810090302844413, F1: 0.8747637579049774
Epoch 9/10


100%|██████████| 360/360 [04:42<00:00,  1.27it/s]


Train loss: 0.09464016573296653, accuracy: 0.9822824387701928


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics:
  Accuracy: 0.9016232169208067
  Micro - Precision: 0.9016232169208067, Recall: 0.9016232169208067, F1: 0.9016232169208067
  Macro - Precision: 0.8833560652843553, Recall: 0.8903175536265076, F1: 0.882459663520674
Epoch 10/10


100%|██████████| 360/360 [04:42<00:00,  1.28it/s]


Train loss: 0.08668375316386422, accuracy: 0.9834983498349835
Validation Metrics:
  Accuracy: 0.9021151008362026
  Micro - Precision: 0.9021151008362026, Recall: 0.9021151008362026, F1: 0.9021151008362026
  Macro - Precision: 0.8827996102378552, Recall: 0.89066263470081, F1: 0.8823660523695194


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:

total_params = sum(p.numel() for p in model_en.parameters())
print(f"Total number of parameters for roberta-large (used for EN intent detection): {total_params}")

Total number of parameters for roberta-large (used for EN intent detection): 355421244


In [4]:
# Function to predict for entire test set
def predict_example(text, tokenizer, model, intent2id):
    encoding = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    predicted_intent = list(intent2id.keys())[list(intent2id.values()).index(predicted_class)]
    return predicted_intent


In [6]:
import pandas as pd
results = []

for idx, (sentence, true_intent, id) in enumerate(zip(test_data[0], test_data[2], [entry['id'] for entry in massive_raw_en if entry['partition'] == 'test'])):
    text = ' '.join(sentence)  # Join tokens into text
    predicted_intent = predict_example(text, tokenizer, model_en, intent2id)
    results.append({
        'id': id,
        'text': text,
        'true_intent': true_intent,
        'predicted_intent': predicted_intent
    })

df_results = pd.DataFrame(results)
df_results.to_csv('en_roberta_test_set_predictions.csv', index=False)

print("Predictions saved to 'test_set_predictions.csv'")

Predictions saved to 'test_set_predictions.csv'
