In [None]:
!gdown https://amazon-massive-nlu-dataset.s3.amazonaws.com/amazon-massive-dataset-1.0.tar.gz
!tar -xvf /content/amazon-massive-dataset-1.0.tar.gz


In [None]:
import json
import re
from collections import defaultdict
import random

def parse_line(item):
    utt = item['utt']
    annot_utt = item['annot_utt']
    intent = item['intent']

    tokens = utt.split()
    labels = ['O'] * len(tokens)

    if annot_utt:
        annotations = re.findall(r'\[(.*?)\s*:\s*(.*?)\]', annot_utt)
        for slot_type, slot_value in annotations:
            for i, token in enumerate(tokens):
                if token == slot_value.split()[0]:  # Assuming first word match
                    start = i
                    for j, sub_token in enumerate(slot_value.split()):
                        if i + j < len(tokens) and tokens[i + j] == sub_token:
                            labels[i + j] = f"{('B' if j == 0 else 'I')}-{slot_type}"

    return ' '.join([f"{token}:{label}" for token, label in zip(tokens, labels)]), intent

def split_data(data):
    random.shuffle(data)
    train_size = int(0.8 * len(data))
    val_size = int(0.1 * len(data))
    train_data = data[:train_size]
    val_data = data[train_size:train_size + val_size]
    test_data = data[train_size + val_size:]
    return train_data, val_data, test_data

def create_txt_files(train_data, val_data, test_data,lang):
    for filename, data_set in [(lang+'train.txt', train_data), (lang+'valid.txt', val_data), (lang+'test.txt', test_data)]:
        with open(filename, 'w', encoding='utf-8') as f:
            for item in data_set:
                parsed_line, intent = parse_line(item)
                f.write(f"{parsed_line} <=> {intent}\n")

def create_vocab_files(data, lang):
    intents = set()
    slots = set()

    for item in data:
        intents.add(item['intent'])
        if item['annot_utt']:
            annotations = re.findall(r'\[(.*?)\s*:\s*(.*?)\]', item['annot_utt'])
            for slot_type, _ in annotations:
                slots.add(f"B-{slot_type}")
                slots.add(f"I-{slot_type}")

    slots.add('O')  # Add 'O' for outside of any slot

    with open(lang+'vocab.intent', 'w', encoding='utf-8') as f:
        for intent in sorted(intents):
            f.write(f"{intent}\n")

    with open(lang+'vocab.slot', 'w', encoding='utf-8') as f:
        for slot in sorted(slots):
            f.write(f"{slot}\n")

if __name__ == "__main__":
    fa_data_path = "/content/1.0/data/fa-IR.jsonl"  # Replace with actual path

    with open(fa_data_path, 'r', encoding='utf-8') as f:
        fa_data = [json.loads(line) for line in f]

    # Filter out items without intent or annotation if necessary
    fa_data = [item for item in fa_data if item.get('intent') and item.get('annot_utt')]

    train_data, val_data, test_data = split_data(fa_data)

    create_txt_files(train_data, val_data, test_data,'fa_')
    create_vocab_files(fa_data,'fa_')

    print("Files created for persian lang : train.txt, valid.txt, test.txt, vocab.intent, vocab.slot")

    ##############################################
    en_data_path = "/content/1.0/data/en-US.jsonl"  # Replace with actual path

    with open(en_data_path, 'r', encoding='utf-8') as f:
        en_data = [json.loads(line) for line in f]

    # Filter out items without intent or annotation if necessary
    en_data = [item for item in en_data if item.get('intent') and item.get('annot_utt')]

    train_data, val_data, test_data = split_data(en_data)

    create_txt_files(train_data, val_data, test_data,'en_')
    create_vocab_files(en_data,'en_')

    print("Files created for eglish lang: train.txt, valid.txt, test.txt, vocab.intent, vocab.slot")

Files created for persian lang : train.txt, valid.txt, test.txt, vocab.intent, vocab.slot
Files created for eglish lang: train.txt, valid.txt, test.txt, vocab.intent, vocab.slot


In [25]:

import re
from pathlib import Path

def parse_line(line):
    utterance_data, intent_label = line.split(" <=> ")
    items = utterance_data.split()
    words = []
    labels = []

    for item in items:
        split_item = item.rsplit(":", 1)
        if len(split_item) == 2:  # If there is a colon in the item
            words.append(split_item[0])
            labels.append(split_item[1])
        else:
            # If no colon, treat as 'O'
            words.append(item)
            labels.append('O')

    return {
        "intent_label": intent_label.strip(),
        "words": " ".join(words),
        "word_labels": " ".join(labels),
        "length": len(words),
    }

lines_train = Path("/content/fa_train.txt").read_text("utf-8").strip().splitlines()
import pandas as pd

parsed = [parse_line(line) for line in lines_train]

df_train = pd.DataFrame([p for p in parsed if p is not None])


In [26]:
df_train.head(5)

Unnamed: 0,intent_label,words,word_labels,length
0,play_music,عوض کن و به لیست آهنگ های ورزشی من برو,O O O O B-playlist_name I-playlist_name I-play...,10
1,email_query,ایا ندا هیچ ایمیلی در مورد علی برایم فرستاده,O B-person O O O O B-person O O,9
2,play_music,فهرست نوار صوتی نرماندی را پخش کن,O O O B-playlist_name O O O,7
3,weather_query,لطفا وضع هوای سیزدهم این ماه را تایید کن,O O O B-date I-date I-date O O O,9
4,calendar_remove,برنامه های من برای هفته آینده را خالی کن,O O O O B-date I-date O O O,9


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder

# Load vocabulary for slot labels
slot_labels_path = "/content/fa_vocab.slot"
with open(slot_labels_path, 'r') as f:
    SLOT_LABELS = [line.strip() for line in f.readlines()]

# Define label encoders
label_encoder = LabelEncoder()
label_encoder.fit(SLOT_LABELS)

def prepare_data(df, tokenizer):
    sentences = df['words'].tolist()
    labels = df['word_labels'].apply(lambda x: x.split()).tolist()

    # Tokenization and alignment
    tokenized_data = []
    label_data = []

    for sentence, label in zip(sentences, labels):
        tokens = tokenizer(sentence.split(), is_split_into_words=True, truncation=True, padding='max_length', max_length=128)
        word_ids = tokens.word_ids()

        # Align labels with subwords
        aligned_labels = []
        prev_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)
            elif word_idx != prev_word_idx:
                aligned_labels.append(label_encoder.transform([label[word_idx]])[0])
            else:
                aligned_labels.append(-100)  # Ignore subwords
            prev_word_idx = word_idx

        tokenized_data.append({key: torch.tensor(val) for key, val in tokens.items()})
        label_data.append(torch.tensor(aligned_labels))

    return tokenized_data, label_data

# Dataset class
class SlotFillingDataset(Dataset):
    def __init__(self, tokenized_data, label_data):
        self.tokenized_data = tokenized_data
        self.label_data = label_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        tokens = self.tokenized_data[idx]
        labels = self.label_data[idx]

        input_ids = tokens['input_ids']
        attention_mask = tokens['attention_mask']

        return input_ids, attention_mask, labels

# Define model
class SlotFillingModel(nn.Module):
    def __init__(self, model_name, num_labels):
        super(SlotFillingModel, self).__init__()
        self.roberta = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

        return logits, loss

# Load data
def load_data(file_path):
    def parse_line(line):
        utterance_data, intent_label = line.split(" <=> ")
        items = utterance_data.split()
        words = []
        labels = []

        for item in items:
            split_item = item.rsplit(":", 1)
            if len(split_item) == 2:
                words.append(split_item[0])
                labels.append(split_item[1])
            else:
                words.append(item)
                labels.append('O')

        return {
            "intent_label": intent_label.strip(),
            "words": " ".join(words),
            "word_labels": " ".join(labels),
            "length": len(words),
        }

    lines = Path(file_path).read_text("utf-8").strip().splitlines()
    parsed = [parse_line(line) for line in lines]
    return pd.DataFrame([p for p in parsed if p is not None])

# Training loop
def train(model, dataloader, optimizer, scheduler):
    model.train()
    total_loss = 0

    for batch in dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()

        logits, loss = model(input_ids, attention_mask, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    return total_loss / len(dataloader)

# Evaluation loop
def evaluate(model, dataloader):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            logits, _ = model(input_ids, attention_mask)
            predictions = torch.argmax(logits, dim=-1)

            for pred, label in zip(predictions, labels):
                all_preds.extend(pred.cpu().numpy())
                all_labels.extend(label.cpu().numpy())

    # Remove ignored indices (-100)
    valid_preds = [p for p, l in zip(all_preds, all_labels) if l != -100]
    valid_labels = [l for l in all_labels if l != -100]

    precision_mic = precision_score(valid_labels, valid_preds, average='micro')
    recall_mic = recall_score(valid_labels, valid_preds, average='micro')
    f1_mic = f1_score(valid_labels, valid_preds, average='micro')

    precision_mac = precision_score(valid_labels, valid_preds, average='micro')
    recall_mac = recall_score(valid_labels, valid_preds, average='micro')
    f1_mac = f1_score(valid_labels, valid_preds, average='micro')

    accuracy = accuracy_score(valid_labels, valid_preds)

    return precision_mic, recall_mic, f1_mic, accuracy , precision_mac , recall_mac , f1_mac

if __name__ == "__main__":
    # Load tokenizer
    MODEL_NAME = "xlm-roberta-base"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)

    # Load and prepare data
    train_file = "/content/fa_train.txt"
    valid_file = "/content/fa_valid.txt"
    test_file = "/content/fa_test.txt"

    df_train = load_data(train_file)
    df_valid = load_data(valid_file)
    df_test = load_data(test_file)

    train_tokens, train_labels = prepare_data(df_train, tokenizer)
    valid_tokens, valid_labels = prepare_data(df_valid, tokenizer)
    test_tokens, test_labels = prepare_data(df_test, tokenizer)

    train_dataset = SlotFillingDataset(train_tokens, train_labels)
    valid_dataset = SlotFillingDataset(valid_tokens, valid_labels)
    test_dataset = SlotFillingDataset(test_tokens, test_labels)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Initialize model, optimizer, and scheduler
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = SlotFillingModel(MODEL_NAME, len(SLOT_LABELS)).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, total_iters=10)

    # Train model
    num_epochs = 5
    for epoch in range(num_epochs):
        train_loss = train(model, train_loader, optimizer, scheduler)
        precision_mic, recall_mic, f1_mic, accuracy ,  precision_mac, recall_mac, f1_mac = evaluate(model, valid_loader)

        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Micro Validation report - Precision: {precision_mic:.4f}, Recall: {recall_mic:.4f}, F1-Score: {f1_mic:.4f}, Accuracy: {accuracy:.4f}")
        print(f"Macro Validation report - Precision: {precision_mac:.4f}, Recall: {recall_mac:.4f}, F1-Score: {f1_mac:.4f}, Accuracy: {accuracy:.4f}")

    # Evaluate on test set
    precision_mic, recall_mic, f1_mic, accuracy ,  precision_mac, recall_mac, f1_mac = evaluate(model, test_loader)
    print("\nTest Set Evaluation micro mode:")
    print(f"Precision: {precision_mic:.4f}, Recall: {recall_mic:.4f}, F1-Score: {f1_mic:.4f}, Accuracy: {accuracy:.4f}")

    print("\nTest Set Evaluation macro mode:")
    print(f"Precision: {precision_mac:.4f}, Recall: {recall_mac:.4f}, F1-Score: {f1_mac:.4f}, Accuracy: {accuracy:.4f}")



Epoch 1/5
Train Loss: 0.7060
Micro Validation report - Precision: 0.9034, Recall: 0.9034, F1-Score: 0.9034, Accuracy: 0.9034
Macro Validation report - Precision: 0.9034, Recall: 0.9034, F1-Score: 0.9034, Accuracy: 0.9034
Epoch 2/5
Train Loss: 0.3158
Micro Validation report - Precision: 0.9204, Recall: 0.9204, F1-Score: 0.9204, Accuracy: 0.9204
Macro Validation report - Precision: 0.9204, Recall: 0.9204, F1-Score: 0.9204, Accuracy: 0.9204
Epoch 3/5
Train Loss: 0.2401
Micro Validation report - Precision: 0.9200, Recall: 0.9200, F1-Score: 0.9200, Accuracy: 0.9200
Macro Validation report - Precision: 0.9200, Recall: 0.9200, F1-Score: 0.9200, Accuracy: 0.9200


In [22]:

def predict_on_input(model, tokenizer, label_encoder, input_text):
    model.eval()
    tokens = tokenizer(
        input_text.split(),
        is_split_into_words=True,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors="pt",
    )
    input_ids = tokens["input_ids"].to(device)
    attention_mask = tokens["attention_mask"].to(device)

    with torch.no_grad():
        logits, _ = model(input_ids, attention_mask)
        predictions = torch.argmax(logits, dim=-1)

    input_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu().numpy())
    predicted_labels = [
        label_encoder.inverse_transform([p])[0] if l != -100 else "O"
        for p, l in zip(predictions[0].cpu().numpy(), tokens["input_ids"][0].cpu().numpy())
    ]

    # Filter out special tokens (<s>, </s>, <pad>)
    result = {
        "tokens": [],
        "predicted_labels": [],
    }
    for token, label in zip(input_tokens, predicted_labels):
        if token not in tokenizer.all_special_tokens:
            result["tokens"].append(token)
            result["predicted_labels"].append(label)

    print("Input text result:")
    for token, label in zip(result["tokens"], result["predicted_labels"]):
        print(f"{token}: {label}")

    return result

if __name__ == "__main__":
    # Apply model on test dataset
    test_csv_path = "/content/test_predictions.csv"
    predict_on_test_set(model, test_loader, test_csv_path, tokenizer, label_encoder)

    # Apply model on an input string
    input_text = "I want to book a flight from Paris to Berlin"
    predict_on_input(model, tokenizer, label_encoder, input_text)


Test predictions saved to /content/test_predictions.csv
Input text result:
ĠI: O
Ġwant: O
Ġto: O
Ġbook: O
Ġa: O
Ġflight: O
Ġfrom: O
ĠParis: B-place_name
Ġto: O
ĠBerlin: B-place_name


In [24]:
import csv

def predict_on_input(model, dataloader, output_path, tokenizer, label_encoder):
    model.eval()
    results = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            logits, _ = model(input_ids, attention_mask)
            predictions = torch.argmax(logits, dim=-1)

            for idx in range(len(input_ids)):
                # Get tokens and predicted labels
                tokens = tokenizer.convert_ids_to_tokens(input_ids[idx].cpu().numpy())
                predicted_labels = [
                    label_encoder.inverse_transform([p])[0] if l != -100 else "O"
                    for p, l in zip(predictions[idx].cpu().numpy(), labels[idx].cpu().numpy())
                ]
                truth_labels = [
                    label_encoder.inverse_transform([l])[0] if l != -100 else "O"
                    for l in labels[idx].cpu().numpy()
                ]

                # Filter out <pad> tokens and corresponding labels
                filtered_tokens = []
                filtered_predicted_labels = []
                filtered_truth_labels = []

                for token, pred_label, truth_label in zip(tokens, predicted_labels, truth_labels):
                    if token not in tokenizer.all_special_tokens:  # Skip pad and other special tokens
                        filtered_tokens.append(token)
                        filtered_predicted_labels.append(pred_label)
                        filtered_truth_labels.append(truth_label)

                # Combine sentence and labels as strings
                sentence = tokenizer.convert_tokens_to_string(filtered_tokens).strip()
                predicted_labels_str = " ".join(filtered_predicted_labels)
                truth_labels_str = " ".join(filtered_truth_labels)

                results.append({
                    "sentence": sentence,
                    "predicted_labels": predicted_labels_str,
                    "truth_label": truth_labels_str
                })

    # Save results to CSV
    with open(output_path, "w", newline='', encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["sentence", "predicted_labels", "truth_label"])
        writer.writeheader()
        writer.writerows(results)
    print(f"Predictions saved to {output_path}")

if __name__ == "__main__":
    # Apply model on test dataset
    test_csv_path = "/content/final-slotfiling-roberta-test_predictions.csv"

    predict_on_input(model, test_loader, test_csv_path, tokenizer, label_encoder)


Predictions saved to /content/final-slotfiling-roberta-test_predictions.csv
