In [4]:
import json

def create_tokens_and_labels(id, sample):
    intent = sample['intent']
    utt = sample['utt']
    annot_utt = sample['annot_utt']
    tokens = utt.split()
    labels = []
    label = 'O'
    split_annot_utt = annot_utt.split()
    idx = 0
    BIO_SLOT = False
    while idx < len(split_annot_utt):
        if split_annot_utt[idx].startswith('['):
            label = split_annot_utt[idx].lstrip('[')
            idx += 2
            BIO_SLOT = True
        elif split_annot_utt[idx].endswith(']'):
            if split_annot_utt[idx-1] == ":":
                labels.append("B-" + label)
                label = 'O'
                idx += 1
            else:
                labels.append("I-" + label)
                label = 'O'
                idx += 1
            BIO_SLOT = False
        else:
            if split_annot_utt[idx-1] == ":":
                labels.append("B-" + label)
                idx += 1
            elif BIO_SLOT == True:
                labels.append("I-" + label)
                idx += 1
            else:
                labels.append("O")
                idx += 1

    if len(tokens) != len(labels):
        raise ValueError(f"Len of tokens, {tokens}, doesnt match len of labels, {labels}, "
                         f"for id {id} and annot utt: {annot_utt}")
    return tokens, labels, intent


def Read_Massive_dataset(massive_raw):
    sentences_tr, tags_tr, intent_tags_tr = [], [], []
    sentences_val, tags_val, intent_tags_val = [], [], []
    sentences_test, tags_test, intent_tags_test = [], [], []
    all_tags, all_intents = [], []

    for id, sample in enumerate(massive_raw):
        if sample['partition'] == 'train':
            tokens, labels, intent = create_tokens_and_labels(id, sample)
            sentences_tr.append(tokens)
            tags_tr.append(labels)
            intent_tags_tr.append(intent)
            all_tags.extend(labels)
            all_intents.append(intent)
        elif sample['partition'] == 'dev':
            tokens, labels, intent = create_tokens_and_labels(id, sample)
            sentences_val.append(tokens)
            tags_val.append(labels)
            intent_tags_val.append(intent)
        elif sample['partition'] == 'test':
            tokens, labels, intent = create_tokens_and_labels(id, sample)
            sentences_test.append(tokens)
            tags_test.append(labels)
            intent_tags_test.append(intent)

    # Unique labels and intents
    unique_tags = sorted(set(all_tags))
    unique_intents = sorted(set(all_intents))
    tag2id = {tag: i for i, tag in enumerate(unique_tags)}
    intent2id = {intent: i for i, intent in enumerate(unique_intents)}

    return (sentences_tr, tags_tr, intent_tags_tr), \
           (sentences_val, tags_val, intent_tags_val), \
           (sentences_test, tags_test, intent_tags_test), \
           tag2id, intent2id

#Read the dataset
massive_raw_fa = []
with open('/kaggle/input/english-massive/en-US.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        massive_raw_fa.append(json.loads(line))

(train_data, val_data, test_data, tag2id, intent2id) = Read_Massive_dataset(massive_raw_fa)

print('done')

done


In [5]:
import json
import random
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, AdamW
from torch.nn import CrossEntropyLoss
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

# Load T5-large
tokenizer = T5Tokenizer.from_pretrained("t5-base")

def encode_data(sentences, intents, tokenizer, intent2id):
    encodings = tokenizer(sentences, is_split_into_words=True, padding=True, truncation=True, max_length=128, return_tensors='pt')
    intent_labels = [intent2id[intent] for intent in intents]
    return encodings, torch.tensor(intent_labels)

###################
def encode_data(sentences, intents, tokenizer, intent2id):
    sentences = [" ".join(sentence) for sentence in sentences]
    encodings = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
    intent_labels = [intent2id[intent] for intent in intents]
    return encodings, torch.tensor(intent_labels)

# Encode data
train_encodings, train_intents = encode_data(train_data[0], train_data[2], tokenizer, intent2id)
val_encodings, val_intents = encode_data(val_data[0], val_data[2], tokenizer, intent2id)

# Convert encodings to tensors for DataLoader
def convert_to_tensor(encodings, labels):
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    labels = labels
    return TensorDataset(input_ids, attention_mask, labels)

train_dataset = convert_to_tensor(train_encodings, train_intents)
val_dataset = convert_to_tensor(val_encodings, val_intents)

# DataLoader for batch processing
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

model = T5ForConditionalGeneration.from_pretrained('t5-base')
model.config.num_labels = len(intent2id)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Initialize loss function without class weights
loss_fn = torch.nn.CrossEntropyLoss()

def train_epoch(model, data_loader, optimizer, device, loss_fn):
    model.train()
    total_loss = 0
    correct_predictions = 0
    
    for data in tqdm(data_loader):
        input_ids, attention_mask, labels = [t.to(device) for t in data]

        optimizer.zero_grad()

        decoder_input_ids = torch.full((input_ids.shape[0], 1), model.config.decoder_start_token_id, dtype=torch.long, device=input_ids.device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)
        logits = outputs.logits[:, -1, :]  # Take the last token's logits for classification
        
        # Compute loss
        loss = loss_fn(logits, labels)
        total_loss += loss.item()

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels).item()

        loss.backward()
        optimizer.step()

    return correct_predictions / len(data_loader.dataset), total_loss / len(data_loader)
#####################

def eval_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for data in data_loader:
            input_ids, attention_mask, labels = [t.to(device) for t in data]
            
            # Similar to training, we need to specify decoder_input_ids:
            decoder_input_ids = torch.full((input_ids.shape[0], 1), model.config.decoder_start_token_id, dtype=torch.long, device=input_ids.device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)
            logits = outputs.logits[:, -1, :]  # We look at the logits of the last token for classification

            _, preds = torch.max(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate both micro and macro metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(all_labels, all_preds, average='micro')
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(all_labels, all_preds, average='macro')

    return {
        'accuracy': accuracy,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro,
        'f1_micro': f1_micro,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro
    }


# Early stopping parameters
patience = 4
best_val_f1 = float('-inf')  
early_stop_counter = 0

# Training loop with early stopping
epochs = 7
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, device, loss_fn)
    print(f"Train loss: {train_loss}, accuracy: {train_acc}")

    val_metrics = eval_model(model, val_loader, device)
    print(f"Validation Metrics:")
    print(f"  Accuracy: {val_metrics['accuracy']}")
    print(f"  Micro - Precision: {val_metrics['precision_micro']}, Recall: {val_metrics['recall_micro']}, F1: {val_metrics['f1_micro']}")
    print(f"  Macro - Precision: {val_metrics['precision_macro']}, Recall: {val_metrics['recall_macro']}, F1: {val_metrics['f1_macro']}")

    # Early stopping check
    if val_metrics['f1_macro'] > best_val_f1:  
        best_val_f1 = val_metrics['f1_macro']
        early_stop_counter = 0
        model.save_pretrained('t5_base_intent-english-best.bin')  # Save the best model
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("Early stopping triggered.")
            break



model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



Epoch 1/7


100%|██████████| 720/720 [01:42<00:00,  7.05it/s]


Train loss: 5.193860990802447, accuracy: 0.10613166579815876


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Metrics:
  Accuracy: 0.3782587309394983
  Micro - Precision: 0.3782587309394983, Recall: 0.3782587309394983, F1: 0.3782587309394983
  Macro - Precision: 0.1582080383108063, Recall: 0.14761322084532613, F1: 0.12353785557084085
Epoch 2/7


100%|██████████| 720/720 [01:41<00:00,  7.07it/s]


Train loss: 2.3190659527149466, accuracy: 0.47507382317179087


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Metrics:
  Accuracy: 0.719626168224299
  Micro - Precision: 0.719626168224299, Recall: 0.719626168224299, F1: 0.7196261682242989
  Macro - Precision: 0.5785149261768234, Recall: 0.5230503089613251, F1: 0.5246837516856118
Epoch 3/7


100%|██████████| 720/720 [01:41<00:00,  7.08it/s]


Train loss: 1.3211041220981214, accuracy: 0.6797811360083377


  _warn_prf(average, modifier, msg_start, len(result))


Validation Metrics:
  Accuracy: 0.7924249877029022
  Micro - Precision: 0.7924249877029022, Recall: 0.7924249877029022, F1: 0.7924249877029023
  Macro - Precision: 0.7244089362472615, Recall: 0.6709524062545935, F1: 0.6768279185575692
Epoch 4/7


100%|██████████| 720/720 [01:41<00:00,  7.10it/s]


Train loss: 0.9697164637554023, accuracy: 0.7603786694458919


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Metrics:
  Accuracy: 0.8332513526807673
  Micro - Precision: 0.8332513526807673, Recall: 0.8332513526807673, F1: 0.8332513526807673
  Macro - Precision: 0.7697409216458728, Recall: 0.7319692358332383, F1: 0.7363799317513581
Epoch 5/7


100%|██████████| 720/720 [01:43<00:00,  6.99it/s]


Train loss: 0.8062112050855326, accuracy: 0.7937293729372937


  _warn_prf(average, modifier, msg_start, len(result))


Validation Metrics:
  Accuracy: 0.8524348253812101
  Micro - Precision: 0.8524348253812101, Recall: 0.8524348253812101, F1: 0.85243482538121
  Macro - Precision: 0.8036408848492083, Recall: 0.781687523638494, F1: 0.7842911430802975
Epoch 6/7


100%|██████████| 720/720 [01:43<00:00,  6.96it/s]


Train loss: 0.6756602637107587, accuracy: 0.827340628799722


  _warn_prf(average, modifier, msg_start, len(result))


Validation Metrics:
  Accuracy: 0.8593212001967536
  Micro - Precision: 0.8593212001967536, Recall: 0.8593212001967536, F1: 0.8593212001967535
  Macro - Precision: 0.8182593861078354, Recall: 0.7978837020662012, F1: 0.7995672130569388
Epoch 7/7


100%|██████████| 720/720 [01:44<00:00,  6.90it/s]


Train loss: 0.5936669622547924, accuracy: 0.84627410109432


  _warn_prf(average, modifier, msg_start, len(result))


Validation Metrics:
  Accuracy: 0.8726020659124447
  Micro - Precision: 0.8726020659124447, Recall: 0.8726020659124447, F1: 0.8726020659124447
  Macro - Precision: 0.8318694417618933, Recall: 0.8221643446929787, F1: 0.8220569878556814


In [9]:
def predict_example(text, tokenizer, model, intent2id):
    encoding = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    decoder_input_ids = torch.full((input_ids.shape[0], 1), model.config.decoder_start_token_id, dtype=torch.long, device=device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)

    logits = outputs.logits[:, -1, :]  
    predicted_class = torch.argmax(logits, dim=1).item()
    predicted_intent = list(intent2id.keys())[list(intent2id.values()).index(predicted_class)]
    return predicted_intent
    
import random
num_examples = 15
random_indices = random.sample(range(len(test_data[0])), num_examples)

for idx in random_indices:
    text = ' '.join(test_data[0][idx]) 
    true_intent = test_data[2][idx]  

    predicted_intent = predict_example(text, tokenizer, model, intent2id)

    print(f"Text: {text}")
    print(f"True Intent: {true_intent}")
    print(f"Predicted Intent: {predicted_intent}")
    print("-" * 50)

Text: nice lyrics
True Intent: music_likeness
Predicted Intent: music_likeness
--------------------------------------------------
Text: please check my reminders
True Intent: calendar_query
Predicted Intent: calendar_query
--------------------------------------------------
Text: go on sweet talk me
True Intent: general_quirky
Predicted Intent: general_quirky
--------------------------------------------------
Text: add paav bhaji in menu card
True Intent: lists_createoradd
Predicted Intent: takeaway_order
--------------------------------------------------
Text: check recent to do list
True Intent: lists_query
Predicted Intent: lists_query
--------------------------------------------------
Text: delete list for groceries
True Intent: lists_remove
Predicted Intent: lists_remove
--------------------------------------------------
Text: change the music mode to rock
True Intent: music_settings
Predicted Intent: play_music
--------------------------------------------------
Text: what is the d

In [8]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

def predict_example(text, tokenizer, model, intent2id):
    encoding = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    decoder_input_ids = torch.full((input_ids.shape[0], 1), model.config.decoder_start_token_id, dtype=torch.long, device=device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)

    logits = outputs.logits[:, -1, :]  
    predicted_class = torch.argmax(logits, dim=1).item()
    predicted_intent = list(intent2id.keys())[list(intent2id.values()).index(predicted_class)]
    return predicted_intent

import pandas as pd
# Prepare data for prediction
results = []
tokenizer = T5Tokenizer.from_pretrained("t5-base")
# Load the model
model =T5ForConditionalGeneration.from_pretrained('/kaggle/working/t5_base_intent-english-best.bin')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

for idx, (sentence, true_intent, id) in enumerate(zip(test_data[0], test_data[2], [entry['id'] for entry in massive_raw_fa if entry['partition'] == 'test'])):
    text = ' '.join(sentence)  # Join tokens into text
    predicted_intent = predict_example(text, tokenizer, model, intent2id)
    results.append({
        'id': id,
        'text': text,
        'true_intent': true_intent,
        'predicted_intent': predicted_intent
    })

df_results = pd.DataFrame(results)
df_results.to_csv('en-t5-base_test_set_predictions.csv', index=False)

print("Predictions saved to 'test_set_predictions-final.csv'")

Predictions saved to 'test_set_predictions-final.csv'
