In [None]:
!pip install evaluate
!pip install datasets

In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"

def create_tokens_and_labels(id, sample):
    intent = sample['intent']
    utt = sample['utt']
    annot_utt = sample['annot_utt']
    tokens = utt.split()
    labels = []
    label = 'O'
    split_annot_utt = annot_utt.split()
    idx = 0
    BIO_SLOT = False
    while idx < len(split_annot_utt):
        if split_annot_utt[idx].startswith('['):
            label = split_annot_utt[idx].lstrip('[')
            idx += 2
            BIO_SLOT = True
        elif split_annot_utt[idx].endswith(']'):
            if split_annot_utt[idx-1] == ":":
                labels.append("B-" + label)
                label = 'O'
                idx += 1
            else:
                labels.append("I-" + label)
                label = 'O'
                idx += 1
            BIO_SLOT = False
        else:
            if split_annot_utt[idx-1] == ":":
                labels.append("B-" + label)
                idx += 1
            elif BIO_SLOT == True:
                labels.append("I-" + label)
                idx += 1
            else:
                labels.append("O")
                idx += 1

    if len(tokens) != len(labels):
        raise ValueError(f"Len of tokens, {tokens}, doesnt match len of labels, {labels}, for id {id} and annot utt: {annot_utt}")
    return tokens, labels, intent

def Read_Massive_dataset(filepath):
    sentences_tr, tags_tr, intent_tags_tr = [], [], []
    sentences_val, tags_val, intent_tags_val = [], [], []
    sentences_test, tags_test, intent_tags_test = [], [], []
    all_tags, all_intents = [], []

    with open(filepath, 'r', encoding='utf-8') as f:
        for id, line in enumerate(f):
            sample = json.loads(line)
            tokens, labels, intent = create_tokens_and_labels(id, sample)

            if sample['partition'] == 'train':
                sentences_tr.append(tokens)
                tags_tr.append(labels)
                intent_tags_tr.append(intent)
                all_tags.extend(labels)
                all_intents.append(intent)
            elif sample['partition'] == 'dev':
                sentences_val.append(tokens)
                tags_val.append(labels)
                intent_tags_val.append(intent)
            elif sample['partition'] == 'test':
                sentences_test.append(tokens)
                tags_test.append(labels)
                intent_tags_test.append(intent)

    # Unique labels and intents
    unique_tags = sorted(set(all_tags))
    unique_intents = sorted(set(all_intents))
    tag2id = {tag: i for i, tag in enumerate(unique_tags)}
    intent2id = {intent: i for i, intent in enumerate(unique_intents)}

    return (sentences_tr, tags_tr, intent_tags_tr), \
           (sentences_val, tags_val, intent_tags_val), \
           (sentences_test, tags_test, intent_tags_test), \
           tag2id, intent2id

In [4]:
from huggingface_hub import login

token = "hf_"
login(token=token)

print("Logged in successfully!")


Logged in successfully!


In [7]:
def compute_metrics(pred):
    logits, labels = pred
    predictions = torch.argmax(torch.tensor(logits), dim=1).numpy()

    micro_metrics = precision_recall_fscore_support(labels, predictions, average="micro")
    macro_metrics = precision_recall_fscore_support(labels, predictions, average="macro")

    return {
        "accuracy": accuracy_score(labels, predictions),
        "micro_precision": micro_metrics[0],
        "micro_recall": micro_metrics[1],
        "micro_f1": micro_metrics[2],
        "macro_precision": macro_metrics[0],
        "macro_recall": macro_metrics[1],
        "macro_f1": macro_metrics[2],
    }

def tokenize_data(sentences, labels, tokenizer, max_length=128):
    encodings = tokenizer(
        [" ".join(sentence) for sentence in sentences],
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt"
    )
    return IntentDataset(encodings, labels, tokenizer)

class IntentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, tokenizer):
        self.encodings = encodings
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [13]:
import json

def load_massive_dataset(filepath):
    dataset = []
    with open(filepath, "r") as f:
        for line in f:
            dataset.append(json.loads(line))
    return dataset

def prepare_data(massive_raw):
    # Extract unique intents
    intents = sorted(set(sample["intent"] for sample in massive_raw))

    # Map intents to label indices
    intent_to_label = {intent: idx for idx, intent in enumerate(intents)}

    # Split data into training, validation, and testing
    train_sentences = [sample["utt"] for sample in massive_raw if sample["partition"] == "train"]
    train_labels = [intent_to_label[sample["intent"]] for sample in massive_raw if sample["partition"] == "train"]

    val_sentences = [sample["utt"] for sample in massive_raw if sample["partition"] == "dev"]
    val_labels = [intent_to_label[sample["intent"]] for sample in massive_raw if sample["partition"] == "dev"]

    test_sentences = [sample["utt"] for sample in massive_raw if sample["partition"] == "test"]
    test_labels = [intent_to_label[sample["intent"]] for sample in massive_raw if sample["partition"] == "test"]

    return train_sentences, train_labels, val_sentences, val_labels, test_sentences, test_labels, intents, intent_to_label

# Example usage:
data_path = "/content/1.0/data/en-US.jsonl"
massive_raw = load_massive_dataset(data_path)
(
    train_sentences, train_labels,
    val_sentences, val_labels,
    test_sentences, test_labels,
    intents, intent_to_label
) = prepare_data(massive_raw)

print(f"Number of intents: {len(intents)}")
print(f"Intents: {intents}")
print(f"Intent to label mapping: {intent_to_label}")


Number of intents: 60
Intents: ['alarm_query', 'alarm_remove', 'alarm_set', 'audio_volume_down', 'audio_volume_mute', 'audio_volume_other', 'audio_volume_up', 'calendar_query', 'calendar_remove', 'calendar_set', 'cooking_query', 'cooking_recipe', 'datetime_convert', 'datetime_query', 'email_addcontact', 'email_query', 'email_querycontact', 'email_sendemail', 'general_greet', 'general_joke', 'general_quirky', 'iot_cleaning', 'iot_coffee', 'iot_hue_lightchange', 'iot_hue_lightdim', 'iot_hue_lightoff', 'iot_hue_lighton', 'iot_hue_lightup', 'iot_wemo_off', 'iot_wemo_on', 'lists_createoradd', 'lists_query', 'lists_remove', 'music_dislikeness', 'music_likeness', 'music_query', 'music_settings', 'news_query', 'play_audiobook', 'play_game', 'play_music', 'play_podcasts', 'play_radio', 'qa_currency', 'qa_definition', 'qa_factoid', 'qa_maths', 'qa_stock', 'recommendation_events', 'recommendation_locations', 'recommendation_movies', 'social_post', 'social_query', 'takeaway_order', 'takeaway_query

In [14]:
import torch
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score
import json

# Mapping intents to indices
INTENT_MAP = intent_to_label

LABEL_TO_INTENT = {v: k for k, v in INTENT_MAP.items()}

# Dataset class
class IntentDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.inputs = [tokenizer(item['utt'], truncation=True, padding='max_length', max_length=max_length, return_tensors='pt') for item in data]
        self.labels = [INTENT_MAP[item['intent']] for item in data]

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_ids = self.inputs[idx]['input_ids'].squeeze()
        attention_mask = self.inputs[idx]['attention_mask'].squeeze()
        label = torch.tensor(self.labels[idx])
        return input_ids, attention_mask, label

# Load data
def load_and_split_data(filename):
    with open(filename) as f:
        data = [json.loads(line) for line in f]
    train_data = [item for item in data if item['partition'] == 'train']
    val_data = [item for item in data if item['partition'] == 'dev']
    test_data = [item for item in data if item['partition'] == 'test']
    return train_data, val_data, test_data

# Evaluation metrics
def evaluate_model(model, dataloader, prefix="Validation"):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in dataloader:
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            labels = labels.cuda()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    unique_labels = sorted(set(all_labels))
    target_names = [LABEL_TO_INTENT[i] for i in unique_labels]

    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(
        all_labels,
        all_preds,
        labels=unique_labels,
        target_names=target_names,
        zero_division=0
    )
    print(f"{prefix} Accuracy: {accuracy:.4f}\n")
    print(f"{prefix} Classification Report:")
    print(report)

# Training the model
def fine_tune_model(data_path, output_dir, batch_size=16, epochs=5, max_length=64, lr=1e-5):
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token

    train_data, val_data, test_data = load_and_split_data(data_path)
    train_dataset = IntentDataset(train_data, tokenizer, max_length)
    val_dataset = IntentDataset(val_data, tokenizer, max_length)
    test_dataset = IntentDataset(test_data, tokenizer, max_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=len(INTENT_MAP))
    model.config.pad_token_id = tokenizer.pad_token_id
    model = model.cuda()

    optimizer = AdamW(model.parameters(), lr=lr)

        # Compute class weights
    from sklearn.utils.class_weight import compute_class_weight
    import numpy as np

    def compute_weights(labels, num_classes):
        class_weights = compute_class_weight('balanced', classes=np.arange(num_classes), y=labels)
        return torch.tensor(class_weights, dtype=torch.float).cuda()

    train_labels = [INTENT_MAP[item['intent']] for item in train_data]
    class_weights = compute_weights(train_labels, len(INTENT_MAP))

    loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
    # Training Loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for input_ids, attention_mask, labels in train_loader:
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            labels = labels.cuda()

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

        # Validation evaluation
        print(f"\nEpoch {epoch + 1} Evaluation on Validation Set:")
        evaluate_model(model, val_loader, prefix="Validation")

    # Final Test Evaluation
    print("\nFinal Evaluation on Test Set:")
    evaluate_model(model, test_loader, prefix="Test")

    # Save the fine-tuned model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

if __name__ == "__main__":
    data_path = "/content/1.0/data/en-US.jsonl"
    output_dir = "./fine_tuned_gpt2_en"
    fine_tune_model(data_path, output_dir)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 2590.7619

Epoch 1 Evaluation on Validation Set:
Validation Accuracy: 0.5175

Validation Classification Report:
                          precision    recall  f1-score   support

             alarm_query       0.28      0.53      0.36        19
            alarm_remove       0.00      0.00      0.00        14
               alarm_set       0.55      0.71      0.62        31
       audio_volume_down       0.00      0.00      0.00         8
       audio_volume_mute       0.00      0.00      0.00        15
         audio_volume_up       0.00      0.00      0.00        12
          calendar_query       0.46      0.60      0.52       102
         calendar_remove       0.71      0.21      0.33        47
            calendar_set       0.49      0.77      0.60       131
           cooking_query       0.00      0.00      0.00         2
          cooking_recipe       0.76      0.61      0.68        41
        datetime_convert       0.00      0.00      0.00         9
          date

In [15]:
import pandas as pd

def evaluate_and_save_results(test_data, model, tokenizer, max_length=64, output_filename="test_predictions.csv"):
    model.eval()
    results = []

    for item in test_data:
        # Tokenize the test input
        inputs = tokenizer(item['utt'], return_tensors="pt", truncation=True, padding='max_length', max_length=max_length)
        inputs = {key: value.cuda() for key, value in inputs.items()}

        # Perform prediction
        with torch.no_grad():
            outputs = model(**inputs)
            pred = torch.argmax(outputs.logits, dim=-1).item()

        # Map prediction index to intent
        predicted_intent = LABEL_TO_INTENT[pred]

        # Collect results
        results.append({
            "id": item["id"],
            "locale": item["locale"],
            "utterance": item["utt"],
            "true_intent": item["intent"],
            "predicted_intent": predicted_intent
        })

    # Save results to CSV
    df = pd.DataFrame(results)
    df.to_csv(output_filename, index=False)
    print(f"Results saved to {output_filename}")

if __name__ == "__main__":
    data_path = "/content/1.0/data/en-US.jsonl"
    output_dir = "./fine_tuned_gpt2_en"

    # Load test data

    # Load fine-tuned model and tokenizer
    model = GPT2ForSequenceClassification.from_pretrained(output_dir).cuda()
    tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
    tokenizer.pad_token = tokenizer.eos_token

    train_data, val_data, test_data = load_and_split_data(data_path)
    test_dataset = IntentDataset(test_data, tokenizer, max_length=64)

    # Evaluate and save test results
    evaluate_and_save_results(test_data, model, tokenizer, output_filename="test_predictions-final-en-gpt.csv")


Results saved to test_predictions-final-en-gpt.csv
