In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
!pip install sentencepiece transformers[torch] -q
!pip install accelerate -U -q

In [3]:
import os
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments

In [4]:
workdir =  "/content/gdrive/MyDrive/Ultimate/"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "xlm-roberta-large"
model_path = workdir + f"{model_name}-custom-trained" # Save the trained model if needed
cache_dir=workdir+"model_cache"

#training data
train_file_path = workdir+"train.tsv"
train_ext_path = workdir+"train_ext.tsv"

In [5]:
# Define training parameters
epochs = 6
batch_size = 4

In [6]:
# Load your training data from a TSV file
data = pd.read_csv(train_file_path, delimiter="\t", names = ["text", "label"])
print("Number of examples in training file:",len(data))

Number of examples in training file: 4634


In [7]:
aggregate_data = data.groupby('label').size().reset_index(name='count').sort_values(['count'], ascending=True)
aggregate_data.head()

Unnamed: 0,label,count
2,aircraft+flight+flight_no,1
18,ground_service+ground_fare,1
4,airfare+flight_time,1
9,cheapest,1
6,airline+flight_no,2


In [8]:
#split the multiple intent joined by + into multiple rows

In [9]:
# data['label'].str.contains('+')
df_split = data[data['label'].str.contains(r'\+', regex=True)].copy()
df_split['label'] = df_split['label'].str.split('+')
# Expand the list of intents into separate rows
df_split = df_split.explode('label')
# Remove the original rows with '+' sign
data = data[~data['label'].str.contains(r'\+', regex=True)]
# Concatenate the split DataFrame with the original DataFrame
data = pd.concat([data, df_split], ignore_index=True)
len(data)

4657

In [10]:
#count of different classes' examples.
aggregate_data = data.groupby('label').size().reset_index(name='count').sort_values(['count'], ascending=True)
aggregate_data.head()

Unnamed: 0,label,count
6,cheapest,1
16,restriction,5
14,meal,6
10,flight_no,15
5,capacity,16


In [11]:
# in few categories, the number of examples is low.
# we will add few artificially generated exaples using Chatgpt. These are stores in train_ext.tsv file
# we will not be training for the categories having multiple intents. Instead we will train for base intents and add multiple if confidence for multiple intent is high

In [12]:
data_ext = pd.read_csv(train_ext_path, delimiter="\t", names = ["text", "label"])
len(data_ext)

30

In [13]:
data = pd.concat([data, data_ext], ignore_index=True)

In [14]:
aggregate_data = data.groupby('label').size().reset_index(name='count').sort_values(['count'], ascending=True)
aggregate_data.head()

Unnamed: 0,label,count
6,cheapest,11
16,restriction,15
10,flight_no,15
14,meal,16
5,capacity,16


In [15]:
import torch
import numpy as np
import pickle
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the XLM-RoBERTa tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name, cache_dir=cache_dir)

# Example data (replace with your own dataset)
texts = data.text.tolist()
labels = data.label.tolist()  # String labels

# Encode the string labels to integers
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Tokenize the texts
encoded_texts = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Create DataLoader from the tokenized data
dataset = TensorDataset(
    encoded_texts["input_ids"],
    encoded_texts["attention_mask"],
    torch.tensor(encoded_labels, dtype=torch.long),
)

# Create a DataLoader for the entire dataset
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(encoded_labels), y=encoded_labels)

# Convert class weights to a PyTorch tensor
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)

# Initialize the model
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_), cache_dir=cache_dir).to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
# loss_fn = torch.nn.CrossEntropyLoss()

loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    all_labels = []
    all_predictions = []

    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        _, predicted = torch.max(outputs.logits, 1)
        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predicted.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_predictions)

    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_loss:.4f}, Training Accuracy: {accuracy:.4f}")

model.save_pretrained(model_path)

# Save label encoder for inference
with open(model_path + "/label_encoder.pkl", "wb") as le_file:
    pickle.dump(label_encoder, le_file)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/6, Training Loss: 0.5878, Training Accuracy: 0.8594
Epoch 2/6, Training Loss: 0.1684, Training Accuracy: 0.9554
Epoch 3/6, Training Loss: 0.1059, Training Accuracy: 0.9729
Epoch 4/6, Training Loss: 0.0731, Training Accuracy: 0.9827
Epoch 5/6, Training Loss: 0.0652, Training Accuracy: 0.9838
Epoch 6/6, Training Loss: 0.0648, Training Accuracy: 0.9814


In [16]:
#inference

In [17]:
import torch
import pickle
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from sklearn.preprocessing import LabelEncoder

# Load the tokenizer and model
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaForSequenceClassification.from_pretrained(model_path)
model.to(device)

# Load the label encoder from the file
with open(model_path+"/label_encoder.pkl", "rb") as le_file:
    label_encoder = pickle.load(le_file)


def classify(input_text):

    # Tokenize the input text
    encoded_text = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt")
    encoded_text.to(device)

    # Perform inference
    with torch.no_grad():
        outputs = model(encoded_text.input_ids.to(device), attention_mask=encoded_text.attention_mask.to(device))

    predicted_probabilities = torch.softmax(outputs.logits, dim=1)

    # Get the top three predicted classes and their corresponding probabilities
    top_classes = torch.topk(predicted_probabilities, 3, dim=1)
    top_class_indices = top_classes.indices[0].tolist()
    top_class_probabilities = top_classes.values[0].tolist()

    # Map the class indices back to the original string labels using the label encoder
    top_class_labels = label_encoder.inverse_transform(top_class_indices)

    res = []

    for label, probability in zip(top_class_labels, top_class_probabilities):
        res.append({"label": label, "confidence": f"{probability:.4f}"})
    return res
input_text = "suggest cheapest flight tickets"
print(classify(input_text))

[{'label': 'cheapest', 'confidence': '0.9677'}, {'label': 'airfare', 'confidence': '0.0081'}, {'label': 'flight', 'confidence': '0.0060'}]


In [18]:
def evaluation(data):
    # Get the true labels for the test data
    true_labels = data.label

    # Make predictions using your inference function
    predicted_labels = data.text.apply(lambda x: classify(x)[0]["label"])

    # Calculate accuracy
    acc = accuracy_score(true_labels, predicted_labels)
    print("Accuracy:", acc)

    # Calculate precision, recall, and F1 score
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')

    print("Precision :", precision)
    print("Recall :", recall)
    print("F1 Score:", f1)

In [19]:
#train eval
data = pd.read_csv(workdir+"train.tsv", delimiter="\t", names = ["text", "label"])
evaluation(data)

Accuracy: 0.991799741044454
Precision : 0.9874185558363879
Recall : 0.991799741044454
F1 Score: 0.9894617421111547


  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
#test eval
test =  pd.read_csv(workdir+"test.tsv", delimiter="\t", names = ["text", "label"])
evaluation(test)

Accuracy: 0.9705882352941176
Precision : 0.9551627542402729
Recall : 0.9705882352941176
F1 Score: 0.9622531573499896


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
