In [61]:
import pandas as pd

# Load the dataset
data = pd.read_csv("data/Intent recognition/Preprocessed data/Data.csv")  # Replace with your dataset file
print(data.head())

                                                text         intent
0   listen to westbam alumb allergic on google music      playmusic
1         add step to me to the 50 clásicos playlist  addtoplaylist
2  i give this current textbook a rating value of...       ratebook
3               play the song little robin redbreast      playmusic
4  please add iris dement to my playlist this is ...  addtoplaylist


In [62]:
import re

def clean_text(text):
    if not isinstance(text, str):
        return text  # Handle non-string values
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply cleaning to text and intent columns
data['text'] = data['text'].apply(clean_text)
data['intent'] = data['intent'].apply(clean_text)

# Check for missing values
print(data.isnull().sum())

text      0
intent    0
dtype: int64


In [75]:
from sklearn.preprocessing import LabelEncoder
import joblib

# Encode intent labels
label_encoder = LabelEncoder()
data['intent_encoded'] = label_encoder.fit_transform(data['intent'])
print(data.head())


# Save the LabelEncoder to a file
joblib.dump(label_encoder, 'models/Model training/Intent recognition/label_encoder.pkl')  # Save it in the current directory

                                                text         intent  \
0   listen to westbam alumb allergic on google music      playmusic   
1         add step to me to the 50 clásicos playlist  addtoplaylist   
2  i give this current textbook a rating value of...       ratebook   
3               play the song little robin redbreast      playmusic   
4  please add iris dement to my playlist this is ...  addtoplaylist   

   intent_encoded  
0              35  
1               0  
2              38  
3              35  
4               0  


['models/Model training/Intent recognition/label_encoder.pkl']

In [64]:
from sklearn.model_selection import train_test_split

# Split the data
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

print(f"Training set: {len(train_df)} examples")
print(f"Validation set: {len(val_df)} examples")
print(f"Test set: {len(test_df)} examples")

Training set: 33657 examples
Validation set: 3740 examples
Test set: 9350 examples


In [65]:
from transformers import BertTokenizer

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_data(df, tokenizer, max_length=64):
    return tokenizer(
        df['text'].tolist(),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

# Tokenize training, validation, and test data
train_encodings = tokenize_data(train_df, tokenizer)
val_encodings = tokenize_data(val_df, tokenizer)
test_encodings = tokenize_data(test_df, tokenizer)

In [66]:
import torch

class IntentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = IntentDataset(train_encodings, train_df['intent_encoded'].tolist())
val_dataset = IntentDataset(val_encodings, val_df['intent_encoded'].tolist())
test_dataset = IntentDataset(test_encodings, test_df['intent_encoded'].tolist())

In [67]:
from transformers import BertForSequenceClassification

# Load pre-trained BERT for sequence classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_encoder.classes_)  # Number of unique intents
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [68]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',  # Output directory
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    warmup_steps=500,  # Number of warmup steps
    weight_decay=0.01,  # Strength of weight decay
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=10,  # Log every 10 steps
    evaluation_strategy="epoch"  # Evaluate after each epoch
)






In [70]:
from transformers import Trainer

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
expected string or bytes-like object, got 'NoneType'

In [71]:
# Train the model
trainer.train()

NameError: name 'trainer' is not defined

In [72]:
# Evaluate on the test set
results = trainer.evaluate(test_dataset)
print(results)

NameError: name 'trainer' is not defined

In [73]:
# Save the model and tokenizer
model.save_pretrained('./intent_recognition_bert')
tokenizer.save_pretrained('./intent_recognition_bert')

('./intent_recognition_bert\\tokenizer_config.json',
 './intent_recognition_bert\\special_tokens_map.json',
 './intent_recognition_bert\\vocab.txt',
 './intent_recognition_bert\\added_tokens.json')

In [None]:
# Load the fine-tuned model and tokenizer
from transformers import BertForSequenceClassification, BertTokenizer

model = BertForSequenceClassification.from_pretrained('./intent_recognition_bert')
tokenizer = BertTokenizer.from_pretrained('./intent_recognition_bert')

# Predict intent for a real-time query
def predict_intent(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=64)
    outputs = model(**inputs)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()
    return label_encoder.inverse_transform([predicted_label])[0]

# Test with a real-time query
user_query = "What's the weather like today?"
predicted_intent = predict_intent(user_query)
print(f"Predicted Intent: {predicted_intent}")