In [1]:
!pip install -q pandas torch scikit-learn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Setup and Imports

In [25]:
import time
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import re
import pickle
import csv

print("Libraries imported successfully!")


Libraries imported successfully!


Load and Preprocess Data

In [10]:
# Load your dataset from the uploaded 'intent.csv' file.
try:
    df = pd.read_csv('intent.csv', skiprows=1, header=None, names=['raw_data'])
    df[['text', 'intent']] = df['raw_data'].str.split(',', n=1, expand=True)
    df['text'] = df['text'].str.strip('" ')
    df['intent'] = df['intent'].str.strip('" ')
    print("Successfully loaded and parsed 'intent.csv'.")

except FileNotFoundError:
    print("Error: 'intent.csv' not found.")
    print("Please make sure you have uploaded the file to your Colab session.")
    data = {'text': ['tell me a joke', 'what is the weather like', 'play some music',
                     'how are you', 'what is the temperature', 'I want to hear a song'],
            'intent': ['joke', 'weather', 'music', 'greeting', 'weather', 'music']}
    df = pd.DataFrame(data)
    print("\nUsing a dummy dataframe for demonstration.")

# --- Inspect the class distribution to see the problem ---
print("\n--- Examining Intent Counts ---")
print("This shows how many examples exist for each intent category:")
print(df['intent'].value_counts())
print("---------------------------------")


# Basic text cleaning function
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()
    text = text.strip()
    return text

df['text'] = df['text'].apply(clean_text)

# Encode the labels
label_encoder = LabelEncoder()
df['intent_encoded'] = label_encoder.fit_transform(df['intent'])
print("\nLabels encoded:")
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))


# Split data into training and validation sets
# --- CORRECTED LINE: Removed 'stratify' parameter ---
X_train, X_val, y_train, y_val = train_test_split(
    df['text'],
    df['intent_encoded'],
    test_size=0.2,
    random_state=42
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")

print("\nFinal processed DataFrame head:")
print(df.head())

Successfully loaded and parsed 'intent.csv'.

--- Examining Intent Counts ---
This shows how many examples exist for each intent category:
intent
product info      20
get directions    20
repeat            20
Name: count, dtype: int64
---------------------------------

Labels encoded:
{'get directions': np.int64(0), 'product info': np.int64(1), 'repeat': np.int64(2)}

Training set size: 48
Validation set size: 12

Final processed DataFrame head:
                                            raw_data  \
0  "What's the price of the Model Z headphones?",...   
1  "Can you tell me the battery life of the X200 ...   
2  "Do you have information on the dimensions of ...   
3  "What's the availability of the Galaxy S21 in ...   
4  "Give me specs for the TurboMax laptop.","prod...   

                                                text        intent  \
0          whats the price of the model z headphones  product info   
1  can you tell me the battery life of the x smar...  product info   
2  

Build Vocabulary

In [11]:
# Build vocabulary from the training data
counter = Counter()
for text in X_train:
    counter.update(text.split())

# Create a word-to-index dictionary
# Start with special tokens
word_to_idx = {'<pad>': 0, '<unk>': 1}
# Start indexing from 2, since 0 and 1 are taken
idx = 2
for word, count in counter.most_common():
    word_to_idx[word] = idx
    idx += 1

vocab_size = len(word_to_idx)
print(f"Vocabulary size: {vocab_size}")

# Example: Get the index of a word
print(f"Index of 'the': {word_to_idx.get('the', 1)}") # Use .get() to handle unknown words


Vocabulary size: 187
Index of 'the': 2


Create PyTorch Dataset

In [12]:
class IntentDataset(Dataset):
    def __init__(self, texts, labels, vocabulary, max_len=20):
        self.texts = texts
        self.labels = labels
        self.vocabulary = vocabulary
        self.max_len = max_len
        self.unk_token_idx = vocabulary['<unk>']
        self.pad_token_idx = vocabulary['<pad>']

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        # Tokenize and numericalize using the dictionary
        tokens = text.split()
        # Use .get() to default to the <unk> token index if a word is not in the vocabulary
        token_indices = [self.vocabulary.get(token, self.unk_token_idx) for token in tokens]

        # Pad or truncate the sequence
        if len(token_indices) < self.max_len:
            # Pad with the index of <pad> token
            token_indices.extend([self.pad_token_idx] * (self.max_len - len(token_indices)))
        else:
            token_indices = token_indices[:self.max_len]

        return torch.tensor(token_indices, dtype=torch.long), torch.tensor(label, dtype=torch.long)

# Define max sequence length
MAX_LEN = 25

# Create dataset instances
train_dataset = IntentDataset(X_train, y_train, word_to_idx, MAX_LEN)
val_dataset = IntentDataset(X_val, y_val, word_to_idx, MAX_LEN)

# Example of one item from the dataset
print("Example item from the dataset:")
print(train_dataset[0])


Example item from the dataset:
(tensor([43,  8, 10,  6, 23,  6, 44, 45, 46,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0]), tensor(0))


Define the Lightweight Model

In [13]:
class IntentClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(IntentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        # Using a simple averaging of embeddings instead of a complex RNN/LSTM
        # This is a form of a "Continuous Bag-of-Words" (CBOW) model.
        self.fc = nn.Linear(embed_dim, num_class)

    def forward(self, text):
        # text shape: (batch_size, seq_len)
        embedded = self.embedding(text)
        # embedded shape: (batch_size, seq_len, embed_dim)

        # Average the embeddings across the sequence length dimension
        # The mean operation needs to ignore padding. We can do this by creating a mask.
        mask = (text != 0).float().unsqueeze(2)
        embedded = embedded * mask
        summed = torch.sum(embedded, 1)
        non_pad_count = mask.sum(1)
        mean_embedded = summed / (non_pad_count + 1e-9) # Add epsilon to avoid division by zero
        # mean_embedded shape: (batch_size, embed_dim)

        return self.fc(mean_embedded)

# Hyperparameters
EMBED_DIM = 32 # Keep this small for a lightweight model
NUM_CLASSES = len(label_encoder.classes_)
BATCH_SIZE = 8

# Instantiate the model
model = IntentClassifier(vocab_size, EMBED_DIM, NUM_CLASSES)
print("Model architecture:")
print(model)

# Dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)


Model architecture:
IntentClassifier(
  (embedding): Embedding(187, 32, padding_idx=0)
  (fc): Linear(in_features=32, out_features=3, bias=True)
)


Train the Model

In [14]:
# Training parameters
LEARNING_RATE = 0.005
EPOCHS = 50 # Increase if needed, but watch for overfitting

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Training loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    total_acc = 0

    for texts, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        # Calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_acc += (predicted == labels).sum().item()

    # Print training stats
    avg_loss = total_loss / len(train_dataloader)
    avg_acc = total_acc / len(train_dataset)
    print(f'Epoch {epoch+1}/{EPOCHS}, Train Loss: {avg_loss:.4f}, Train Acc: {avg_acc:.4f}')

    # Validation loop
    model.eval()
    val_loss = 0
    val_acc = 0
    with torch.no_grad():
        for texts, labels in val_dataloader:
            outputs = model(texts)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            val_acc += (predicted == labels).sum().item()

    avg_val_loss = val_loss / len(val_dataloader)
    avg_val_acc = val_acc / len(val_dataset)
    print(f'Epoch {epoch+1}/{EPOCHS}, Val Loss: {avg_val_loss:.4f},   Val Acc: {avg_val_acc:.4f}\n')

print("Training complete!")


Epoch 1/50, Train Loss: 1.1527, Train Acc: 0.2500
Epoch 1/50, Val Loss: 1.1137,   Val Acc: 0.3333

Epoch 2/50, Train Loss: 1.0584, Train Acc: 0.3958
Epoch 2/50, Val Loss: 1.0367,   Val Acc: 0.5000

Epoch 3/50, Train Loss: 0.9776, Train Acc: 0.5417
Epoch 3/50, Val Loss: 0.9643,   Val Acc: 0.6667

Epoch 4/50, Train Loss: 0.8985, Train Acc: 0.7292
Epoch 4/50, Val Loss: 0.9050,   Val Acc: 0.5833

Epoch 5/50, Train Loss: 0.8280, Train Acc: 0.8125
Epoch 5/50, Val Loss: 0.8402,   Val Acc: 0.6667

Epoch 6/50, Train Loss: 0.7517, Train Acc: 0.8542
Epoch 6/50, Val Loss: 0.7737,   Val Acc: 0.6667

Epoch 7/50, Train Loss: 0.6781, Train Acc: 0.8958
Epoch 7/50, Val Loss: 0.7146,   Val Acc: 0.6667

Epoch 8/50, Train Loss: 0.6039, Train Acc: 0.9167
Epoch 8/50, Val Loss: 0.6545,   Val Acc: 0.6667

Epoch 9/50, Train Loss: 0.5310, Train Acc: 0.9583
Epoch 9/50, Val Loss: 0.5929,   Val Acc: 0.7500

Epoch 10/50, Train Loss: 0.4627, Train Acc: 0.9792
Epoch 10/50, Val Loss: 0.5369,   Val Acc: 0.7500

Epoch 11

Save the Model for Mobile

In [23]:
# Ensure the model is in evaluation mode
model.eval()

# Create an example input tensor. This is needed for tracing.
# The shape should match a single input to the model.
example_input = torch.randint(0, vocab_size, (1, MAX_LEN), dtype=torch.long)

# Trace the model
traced_script_module = torch.jit.trace(model, example_input)

# Save the traced model
# This .ptl file is the one you will use in your Android app with PyTorch Mobile.
model_filename_mobile = "intent_model_mobile.ptl"
traced_script_module._save_for_lite_interpreter(model_filename_mobile)

print(f"Model saved for mobile deployment as '{model_filename_mobile}'")

# You can also save the regular state dict for later use in Python
model_filename_pytorch = "intent_model_pytorch.pth"
torch.save(model.state_dict(), model_filename_pytorch)
print(f"Standard PyTorch model state_dict saved as '{model_filename_pytorch}'")

# Also save the vocabulary and label encoder, you'll need them for inference
import pickle

with open('vocabulary.pkl', 'wb') as f:
    pickle.dump(word_to_idx, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("Vocabulary and Label Encoder also saved.")

Model saved for mobile deployment as 'intent_model_mobile.ptl'
Standard PyTorch model state_dict saved as 'intent_model_pytorch.pth'
Vocabulary and Label Encoder also saved.


Evaluation, Inference, and Model Stats

In [26]:
# --- 1. Load the necessary artifacts ---
# You would typically do this in your application
# For this notebook, we can reuse the objects from previous cells,
# but we'll load them to demonstrate the full process.

# Load the vocabulary and label encoder
with open('vocabulary.pkl', 'rb') as f:
    word_to_idx = pickle.load(f)

with open('label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

# Re-instantiate the model structure
# Make sure the parameters match the ones used for training
vocab_size = len(word_to_idx)
EMBED_DIM = 32 # Should be the same as in Cell 5
NUM_CLASSES = len(label_encoder.classes_) # Should be the same as in Cell 5
model_eval = IntentClassifier(vocab_size, EMBED_DIM, NUM_CLASSES)

# Load the trained weights
# For evaluation in Python, it's easier to use the .pth file
model_eval.load_state_dict(torch.load("intent_model_pytorch.pth"))
model_eval.eval() # Set the model to evaluation mode

print("Model, vocabulary, and label encoder loaded successfully.\n")


# --- 2. Model Information ---
# Calculate the total number of parameters
total_params = sum(p.numel() for p in model_eval.parameters() if p.requires_grad)
print(f"Total trainable parameters: {total_params:,}")

# Get the size of the saved mobile-optimized model file
try:
    model_size_bytes = os.path.getsize("intent_model_mobile.ptl")
    model_size_kb = model_size_bytes / 1024
    print(f"Size of the mobile model file ('intent_model_mobile.ptl'): {model_size_kb:.2f} KB")
except FileNotFoundError:
    print("Mobile model file not found.")

print("-" * 30)

# --- 3. Prediction Function and Inference Time ---
MAX_LEN = 25 # Should be the same as in Cell 4
unk_token_idx = word_to_idx['<unk>']
pad_token_idx = word_to_idx['<pad>']

def predict_intent(sentence):
    # Preprocess the sentence
    cleaned_sentence = clean_text(sentence)
    tokens = cleaned_sentence.split()

    # Numericalize
    token_indices = [word_to_idx.get(token, unk_token_idx) for token in tokens]

    # Pad/Truncate
    if len(token_indices) < MAX_LEN:
        token_indices.extend([pad_token_idx] * (MAX_LEN - len(token_indices)))
    else:
        token_indices = token_indices[:MAX_LEN]

    # Convert to tensor
    text_tensor = torch.tensor(token_indices, dtype=torch.long).unsqueeze(0) # Add batch dimension

    # Get prediction
    with torch.no_grad():
        output = model_eval(text_tensor)
        _, predicted_idx = torch.max(output.data, 1)

    # Decode the prediction
    predicted_intent = label_encoder.inverse_transform(predicted_idx.numpy())[0]
    return predicted_intent

# --- 4. Evaluate on Test Sentences ---
test_sentences = [
    "How big is this tomato sauce?",
    "Sorry repeat that?",
    "Where can I buy meat?",
    "Show me nutriments infos of this cereal box."
]

print("\nRunning predictions on test sentences:")
for sentence in test_sentences:
    prediction = predict_intent(sentence)
    print(f"Sentence: '{sentence}'")
    print(f"--> Predicted Intent: '{prediction}'\n")

# --- 5. Measure Inference Time ---
# Let's measure the time for one prediction.
# We run it once to warm up, then time it.
_ = predict_intent("this is a warm-up sentence")

start_time = time.perf_counter()
_ = predict_intent("what is the weather like today")
end_time = time.perf_counter()

inference_time_ms = (end_time - start_time) * 1000
print("-" * 30)
print(f"Single sentence inference time: {inference_time_ms:.4f} ms")


Model, vocabulary, and label encoder loaded successfully.

Total trainable parameters: 6,083
Size of the mobile model file ('intent_model_mobile.ptl'): 33.59 KB
------------------------------

Running predictions on test sentences:
Sentence: 'How big is this tomato sauce?'
--> Predicted Intent: 'get directions'

Sentence: 'Sorry repeat that?'
--> Predicted Intent: 'repeat'

Sentence: 'Where can I buy meat?'
--> Predicted Intent: 'get directions'

Sentence: 'Show me nutriments infos of this cereal box.'
--> Predicted Intent: 'product info'

------------------------------
Single sentence inference time: 0.6554 ms
