![servicedesk](servicedesk.png)

CleverSupport is a company at the forefront of AI innovation, specializing in the development of AI-driven solutions to enhance customer support services. Their latest endeavor is to engineer a text classification system that can automatically categorize customer complaints. 

Your role as a data scientist involves the creation of a sophisticated machine learning model that can accurately assign complaints to specific categories, such as mortgage, credit card, money transfers, debt collection, etc.

In [91]:
!pip install torchmetrics

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [92]:
from collections import Counter
import nltk, json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torchmetrics import Accuracy, Precision, Recall

In [93]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/repl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [94]:
# Import data and labels
with open("words.json", 'r') as f1:
    words = json.load(f1)
with open("text.json", 'r') as f2:
    text = json.load(f2)
labels = np.load('labels.npy')

In [95]:
# Dictionaries to store the word to index mappings and vice versa
word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

# Looking up the mapping dictionary and assigning the index to the respective words
for i, sentence in enumerate(text):
    text[i] = [word2idx[word] if word in word2idx else 0 for word in sentence]
    
# Defining a function that either shortens sentences or pads sentences with 0 to a fixed length
def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

text = pad_input(text, 50)

In [96]:
# Splitting dataset
train_text, test_text, train_label, test_label = train_test_split(text, labels, test_size=0.2, random_state=42)

# Modified
# Create DataLoaders
batch_size = 32  # Added batch size
train_data = DataLoader(
    TensorDataset(torch.from_numpy(train_text), torch.from_numpy(train_label).long()),
    batch_size=batch_size,
    shuffle=True
)
test_data = DataLoader(
    TensorDataset(torch.from_numpy(test_text), torch.from_numpy(test_label).long()),
    batch_size=batch_size
)

## Define a CNN classifier

In [97]:
class CNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_classes=10):
        super(CNNClassifier, self).__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        
        # Convolutional layer
        self.conv1d = nn.Conv1d(embed_dim, 256, kernel_size=3, padding=1)
        
        # Calculate the size of the flattened features
        self.flatten_size = 256 * 25  # After max pooling with kernel_size=2
        
        # Linear layer
        self.fc1 = nn.Linear(self.flatten_size, num_classes)
    
    def forward(self, x):
        # Embedding layer: [batch_size, seq_len] -> [batch_size, seq_len, embed_dim]
        embedded = self.embedding(x)
        
        # Transpose for conv1d: [batch_size, embed_dim, seq_len]
        embedded = embedded.transpose(1, 2)
        
        # Convolutional layer
        conv_out = self.conv1d(embedded)
        conv_out = F.relu(conv_out)
        
        # Max pooling
        pooled = F.max_pool1d(conv_out, kernel_size=2)
        
        # Flatten
        flattened = pooled.view(-1, self.flatten_size)
        
        # Linear layer
        output = self.fc1(flattened)
        
        return output

## Train your classifier

In [98]:
# Initialize the model
model = CNNClassifier(vocab_size=len(word2idx))

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch_x, batch_y in train_data:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        # Zero out the gradients
        optimizer.zero_grad()
        
        # Forward pass
        output = model(batch_x)
        loss = criterion(output, batch_y)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_data)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

Epoch 1/3, Loss: 1.3741
Epoch 2/3, Loss: 0.4679
Epoch 3/3, Loss: 0.1640


## Test your classifier

In [99]:
# Evaluation
model.eval()
accuracy = Accuracy(task="multiclass", num_classes=10).to(device)
precision = Precision(task="multiclass", num_classes=10, average=None).to(device)
recall = Recall(task="multiclass", num_classes=10, average=None).to(device)

# Initialize empty lists for predictions and true labels
predicted = []
true_labels = []

# Disable gradient computation for evaluation
with torch.no_grad():
    for batch_x, batch_y in test_data:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        # Get model predictions
        outputs = model(batch_x)
        _, p = torch.max(outputs, 1)
        
        # Store predictions and true labels
        predicted.extend(p.cpu().numpy())
        true_labels.extend(batch_y.cpu().numpy())

## Calculate the accuracy, per-class precision, and recall

In [100]:
# Convert to tensors for metric calculation
predictions_tensor = torch.tensor(predictions).to(device)
labels_tensor = torch.tensor(true_labels).to(device)

# Calculate accuracy
accuracy = (predictions_tensor == labels_tensor).float().mean().item()

# Initialize per-class metrics
precision_metric = Precision(task="multiclass", num_classes=10, average=None).to(device)
recall_metric = Recall(task="multiclass", num_classes=10, average=None).to(device)

# Calculate precision and recall
precision = precision_metric(predictions_tensor, labels_tensor).cpu().tolist()
recall = recall_metric(predictions_tensor, labels_tensor).cpu().tolist()

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision per class: {precision}")
print(f"Recall per class: {recall}")

Accuracy: 0.7010
Precision per class: [0.559183657169342, 0.7310344576835632, 0.7058823704719543, 0.7134831547737122, 0.8402062058448792, 0.0, 0.0, 0.0, 0.0, 0.0]
Recall per class: [0.7135416865348816, 0.557894766330719, 0.7777777910232544, 0.6614583134651184, 0.776190459728241, 0.0, 0.0, 0.0, 0.0, 0.0]
