In [9]:
import torch

In [10]:
from transformers import DistilBertTokenizerFast

In [11]:
from transformers import DistilBertForSequenceClassification

In [12]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
import numpy as np
from torch.utils.data import DataLoader
from torch.optim import AdamW
import pandas as pd
from sklearn.model_selection import train_test_split

In [13]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device('cpu')
    print("Using CPU")

Using GPU: NVIDIA GeForce RTX 4070 Laptop GPU


In [14]:
# device = torch.device('cpu')

In [15]:
data_path = "categorized_phrases.csv"

In [16]:
df = pd.read_csv(data_path)

In [17]:
df.columns = ["prompt", "label"]

In [18]:
class_names = ["greetings", "thanks", "goodbye", "prompt"]

In [19]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["prompt"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)

In [20]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
# encodings = tokenizer(texts, truncation=True, padding=True)

In [21]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
test_encodings  = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")

In [22]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [23]:
dataset_train = TextDataset(train_encodings, train_labels)
dataset_test = TextDataset(test_encodings, test_labels)

In [24]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class weights:", class_weights)

Class weights: tensor([1.3119, 2.0456, 1.3196, 0.5022])


In [25]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
train_loader = DataLoader(dataset_train, batch_size=8, shuffle=True)
test_loader = DataLoader(dataset_train, batch_size=8, shuffle=True)

optimizer = AdamW(model.parameters(), lr=5e-5)
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [27]:
epochs = 1

model.train()
for epoch in range(epochs):
    total_loss = 0

    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if i % 100 == 0:
            print(f"Batch {i}, Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Batch 0, Loss: 1.3727
Batch 100, Loss: 0.1848
Batch 200, Loss: 0.1061
Batch 300, Loss: 0.0215
Epoch 1/1, Loss: 0.2285


In [28]:
from sklearn.metrics import classification_report

model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Predictions for this batch
        batch_preds = torch.argmax(logits, dim=1)

        # Collect all predictions and labels
        all_preds.extend(batch_preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())




  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [29]:
print(classification_report(all_labels, all_preds, target_names=class_names))

              precision    recall  f1-score   support

   greetings       1.00      1.00      1.00       513
      thanks       0.98      1.00      0.99       329
     goodbye       1.00      0.98      0.99       510
      prompt       1.00      1.00      1.00      1340

    accuracy                           1.00      2692
   macro avg       0.99      1.00      1.00      2692
weighted avg       1.00      1.00      1.00      2692



In [30]:
def predict(prompt):
    prompt = [prompt]

    encodings = tokenizer(prompt, truncation=True, padding=True, return_tensors="pt")
    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)

    model.to(device)
    model.eval()

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        predicted_class_id = torch.argmax(outputs.logits, dim=1).item()

    class_names = ["greetings", "thanking", "goodbye", "prompt"]
    print(f"Predicted class: {class_names[predicted_class_id]}")

In [31]:
predict("wassup")

Predicted class: greetings


In [32]:
model.save_pretrained("model")
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [33]:
tokenizer = DistilBertTokenizerFast.from_pretrained("tokenizer")
model = DistilBertForSequenceClassification.from_pretrained("model")