In [1]:
import torch

In [2]:
from transformers import DistilBertTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import DistilBertForSequenceClassification

In [19]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
import numpy as np
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [5]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device('cpu')
    print("Using CPU")

Using GPU: NVIDIA GeForce RTX 4070 Laptop GPU


In [6]:
device = torch.device('cpu')

In [7]:
data_path = "C:\\Users\\Alber\\Desktop\\Hackaton\\BHL_GPU_Enjoyers\\categorized_phrases.csv"

In [8]:
df = pd.read_csv(data_path)

In [9]:
df.columns = ["prompt", "label"]

In [10]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["prompt"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)

In [11]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
test_encodings  = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")

In [12]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [13]:
dataset_train = TextDataset(train_encodings, train_labels)
dataset_test = TextDataset(test_encodings, test_labels)

In [14]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class weights:", class_weights)


Class weights: tensor([0.7879, 1.5758, 0.9811, 0.9286])


In [15]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=4
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
train_loader = DataLoader(dataset_train, batch_size=8, shuffle=True)

optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = CrossEntropyLoss(weight=class_weights.to(device))  # weight must match device
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [17]:
epochs = 3

model.train()
for epoch in range(epochs):
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/3, Loss: 1.2495
Epoch 2/3, Loss: 0.7003
Epoch 3/3, Loss: 0.2326


In [20]:
model.eval()
preds = []
with torch.no_grad():
    for batch in dataset_test:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds.extend(torch.argmax(logits, dim=1).cpu().numpy())

print(classification_report(labels.cpu().numpy(), preds))


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


RuntimeError: The size of tensor a (7) must match the size of tensor b (512) at non-singleton dimension 1

In [24]:
def make_prediction(prompt):
    sentence = [prompt]

    encodings = tokenizer(sentence, truncation=True, padding=True, return_tensors="pt")
    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)

    model.to(device)
    model.eval()

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        predicted_class_id = torch.argmax(outputs.logits, dim=1).item()

    class_names = ["greetings", "thanking", "bye", "prompt"]
    print(f"Predicted class: {class_names[predicted_class_id]}")

In [34]:
make_prediction("can you make me a cup of coffee?")

Predicted class: prompt
