In [45]:
! pip install transformers datasets torch scikit-learn


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: C:\Users\micko\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip




In [18]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification
from transformers import TrainingArguments, Trainer

In [3]:
helpdesk_ticket_data = pd.read_csv("it_helpdesk_tickets_500.csv")
helpdesk_ticket_data = helpdesk_ticket_data.drop_duplicates(subset=["description"])

In [4]:
le = LabelEncoder()
helpdesk_ticket_data["priority_label"] = le.fit_transform(helpdesk_ticket_data["priority"])

In [5]:
helpdesk_ticket_data["category_label"] = le.fit_transform(helpdesk_ticket_data["category"])

In [7]:
train_helpdesk_ticket_data, test_helpdesk_ticket_data = train_test_split(helpdesk_ticket_data, test_size=0.2, random_state=42)

In [9]:
train_dataset = Dataset.from_pandas(train_helpdesk_ticket_data)
test_dataset = Dataset.from_pandas(test_helpdesk_ticket_data)

In [11]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

Downloading tokenizer_config.json: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 48.0/48.0 [00:00<00:00, 10.7kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading vocab.txt: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 232k/232k [00:00<00:00, 1.25MB/s]
Downloading tokenizer.json: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 466k/466k [00:00<00:00, 3.37MB/s]
Downloading config.json: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 483/483 [00:00<00:00, 121kB/s]


In [13]:
def tokenize(batch):
    return tokenizer(
        batch["description"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16/16 [00:00<00:00, 291.22 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:00<00:00, 234.69 examples/s]


In [14]:
train_dataset = train_dataset.rename_column("priority_label", "labels")
test_dataset = test_dataset.rename_column("priority_label", "labels")

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [16]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=4        # low, medium, high, critical
)

Downloading model.safetensors: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 268M/268M [00:24<00:00, 11.1MB/s] 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
training_args = TrainingArguments(
    output_dir="./priority_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
)

In [22]:
from datasets import load_metric
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return metric.compute(predictions=preds, references=labels)

  metric = load_metric("accuracy")
Downloading builder script: 4.21kB [00:00, 324kB/s]                    


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [21]:
trainer.train()

                                             
 25%|â–ˆâ–ˆâ–Œ       | 1/4 [00:09<00:27,  9.25s/it]

{'eval_loss': 1.4096564054489136, 'eval_runtime': 0.6797, 'eval_samples_per_second': 5.885, 'eval_steps_per_second': 1.471, 'epoch': 1.0}


                                             
 50%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆ     | 2/4 [00:21<00:21, 10.69s/it]

{'eval_loss': 1.3963689804077148, 'eval_runtime': 0.7809, 'eval_samples_per_second': 5.123, 'eval_steps_per_second': 1.281, 'epoch': 2.0}


                                             
 75%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Œ  | 3/4 [00:32<00:10, 10.60s/it]

{'eval_loss': 1.400559902191162, 'eval_runtime': 0.7631, 'eval_samples_per_second': 5.242, 'eval_steps_per_second': 1.31, 'epoch': 3.0}


                                             
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:42<00:00, 10.67s/it]

{'eval_loss': 1.4013772010803223, 'eval_runtime': 0.6664, 'eval_samples_per_second': 6.002, 'eval_steps_per_second': 1.501, 'epoch': 4.0}


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:44<00:00, 11.03s/it]

{'train_runtime': 44.1161, 'train_samples_per_second': 1.451, 'train_steps_per_second': 0.091, 'train_loss': 1.3291938304901123, 'epoch': 4.0}





TrainOutput(global_step=4, training_loss=1.3291938304901123, metrics={'train_runtime': 44.1161, 'train_samples_per_second': 1.451, 'train_steps_per_second': 0.091, 'train_loss': 1.3291938304901123, 'epoch': 4.0})