# Importsand preparations

In [1]:
import pandas as pd
import torch
import os
import numpy as np
import datasets
import transformers
from GPUtil import showUtilization as gpu_usage
from numba import cuda
import torch.nn.functional as F

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

from datasets import load_dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# !watch -n 0.5 nvidia-smi

In [3]:
print(f'PyTorch version: {torch.__version__}')  # 1.9.1+cu111
print(f'CUDA version: {torch.version.cuda}')  # 11.1
print(f'cuDNN version: {torch.backends.cudnn.version()}')  # 8005
print(f'Current device: {torch.cuda.current_device()}')  # 0
print(f'Is cuda available: {torch.cuda.is_available()}')  # TRUE

PyTorch version: 2.1.0+cu121
CUDA version: 12.1
cuDNN version: 8902
Current device: 0
Is cuda available: True


In [4]:
print(f'Transformers version: {transformers.__version__}')
print(f'Datasets version: {datasets.__version__}')

Transformers version: 4.35.0
Datasets version: 2.14.6


In [5]:
# Prevent a warning related to the tokenization process in the transformers library. 
os.environ["TOKENIZERS_PARALLELISM"] = "False"
# Makes CUDA operations synchronous
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [6]:
# Find the GPU with the least memory usage.
!nvidia-smi

Mon Mar 18 11:07:32 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  | 00000000:41:00.0 Off |                  N/A |
| 30%   32C    P8              27W / 350W |      6MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090        On  | 00000000:61:00.0 Off |  

In [7]:
def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    # free unreferenced tensors from the GPU memory.
    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()   

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |
|  1 |  0% |  2% |
|  2 |  0% |  0% |
|  3 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |
|  1 |  0% |  2% |
|  2 |  0% |  0% |
|  3 |  3% |  1% |


In [8]:
# Smaller and faster than bert.
model_ckpt = "distilbert-base-uncased"

epochs = 5 #Number of full cycles through the training set.
num_labels = 2 #Number of labels, high, med, low priority.
learning_rate = 5e-5 # Rate the model updates based on the data its trained on.
train_batch_size = 16 # Number of training examples in one iteration.
eval_batch_size = 32 # Number evaluation examples in on iteration.
save_strategy = "no" # Should the model be saved automatically during training.
save_steps = 500 # How often to save the model during training. No effect since no over.
logging_steps = 100
model_dir = "./model" #Where to save model

# Use early stopping to prevent overfitting
load_best_model_at_end=True
#metric_for_best_model="eval_loss"
#greater_is_better=False

Load dataset from huggingface

In [9]:
dataset = load_dataset("kristmh/highest_vs_rest_balanced_jira")
dataset

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 11073
    })
    train: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 88572
    })
    validate: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 11071
    })
})

In [10]:
mongoDB = load_dataset("kristmh/clean_MongoDB_balanced_1")
mongoDB

Downloading readme: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 487/487 [00:00<00:00, 4.52MB/s]
Downloading data files:   0%|                                                                                                                                                                                                        | 0/1 [00:00<?, ?it/s]
Downloading data:   0%|                                                                                                                                                                                                         | 0.00/498k [00:00<?, ?B/s][A
Downloading data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 498k/498k [00:00<00:00,

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 1816
    })
})

In [11]:
jira = load_dataset("kristmh/clean_Jira_balanced")
jira

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 9332
    })
})

In [12]:
qt = load_dataset("kristmh/clean_QT_balanced")
qt

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 3882
    })
})

In [13]:
rust = load_dataset("kristmh/rust_testset_with_med_low_1")
rust

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 1572
    })
})

In [14]:
flutter = load_dataset("kristmh/flutter_testset_with_med_low_1")
flutter

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 2370
    })
})

In [15]:
mypy = load_dataset("kristmh/mypy_testset_with_med_low_1")
mypy

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 734
    })
})

In [16]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [17]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
device

device(type='cuda')

## Tokenization

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

    Tokenizing the whole dataset

In [20]:
#Tokenize the dataset to the correct input for the transformer model.
def tokenize(batch):
    return tokenizer(batch["text_clean"], padding="max_length", truncation=True)

In [21]:
mongoDB_tokenized = mongoDB.map(tokenize, batched=True)
mongoDB_testset = mongoDB_tokenized["test"]
mongoDB_testset


Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1816/1816 [00:00<00:00, 1876.91 examples/s]


Dataset({
    features: ['text_clean', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1816
})

In [22]:
jira_tokenized = jira.map(tokenize, batched=True)
jira_testset = jira_tokenized["test"]
jira_testset

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9332/9332 [00:05<00:00, 1746.27 examples/s]


Dataset({
    features: ['text_clean', 'label', 'input_ids', 'attention_mask'],
    num_rows: 9332
})

In [23]:
qt_tokenized = qt.map(tokenize, batched=True)
qt_testset = qt_tokenized["test"]
qt_testset

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3882/3882 [00:02<00:00, 1313.68 examples/s]


Dataset({
    features: ['text_clean', 'label', 'input_ids', 'attention_mask'],
    num_rows: 3882
})

In [24]:
rust_tokenized = rust.map(tokenize, batched=True)
rust_testset = rust_tokenized["test"]
rust_testset

Dataset({
    features: ['text_clean', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1572
})

In [25]:
flutter_tokenized = flutter.map(tokenize, batched=True)
flutter_testset = flutter_tokenized["test"]
flutter_testset

Dataset({
    features: ['text_clean', 'label', 'input_ids', 'attention_mask'],
    num_rows: 2370
})

In [26]:
mypy_tokenized = mypy.map(tokenize, batched=True)
mypy_testset = mypy_tokenized["test"]
mypy_testset

Dataset({
    features: ['text_clean', 'label', 'input_ids', 'attention_mask'],
    num_rows: 734
})

In [27]:
tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label', 'input_ids', 'attention_mask'],
        num_rows: 11073
    })
    train: Dataset({
        features: ['text_clean', 'label', 'input_ids', 'attention_mask'],
        num_rows: 88572
    })
    validate: Dataset({
        features: ['text_clean', 'label', 'input_ids', 'attention_mask'],
        num_rows: 11071
    })
})

In [28]:
# List the columns of the dataset.
# Should be: ["attention_mask", "input_ids", "labels", "token_type_ids"]
# Remove unnecessary columns that the model does not expect.
tokenized_dataset = tokenized_dataset.remove_columns(["text_clean"])

train_dataset = tokenized_dataset["train"]
print(train_dataset)
validation_dataset = tokenized_dataset["validate"]
print(validation_dataset)
test_dataset = tokenized_dataset["test"]
test_dataset

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 88572
})
Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 11071
})


Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 11073
})

## Training a classifier

In [29]:
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    save_strategy=save_strategy,
    save_steps=save_steps,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    logging_steps=logging_steps,
)

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [31]:
trainer.train() 

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5565,0.593619,0.693433,[0.69472927 0.69212627],[0.6969861 0.68987342],[0.692487 0.69439388]
2,0.5333,0.57639,0.696775,[0.70992828 0.68237298],[0.68512342 0.71054187],[0.73659674 0.65635238]
3,0.4509,0.635447,0.695511,[0.71288647 0.67589655],[0.67894225 0.71632362],[0.75040344 0.63978886]
4,0.3253,0.822817,0.68702,[0.67613796 0.69719479],[0.70616947 0.67053286],[0.64855657 0.7260648 ]
5,0.2223,1.095049,0.684852,[0.68313505 0.68655107],[0.69212367 0.67784282],[0.67437691 0.69548598]


TrainOutput(global_step=27680, training_loss=0.42714544796530224, metrics={'train_runtime': 6056.6215, 'train_samples_per_second': 73.12, 'train_steps_per_second': 4.57, 'total_flos': 5.866451216879616e+16, 'train_loss': 0.42714544796530224, 'epoch': 5.0})

* Training loss: Difference between the predictons made by the model on the training dataset vs on the actual data.
* Validation loss: how well the model functions on unseen data.
* Accuracy: How much the model gets correct. number of correct Prediction / total number of predictions.
* F1: consider both precision and recall. 
* Precision: Accuracy of positive predictions. Percison TP = TP + FP. How often the model is correct.
* Recall: True positive rate. how many items the model gets correct from the total amount.

### Training loss decreases, validation loss increases = Overfitting

In [32]:
# Evaluate validation set
eval_result = trainer.evaluate(eval_dataset=validation_dataset)

In [33]:
for key, value in sorted(eval_result.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.6848523168638786

eval_f1 = [0.68313505 0.68655107]

eval_loss = 1.0950487852096558

eval_precision = [0.69212367 0.67784282]

eval_recall = [0.67437691 0.69548598]

eval_runtime = 67.1282

eval_samples_per_second = 164.923

eval_steps_per_second = 5.154



In [34]:
# Evaluate test data set
test_results = trainer.evaluate(eval_dataset=test_dataset)

In [35]:
for key, value in sorted(test_results.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.6877088413257473

eval_f1 = [0.68660504 0.6888049 ]

eval_loss = 1.080896258354187

eval_precision = [0.68985613 0.68559656]

eval_recall = [0.68338445 0.6920434 ]

eval_runtime = 67.5045

eval_samples_per_second = 164.033

eval_steps_per_second = 5.14



In [36]:
mongoDB_test_results = trainer.evaluate(eval_dataset=mongoDB_testset)
for key, value in sorted(mongoDB_test_results.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.5919603524229075

eval_f1 = [0.62366684 0.55441972]

eval_loss = 1.5074597597122192

eval_precision = [0.57869934 0.61059603]

eval_recall = [0.67621145 0.50770925]

eval_runtime = 11.7058

eval_samples_per_second = 155.136

eval_steps_per_second = 4.869



In [37]:
jira_test_results = trainer.evaluate(eval_dataset=jira_testset)
for key, value in sorted(jira_test_results.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.6236605229318474

eval_f1 = [0.6142355  0.63263598]

eval_loss = 1.2975651025772095

eval_precision = [0.63001352 0.61789947]

eval_recall = [0.59922846 0.64809258]

eval_runtime = 67.2661

eval_samples_per_second = 138.732

eval_steps_per_second = 4.341



In [38]:
qt_test_results = trainer.evaluate(eval_dataset=qt_testset)
for key, value in sorted(qt_test_results.items()):
    print(f"{key} = {value}\n")
    

epoch = 5.0

eval_accuracy = 0.6545595054095827

eval_f1 = [0.66264151 0.64608076]

eval_loss = 1.2137494087219238

eval_precision = [0.64749263 0.66233766]

eval_recall = [0.67851623 0.63060278]

eval_runtime = 30.7627

eval_samples_per_second = 126.192

eval_steps_per_second = 3.966



In [39]:
rust_test_results = trainer.evaluate(eval_dataset=rust_testset)
for key, value in sorted(rust_test_results.items()):
    print(f"{key} = {value}\n")    

epoch = 5.0

eval_accuracy = 0.6265903307888041

eval_f1 = [0.56029963 0.67551133]

eval_loss = 1.4532843828201294

eval_precision = [0.68123862 0.59726295]

eval_recall = [0.47582697 0.77735369]

eval_runtime = 12.6508

eval_samples_per_second = 124.261

eval_steps_per_second = 3.952



In [40]:
flutter_test_results = trainer.evaluate(eval_dataset=flutter_testset)
for key, value in sorted(flutter_test_results.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.5358649789029536

eval_f1 = [0.57298137 0.49168207]

eval_loss = 1.7696045637130737

eval_precision = [0.53055356 0.54341164]

eval_recall = [0.62278481 0.44894515]

eval_runtime = 18.9597

eval_samples_per_second = 125.002

eval_steps_per_second = 3.956



In [41]:
mypy_test_results = trainer.evaluate(eval_dataset=mypy_testset)
for key, value in sorted(mypy_test_results.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.5681198910081744

eval_f1 = [0.61853189 0.50235479]

eval_loss = 1.5857497453689575

eval_precision = [0.55387931 0.59259259]

eval_recall = [0.70027248 0.4359673 ]

eval_runtime = 5.6111

eval_samples_per_second = 130.813

eval_steps_per_second = 4.099



In [42]:
trainer.save_model(model_dir + "_local") 

In [43]:
from transformers import pipeline
    
classifier = pipeline("text-classification", model="./model_local")

In [44]:
classifier.model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [45]:
classifier("this does not need to be done fast")

[{'label': 'LABEL_0', 'score': 0.9729120135307312}]

In [46]:
classifier("this is super important")

[{'label': 'LABEL_1', 'score': 0.9777431488037109}]

In [47]:
classifier("this bug has super high impact on the project")

[{'label': 'LABEL_0', 'score': 0.9660075902938843}]

## Important to delete large objects to free memory 
del train_dataset

In [48]:
del validation_dataset 

In [49]:
del model

In [50]:
# Free cache
torch.cuda.empty_cache()

###### 

In [51]:
!nvidia-smi

Mon Mar 18 12:53:55 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  | 00000000:41:00.0 Off |                  N/A |
| 33%   52C    P2             124W / 350W |   7955MiB / 24576MiB |      3%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090        On  | 00000000:61:00.0 Off |  