# Importsand preparations

In [1]:
import pandas as pd
import torch
import os
import numpy as np
import datasets
import transformers
from GPUtil import showUtilization as gpu_usage
from numba import cuda
import torch.nn.functional as F

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

from datasets import load_dataset, Dataset, DatasetDict

In [2]:
# !watch -n 0.5 nvidia-smi

In [3]:
print(f'PyTorch version: {torch.__version__}')  # 1.9.1+cu111
print(f'CUDA version: {torch.version.cuda}')  # 11.1
print(f'cuDNN version: {torch.backends.cudnn.version()}')  # 8005
print(f'Current device: {torch.cuda.current_device()}')  # 0
print(f'Is cuda available: {torch.cuda.is_available()}')  # TRUE

PyTorch version: 2.0.1+cu117
CUDA version: 11.7
cuDNN version: 8500
Current device: 0
Is cuda available: True


In [4]:
print(f'Transformers version: {transformers.__version__}')
print(f'Datasets version: {datasets.__version__}')

Transformers version: 4.32.1
Datasets version: 2.14.4


In [5]:
# Prevent a warning related to the tokenization process in the transformers library. 
os.environ["TOKENIZERS_PARALLELISM"] = "False"
# Makes CUDA operations synchronous
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [6]:
# Find the GPU with the least memory usage.
!nvidia-smi

Tue Nov 14 09:05:13 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti      On | 00000000:01:00.0 Off |                  N/A |
| 24%   44C    P2               63W / 250W|    949MiB / 11264MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti      On | 00000000:23:00.0 Off |  

In [7]:
def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    # free unreferenced tensors from the GPU memory.
    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()   

Initial GPU Usage
| ID | GPU  | MEM |
-------------------
|  0 |   0% |  8% |
|  1 | 100% | 93% |
|  2 | 100% | 93% |
|  3 | 100% | 93% |
|  4 |   0% | 12% |
|  5 |   0% |  0% |
|  6 |   0% |  0% |
|  7 | 100% | 93% |
GPU Usage after emptying the cache
| ID | GPU  | MEM |
-------------------
|  0 |   0% |  8% |
|  1 | 100% | 93% |
|  2 | 100% | 93% |
|  3 | 100% | 93% |
|  4 |   0% | 12% |
|  5 |   0% |  0% |
|  6 |   0% |  1% |
|  7 | 100% | 93% |


In [8]:
data = pd.read_csv("csv/priority_high_med_low_clean.csv" , index_col = 0)
data

Unnamed: 0,text_clean,label
0,issue url ads httpswwwgultecompaparazzipics764...,0
1,browser firefox mobile 1170 uaheader mozilla50...,0
2,consider the file frontendsbenchmarksverificat...,0
3,in a repository with many files it appears tha...,0
4,look at other webdrivers and make sure that ex...,0
...,...,...
226982,create a personal page for yourself and link i...,2
226983,when you run the trial it will set areas on fi...,2
226984,the navigator of the wiki should be edited and...,2
226985,javascript code or see if possible without tha...,2


In [9]:
# Smaller and faster than bert.
base_model_id = "distilbert-base-uncased"

epochs = 5 #Number of full cyles through the training set.
num_labels = 3 #Number of labels, high, med, low priority.
learning_rate = 5e-5 # Rate the model updates based on the data its trained on.
train_batch_size = 16 # Number of training examples in one iteration.
eval_batch_size = 32 # Number evalutaion examples in on iteratoion.
save_strategy = "no" # Should the model be saved automatically during training.
save_steps = 500 # How often to save the model during training. No effect since no over.
logging_steps = 100
model_dir = "./model" #Where to save model

# Use early stopping to prevent overfitting
#load_best_model_at_end=True
#metric_for_best_model="eval_loss"
#greater_is_better=False

In [10]:

# Split dataframe into three parts: training, validation and testing.
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=42):
    np.random.seed(seed)
    # Shuffle index of dataframe
    perm = np.random.permutation(df.index)
    
    df_length = len(df.index)
    
    # Number of row in training set
    train_end = int(train_percent * df_length)
    # Number of rows in validate set
    validate_end = int(validate_percent * df_length) + train_end
    
    # From start to train end
    train = df.iloc[perm[:train_end]]
    # From train_end to validate_end
    validate = df.iloc[perm[train_end:validate_end]]
    # From validate to the last row in dataframe.
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [11]:
# Drops rows with missing values
data.dropna(inplace=True)

In [12]:
# Resets the index after dropping rows
data.reset_index(inplace=True)
data

Unnamed: 0,index,text_clean,label
0,0,issue url ads httpswwwgultecompaparazzipics764...,0
1,1,browser firefox mobile 1170 uaheader mozilla50...,0
2,2,consider the file frontendsbenchmarksverificat...,0
3,3,in a repository with many files it appears tha...,0
4,4,look at other webdrivers and make sure that ex...,0
...,...,...,...
226982,226982,create a personal page for yourself and link i...,2
226983,226983,when you run the trial it will set areas on fi...,2
226984,226984,the navigator of the wiki should be edited and...,2
226985,226985,javascript code or see if possible without tha...,2


In [13]:
#Drops the index col, better for managint the data.
data.drop(columns= ["index"], inplace = True)

In [14]:
data

Unnamed: 0,text_clean,label
0,issue url ads httpswwwgultecompaparazzipics764...,0
1,browser firefox mobile 1170 uaheader mozilla50...,0
2,consider the file frontendsbenchmarksverificat...,0
3,in a repository with many files it appears tha...,0
4,look at other webdrivers and make sure that ex...,0
...,...,...
226982,create a personal page for yourself and link i...,2
226983,when you run the trial it will set areas on fi...,2
226984,the navigator of the wiki should be edited and...,2
226985,javascript code or see if possible without tha...,2


In [15]:
# 60% trainig, 20% validate, 20% test. Seed None.
# Test 80-10-10 and 70-15-15
train , validate , test = train_validate_test_split(data)

In [16]:
train.set_index("label" , inplace = True)
validate.set_index("label" , inplace = True)
test.set_index("label" , inplace = True)

In [17]:
test

Unnamed: 0_level_0,text_clean
label,Unnamed: 1_level_1
1,todo x decide on maximum acceptable wind and r...
2,the current apaeval conda environment contains...
0,hallo beim letzten update 130 alpha 4 habe ich...
1,while scanning the cmsweb weekly report i almo...
1,error message is ascii codec cant encode chara...
...,...
1,is your feature request related to a problem p...
1,imagehttpsuserimagesgithubusercontentcom353649...
1,whats the problem i entered all my blogs and t...
1,describe the bug custom user text styles not a...


In [18]:
# Convert from Pandas DataFrame to Hugging Face datasets
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validate)
testds = Dataset.from_pandas(test)

ds = DatasetDict()

ds["test"] = testds
ds["train"] = tds
ds["validate"] = vds

ds

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 22700
    })
    train: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 181589
    })
    validate: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 22698
    })
})

In [19]:
train_dataset = ds["train"]
valid_dataset = ds["validate"]
test_ds = ds["test"]

In [20]:
ds["train"][0]

{'text_clean': 'a hrefhttpsgithubcomgooglecodeexporterimg srchttpsavatarsgithubusercontentcomu9614759v3 alignleft width96 height96 hspace10imga issue by googlecodeexporterhttpsgithubcomgooglecodeexporter monday jul 27 2015 at 0320 gmt originally opened as httpsgithubcomadamsmjyawlissues29 in the attached example when logged in as user stephan deallocating the only work item leads to the message the attempt to deallocate the selected workitem was unsuccessful please check the log files for details original issue reported on codegooglecom by arthurtegmailcom on 21 jul 2008 at 833 attachments newtest20xmlhttpsstoragegoogleapiscomgooglecodeattachmentsyawlissue29comment0newtest20xml newtest20backuphttpsstoragegoogleapiscomgooglecodeattachmentsyawlissue29comment0newtest20backup',
 'label': 0}

In [21]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(base_model_id, num_labels=num_labels)
#tokenizer = AutoTokenizer.from_pretrained(base_model_id)
# optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenization

In [23]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

    Tokenizing the whole dataset

In [24]:
#Tokenize the dataset to the correct input for the transformer model.
def tokenize(batch):
    return tokenizer(batch["text_clean"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))
test_dataset = test_ds.map(tokenize, batched=True, batch_size=len(test_ds))

Map:   0%|          | 0/181589 [00:00<?, ? examples/s]

Map:   0%|          | 0/22698 [00:00<?, ? examples/s]

Map:   0%|          | 0/22700 [00:00<?, ? examples/s]

## Training a classifier

In [25]:
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    save_strategy=save_strategy,
    save_steps=save_steps,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    logging_steps=logging_steps,
)

In [26]:
 trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [27]:
trainer.train() 

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.818,0.822433,0.610054,[0.65928391 0.60354945 0.55523796],[0.6512867 0.65288744 0.52372607],[0.66747996 0.56114438 0.59078467]
2,0.7508,0.813862,0.616706,[0.65451163 0.61747255 0.57420591],[0.7019154 0.63332401 0.52216135],[0.61310561 0.60239521 0.63777372]
3,0.6074,0.907922,0.618557,[0.66541837 0.6211878 0.55563876],[0.67028174 0.62444534 0.5473591 ],[0.66062507 0.61796407 0.56417275]
4,0.464,1.031681,0.61957,[0.66571019 0.62349096 0.55882998],[0.68562985 0.62503343 0.53824482],[0.6469153 0.62195609 0.58105231]
5,0.3052,1.315833,0.613138,[0.66256143 0.6210771 0.53958989],[0.67556146 0.60331201 0.5450737 ],[0.65005228 0.63992016 0.53421533]


TrainOutput(global_step=56750, training_loss=0.6061593800011186, metrics={'train_runtime': 17963.1601, 'train_samples_per_second': 50.545, 'train_steps_per_second': 3.159, 'total_flos': 1.2027525717293568e+17, 'train_loss': 0.6061593800011186, 'epoch': 5.0})

* Training loss: Difference between the predictons made by the model on the training dataset vs on the actual data.
* Validation loss: how well the model functions on unseen data.
* Accuracy: How much the model gets correct. number of correct Prediction / total number of predictions.
* F1: consider both precision and recall. 
* Precision: Accuracy of positive predictions. Percison TP = TP + FP. How often the model is correct.
* Recall: True positive rate. how many items the model gets correct from the total amount.

### Training loss decreases, valdiation loss increases = Overfitting

In [28]:
# Evaluate valdiation set
eval_result = trainer.evaluate(eval_dataset=valid_dataset)

In [29]:
for key, value in sorted(eval_result.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.6131377213851441

eval_f1 = [0.66256143 0.6210771  0.53958989]

eval_loss = 1.3158326148986816

eval_precision = [0.67556146 0.60331201 0.5450737 ]

eval_recall = [0.65005228 0.63992016 0.53421533]

eval_runtime = 144.9896

eval_samples_per_second = 156.549

eval_steps_per_second = 4.897



In [30]:
# Evaluate test data set
test_results = trainer.evaluate(eval_dataset=test_dataset)

In [31]:
for key, value in sorted(test_results.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.6056387665198237

eval_f1 = [0.65838143 0.61134924 0.52865296]

eval_loss = 1.3424071073532104

eval_precision = [0.66894075 0.59053269 0.54061401]

eval_recall = [0.6481503  0.633687   0.51720973]

eval_runtime = 145.3142

eval_samples_per_second = 156.213

eval_steps_per_second = 4.886



In [32]:
trainer.save_model(model_dir + "_local") 

In [33]:
from transformers import pipeline
    
classifier = pipeline("text-classification", model="./model_local")

In [34]:
classifier.model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [35]:
classifier("this does not need to be done fast")

[{'label': 'LABEL_2', 'score': 0.9661545753479004}]

In [36]:
classifier("this is super important")

[{'label': 'LABEL_0', 'score': 0.9975225329399109}]

In [37]:
classifier("this bug has super high impact on the project")

[{'label': 'LABEL_0', 'score': 0.9940073490142822}]

## Important to delete large objects to free memory 
del train_dataset

In [38]:
del valid_dataset

In [39]:
del model

In [40]:
# Free cache
torch.cuda.empty_cache()

In [41]:
!nvidia-smi

Tue Nov 14 14:12:24 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti      On | 00000000:01:00.0 Off |                  N/A |
| 24%   44C    P2               63W / 250W|    949MiB / 11264MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti      On | 00000000:23:00.0 Off |  