# Test with a 80-10-10 split


In [1]:
import pandas as pd

In [2]:
import torch

In [3]:
# !watch -n 0.5 nvidia-smi

In [4]:
print(torch.__version__)  # 1.9.1+cu111
print(torch.version.cuda)  # 11.1
print(torch.backends.cudnn.version())  # 8005
print(torch.cuda.current_device())  # 0
print(torch.cuda.is_available())  # TRUE

2.0.1+cu117
11.7
8500
0
True


In [5]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "False"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [6]:
!nvidia-smi

Thu Sep 21 10:53:46 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-PCIE-40GB           On | 00000000:01:00.0 Off |                    0 |
| N/A   31C    P0               57W / 250W|  24660MiB / 40960MiB |     36%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCIE-40GB           On | 00000000:25:00.0 Off |  

In [7]:
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()   

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 28% | 60% |
|  1 | 32% | 64% |
|  2 |  0% |  9% |
|  3 | 34% | 49% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 28% | 60% |
|  1 | 32% | 64% |
|  2 |  4% | 10% |
|  3 | 30% | 49% |


In [8]:
data = pd.read_csv("../priority_dataset_clean.csv" , index_col = 0)

In [9]:
data

Unnamed: 0,text_clean,label
0,describe the bug current documentationhttpsdoc...,1
1,airport name dandong langtou country cn improv...,1
2,even though we might seed the values with defa...,1
3,describe the bug i dont know what change could...,1
4,describe the issue the buttons at the top of t...,1
...,...,...
196284,やること タイトル画面を作る 詳細 良い感じのタイトル画面を作る,0
196285,create a wiki page to incorporate the must imp...,0
196286,hi ive got exception when i try to import zone...,0
196287,github is reserved for bug reports and feature...,0


In [10]:
import datasets
import transformers

print(transformers.__version__)
print(datasets.__version__)

4.32.1
2.14.4


In [11]:
from datasets import load_dataset, Dataset, DatasetDict

In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

In [13]:
base_model_id = "distilbert-base-uncased"

epochs = 5
num_labels = 2 
learning_rate = 5e-5
train_batch_size = 16
eval_batch_size = 32
save_strategy = "no"
save_steps = 500
logging_steps = 100
model_dir = "./model"

# Use early stopping to prevent overfitting
load_best_model_at_end=True
metric_for_best_model="eval_loss"
greater_is_better=False

In [14]:
import numpy as np

def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

## Load Datasets

In [15]:
# Drops rows with missing values
data.dropna(inplace=True)

In [16]:
# Resets the index after dropping rows
data.reset_index(inplace=True)

In [17]:
#Drops the index col, better for managint the data.
data.drop(columns= ["index"], inplace = True)

In [18]:
data

Unnamed: 0,text_clean,label
0,describe the bug current documentationhttpsdoc...,1
1,airport name dandong langtou country cn improv...,1
2,even though we might seed the values with defa...,1
3,describe the bug i dont know what change could...,1
4,describe the issue the buttons at the top of t...,1
...,...,...
196259,やること タイトル画面を作る 詳細 良い感じのタイトル画面を作る,0
196260,create a wiki page to incorporate the must imp...,0
196261,hi ive got exception when i try to import zone...,0
196262,github is reserved for bug reports and feature...,0


In [19]:
# 80% trainig, 10% validate, 10% test. Seed=42.
seed = torch.cuda.manual_seed_all(42)
training_percentage = .8
validation_percentage = .1
train , validate , test = train_validate_test_split(data, training_percentage, validation_percentage, seed)

In [20]:
train.set_index("label" , inplace = True)
validate.set_index("label" , inplace = True)
test.set_index("label" , inplace = True)

In [21]:
test

Unnamed: 0_level_0,text_clean
label,Unnamed: 1_level_1
1,see httpscodereviewqtprojectorgcqtqtwebengine3...
1,now that weve been stably on solr 8 for a whil...
1,description in a situation where an api is mis...
0,id be nice if mocha supported a dark color sch...
1,automatic dependency update failed for the cur...
...,...
1,describe the bug during game play not seen in ...
1,f´ version affected component feature descript...
0,is your feature request related to a problem p...
1,yarn create modularreactapp xyz should create ...


In [22]:
# Convert from Pandas DataFrame to Hugging Face datasets
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validate)
testds = Dataset.from_pandas(test)

ds = DatasetDict()

ds["test"] = testds
ds["train"] = tds
ds["validate"] = vds

ds

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 19627
    })
    train: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 157011
    })
    validate: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 19626
    })
})

In [23]:
train_dataset = ds["train"]
valid_dataset = ds["validate"]
test_ds = ds["test"]

In [24]:
ds["train"][0]

{'text_clean': 'description title steps to reproduce ballerina type foo int foo2 type bar int foo affected versions slbeta2rc3',
 'label': 1}

In [25]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(base_model_id, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
# optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenization

In [27]:
import torch.nn.functional as F

In [28]:
# Tokenization need to be done in the same exact way as the pretrained model. 
model_ckpt = "distilbert-base-uncased"
# Fetch data from Hugging Face for the given model
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

    Tokenizing the whole dataset

In [29]:
#Tokenize the dataset to the correct input for the transformer model.
def tokenize(batch):
    return tokenizer(batch["text_clean"], padding="max_length", truncation=True)


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))

Map:   0%|          | 0/157011 [00:00<?, ? examples/s]

Map:   0%|          | 0/19626 [00:00<?, ? examples/s]

## Training a classifier

In [30]:
#Set the trainings args for the model.
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    save_strategy=save_strategy,
    save_steps=save_steps,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    logging_steps=logging_steps,
)

In [31]:
 trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [None]:
trainer.train() 

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4287,0.436296,0.777948,[0.63893952 0.83967331],[0.77820383 0.77786109],[0.54195362 0.9121573 ]


### Training loss decreases, valdiation loss increases = Overfitting

In [None]:
# Evaluate valdiation set
eval_result = trainer.evaluate(eval_dataset=valid_dataset)

In [None]:
for key, value in sorted(eval_result.items()):
    print(f"{key} = {value}\n")

In [None]:
# Evaluate test data set
test_results = trainer.evaluate(eval_dataset=test_ds)

In [None]:
for key, value in sorted(test_results.items()):
    print(f"{key} = {value}\n")

In [None]:
trainer.save_model(model_dir + "_local") 

In [None]:
from transformers import pipeline
    
classifier = pipeline("text-classification", model="./model_local")

In [None]:
classifier.model

In [None]:
classifier("Woo hoo almost done")

In [None]:
# Important to delete large objects to free memory 
del train_dataset

In [None]:
del valid_dataset

In [None]:
del model

In [None]:
# Free cache
import torch
torch.cuda.empty_cache()

In [None]:
!nvidia-smi