In [1]:
import pandas as pd

In [2]:
import torch

In [3]:
# !watch -n 0.5 nvidia-smi

In [4]:
print(torch.__version__)  # 1.9.1+cu111
print(torch.version.cuda)  # 11.1
print(torch.backends.cudnn.version())  # 8005
print(torch.cuda.current_device())  # 0
print(torch.cuda.is_available())  # TRUE

2.0.1+cu117
11.7
8500
0
True


In [5]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "False"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [6]:
!nvidia-smi

Thu Sep 21 08:19:47 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-PCIE-40GB           On | 00000000:01:00.0 Off |                    0 |
| N/A   27C    P0               35W / 250W|  15378MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCIE-40GB           On | 00000000:25:00.0 Off |  

In [7]:
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()   

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  0% | 38% |
|  1 | 34% | 64% |
|  2 |  0% |  8% |
|  3 | 31% | 49% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 |  0% | 38% |
|  1 | 44% | 64% |
|  2 |  9% |  9% |
|  3 | 29% | 49% |


In [8]:
data = pd.read_csv("priority_dataset_clean.csv" , index_col = 0)

In [9]:
data

Unnamed: 0,text_clean,label
0,describe the bug current documentationhttpsdoc...,1
1,airport name dandong langtou country cn improv...,1
2,even though we might seed the values with defa...,1
3,describe the bug i dont know what change could...,1
4,describe the issue the buttons at the top of t...,1
...,...,...
196284,やること タイトル画面を作る 詳細 良い感じのタイトル画面を作る,0
196285,create a wiki page to incorporate the must imp...,0
196286,hi ive got exception when i try to import zone...,0
196287,github is reserved for bug reports and feature...,0


In [10]:
import datasets
import transformers

print(transformers.__version__)
print(datasets.__version__)

4.32.1
2.14.4


In [11]:
from datasets import load_dataset, Dataset, DatasetDict

In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

In [13]:
base_model_id = "distilbert-base-uncased"

epochs = 5
num_labels = 2 
learning_rate = 5e-5
train_batch_size = 16
eval_batch_size = 32
save_strategy = "no"
save_steps = 500
logging_steps = 100
model_dir = "./model"

In [14]:
import numpy as np

def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

## Load Datasets

In [15]:
data.dropna(inplace=True)

In [16]:
data.reset_index(inplace=True)

In [17]:
data.drop(columns= ["index"], inplace = True)

In [18]:
data

Unnamed: 0,text_clean,label
0,describe the bug current documentationhttpsdoc...,1
1,airport name dandong langtou country cn improv...,1
2,even though we might seed the values with defa...,1
3,describe the bug i dont know what change could...,1
4,describe the issue the buttons at the top of t...,1
...,...,...
196259,やること タイトル画面を作る 詳細 良い感じのタイトル画面を作る,0
196260,create a wiki page to incorporate the must imp...,0
196261,hi ive got exception when i try to import zone...,0
196262,github is reserved for bug reports and feature...,0


In [19]:
train , validate , test = train_validate_test_split(data)

In [20]:

train.set_index("label" , inplace = True)
validate.set_index("label" , inplace = True)
test.set_index("label" , inplace = True)

In [21]:
test

Unnamed: 0_level_0,text_clean
label,Unnamed: 1_level_1
1,design main backpack fragments please define r...
0,hello i have 24 errors like this when i want t...
0,・防衛大 一次試験 ・大学の話のとき 漢数字と数字を統一
1,see httpsappzenhubcomworkspacesdevxbacklogboar...
0,document on wiki page every sub component of game
...,...
1,airport name lawson aaf country usa improvemen...
1,issue while downloading normal quality doesnt ...
0,httpsgithubcomhoangnam1201webmeetingonlineblob...
1,this is mostly copy paste from the discord ser...


In [22]:
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validate)
testds = Dataset.from_pandas(test)

ds = DatasetDict()

ds["test"] = testds
ds["train"] = tds
ds["validate"] = vds

ds

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 39254
    })
    train: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 117758
    })
    validate: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 39252
    })
})

In [23]:
train_dataset = ds["train"]
valid_dataset = ds["validate"]

In [24]:
ds["train"][0]


{'text_clean': 'so that my saved schedules can be loaded whenever i load the application',
 'label': 1}

In [25]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(base_model_id, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
# optim = torch.optim.Adam(model.parameters(), lr=5e-5)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenization

In [28]:
import torch.nn.functional as F

In [29]:
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

    Tokenizing the whole dataset

In [30]:
def tokenize(batch):
    return tokenizer(batch["text_clean"], padding="max_length", truncation=True)


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))

Map:   0%|          | 0/117758 [00:00<?, ? examples/s]

Map:   0%|          | 0/39252 [00:00<?, ? examples/s]

## Training a classifier

In [42]:
base_model_id = "distilbert-base-uncased"

epochs = 5
num_labels = 2 
learning_rate = 5e-5
train_batch_size = 16
eval_batch_size = 32
save_strategy = "no"
save_steps = 500
logging_steps = 100
model_dir = "./model"

In [43]:
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    save_strategy=save_strategy,
    save_steps=save_steps,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    logging_steps=logging_steps,
)

In [44]:
 trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [45]:
trainer.train() 

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4666,0.463769,0.765693,[0.62609261 0.82939136],[0.74583495 0.7727807 ],[0.53948014 0.89495176]
2,0.3717,0.475399,0.775553,[0.64064285 0.83681559],[0.76666992 0.77868937],[0.55019968 0.90431963]
3,0.3376,0.48315,0.775502,[0.69417644 0.8226605 ],[0.68777938 0.82712152],[0.70069362 0.81824733]
4,0.2424,0.648061,0.775298,[0.68939287 0.8239802 ],[0.69305388 0.82152095],[0.68577034 0.82645422]
5,0.1742,0.905256,0.774661,[0.68582389 0.82433318],[0.69553314 0.81794892],[0.67638198 0.83081789]


TrainOutput(global_step=36800, training_loss=0.3276964659276216, metrics={'train_runtime': 8822.4811, 'train_samples_per_second': 66.737, 'train_steps_per_second': 4.171, 'total_flos': 7.799547965466624e+16, 'train_loss': 0.3276964659276216, 'epoch': 5.0})

In [46]:
eval_result = trainer.evaluate(eval_dataset=valid_dataset)

In [47]:
for key, value in sorted(eval_result.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.774661163762356

eval_f1 = [0.68582389 0.82433318]

eval_loss = 0.905255913734436

eval_precision = [0.69553314 0.81794892]

eval_recall = [0.67638198 0.83081789]

eval_runtime = 172.9977

eval_samples_per_second = 226.893

eval_steps_per_second = 7.093



In [48]:
trainer.save_model(model_dir + "_local") 

In [49]:
from transformers import pipeline
    
classifier = pipeline("text-classification", model="./model_local")

In [50]:
classifier.model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [51]:
classifier("Woo hoo almost done")

[{'label': 'LABEL_1', 'score': 0.8336262106895447}]

In [52]:
del train_dataset

In [53]:
del valid_dataset

In [54]:
del model

In [55]:
import torch
torch.cuda.empty_cache()

In [56]:
!nvidia-smi

Tue Sep 19 10:46:56 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-PCIE-40GB           On | 00000000:01:00.0 Off |                    0 |
| N/A   27C    P0               35W / 250W|   4231MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCIE-40GB           On | 00000000:25:00.0 Off |  