# Importsand preparations

In [1]:
import pandas as pd
import torch
import os
import numpy as np
import datasets
import transformers
from GPUtil import showUtilization as gpu_usage
from numba import cuda
import torch.nn.functional as F

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

from datasets import load_dataset, Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# !watch -n 0.5 nvidia-smi

In [3]:
print(f'PyTorch version: {torch.__version__}')  # 1.9.1+cu111
print(f'CUDA version: {torch.version.cuda}')  # 11.1
print(f'cuDNN version: {torch.backends.cudnn.version()}')  # 8005
print(f'Current device: {torch.cuda.current_device()}')  # 0
print(f'Is cuda available: {torch.cuda.is_available()}')  # TRUE

PyTorch version: 2.1.0+cu121
CUDA version: 12.1
cuDNN version: 8902
Current device: 0
Is cuda available: True


In [4]:
print(f'Transformers version: {transformers.__version__}')
print(f'Datasets version: {datasets.__version__}')

Transformers version: 4.35.0
Datasets version: 2.14.6


In [5]:
# Prevent a warning related to the tokenization process in the transformers library. 
os.environ["TOKENIZERS_PARALLELISM"] = "False"
# Makes CUDA operations synchronous
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [6]:
# Find the GPU with the least memory usage.
!nvidia-smi

Tue Nov 28 10:23:28 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti      On | 00000000:01:00.0 Off |                  N/A |
| 33%   56C    P2              243W / 250W|   7973MiB / 11264MiB |     91%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti      On | 00000000:23:00.0 Off |  

In [7]:
def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    # free unreferenced tensors from the GPU memory.
    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()   

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 91% | 71% |
|  1 |  0% |  0% |
|  2 | 91% | 72% |
|  3 |  0% | 88% |
|  4 |  0% | 10% |
|  5 |  0% |  9% |
|  6 |  0% | 64% |
|  7 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 90% | 71% |
|  1 |  0% |  1% |
|  2 | 90% | 72% |
|  3 |  0% | 88% |
|  4 |  0% | 10% |
|  5 |  0% |  9% |
|  6 |  0% | 64% |
|  7 |  0% |  0% |


In [8]:
data = pd.read_csv("csv/clean_priority_high_or_not_high.csv" , index_col = 0)
data

Unnamed: 0,text_clean,label
0,autolog out after some time frame is your feat...,0
1,image picker for sourceimplement an android im...,0
2,fix video page listitem hovering behaviour whe...,0
3,escape shuttle reaches ludicrous speed descrip...,0
4,binder doesnt load notebooks outside of the un...,0
...,...,...
398144,running monomerbased executables on windowshi ...,1
398145,shamir 1ofx groups scheme where someeach group...,1
398146,setting fishhistory to empty in configfish cra...,1
398147,augment diverse datasetcreate script to augmen...,1


In [9]:
data_test_set = pd.read_csv("csv/testset_priority_high_or_not_high_clean.csv" , index_col = 0)
data_test_set

Unnamed: 0,text_clean,label
0,cardano indonesia meeting 3 24 january 2022 un...,1
1,expose active services rest api card the micro...,1
2,querycachememoryleakteststressuserlistenerremo...,1
3,fix linter errors for applicationnow that lint...,0
4,order show updated dateadd date updated to sho...,1
...,...,...
44234,comprehensive themingin the comprehensive them...,1
44235,post title is not appearing when a post is sha...,0
44236,checkeredflag emoji is missing from reactions...,0
44237,mapbox crashes randomly normally in low gps ar...,0


In [10]:

# Split dataframe into three parts: training, validation and testing.
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=42):
    np.random.seed(seed)
    # Shuffle index of dataframe
    perm = np.random.permutation(df.index)
    
    df_length = len(df.index)
    
    # Number of row in training set
    train_end = int(train_percent * df_length)
    # Number of rows in validate set
    validate_end = int(validate_percent * df_length) + train_end
    
    # From start to train end
    train = df.iloc[perm[:train_end]]
    # From train_end to validate_end
    validate = df.iloc[perm[train_end:validate_end]]
    # From validate to the last row in dataframe.
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [11]:
# Drops rows with missing values
data.dropna(inplace=True)

In [12]:
# Resets the index after dropping rows
data.reset_index(inplace=True)
data

Unnamed: 0,index,text_clean,label
0,0,autolog out after some time frame is your feat...,0
1,1,image picker for sourceimplement an android im...,0
2,2,fix video page listitem hovering behaviour whe...,0
3,3,escape shuttle reaches ludicrous speed descrip...,0
4,4,binder doesnt load notebooks outside of the un...,0
...,...,...,...
398144,398144,running monomerbased executables on windowshi ...,1
398145,398145,shamir 1ofx groups scheme where someeach group...,1
398146,398146,setting fishhistory to empty in configfish cra...,1
398147,398147,augment diverse datasetcreate script to augmen...,1


In [13]:
#Drops the index col, better for managing the data.
data.drop(columns= ["index"], inplace = True)

In [14]:
data

Unnamed: 0,text_clean,label
0,autolog out after some time frame is your feat...,0
1,image picker for sourceimplement an android im...,0
2,fix video page listitem hovering behaviour whe...,0
3,escape shuttle reaches ludicrous speed descrip...,0
4,binder doesnt load notebooks outside of the un...,0
...,...,...
398144,running monomerbased executables on windowshi ...,1
398145,shamir 1ofx groups scheme where someeach group...,1
398146,setting fishhistory to empty in configfish cra...,1
398147,augment diverse datasetcreate script to augmen...,1


In [15]:
# 80% trainig, 10% validate, 10% test. Seed 42.
# Test 80-10-10 and 70-15-15
train , validate , test = train_validate_test_split(data)

In [16]:
train.set_index("label" , inplace = True)
validate.set_index("label" , inplace = True)
test.set_index("label" , inplace = True)

In [17]:
test

Unnamed: 0_level_0,text_clean
label,Unnamed: 1_level_1
0,drop appbotx in favor of ios 103 skstorereview...
0,selection support isgroupselected and passing ...
1,twitter report app shows begegnungen an 2 tage...
0,ui nodeview enhancementsthis is a meta thread ...
1,change login action to use public key descript...
...,...
1,65f15e2d8573446b8677summary test id tes1016050...
1,default buttonsbuttons should be pressed by de...
0,statistics add new filter on kw and timethis f...
0,compiler plugin crashes when using a newexpr w...


In [18]:
# Convert from Pandas DataFrame to Hugging Face datasets
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validate)
testds = Dataset.from_pandas(test)

separate_test_set = Dataset.from_pandas(data_test_set)
ds = DatasetDict()

ds["test"] = testds
ds["train"] = tds
ds["validate"] = vds
#ds["separate_test_set"] = separate_test_set

ds

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 39816
    })
    train: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 318519
    })
    validate: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 39814
    })
})

In [19]:
train_dataset = ds["train"]
valid_dataset = ds["validate"]
test_ds = ds["test"]
#separate_test_set_dataset = ds["separate_test_set"]

In [20]:
ds["train"][0]

{'text_clean': 'prevent deletions and insertions from duplication when movingcopying strandscreate two strands bound on a helix and add a deletion imagehttpsuserimagesgithubusercontentcom1927436592946017f2e27880f40a11ea9f7c07a8eef2b581png create a new strand elsewhere imagehttpsuserimagesgithubusercontentcom192743659294608005f54880f40b11ea8f92d58997f07bbbpng move the bottom strand to be bound to the new strand note that one has a deletion and the other does not imagehttpsuserimagesgithubusercontentcom1927436592946138160d2800f40b11ea99066103ddd7de3epng add a deletion at the same offset by clicking on the strand lacking the deletion imagehttpsuserimagesgithubusercontentcom192743659294618626250780f40b11ea9075ddc6a17115f6png now the bottom strand has two deletions stored in the scadnano file json version 0120 grid square helices gridposition 0 0 maxoffset 64 gridposition 0 1 maxoffset 64 strands color cc0000 domains helix 0 forward true start 0 end 8 deletions 3 color 32b86c domains helix 

In [21]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["text_clean"], truncation=True)

# Tokenize all the dataset
tokenized_datasets = ds.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading tokenizer_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28.0/28.0 [00:00<00:00, 113kB/s]
Downloading config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 570/570 [00:00<00:00, 3.45MB/s]
Downloading vocab.txt: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 232k/232k [00:00<00:00, 24.6MB/s]
Downloading tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 466k/466k [00:00<00:00, 390MB/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39816/39816 [00:43<00:00, 914.10 examples/s]
Map: 100%|██████████

In [22]:
tokenized_datasets

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 39816
    })
    train: Dataset({
        features: ['text_clean', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 318519
    })
    validate: Dataset({
        features: ['text_clean', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 39814
    })
})

In [23]:
data_collator

DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [24]:
# Remove unnecessary columns that the model does not expect.
tokenized_datasets = tokenized_datasets.remove_columns(["text_clean"])
# Rename the column to labels because the model expect that.
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
# Returns PyTorch tensors instead of lists.
tokenized_datasets.set_format("torch")
# List the columns of the dataset.
# Should be: ["attention_mask", "input_ids", "labels", "token_type_ids"]
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [25]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validate"], batch_size=8, collate_fn=data_collator
)

In [26]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 512]),
 'token_type_ids': torch.Size([8, 512]),
 'attention_mask': torch.Size([8, 512])}

In [27]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Downloading model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 440M/440M [00:01<00:00, 389MB/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
from torch.optim import AdamW

# The same optimizer as used by "Trainer"
optimizer = AdamW(model.parameters(), lr=5e-5)

In [29]:
from transformers import get_scheduler

# Learning rate scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

119445


In [30]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [None]:
from tqdm.auto import tqdm

# To see when training is finished.
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 100951/119445 [7:43:43<1:33:35,  3.29it/s]