# **Install Related Libraries**

In [1]:
!pip install transformers==4.17

Collecting transformers==4.17
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses (from transformers==4.17)
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses, transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed sacremoses-0.1.1 transformers-4.17.0


In [2]:
from transformers import Trainer, TrainingArguments
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
import pandas as pd
import nltk
import re
import ast
import numpy as np
from torch.utils.data import Dataset
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_fscore_support,accuracy_score
from sklearn.model_selection import train_test_split

# **Import Datasets**

In [3]:
trainingData=pd.read_csv("issues_train.csv")
testingData=pd.read_csv("issues_test.csv")

# **Text Preprocessing**

In [4]:
#Concatenation of title and body into a summary column
trainingData["summary"]=trainingData["title"]+" "+trainingData['body']
testingData["summary"]=testingData["title"]+" "+trainingData['body']

#Keeping only the first 250 characters
trainingData["summary"]=trainingData["summary"].apply(lambda x: x[:256] if len(str(x))>256 else x)
testingData["summary"]=testingData["summary"].apply(lambda x: x[:256] if len(str(x))>256 else x)

In [5]:
def checkLink(text):
    link_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    return re.sub(link_pattern, "LINK" ,str(text))

#Removing Links
trainingData['summary'] = trainingData['summary'].apply(checkLink)
testingData['summary'] = testingData['summary'].apply(checkLink)

In [6]:
#Converting data to lowercase
trainingData = trainingData.applymap(lambda x: x.lower() if isinstance(x, str) else str(x))
testingData = testingData.applymap(lambda x: x.lower() if isinstance(x, str) else str(x))

In [7]:
def removeSpecialCharacters(text):
    pattern = r'[^a-zA-Z]'
    cleanedText = re.sub(pattern, ' ', text)
    cleanedText = ' '.join(cleanedText.split())
    return cleanedText

#Removing special Characters and spaces
trainingData['summary'] = trainingData['summary'].apply(lambda x :removeSpecialCharacters(str(x)))
testingData['summary'] = testingData['summary'].apply(lambda x :removeSpecialCharacters(str(x)))

In [8]:
def dropNan(x):
  if len(str(x))<5:
    return False
  else: return True

#Droping duplicates and Nan values
mask=trainingData["summary"].apply(dropNan)
trainingData=trainingData[mask]
mask=testingData["summary"].apply(dropNan)
testingData=testingData[mask]
trainingData = trainingData.drop_duplicates(subset='summary')
testingData = testingData.drop_duplicates(subset='summary')

trainingData.head()

Unnamed: 0,repo,created_at,label,title,body,summary
0,facebook/react,2023-08-26 06:33:37+00:00,bug,"[devtools bug] cannot add node ""1"" because a n...",### website or app\n\nprivate repo cannot give...,devtools bug cannot add node because a node wi...
1,facebook/react,2023-07-28 05:16:12+00:00,bug,[devtools bug]: devtools extension build faili...,### website or app\n\nn/a\n\n### repro steps\n...,devtools bug devtools extension build failing ...
2,facebook/react,2023-07-13 21:58:31+00:00,bug,[devtools bug]: deprecated __react_devtools_gl...,### website or app\n\nhttps://github.com/open-...,devtools bug deprecated react devtools global ...
3,facebook/react,2023-06-14 02:31:20+00:00,bug,"[devtools bug] cannot remove node ""0"" because ...",### website or app\n\nlocal\n\n### repro steps...,devtools bug cannot remove node because no mat...
4,facebook/react,2023-06-03 11:29:44+00:00,bug,"[devtools bug] cannot remove node ""103"" becaus...",### website or app\n\nlocalhost\n\n### repro s...,devtools bug cannot remove node because no mat...


In [9]:
#Saving preprocessed data to CSV
trainingData=trainingData[['repo','summary','label']]
testingData=testingData[['repo','summary','label']]
trainingData.to_csv("training_Dataset.csv", index=False)
testingData.to_csv("testing_Dataset.csv", index=False)

# **Dataset Preparation**

In [10]:
#load train data
data = pd.read_csv("training_Dataset.csv")

label=[]
for index, row in data.iterrows():
  if row['label']=="bug":
     label.append(0)
  elif row['label']=="feature":
      label.append(1)
  else: label.append(2)
data['label']=label

X = list(data["summary"])
y = list(data["label"])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)

In [11]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = RobertaForSequenceClassification.from_pretrained('roberta-large',num_labels=3)

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifie

In [12]:
df_valid = pd.DataFrame({"summary":X_val,"label":y_val})
train_text = df_valid.summary.values
train_label = df_valid.label.values
val_text = df_valid.summary.values
val_label = df_valid.label.values

In [13]:
# define custom dataset
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encoded_text = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=256,
            padding='max_length',
            truncation=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoded_text['input_ids'].squeeze()
        attention_mask = encoded_text['attention_mask'].squeeze()
        label = torch.tensor(label)

        return {
            'input_ids': input_ids.cpu(),
            'attention_mask': attention_mask.cpu(),
            'labels': label.cpu()
        }

# create datasets
train_dataset = TextClassificationDataset(train_text, train_label, tokenizer)
eval_dataset = TextClassificationDataset(val_text, val_label, tokenizer)

In [14]:
# Define custom metrics for validation to avoid error
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# **Setting Hyperparameters**

In [16]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=32,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=5,
    weight_decay=32,
    fp16=True,
    evaluation_strategy='epoch',
    learning_rate=7e-6,
    greater_is_better=True,
    gradient_accumulation_steps=2,
    eval_steps=10,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

# **Model Training**

In [None]:
trainer.train()

***** Running training *****
  Num examples = 1046
  Num Epochs = 32
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 4192


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.8217,0.632517,0.665212,0.632517,0.630802
2,No log,0.686496,0.728285,0.728155,0.728285,0.72324
3,No log,0.776306,0.737194,0.756264,0.737194,0.738284
4,0.705800,0.850085,0.721604,0.72793,0.721604,0.716674
5,0.705800,1.172151,0.723831,0.736772,0.723831,0.726261
6,0.705800,1.270803,0.721604,0.7476,0.721604,0.725102
7,0.705800,1.505421,0.697105,0.706989,0.697105,0.68986
8,0.255400,1.732754,0.685969,0.715911,0.685969,0.682241
9,0.255400,1.559906,0.739421,0.751553,0.739421,0.741999
10,0.255400,1.741879,0.723831,0.739305,0.723831,0.712826


***** Running Evaluation *****
  Num examples = 449
  Batch size = 8
***** Running Evaluation *****
  Num examples = 449
  Batch size = 8
***** Running Evaluation *****
  Num examples = 449
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 449
  Batch size = 8
***** Running Evaluation *****
  Num examples = 449
  Batch size = 8
***** Running Evaluation *****
  Num examples = 449
  Batch size = 8
***** Running Evaluation *****
  Num examples = 449
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 449
  Batch size = 8
***** Running Evaluation *****
  Num examples = 449
  Batch size = 8
***** Running Evalua

TrainOutput(global_step=4192, training_loss=0.13724931437195143, metrics={'train_runtime': 2862.679, 'train_samples_per_second': 11.693, 'train_steps_per_second': 1.464, 'total_flos': 1.5596855994974208e+16, 'train_loss': 0.13724931437195143, 'epoch': 32.0})

# **Model Evaluation**

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 449
  Batch size = 8


{'eval_loss': 1.998992681503296,
 'eval_accuracy': 0.7282850779510023,
 'eval_precision': 0.7302405960209626,
 'eval_recall': 0.7282850779510023,
 'eval_f1': 0.7289225172170626,
 'eval_runtime': 6.9294,
 'eval_samples_per_second': 64.797,
 'eval_steps_per_second': 8.226,
 'epoch': 32.0}

# **Model Testing**

In [None]:
test_data = pd.read_csv("testing_Dataset.csv")
test_data.dropna(inplace=True)
X_test = list(test_data["summary"])
label=[]
for index, row in test_data.iterrows():
  if row['label']=="bug":
     label.append(0)
  elif row['label']=="feature":
      label.append(1)
  else: label.append(2)
test_data['label']=label


In [None]:
test_dataset = TextClassificationDataset(test_data['summary'],test_data['label'],tokenizer)
predictions=trainer.predict(test_dataset=test_dataset).predictions

***** Running Prediction *****
  Num examples = 1498
  Batch size = 8


In [None]:
labels=["bug","feature","question"]
predicted_labels=[]
trueFalse=[]
i=0
for prediction in predictions:
  index_of_max = np.argmax(prediction)
  predicted_labels.append(index_of_max)

test_data["predicted_label"]=predicted_labels

for index, row in test_data.iterrows():
  if row['label']==row['predicted_label']:
     trueFalse.append("True")
  else: trueFalse.append("False")

test_data["True/False"]=trueFalse

In [None]:
actual = test_data['label']
predicted = test_data['predicted_label']
accuracy = accuracy_score(actual, predicted)
precision = precision_score(actual, predicted, average='weighted')
recall = recall_score(actual, predicted ,average='weighted')
f1 = f1_score(actual, predicted, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Accuracy: 0.8324432576769025
Precision: 0.8327808705737002
Recall: 0.8324432576769025
F1-score: 0.8322107403502624


# **Results**

In [None]:
repos = list(set(test_data["repo"].unique()))
grouped = test_data.groupby("repo")
smaller_dataframes = {}
for group_name, group_data in grouped:
    smaller_dataframes[group_name] = group_data.copy()

for repo in repos:
  precision, recall, f1, support = precision_recall_fscore_support(smaller_dataframes[repo]['label'],smaller_dataframes[repo]["predicted_label"])
  print("---------",repo,"------------")
  for label in range(len(precision)):
    print(f"Label {labels[label]}:")
    print(f"Precision: {precision[label]}")
    print(f"Recall: {recall[label]}")
    print(f"F1 Score: {f1[label]}")
    print(f"support: {support[label]}")
  print(f"Average Precision: {np.average(precision)}")
  print(f"Average Recall: {np.average(recall)}")
  print(f"Average F1 Score: {np.average(f1)}")
  print(f"Average Support: {np.average(support)}")
  print("---------------------")
  print("_______________________________________")
  i=i+1



--------- microsoft/vscode ------------
Label bug:
Precision: 0.780952380952381
Recall: 0.82
F1 Score: 0.8
support: 100
Label feature:
Precision: 0.8383838383838383
Recall: 0.83
F1 Score: 0.8341708542713568
support: 100
Label question:
Precision: 0.84375
Recall: 0.81
F1 Score: 0.826530612244898
support: 100
Average Precision: 0.8210287397787397
Average Recall: 0.82
Average F1 Score: 0.820233822172085
Average Support: 100.0
---------------------
_______________________________________
--------- opencv/opencv ------------
Label bug:
Precision: 0.7454545454545455
Recall: 0.82
F1 Score: 0.780952380952381
support: 100
Label feature:
Precision: 0.84375
Recall: 0.81
F1 Score: 0.826530612244898
support: 100
Label question:
Precision: 0.8191489361702128
Recall: 0.77
F1 Score: 0.7938144329896908
support: 100
Average Precision: 0.8027844938749195
Average Recall: 0.7999999999999999
Average F1 Score: 0.8004324753956565
Average Support: 100.0
---------------------
___________________________________

In [None]:
from sklearn.metrics import classification_report
report = classification_report(test_data['label'], test_data['predicted_label'], output_dict=True)
for class_label, metrics in report.items():
    if class_label.isdigit():
        print(f"Class {class_label}:")
        print(f"F1 Score: {metrics['f1-score']}")
        print(f"precision: {metrics['precision']}")
        print(f"Recall: {metrics['recall']}")
        print("============================")

Class 0:
F1 Score: 0.8365758754863815
precision: 0.8143939393939394
Recall: 0.86
Class 1:
F1 Score: 0.8514056224899598
precision: 0.8531187122736419
Recall: 0.8496993987975952
Class 2:
F1 Score: 0.808641975308642
precision: 0.8308668076109936
Recall: 0.7875751503006012
