# Initialization

In [None]:
!pip install -U sentence-transformers

Requirement already up-to-date: sentence-transformers in /usr/local/lib/python3.7/dist-packages (1.1.0)


In [None]:
from sentence_transformers import SentenceTransformer, util
import torch
import json
import numpy as np
import pandas as pd

In [None]:
ISSUES_FILE = 'drive/MyDrive/bugs_data/ooall.csv'
CUSTOM_MODEL_PATH = 'drive/MyDrive/bugs_data/models/stsb-distilbert-base-ooall-10000'

# Issues Helper Methods

In [None]:
def get_issues(issues_file):
  issues = pd.read_csv(issues_file)
  issues['full_description'] = issues['short_desc'].astype(str) + '\n' + issues['description'].astype(str)
  return issues

# Model Helper Methods

In [None]:
def get_base_model():
  return SentenceTransformer('paraphrase-distilroberta-base-v1')

In [None]:
def get_custom_model(name):
  return SentenceTransformer(name)

# Main

In [None]:
issues = get_issues(ISSUES_FILE)

In [None]:
issues = issues.iloc[-10000:].reset_index(drop=True)

In [None]:
len(issues)

10000

In [None]:
model = get_custom_model('drive/MyDrive/bugs_data/models/bert-base-custom-ooall-10000')

Gather the newest duplicates from the issues set

In [None]:
issues_new_duplicates = issues.iloc[-1000:].loc[issues['dup_id'] == issues['dup_id']].reset_index(drop=True)

Ensure that the newest duplicates gathered are not in the set of issues from which we will try to retrieve top-k similar issues

In [None]:
issues_pool = issues[~issues['bug_id'].isin(issues_new_duplicates['bug_id'])].reset_index(drop=True)

Calculate embeddings for the issues pool

In [None]:
embeddings = model.encode(np.array(issues_pool['full_description']), convert_to_tensor=True)

# Top-K Retrieval Methods

In [None]:
def get_top_k_similar_issues(query_embedding, embeddings, top_k):
  return util.semantic_search(query_embedding, embeddings, top_k=top_k)[0]

In [None]:
def evaluate_recall_at_top_k(model, query_issues, pool_issues, embeddings, top_k):
  count = 0
  correct = 0
  for index, row in query_issues.iterrows():
    count += 1
    query_embedding = model.encode(row['full_description'], convert_to_tensor=True)
    results = get_top_k_similar_issues(query_embedding, embeddings, top_k)
    correct_prediction_found = False
    for result in results:
      result_issue = pool_issues.iloc[result['corpus_id']]
      if result_issue['master_id'] == row['master_id']:
        correct_prediction_found = True
    if correct_prediction_found:
      correct += 1
    print(correct / count)
  return correct / count
    


In [None]:
evaluate_recall_at_top_k(model, issues_new_duplicates, issues_pool, embeddings, 5)

1.0
0.5
0.3333333333333333
0.25
0.4
0.3333333333333333
0.42857142857142855
0.375
0.4444444444444444
0.4
0.36363636363636365
0.4166666666666667
0.38461538461538464
0.42857142857142855
0.4
0.375
0.4117647058823529
0.4444444444444444
0.42105263157894735
0.45
0.42857142857142855
0.4090909090909091
0.43478260869565216
0.4583333333333333
0.48
0.5
0.5185185185185185
0.5357142857142857
0.5172413793103449
0.5
0.5161290322580645
0.53125
0.5151515151515151
0.5294117647058824
0.5142857142857142
0.5
0.4864864864864865
0.47368421052631576
0.46153846153846156
0.45
0.43902439024390244
0.42857142857142855
0.4186046511627907
0.4090909090909091
0.4
0.41304347826086957
0.40425531914893614
0.3958333333333333
0.3877551020408163
0.38
0.37254901960784315
0.36538461538461536
0.3584905660377358
0.35185185185185186
0.34545454545454546
0.3392857142857143
0.3508771929824561
0.3448275862068966
0.3389830508474576
0.35
0.3442622950819672
0.3387096774193548
0.3333333333333333
0.34375
0.3384615384615385
0.3484848484848

0.33980582524271846

# Base Model Fine-Tuning

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [None]:
PAIRS_FILE = 'drive/MyDrive/bugs_data/ooall_pairs_10000.csv'
MODEL_OUTPUT_PATH = 'drive/MyDrive/bugs_data/models/paraphrase-distilroberta-base-v1-ooall-10000'

In [None]:
pairs = pd.read_csv(PAIRS_FILE)
pairs_train, pairs_test = train_test_split(pairs, test_size=0.1)

In [None]:
train_data = []
for index, pair in pairs_train.iterrows():
  train_sample = InputExample(texts=[pair['description_1'], pair['description_2']], label=float(pair['label']))
  train_data.append(train_sample)

In [None]:
descriptions_1 = pairs_test['description_1'].to_list()
descriptions_2 = pairs_test['description_2'].to_list()
scores = pairs_test['label'].to_list()

In [None]:
base_model = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [None]:
distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE
margin = 0.5

In [None]:
evaluator = evaluation.EmbeddingSimilarityEvaluator(descriptions_1, descriptions_2, scores, write_csv=True)

In [None]:
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=32)
train_loss = losses.OnlineContrastiveLoss(model=base_model, distance_metric=distance_metric, margin=margin)

In [None]:
base_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=5, warmup_steps=100, evaluator=evaluator, evaluation_steps=50, output_path=MODEL_OUTPUT_PATH, save_best_model=True)

RuntimeError: ignored

In [None]:
base_model.save(MODEL_OUTPUT_PATH)

# Custom Model Training

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, models
from torch.utils.data import DataLoader
import torch
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [None]:
PAIRS_FILE = 'drive/MyDrive/bugs_data/ooall_pairs_10000.csv'
MODEL_OUTPUT_PATH = 'drive/MyDrive/bugs_data/models/bert-base-custom-ooall-10000'

In [None]:
pairs = pd.read_csv(PAIRS_FILE)
pairs_train, pairs_test = train_test_split(pairs, test_size=0.1)

In [None]:
train_data = []
for index, pair in pairs_train.iterrows():
  train_sample = InputExample(texts=[pair['description_1'], pair['description_2']], label=float(pair['label']))
  train_data.append(train_sample)

In [None]:
descriptions_1 = pairs_test['description_1'].to_list()
descriptions_2 = pairs_test['description_2'].to_list()
scores = pairs_test['label'].to_list()

In [None]:
word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=96)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

In [None]:
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE
margin = 0.5

In [None]:
evaluator = evaluation.EmbeddingSimilarityEvaluator(descriptions_1, descriptions_2, scores, write_csv=True)

In [None]:
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=16)
train_loss = losses.OnlineContrastiveLoss(model=model, distance_metric=distance_metric, margin=margin)

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=5, warmup_steps=100, evaluator=evaluator, evaluation_steps=50, output_path=MODEL_OUTPUT_PATH, save_best_model=True)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=563.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=563.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=563.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=563.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=563.0, style=ProgressStyle(description_wi…





In [None]:
model.save(MODEL_OUTPUT_PATH)

In [None]:
torch.cuda.empty_cache()

In [None]:
!nvidia-smi

Sun Apr 25 13:32:32 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P0    24W /  75W |   7563MiB /  7611MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces