# **1. Prepare the environment**

## **1.1 Fetch the "Label-Bot" repo and modify it so that it can be used in colab**

In [None]:
!git clone https://github.com/GiorgosKarantonis/Label-Bot

!mv Label-Bot Label_Bot
!touch Label_Bot/__init__.py

## **1.2 Mount google drive in order to be able to access the preprocessed dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **2. Prepare the Label Bot**

## **2.1 Import all the required libraries**

In [None]:
import time
import itertools

import numpy as np
import pandas as pd
import tensorflow as tf

import Label_Bot.preprocessing as pp

try:
    from transformers import BertTokenizer, DistilBertTokenizer, DistilBertTokenizerFast
    from transformers import TFBertModel, TFDistilBertModel
    from transformers import TFBertForSequenceClassification, TFDistilBertForSequenceClassification
except:
    !pip install transformers==3.0.0

    from transformers import BertTokenizer, DistilBertTokenizer, DistilBertTokenizerFast
    from transformers import TFBertModel, TFDistilBertModel
    from transformers import TFBertForSequenceClassification, TFDistilBertForSequenceClassification

## **2.2 Define the hyperparameters and the global variables**

In [None]:
MEMORY_LIMIT = 100	# set to None if you don't want to apply any limit

# a dummy set of labels used for the proof of concept
LABELS = [
	'bug', 
	'enhancement', 
	'question', 
	'undefined'
]

# an actual set of labels that the bot should be able to predict
EXAMPLE_LABELS = [
	'bug', 
	'enhancement', 
	'documentation', 
	'duplicate', 
	'maintenance', 
	'good first issue', 
	'help wanted', 
	'invalid', 
	'question', 
	"won't fix", 
	'status: proposal', 
	'status: available', 
	'status: in progress', 
	'status: on hold', 
	'status: blocked', 
	'status: abandoned', 
	'status: review needed', 
	'priority: low', 
	'priority: medium', 
	'priority: high', 
	'priority: critical'
]

## **2.3 Load the preprocessed dataset**

In [None]:
df = pp.load_data(memory_limit=MEMORY_LIMIT, file='./drive/My Drive/Label Bot/data/github.pkl')

for c in df.columns:
    if c.startswith('label_'):
        df = df.drop(c, axis=1)

df

Unnamed: 0,url,repo,title,body,labels,user,repo_name,issue_number
0,https://github.com/F5Networks/f5-openstack-lba...,F5Networks/f5-openstack-lbaasv2-driver,test_l7policies_and_rules.py:testl7basicupdate...,title: test_l7policies_and_rules.py:testl7basi...,"[s3, p3, test-bug]",F5Networks,f5-openstack-lbaasv2-driver,835
1,https://github.com/aspnet/Mvc/issues/6339,aspnet/Mvc,testing all controllers dependency injection,i'm writing integration tests for my applicati...,[question],aspnet,Mvc,6339
2,https://github.com/ionic-team/ionic-cli/issues...,ionic-team/ionic-cli,testing ionic4 - serve shows two displays,\r description: \r ionic serve shows two di...,[bug],ionic-team,ionic-cli,3044
3,https://github.com/thefarwind/chip-8/issues/21,thefarwind/chip-8,tests are all broken,"when switching chip8 such that the audio, disp...","[bug, tests]",thefarwind,chip-8,21
4,https://github.com/n-sokolov/CoffeeShop/issues/1,n-sokolov/CoffeeShop,tests for paging,_ context _: paging mechanism must be tested...,[enhancement],n-sokolov,CoffeeShop,1
...,...,...,...,...,...,...,...,...
95,https://github.com/CHEF-KOCH/NoScript-Whitelis...,CHEF-KOCH/NoScript-Whitelist,ublock sync issue,on a huge filter-list like we have ub refuse...,"[extension-bug, status: stale]",CHEF-KOCH,NoScript-Whitelist,5
96,https://github.com/Aluxian/Whatsie/issues/63,Aluxian/Whatsie,ubuntu linux: package operation failed,"dear @aluxian,\r \r thanks for your latest rel...","[bug, done]",Aluxian,Whatsie,63
97,https://github.com/rosevalfbj/CommSys/issues/5,rosevalfbj/CommSys,ufrmcomissoes - auto-calculo do valor da comissão,realizar automaticamente do valor da comissão.,[enhancement],rosevalfbj,CommSys,5
98,https://github.com/TeXworks/texworks/issues/742,TeXworks/texworks,"ugly magnifying glass in preview window, weird...",_steps to reproduce the problem_: 1. compile a...,"[bug, pdf previewer]",TeXworks,texworks,742


## **2.4 Disambiguate the Labels**

In [None]:
def check_paraphrase(inputs):
	tokenizer = BertTokenizer.from_pretrained('bert-base-cased-finetuned-mrpc')
	model = TFBertForSequenceClassification.from_pretrained('bert-base-cased-finetuned-mrpc')

	inputs = tokenizer(	inputs, 
						padding=True, 
						truncation=True, 
						return_tensors='tf')

	logits = model(inputs)[0]
	outputs = tf.nn.softmax(logits, axis=1).numpy()

	not_paraphrase_likelihood = outputs[:, 0]
	paraphrase_likelihood = outputs[:, 1]

	return not_paraphrase_likelihood, paraphrase_likelihood

### **2.4.1 Find the similarity between all the labels that are present in the dataset**

In [None]:
unique_labels = pp.get_unique_values(df, 'labels').keys().values
combinations = np.array(list(itertools.combinations(unique_labels, 2))).tolist()

_, paraphrase_likelihood = check_paraphrase(combinations)

for i, pair in enumerate(combinations):
	if paraphrase_likelihood[i] > .8:
		print(f'{pair[0]}\t{pair[1]}\t\t{paraphrase_likelihood[i]}')

### **2.4.2 Find the similarity between the predefined labels and the labels that are present in the dataset**

In [None]:
paraphrase_list = []
for target_l in LABELS:
    for real_l in unique_labels:
        paraphrase_list.append([target_l, real_l])

_, paraphrase_likelihood = check_paraphrase(paraphrase_list)

In [None]:
def disambiguate_labels(labels_dict, disambiguate='keep_most_probable'):
    assert disambiguate in ['keep_most_probable', 'drop_all']

    for key in labels_dict:
        if len(labels_dict[key]) > 1:
            if disambiguate == 'drop_all':
                labels_dict[key] = [(None, None)]
            else:
                max_likelihood = 0
                best_match = None

                for label, likelihood in labels_dict[key]:
                    if likelihood > max_likelihood:
                        max_likelihood = likelihood
                        best_match = label
                
                labels_dict[key] = [(best_match, likelihood)]
    
    return labels_dict

### **2.4.3 Find how the labels from the dataset relate to the predefined ones**

In [None]:
label_mapping = {}
for i, pair in enumerate(paraphrase_list):
    if paraphrase_likelihood[i] > .5:
        target_l, real_l = pair[0], pair[1]
        try:
            label_mapping[real_l].append((target_l, paraphrase_likelihood[i]))
        except:
            label_mapping[real_l] = []
            label_mapping[real_l].append((target_l, paraphrase_likelihood[i]))

label_mapping

{'bug': [('bug', 0.9162983)],
 'bug report': [('bug', 0.6632633)],
 'doc-enhancement': [('enhancement', 0.8313373)],
 'enhancement': [('enhancement', 0.9370464)],
 'extension-bug': [('bug', 0.62284034)],
 'kind/bug': [('bug', 0.84865934)],
 'kind/enhancement': [('enhancement', 0.90538484)],
 'minor bug': [('bug', 0.5306739)],
 'question': [('question', 0.9274783)],
 'type-bug': [('bug', 0.799179)],
 'type: bug': [('bug', 0.82417744)],
 'type:visual bug': [('bug', 0.7715802)]}

In [None]:
label_mapping = disambiguate_labels(label_mapping)

label_mapping

{'bug': [('bug', 0.9162983)],
 'bug report': [('bug', 0.6632633)],
 'doc-enhancement': [('enhancement', 0.8313373)],
 'enhancement': [('enhancement', 0.9370464)],
 'extension-bug': [('bug', 0.62284034)],
 'kind/bug': [('bug', 0.84865934)],
 'kind/enhancement': [('enhancement', 0.90538484)],
 'minor bug': [('bug', 0.5306739)],
 'question': [('question', 0.9274783)],
 'type-bug': [('bug', 0.799179)],
 'type: bug': [('bug', 0.82417744)],
 'type:visual bug': [('bug', 0.7715802)]}

In [None]:
# clean the dict
for key in label_mapping:
    label_mapping[key] = label_mapping[key][0][0]

label_mapping

{'bug': 'bug',
 'bug report': 'bug',
 'doc-enhancement': 'enhancement',
 'enhancement': 'enhancement',
 'extension-bug': 'bug',
 'kind/bug': 'bug',
 'kind/enhancement': 'enhancement',
 'minor bug': 'bug',
 'question': 'question',
 'type-bug': 'bug',
 'type: bug': 'bug',
 'type:visual bug': 'bug'}

In [None]:
df['labels']

0                 [s3, p3, test-bug]
1                         [question]
2                              [bug]
3                       [bug, tests]
4                      [enhancement]
                   ...              
95    [extension-bug, status: stale]
96                       [bug, done]
97                     [enhancement]
98              [bug, pdf previewer]
99                             [bug]
Name: labels, Length: 100, dtype: object

In [None]:
mapped_labels = []

for i, label_list in enumerate(df['labels']):
    temp_labels = []
    for l in label_list:
        try:
            temp_labels.append(label_mapping[l])
        except:
            pass
    
    if temp_labels:
        mapped_labels.append(temp_labels)
    else:
        mapped_labels.append(['undefined'])

df['labels'] = pd.Series(mapped_labels)
df['labels']

0       [undefined]
1        [question]
2             [bug]
3             [bug]
4     [enhancement]
          ...      
95            [bug]
96            [bug]
97    [enhancement]
98            [bug]
99            [bug]
Name: labels, Length: 100, dtype: object

In [None]:
labels_vectorized = pp.vectorize(df['labels'], LABELS)
df = pp.transform(df, to_add=[labels_vectorized])

df

Vectorizing Features...

1 / 1002 / 1003 / 1004 / 1005 / 1006 / 1007 / 1008 / 1009 / 10010 / 10011 / 10012 / 10013 / 10014 / 10015 / 10016 / 10017 / 10018 / 10019 / 10020 / 10021 / 10022 / 10023 / 10024 / 10025 / 10026 / 10027 / 10028 / 10029 / 10030 / 10031 / 10032 / 10033 / 10034 / 10035 / 10036 / 10037 / 10038 / 10039 / 10040 / 10041 / 10042 / 10043 / 10044 / 10045 / 10046 / 10047 / 10048 / 10049 / 10050 / 10051 / 10052 / 10053 / 10054 / 10055 / 10056 / 10057 / 10058 / 10059 / 10060 / 10061 / 10062 / 10063 / 10064 / 10065 / 10066 / 10067 / 10068 / 10069 / 10070 / 10071 / 10072 / 10073 / 10074 / 10075 / 10076 / 10077 / 10078 / 10079 / 10080 / 10081 / 10082 / 10083 / 10084 / 10085 / 10086 / 10087 / 10088 / 10089 / 10090 / 10091 / 10092 / 10093 / 10094 / 10095 / 10096 / 10097 / 10098 / 10099 / 100100 / 100Transforming DataFrame...



Unnamed: 0,url,repo,title,body,labels,user,repo_name,issue_number,bug,enhancement,question,undefined
0,https://github.com/F5Networks/f5-openstack-lba...,F5Networks/f5-openstack-lbaasv2-driver,test_l7policies_and_rules.py:testl7basicupdate...,title: test_l7policies_and_rules.py:testl7basi...,[undefined],F5Networks,f5-openstack-lbaasv2-driver,835,0.0,0.0,0.0,1.0
1,https://github.com/aspnet/Mvc/issues/6339,aspnet/Mvc,testing all controllers dependency injection,i'm writing integration tests for my applicati...,[question],aspnet,Mvc,6339,0.0,0.0,1.0,0.0
2,https://github.com/ionic-team/ionic-cli/issues...,ionic-team/ionic-cli,testing ionic4 - serve shows two displays,\r description: \r ionic serve shows two di...,[bug],ionic-team,ionic-cli,3044,1.0,0.0,0.0,0.0
3,https://github.com/thefarwind/chip-8/issues/21,thefarwind/chip-8,tests are all broken,"when switching chip8 such that the audio, disp...",[bug],thefarwind,chip-8,21,1.0,0.0,0.0,0.0
4,https://github.com/n-sokolov/CoffeeShop/issues/1,n-sokolov/CoffeeShop,tests for paging,_ context _: paging mechanism must be tested...,[enhancement],n-sokolov,CoffeeShop,1,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
95,https://github.com/CHEF-KOCH/NoScript-Whitelis...,CHEF-KOCH/NoScript-Whitelist,ublock sync issue,on a huge filter-list like we have ub refuse...,[bug],CHEF-KOCH,NoScript-Whitelist,5,1.0,0.0,0.0,0.0
96,https://github.com/Aluxian/Whatsie/issues/63,Aluxian/Whatsie,ubuntu linux: package operation failed,"dear @aluxian,\r \r thanks for your latest rel...",[bug],Aluxian,Whatsie,63,1.0,0.0,0.0,0.0
97,https://github.com/rosevalfbj/CommSys/issues/5,rosevalfbj/CommSys,ufrmcomissoes - auto-calculo do valor da comissão,realizar automaticamente do valor da comissão.,[enhancement],rosevalfbj,CommSys,5,0.0,1.0,0.0,0.0
98,https://github.com/TeXworks/texworks/issues/742,TeXworks/texworks,"ugly magnifying glass in preview window, weird...",_steps to reproduce the problem_: 1. compile a...,[bug],TeXworks,texworks,742,1.0,0.0,0.0,0.0


# **3 Use the Pre-Trained Models**

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer_dstl = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer_dstl_fast = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

model = TFBertModel.from_pretrained('bert-base-cased')
model_dstl = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

model_cls = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
model_cls_dstl = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

## **3.1 Get the Contextual Word Embeddings**

In [None]:
def get_embeddings(data, tokenizer=tokenizer_dstl, model=model_dstl):
    inputs = tokenizer(	data, 
                        padding=True, 
                        truncation=True, 
                        return_tensors='tf')

    outputs = model(inputs)
    logits = outputs[0]

    return logits

In [None]:
embeddings_title = get_embeddings(df['title'].values.tolist())  # n_examples x n_words x 768
embeddings_body = get_embeddings(df['body'].values.tolist())  # n_examples x n_words x 768

## **3.2 Run the Pre-Trained Sequence Classification Model**

Actually don't run it, it will fail...

The pretrained model cannot handle Multilabel Classification tasks. Since the architecture won't rely solely on this model, it can be used to boost the training by: 

- Select randomly one target class when two or more are present. 
- Use all target classes when 2 or more are present but split them into different examples and add random noise. 

In [None]:
def seq_classification(data, labels, tokenizer=tokenizer_dstl, model=model_cls_dstl):
    inputs = tokenizer(	data, 
                        padding=True, 
                        truncation=True, 
                        return_tensors='tf')

    inputs['labels'] = tf.convert_to_tensor(np.array(labels).reshape(-1, 1))

    outputs = model(inputs)
    loss, logits = outputs[:2]

    return loss, logits

In [None]:
labels_num = []  # 0: no label , i+1: for any other defined in LABELS
for row in labels:
    vector_to_num = 0
    for j, column_value in enumerate(row):
        vector_to_num += (j + 1) * column_value
    
    labels_num.append(vector_to_num)

loss_cls_title, logits_cls_title = seq_classification(df['title'].values.tolist(), labels_num)  # loss: n_examples x 1 , logits: n_examples x n_labels
loss_cls_body, logits_cls_body = seq_classification(df['body'].values.tolist(), labels_num)  # loss: n_examples x 1 , logits: n_examples x n_labels