# **1. Prepare the environment**

## **1.1 Fetch the "Label-Bot" repo and modify it so that it can be used in colab**

In [None]:
!git clone https://github.com/GiorgosKarantonis/Label-Bot

!mv Label-Bot Label_Bot
!touch Label_Bot/__init__.py

## **1.2 Mount google drive in order to be able to access the preprocessed dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **2. Prepare the Label Bot**

## **2.1 Import all the required libraries**

In [2]:
import time
import itertools

import numpy as np
import pandas as pd
import tensorflow as tf

import Label_Bot.preprocessing as pp

try:
    from transformers import BertTokenizer, DistilBertTokenizer, DistilBertTokenizerFast
    from transformers import TFBertModel, TFDistilBertModel
    from transformers import TFBertForSequenceClassification, TFDistilBertForSequenceClassification
except:
    !pip install transformers==3.0.0

    from transformers import BertTokenizer, DistilBertTokenizer, DistilBertTokenizerFast
    from transformers import TFBertModel, TFDistilBertModel
    from transformers import TFBertForSequenceClassification, TFDistilBertForSequenceClassification

## **2.2 Define the hyperparameters and the global variables**

In [3]:
MEMORY_LIMIT = 100	# set to None if you don't want to apply any limit

# a dummy set of labels used for the proof of concept
LABELS = [
	'bug', 
	'enhancement', 
	'question'
]

# an actual set of labels that the bot should be able to predict
EXAMPLE_LABELS = [
	'bug', 
	'enhancement', 
	'documentation', 
	'duplicate', 
	'maintenance', 
	'good first issue', 
	'help wanted', 
	'invalid', 
	'question', 
	"won't fix", 
	'status: proposal', 
	'status: available', 
	'status: in progress', 
	'status: on hold', 
	'status: blocked', 
	'status: abandoned', 
	'status: review needed', 
	'priority: low', 
	'priority: medium', 
	'priority: high', 
	'priority: critical'
]

## **2.3 Load the preprocessed dataset**

In [4]:
df = pp.load_data(memory_limit=MEMORY_LIMIT, file='./drive/My Drive/Label Bot/data/github.pkl')
labels = np.transpose([df[c] for c in df if c.startswith('label_')])

for c in df.columns:
    if c.startswith('label_') and c.replace('label_', '') not in LABELS:
        df = df.drop(c, axis=1)

print(pp.get_unique_values(df, 'labels').head())

bug            56
enhancement    18
question        9
fixed           3
kind/bug        3
Name: labels, dtype: int64


# **3 Use the Pre-Trained Models** **bold text**

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer_dstl = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer_dstl_fast = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

model = TFBertModel.from_pretrained('bert-base-cased')
model_dstl = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

model_cls = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
model_cls_dstl = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

## **3.1 Disambiguate the Labels**

In [5]:
def check_paraphrase(inputs):
	tokenizer = BertTokenizer.from_pretrained('bert-base-cased-finetuned-mrpc')
	model = TFBertForSequenceClassification.from_pretrained('bert-base-cased-finetuned-mrpc')

	inputs = tokenizer(	inputs, 
						padding=True, 
						truncation=True, 
						return_tensors='tf')

	logits = model(inputs)[0]
	outputs = tf.nn.softmax(logits, axis=1).numpy()

	not_paraphrase_likelihood = outputs[:, 0]
	paraphrase_likelihood = outputs[:, 1]

	return not_paraphrase_likelihood, paraphrase_likelihood

### **3.1.1 Test the similarity between all the labels that are present in the dataset**

In [6]:
unique_labels = pp.get_unique_values(df, 'labels').keys().values
combinations = np.array(list(itertools.combinations(unique_labels, 2))).tolist()

_, paraphrase_likelihood = check_paraphrase(combinations)

for i, pair in enumerate(combinations):
	if paraphrase_likelihood[i] > .8:
		print(f'{pair[0]}\t{pair[1]}\t\t{paraphrase_likelihood[i]}')

Some weights of the model checkpoint at bert-base-cased-finetuned-mrpc were not used when initializing TFBertForSequenceClassification: ['dropout_183']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased-finetuned-mrpc and are newly initialized: ['dropout_37']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


bug	kind/bug		0.848659336566925
bug	type: bug		0.8241774439811707
enhancement	doc-enhancement		0.8313372731208801
enhancement	kind/enhancement		0.9053848385810852
kind/bug	type: bug		0.9100732803344727
kind/bug	bug report		0.8221029043197632
kind/bug	type:visual bug		0.8794167041778564
kind/bug	kind/enhancement		0.8732624650001526
kind/bug	type-bug		0.9201246500015259
kind/bug	extension-bug		0.8677968978881836
kind/bug	kind/bug/p0		0.9146767854690552
in progress	in-progress		0.9327884912490845
testing	tests		0.8916923403739929
type: bug	type:feature		0.9392443299293518
type: bug	bug report		0.8869650959968567
type: bug	minor bug		0.8080185651779175
type: bug	type:visual bug		0.9418724775314331
type: bug	possible-bug		0.8296520113945007
type: bug	type-bug		0.9411540627479553
type: bug	browser: chrome		0.9087804555892944
type: bug	test-bug		0.8598078489303589
type: bug	extension-bug		0.9007737040519714
hard problem	easy		0.8164892196655273
type:feature	feature		0.8799180388450623
type:fe

### **3.1.2 Test the similarity between the predefined labels and the labels that are present in the dataset**

In [29]:
test_list = []

for target_l in LABELS:
    for real_l in unique_labels:
        test_list.append([target_l, real_l])

_, paraphrase_likelihood = check_paraphrase(test_list)

for i, pair in enumerate(test_list):
	if paraphrase_likelihood[i] > .5:
		print(f'{pair[0]}\t{pair[1]}\t\t{paraphrase_likelihood[i]}')

Some weights of the model checkpoint at bert-base-cased-finetuned-mrpc were not used when initializing TFBertForSequenceClassification: ['dropout_183']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased-finetuned-mrpc and are newly initialized: ['dropout_341']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


bug	bug		0.9162982702255249
bug	kind/bug		0.848659336566925
bug	type: bug		0.8241774439811707
bug	bug report		0.6632633209228516
bug	minor bug		0.5306739211082458
bug	type:visual bug		0.7715802192687988
bug	type-bug		0.7991790175437927
bug	extension-bug		0.6228403449058533
enhancement	enhancement		0.9370464086532593
enhancement	doc-enhancement		0.8313372731208801
enhancement	kind/enhancement		0.9053848385810852
question	question		0.9274783134460449


## **3.2 Get the Contextual Word Embeddings**

In [None]:
def get_embeddings(data, tokenizer=tokenizer_dstl, model=model_dstl):
    inputs = tokenizer(	data, 
                        padding=True, 
                        truncation=True, 
                        return_tensors='tf')

    outputs = model(inputs)
    logits = outputs[0]

    return logits

In [None]:
embedding_title = get_embeddings(df['title'].values.tolist())  # n_examples x 768
embedding_body = get_embeddings(df['body'].values.tolist())  # n_examples x 768

## **3.3 Run the Pre-Trained Sequence Classification Model**

Actually don't run it, it will fail. 

The pretrained model cannot handle Multilabel Classification tasks. It can be used in another way though... Since the architecture won't rely solely on this model, it can be used to boost the training by: 

- Select randomly one target class when two or more are present. 
- Use all target classes when 2 or more are present but split them into different examples and add random noise. 

In [8]:
def seq_classification(data, labels, tokenizer=tokenizer_dstl, model=model_cls_dstl):
    inputs = tokenizer(	data, 
                        padding=True, 
                        truncation=True, 
                        return_tensors='tf')

    inputs['labels'] = tf.convert_to_tensor(np.array(labels).reshape(-1, 1))

    outputs = model(inputs)
    loss, logits = outputs[:2]

    return loss, logits

In [None]:
labels_num = []  # 0: no label , i+1: for any other defined in LABELS
for row in labels:
    vector_to_num = 0
    for j, column_value in enumerate(row):
        vector_to_num += (j + 1) * column_value
    
    labels_num.append(vector_to_num)

loss_cls_title, logits_cls_title = seq_classification(df['title'].values.tolist(), labels_num)  # loss: n_examples x 1 , logits: n_examples x n_labels
loss_cls_body, logits_cls_body = seq_classification(df['body'].values.tolist(), labels_num)  # loss: n_examples x 1 , logits: n_examples x n_labels