# **1. Prepare the environment**

## **Fetch the "Label-Bot" repo and modify it so that it can be used in colab**

In [None]:
!git clone https://github.com/GiorgosKarantonis/Label-Bot

!mv Label-Bot Label_Bot
!touch Label_Bot/__init__.py


## **Mount google drive in order to be able to access the preprocessed dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **2. Prepare the Label Bot**

## **Import all the required libraries**

In [1]:
import time

import numpy as np
import pandas as pd

import tensorflow as tf

try:
    from transformers import BertTokenizer, TFBertForSequenceClassification
except:
    !pip install transformers==3.0.0
    from transformers import BertTokenizer, TFBertForSequenceClassification

import Label_Bot.preprocessing as pp

## **Define the hyperparameters**

In [2]:
MEMORY_LIMIT = 100

LABELS = [
    'bug', 
    'question', 
    'enhancement'
]

## **Load the dataset**

In [3]:
df = pp.load_data(memory_limit=MEMORY_LIMIT, file='./drive/My Drive/Label Bot/data/github_no_merging.pkl')

for c in df.columns:
    if c.startswith('label_'):
        df = df.drop(c, axis=1)

In [6]:
df.head()

Unnamed: 0,url,repo,title,body,labels,user,repo_name,issue_number
0,https://github.com/F5Networks/f5-openstack-lba...,F5Networks/f5-openstack-lbaasv2-driver,test_l7policies_and_rules.py:testl7basicupdate...,title: test_l7policies_and_rules.py:testl7basi...,"[s3, p3, test-bug]",F5Networks,f5-openstack-lbaasv2-driver,835
1,https://github.com/aspnet/Mvc/issues/6339,aspnet/Mvc,testing all controllers dependency injection,i'm writing integration tests for my applicati...,[question],aspnet,Mvc,6339
2,https://github.com/ionic-team/ionic-cli/issues...,ionic-team/ionic-cli,testing ionic4 - serve shows two displays,\r description: \r ionic serve shows two di...,[bug],ionic-team,ionic-cli,3044
3,https://github.com/thefarwind/chip-8/issues/21,thefarwind/chip-8,tests are all broken,"when switching chip8 such that the audio, disp...","[bug, tests]",thefarwind,chip-8,21
4,https://github.com/n-sokolov/CoffeeShop/issues/1,n-sokolov/CoffeeShop,tests for paging,_ context _: paging mechanism must be tested...,[enhancement],n-sokolov,CoffeeShop,1


# **3. Disambiguate the Labels**

In [None]:
unique_labels = pp.get_unique_values(df, 'labels').keys().values

paraphrase_candidates = pp.make_combinations(LABELS, unique_labels)
paraphrase_likelihood = pp.check_paraphrase(paraphrase_candidates)

label_mapping = {}
for i, pair in enumerate(paraphrase_candidates):
    if paraphrase_likelihood[i] > .5:
        target_l, real_l = pair[0], pair[1]
        try:
            label_mapping[real_l].append((target_l, paraphrase_likelihood[i]))
        except:
            label_mapping[real_l] = []
            label_mapping[real_l].append((target_l, paraphrase_likelihood[i]))

In [8]:
pd.Series(label_mapping)

bug                         [(bug, 0.9162983894348145)]
kind/bug                    [(bug, 0.8486595153808594)]
extension-bug               [(bug, 0.6228393912315369)]
type-bug                    [(bug, 0.7991788387298584)]
type: bug                   [(bug, 0.8241775035858154)]
minor bug                   [(bug, 0.5306734442710876)]
bug report                  [(bug, 0.6632622480392456)]
type:visual bug             [(bug, 0.7715801000595093)]
question               [(question, 0.9274781942367554)]
enhancement         [(enhancement, 0.9370462894439697)]
kind/enhancement    [(enhancement, 0.9053848385810852)]
doc-enhancement     [(enhancement, 0.8313373923301697)]
dtype: object

In [None]:
label_mapping = pp.disambiguate_labels(label_mapping)

In [10]:
pd.Series(label_mapping)

bug                         bug
kind/bug                    bug
extension-bug               bug
type-bug                    bug
type: bug                   bug
minor bug                   bug
bug report                  bug
type:visual bug             bug
question               question
enhancement         enhancement
kind/enhancement    enhancement
doc-enhancement     enhancement
dtype: object

In [11]:
df['labels']

0                 [s3, p3, test-bug]
1                         [question]
2                              [bug]
3                       [bug, tests]
4                      [enhancement]
                   ...              
95    [extension-bug, status: stale]
96                       [bug, done]
97                     [enhancement]
98              [bug, pdf previewer]
99                             [bug]
Name: labels, Length: 100, dtype: object

In [None]:
df['labels'] = pp.map_labels(df['labels'], label_mapping)

In [13]:
df['labels']

0       [undefined]
1        [question]
2             [bug]
3             [bug]
4     [enhancement]
          ...      
95            [bug]
96            [bug]
97    [enhancement]
98            [bug]
99            [bug]
Name: labels, Length: 100, dtype: object

# **4. Finish and vectorize the Labels**

In [None]:
if 'undefined' not in LABELS:
    LABELS.append('undefined')

labels_vectorized = pp.vectorize(df['labels'], LABELS, prefix='label')
df = pp.transform(df, to_add=[labels_vectorized])

In [15]:
labels_vectorized

Unnamed: 0,label_bug,label_question,label_enhancement,label_undefined
0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
95,1.0,0.0,0.0,0.0
96,1.0,0.0,0.0,0.0
97,0.0,0.0,1.0,0.0
98,1.0,0.0,0.0,0.0
