# **1. Prepare the environment**

## **1.1 Fetch the "Label-Bot" repo and modify it so that it can be used in colab**

In [None]:
!git clone https://github.com/GiorgosKarantonis/Label-Bot

!mv Label-Bot Label_Bot
!touch Label_Bot/__init__.py

## **1.2 Mount google drive in order to be able to access the preprocessed dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **2. Prepare the Label Bot**

## **2.1 Import all the required libraries**

In [None]:
import pandas as pd

try:
    from transformers import BertTokenizer, TFBertForSequenceClassification
except:
    !pip install transformers==3.0.0
    from transformers import BertTokenizer, TFBertForSequenceClassification

import Label_Bot.preprocessing as pp

In [14]:
MEMORY_LIMIT = 100

# a dummy set of labels used for the proof of concept
LABELS = [
    'bug', 
    'enhancement', 
    'question'
]

In [15]:
df = pp.load_data(memory_limit=MEMORY_LIMIT, file='./drive/My Drive/Label Bot/data/github.pkl')

for c in df.columns:
    if c.startswith('label_'):
        df = df.drop(c, axis=1)

df.head()

Unnamed: 0,url,repo,title,body,labels,user,repo_name,issue_number
0,https://github.com/F5Networks/f5-openstack-lba...,F5Networks/f5-openstack-lbaasv2-driver,test_l7policies_and_rules.py:testl7basicupdate...,title: test_l7policies_and_rules.py:testl7basi...,"[s3, p3, test-bug]",F5Networks,f5-openstack-lbaasv2-driver,835
1,https://github.com/aspnet/Mvc/issues/6339,aspnet/Mvc,testing all controllers dependency injection,i'm writing integration tests for my applicati...,[question],aspnet,Mvc,6339
2,https://github.com/ionic-team/ionic-cli/issues...,ionic-team/ionic-cli,testing ionic4 - serve shows two displays,\r description: \r ionic serve shows two di...,[bug],ionic-team,ionic-cli,3044
3,https://github.com/thefarwind/chip-8/issues/21,thefarwind/chip-8,tests are all broken,"when switching chip8 such that the audio, disp...","[bug, tests]",thefarwind,chip-8,21
4,https://github.com/n-sokolov/CoffeeShop/issues/1,n-sokolov/CoffeeShop,tests for paging,_ context _: paging mechanism must be tested...,[enhancement],n-sokolov,CoffeeShop,1


In [16]:
df = pp.clean_text_data(df, 'title', 'body')

df.head()

Unnamed: 0,url,repo,title,body,labels,user,repo_name,issue_number
0,https://github.com/F5Networks/f5-openstack-lba...,F5Networks/f5-openstack-lbaasv2-driver,test_l7policies_and_rules.py:testl7basicupdate...,title: test_l7policies_and_rules.py:testl7basi...,"[s3, p3, test-bug]",F5Networks,f5-openstack-lbaasv2-driver,835
1,https://github.com/aspnet/Mvc/issues/6339,aspnet/Mvc,testing all controllers dependency injection,i'm writing integration tests for my applicati...,[question],aspnet,Mvc,6339
2,https://github.com/ionic-team/ionic-cli/issues...,ionic-team/ionic-cli,testing ionic4 - serve shows two displays,description: ionic serve shows two displa...,[bug],ionic-team,ionic-cli,3044
3,https://github.com/thefarwind/chip-8/issues/21,thefarwind/chip-8,tests are all broken,"when switching chip8 such that the audio, disp...","[bug, tests]",thefarwind,chip-8,21
4,https://github.com/n-sokolov/CoffeeShop/issues/1,n-sokolov/CoffeeShop,tests for paging,_ context _: paging mechanism must be tested...,[enhancement],n-sokolov,CoffeeShop,1


In [17]:
unique_labels = pp.get_unique_values(df, 'labels').keys().values

pd.Series(unique_labels)

0                               bug
1                       enhancement
2                          question
3                          kind/bug
4                             fixed
                  ...              
80                             done
81                       discussion
82                      kind/bug/p0
83                      sig/network
84    role-based-access-control/svc
Length: 85, dtype: object

In [None]:
paraphrase_list = pp.make_combinations(LABELS, unique_labels)
_, paraphrase_likelihood = pp.check_paraphrase(paraphrase_list)

In [19]:
label_mapping = {}
for i, pair in enumerate(paraphrase_list):
    if paraphrase_likelihood[i] > .5:
        target_l, real_l = pair[0], pair[1]
        try:
            label_mapping[real_l].append((target_l, paraphrase_likelihood[i]))
        except:
            label_mapping[real_l] = []
            label_mapping[real_l].append((target_l, paraphrase_likelihood[i]))

pd.DataFrame(label_mapping).T

Unnamed: 0,0
bug,"(bug, 0.9162983)"
kind/bug,"(bug, 0.84865934)"
extension-bug,"(bug, 0.62284034)"
type:visual bug,"(bug, 0.7715802)"
minor bug,"(bug, 0.5306739)"
bug report,"(bug, 0.6632633)"
type: bug,"(bug, 0.82417744)"
type-bug,"(bug, 0.799179)"
enhancement,"(enhancement, 0.9370464)"
kind/enhancement,"(enhancement, 0.90538484)"


In [21]:
df['labels']

0                 [s3, p3, test-bug]
1                         [question]
2                              [bug]
3                       [bug, tests]
4                      [enhancement]
                   ...              
95    [extension-bug, status: stale]
96                       [bug, done]
97                     [enhancement]
98              [bug, pdf previewer]
99                             [bug]
Name: labels, Length: 100, dtype: object

In [22]:
label_mapping = pp.disambiguate_labels(label_mapping)
df['labels'] = pp.map_labels(df['labels'], label_mapping)

df['labels']

0       [undefined]
1        [question]
2             [bug]
3             [bug]
4     [enhancement]
          ...      
95            [bug]
96            [bug]
97    [enhancement]
98            [bug]
99            [bug]
Name: labels, Length: 100, dtype: object

In [23]:
if 'undefined' not in LABELS:
    LABELS.append('undefined')

labels_vectorized = pp.vectorize(df['labels'], LABELS, prefix='label')
df = pp.transform(df, to_add=[labels_vectorized])

df.head()

Vectorizing Features...

1 / 1002 / 1003 / 1004 / 1005 / 1006 / 1007 / 1008 / 1009 / 10010 / 10011 / 10012 / 10013 / 10014 / 10015 / 10016 / 10017 / 10018 / 10019 / 10020 / 10021 / 10022 / 10023 / 10024 / 10025 / 10026 / 10027 / 10028 / 10029 / 10030 / 10031 / 10032 / 10033 / 10034 / 10035 / 10036 / 10037 / 10038 / 10039 / 10040 / 10041 / 10042 / 10043 / 10044 / 10045 / 10046 / 10047 / 10048 / 10049 / 10050 / 10051 / 10052 / 10053 / 10054 / 10055 / 10056 / 10057 / 10058 / 10059 / 10060 / 10061 / 10062 / 10063 / 10064 / 10065 / 10066 / 10067 / 10068 / 10069 / 10070 / 10071 / 10072 / 10073 / 10074 / 10075 / 10076 / 10077 / 10078 / 10079 / 10080 / 10081 / 10082 / 10083 / 10084 / 10085 / 10086 / 10087 / 10088 / 10089 / 10090 / 10091 / 10092 / 10093 / 10094 / 10095 / 10096 / 10097 / 10098 / 10099 / 100100 / 100Transforming DataFrame...



Unnamed: 0,url,repo,title,body,labels,user,repo_name,issue_number,label_bug,label_enhancement,label_question,label_undefined
0,https://github.com/F5Networks/f5-openstack-lba...,F5Networks/f5-openstack-lbaasv2-driver,test_l7policies_and_rules.py:testl7basicupdate...,title: test_l7policies_and_rules.py:testl7basi...,[undefined],F5Networks,f5-openstack-lbaasv2-driver,835,0.0,0.0,0.0,1.0
1,https://github.com/aspnet/Mvc/issues/6339,aspnet/Mvc,testing all controllers dependency injection,i'm writing integration tests for my applicati...,[question],aspnet,Mvc,6339,0.0,0.0,1.0,0.0
2,https://github.com/ionic-team/ionic-cli/issues...,ionic-team/ionic-cli,testing ionic4 - serve shows two displays,description: ionic serve shows two displa...,[bug],ionic-team,ionic-cli,3044,1.0,0.0,0.0,0.0
3,https://github.com/thefarwind/chip-8/issues/21,thefarwind/chip-8,tests are all broken,"when switching chip8 such that the audio, disp...",[bug],thefarwind,chip-8,21,1.0,0.0,0.0,0.0
4,https://github.com/n-sokolov/CoffeeShop/issues/1,n-sokolov/CoffeeShop,tests for paging,_ context _: paging mechanism must be tested...,[enhancement],n-sokolov,CoffeeShop,1,0.0,1.0,0.0,0.0
