# **1. Prepare the environment**

## **Fetch the "Label-Bot" repo and modify it so that it can be used in colab**

In [None]:
!git clone https://github.com/GiorgosKarantonis/Label-Bot

!mv Label-Bot Label_Bot
!touch Label_Bot/__init__.py

## **Mount google drive in order to be able to access the preprocessed dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **2. Prepare the Label Bot**

## **Import all the required libraries**

In [None]:
import time

import numpy as np
import pandas as pd

import tensorflow as tf

try:
    from transformers import BertTokenizer
    from transformers import TFBertModel, TFDistilBertModel
    from transformers import TFBertForSequenceClassification
    from transformers import T5Tokenizer, TFT5ForConditionalGeneration
    from transformers import pipeline
except:
    !pip install transformers==3.0.0
    from transformers import BertTokenizer, DistilBertTokenizer, DistilBertTokenizerFast
    from transformers import TFBertModel, TFDistilBertModel
    from transformers import TFBertForSequenceClassification
    from transformers import T5Tokenizer, TFT5ForConditionalGeneration
    from transformers import pipeline

import Label_Bot.preprocessing as pp
import Label_Bot.language_modeling as lm

## **Define the hyperparameters**

In [4]:
MEMORY_LIMIT = 100

## **Load the dataset**

In [7]:
df = pp.load_data(memory_limit=MEMORY_LIMIT, file='./drive/My Drive/Label Bot/data/github.pkl')

In [8]:
df.describe()

Unnamed: 0,label_bug,label_enhancement,label_question,label_undefined
count,100.0,100.0,100.0,100.0
mean,0.65,0.2,0.09,0.06
std,0.479372,0.402015,0.287623,0.238683
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0


In [9]:
df.head()

Unnamed: 0,url,repo,title,body,labels,user,repo_name,issue_number,label_bug,label_enhancement,label_question,label_undefined
0,https://github.com/F5Networks/f5-openstack-lba...,F5Networks/f5-openstack-lbaasv2-driver,test_l7policies_and_rules.py:testl7basicupdate...,title: test_l7policies_and_rules.py:testl7basi...,[undefined],F5Networks,f5-openstack-lbaasv2-driver,835,0.0,0.0,0.0,1.0
1,https://github.com/aspnet/Mvc/issues/6339,aspnet/Mvc,testing all controllers dependency injection,i'm writing integration tests for my applicati...,[question],aspnet,Mvc,6339,0.0,0.0,1.0,0.0
2,https://github.com/ionic-team/ionic-cli/issues...,ionic-team/ionic-cli,testing ionic4 - serve shows two displays,description: ionic serve shows two displa...,[bug],ionic-team,ionic-cli,3044,1.0,0.0,0.0,0.0
3,https://github.com/thefarwind/chip-8/issues/21,thefarwind/chip-8,tests are all broken,"when switching chip8 such that the audio, disp...",[bug],thefarwind,chip-8,21,1.0,0.0,0.0,0.0
4,https://github.com/n-sokolov/CoffeeShop/issues/1,n-sokolov/CoffeeShop,tests for paging,_ context _: paging mechanism must be tested...,[enhancement],n-sokolov,CoffeeShop,1,0.0,1.0,0.0,0.0


In [10]:
labels = np.transpose([df[c] for c in df.columns if c.startswith('label_')])

title = df['title'].values.tolist()
body = df['body'].values.tolist()

# **3. Summarize**

In [None]:
summarizer = pipeline("summarization")

In [None]:
summary = summarizer(body[0])

In [14]:
body[0]

'title: test_l7policies_and_rules.py:testl7basicupdate.test_policy_deployment_operand_match fails in setup error was encountered instead of active/onine   attachments:    details:         suggested issue type test bug        traceback       0 traceback  most recent call last :   1   file \\ /home-local/jenkins/workspace/openstack/driver/newton/11.5.4-undercloud-vxlan/f5lbaasdriver/test/tempest/tests/scenario/test_l7policies_and_rules.py\\ , line 305, in setup   2     super testl7basicupdate, self .setup     3   file \\ /home-local/jenkins/workspace/openstack/driver/newton/11.5.4-undercloud-vxlan/f5lbaasdriver/test/tempest/tests/scenario/test_l7policies_and_rules.py\\ , line 26, in setup   4     super testl7basic, self .setup     5   file \\ /home-local/jenkins/workspace/openstack/driver/newton/11.5.4-undercloud-vxlan/f5lbaasdriver/test/tempest/tests/scenario/f5_base.py\\ , line 33, in setup   6     self._create_load_balancer     7   file \\ /home-local/jenkins/workspace/openstack/drive

In [17]:
# 3 titles need summarization
# 973902 bodies need summarization
# each summary takes ~20sec

summary[0]['summary_text']

' A bug was encountered with provisioning status: active and operating status: onli . It was encountered instead of active/onine . The bug is low-level but low-several, with a low-calient low bug severity . The issue was reported to openstack_driver_newton_11.5.4-undercloud-vxlan .'

# **3. Get the contextual embeddings**

In [18]:
embeddings_title = lm.get_embeddings(df['title'].values.tolist())
embeddings_body = lm.get_embeddings(df['body'].values.tolist())

In [25]:
embeddings_title.shape

TensorShape([100, 47, 768])

In [21]:
embeddings_body.shape

TensorShape([100, 512, 768])