In [1]:
# !pip3 install scikit-learn
# !pip3 install torch torchvision torchaudio
# !pip3 install simpletransformers

In [2]:
import pandas as pd
import sklearn 
import torch
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from functools import partial
import unicodedata as ud
import re
import os
import logging
pd.set_option('display.max_colwidth', None)
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
torch.multiprocessing.set_sharing_strategy('file_system')
cuda_available = torch.cuda.is_available()
print("Is cuda available?", cuda_available)

Is cuda available? True


# download data [optional]

In [3]:
# if not os.path.isfile("github-labels-top3-803k-train.csv"):
#     print('downloading data...')
#     !curl "https://tickettagger.blob.core.windows.net/datasets/github-labels-top3-803k-train.tar.gz" | tar -xz

# if not os.path.isfile("github-labels-top3-803k-test.csv"):
#     print('downloading data...')
#     !curl "https://tickettagger.blob.core.windows.net/datasets/github-labels-top3-803k-test.tar.gz" | tar -xz

# print('loading data...')
# train = pd.read_csv('github-labels-top3-803k-train.csv')
# test = pd.read_csv('github-labels-top3-803k-test.csv')
# print(train.shape, test.shape)

# load data

In [4]:
train = pd.read_csv('data/github-labels-top3-803k-train.csv')
test = pd.read_csv('data/github-labels-top3-803k-test.csv')
print(train.shape, test.shape)

(722899, 8) (80518, 8)


In [5]:
label= 'issue_label'
time = 'issue_created_at'
repo = 'repository_url'
title  = 'issue_title'
body = 'issue_body'
author = 'issue_author_association'
url = 'issue_url'
label_col = 'labels'
text_col = 'text'
max_title = 30
max_body = 170
punctuations = '!"$%&\()*,/:;<=>[\\]^`{|}~+#@-`'
issue_regex = re.compile(r'#[0-9]+')
function_regex = re.compile(r'[a-zA-Z][a-zA-Z0-9_.]*\([a-zA-Z0-9_, ]*\)')
ascii_regex = re.compile(r'[^\x00-\x7f]')

# first deduplicate the TRAINING dataset based on issue URls¶

In [6]:
dedup_train = train.sort_values(url).drop_duplicates(subset=[url]).copy()
print('Number of dropped issue duplications: ' , train.shape[0] - dedup_train.shape[0])

dedup_train[title] = dedup_train[title].astype(str)
dedup_train[body] = dedup_train[body].astype(str)
dedup_train[author] = dedup_train[author].astype(str)
dedup_train[time] = dedup_train[time].astype(str)
dedup_train[repo] = dedup_train[repo].astype(str)

test[title] = test[title].astype(str)
test[body] = test[body].astype(str)
test[author] = test[author].astype(str)
test[time] = test[time].astype(str)
test[repo] = test[repo].astype(str)

Number of dropped issue duplications:  26220


# normalize text

In [7]:
print('Replacing functions...')
dedup_train[body] = dedup_train[body].apply(lambda x:function_regex.sub(" function ",x))
test[body] = test[body].apply(lambda x:function_regex.sub(" function ",x))
 
print('Replacing issue numbers...')
dedup_train[title] = dedup_train[title].apply(lambda x:issue_regex.sub(" issue ",x))
dedup_train[body] = dedup_train[body].apply(lambda x:issue_regex.sub(" issue ",x))
test[title] = test[title].apply(lambda x:issue_regex.sub(" issue ",x))
test[body] = test[body].apply(lambda x:issue_regex.sub(" issue ",x))

print('Converting to lower case...')
dedup_train[title] = dedup_train[title].str.lower()
dedup_train[body] = dedup_train[body].str.lower()
test[title] = test[title].str.lower()
test[body] = test[body].str.lower()

Replacing functions...
Replacing issue numbers...
Converting to lower case...


# remove extra information

In [8]:
print('Removing punctuations...')
replace_string = ' '*len(punctuations)
dedup_train[title] = dedup_train[title].str.translate(str.maketrans(punctuations, replace_string))
dedup_train[body] = dedup_train[body].str.translate(str.maketrans(punctuations, replace_string))
test[title] = test[title].str.translate(str.maketrans(punctuations, replace_string))
test[body] = test[body].str.translate(str.maketrans(punctuations, replace_string))

print('Removing non-ascii charachters...')
dedup_train[title] = dedup_train[title].apply(lambda x:re.sub(ascii_regex, '', x))
dedup_train[title] = dedup_train[title].apply(lambda x:ud.normalize('NFD', x))
dedup_train[body] = dedup_train[body].apply(lambda x:re.sub(ascii_regex, '', x))
dedup_train[body] = dedup_train[body].apply(lambda x:ud.normalize('NFD', x))

test[title] = test[title].apply(lambda x:re.sub(ascii_regex, '', x))
test[title] = test[title].apply(lambda x:ud.normalize('NFD', x))
test[body] = test[body].apply(lambda x:re.sub(ascii_regex, '', x))
test[body] = test[body].apply(lambda x:ud.normalize('NFD', x))

print('Replacing fixed part of repo URl column...')
dedup_train[repo] = dedup_train[repo].apply(lambda x: x.replace('https://api.github.com/repos/', ''))
test[repo] = test[repo].apply(lambda x: x.replace('https://api.github.com/repos/', ''))

print('Replacing white spaces...')
dedup_train[title] = dedup_train[title].apply(lambda x:" ".join(x.split()))
dedup_train[body] = dedup_train[body].apply(lambda x:" ".join(x.split()))
test[title] = test[title].apply(lambda x:" ".join(x.split()))
test[body] = test[body].apply(lambda x:" ".join(x.split()))

Removing punctuations...
Removing non-ascii charachters...
Replacing fixed part of repo URl column...
Replacing white spaces...


# truncate columns

In [9]:
dedup_train[title] = dedup_train[title].apply(lambda x: ' '.join(x.split(maxsplit=max_title)[:max_title]))
dedup_train[body] = dedup_train[body].apply(lambda x: ' '.join(x.split(maxsplit=max_body)[:max_body]))
test[title] = test[title].apply(lambda x: ' '.join(x.split(maxsplit=max_title)[:max_title]))
test[body] = test[body].apply(lambda x: ' '.join(x.split(maxsplit=max_body)[:max_body]))

# prepare label column for the model

In [10]:
dedup_train[label] = pd.Categorical(dedup_train[label])
test[label] = pd.Categorical(test[label])
dedup_train[label_col] = dedup_train[label].cat.codes
test[label_col] = test[label].cat.codes

# concat issue columns in one "text" column to feed the model

In [11]:
dedup_train[text_col] = 'time ' + dedup_train[time] + ' author ' + dedup_train[author] +' repo ' + dedup_train[repo] + ' title ' + dedup_train[title] + ' body ' + dedup_train[body]
test[text_col] = 'time ' + test[time] + ' author ' + test[author] +' repo ' + test[repo] + ' title ' + test[title] + ' body ' + test[body]

# save the data [optional]

In [12]:
dedup_train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

dedup_train[[text_col, label_col]].to_csv(f'train_clean_concat_{max_title + max_body}.csv', index = False)
test[[text_col, label_col]].to_csv(f'test_clean_concat_{max_title + max_body}.csv', index = False)
del train, dedup_train, test

# load data [optional]

In [13]:
train = pd.read_csv(f'train_clean_concat_{max_title + max_body}.csv')
test = pd.read_csv(f'test_clean_concat_{max_title + max_body}.csv')
print('number of issues: ', train.shape,  test.shape)

number of issues:  (696679, 2) (80518, 2)


# define model parameters and function

In [14]:
lr = 3e-5
epochs = 4
batch = 100
max_seq = 200
name = 'roberta'
ver = 'roberta-base'
output_name = 'output/' + name

def create_model(name, ver, lr, epochs, batch, max_seq):
    
    model_args = ClassificationArgs()
    model_name = name
    model_version = ver
    model_args.learning_rate = lr
    model_args.num_train_epochs = epochs
    model_args.eval_batch_size = batch
    model_args.train_batch_size = batch
    model_args.max_seq_length = max_seq
    model_args.n_gpu = 2
    model_args.no_cache = True
    model_args.output_dir = output_name +'/'
    model_args.overwrite_output_dir = True
    model_args.reprocess_input_data = True
    model_args.preprocess_inputs = True
    model_args.save_steps = -1
    model_args.save_model_every_epoch = False
    
    model = ClassificationModel(model_name, 
                                model_version, 
                                args = model_args, 
                                num_labels = 3, 
                                use_cuda=cuda_available)
 
    return model

# define evaluation setup

In [15]:
def calc(p1, p2, func, **kwargs):
    return func(p1, p2, **kwargs)

metrics_recom = {
    "accuracy": partial(calc,func=sklearn.metrics.accuracy_score) ,
    "p_micro": partial(calc,func=sklearn.metrics.precision_score,average='micro'),
    "r_micro": partial(calc,func=sklearn.metrics.recall_score,average='micro'),
    "f_micro": partial(calc,func=sklearn.metrics.f1_score,average='micro'),
    "classificationReport": partial(calc,func=sklearn.metrics.classification_report, output_dict=True)}

# train the model

In [17]:
model = create_model(name, ver, lr, epochs, batch, max_seq)
model.train_model(train_df = train, **metrics_recom)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

  0%|          | 0/696679 [00:00<?, ?it/s]



Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/6967 [00:00<?, ?it/s]



Running Epoch 1 of 4:   0%|          | 0/6967 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/6967 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/6967 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to output/roberta/.


(27868, 0.32264242434850915)

# evaluate the model

In [18]:
results, model_outputs, wrong_pred = model.eval_model(test, verbose=True, **metrics_recom)
results

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/80518 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/806 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.77534492998308, 'accuracy': 0.8719292580540997, 'p_micro': 0.8719292580540997, 'r_micro': 0.8719292580540997, 'f_micro': 0.8719292580540997, 'classificationReport': {'0.0': {'precision': 0.8925303938250598, 'recall': 0.8983568308181096, 'f1-score': 0.8954341345142814, 'support': 40288}, '1.0': {'precision': 0.877667865707434, 'recall': 0.8818179080203596, 'f1-score': 0.8797379925784595, 'support': 33203}, '2.0': {'precision': 0.7165127894657182, 'recall': 0.6736872064892557, 'f1-score': 0.6944403696640751, 'support': 7027}, 'accuracy': 0.8719292580540997, 'macro avg': {'precision': 0.8289036829994041, 'recall': 0.8179539817759083, 'f1-score': 0.8232041655856053, 'support': 80518}, 'weighted avg': {'precision': 0.8710400907012537, 'recall': 0.8719292580540997, 'f1-score': 0.8714203464631934, 'support': 80518}}, 'eval_loss': 0.3615677583565191}


{'mcc': 0.77534492998308,
 'accuracy': 0.8719292580540997,
 'p_micro': 0.8719292580540997,
 'r_micro': 0.8719292580540997,
 'f_micro': 0.8719292580540997,
 'classificationReport': {'0.0': {'precision': 0.8925303938250598,
   'recall': 0.8983568308181096,
   'f1-score': 0.8954341345142814,
   'support': 40288},
  '1.0': {'precision': 0.877667865707434,
   'recall': 0.8818179080203596,
   'f1-score': 0.8797379925784595,
   'support': 33203},
  '2.0': {'precision': 0.7165127894657182,
   'recall': 0.6736872064892557,
   'f1-score': 0.6944403696640751,
   'support': 7027},
  'accuracy': 0.8719292580540997,
  'macro avg': {'precision': 0.8289036829994041,
   'recall': 0.8179539817759083,
   'f1-score': 0.8232041655856053,
   'support': 80518},
  'weighted avg': {'precision': 0.8710400907012537,
   'recall': 0.8719292580540997,
   'f1-score': 0.8714203464631934,
   'support': 80518}},
 'eval_loss': 0.3615677583565191}

# save the results

In [20]:
with open(output_name + '/fullreport.txt','w') as f:
        f.write(str(results))