In [1]:
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from multiprocessing.pool import ThreadPool as Pool
import pandas as pd
import nltk

nltk.download("stopwords")


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mansmooth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## (1) Bug prediction (pg 20-28)


In [2]:
def initialize_worker(_stopwords, ps):
    global stemmer
    stemmer = ps
    global stopword_set
    stopword_set = _stopwords

def preprocess(text):
    cleaned_text = text.translate(str.maketrans('', '', '!"#$%&\'()*+,.<=>?@[]^`{|}~' + u'\xa0'))
    cleaned_text = cleaned_text.lower()
    cleaned_text = cleaned_text.translate(str.maketrans(string.whitespace, ' ' * len(string.whitespace), ''))
    cleaned_text = ' '.join(['_variable_with_underscore' if '_' in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_variable_with_dash' if '-' in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_long_variable_name' if len(t) > 15 and t[0] != '#' else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_weburl' if t.startswith('http') and '/' in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_number' if re.sub('[\\/;:_-]', '', t).isdigit() else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_variable_with_address' if re.match('.*0x[0-9a-f].*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_name_with_number' if re.match('.*[a-f]*:[0-9]*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_number_starts_with_one_character' if re.match('[a-f][0-9].*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_number_starts_with_three_characters' if re.match('[a-f]{3}[0-9].*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_version' if any(i.isdigit() for i in t) and t.startswith('v') else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_localpath' if ('\\' in t or '/' in t) and ':' not in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_image_size' if t.endswith('px') else t for t in cleaned_text.split()])
    tokenized_text = word_tokenize(cleaned_text)

    sw_removed_text = [word for word in tokenized_text if word not in stopword_set]
    sw_removed_text = [word for word in sw_removed_text if len(word) > 2]
    stemmed_text = ' '.join([stemmer.stem(w) for w in sw_removed_text])

    return stemmed_text


In [3]:
dataset = pd.read_json('resource/embold_train.json')
dataset.loc[dataset['label'] > 0, 'label'] = -1
dataset.loc[dataset['label'] == 0, 'label'] = 1
dataset.loc[dataset['label'] == -1, 'label'] = 0
stopwords_set = set(stopwords.words('english'))
ps = PorterStemmer()
pool = Pool(8, initializer=initialize_worker, initargs=(stopwords_set, ps, ))

cleaned_title = pool.map(preprocess, dataset.title)
cleaned_body = pool.map(preprocess, dataset.body)


In [4]:
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer


In [5]:
data_texts = pd.DataFrame([cleaned_title, cleaned_body], index=['title','body']).T
y = dataset['label']

data_fit, data_blindtest, y_fit, y_blindtest = model_selection.train_test_split(data_texts.apply(lambda x: ' '.join([x["title"], x["body"]]), axis=1), y, test_size=0.1, stratify=y)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1))
tfidf_vectorizer.fit(cleaned_title + cleaned_body)
X_tfidf_fit = tfidf_vectorizer.transform(data_fit)
X_tfidf_blindtest = tfidf_vectorizer.transform(data_blindtest)


In [8]:
import lightgbm as lgb


In [9]:
gbm_model = lgb.LGBMClassifier(boosting_type="dart", num_leaves=63, verbosity=1)
scoring = {
    'precision': 'precision_macro',
    'recall': 'recall_macro',
    'f1': 'f1_macro'
}
scores = model_selection.cross_validate(gbm_model, X_tfidf_fit, y_fit, cv=5, n_jobs=1, scoring=scoring, return_train_score=True)

print('CV: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(scores["train_precision"].mean(), scores["train_recall"].mean(), scores["train_f1"].mean()))


[LightGBM] [Info] Number of positive: 48116, number of negative: 59884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.976653 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 372932
[LightGBM] [Info] Number of data points in the train set: 108000, number of used features: 5323
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.445519 -> initscore=-0.218795
[LightGBM] [Info] Start training from score -0.218795
[LightGBM] [Info] Number of positive: 48115, number of negative: 59885
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.012230 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 372733
[LightGBM] [Info] Number of data points in the train set: 108000, number of used features: 5306
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.445509 -> initscore=-0.218832
[LightGBM] [Info] Start training from score -0.218832
[L

In [9]:
from sklearn import metrics


In [11]:
data_fit_train, data_fit_test, y_fit_train, y_fit_test = model_selection.train_test_split(data_fit, y_fit, test_size=0.3, stratify=y_fit)

X_tfidf_fit_train = tfidf_vectorizer.transform(data_fit_train)
X_tfidf_fit_test = tfidf_vectorizer.transform(data_fit_test)
X_tfidf_blindtest = tfidf_vectorizer.transform(data_blindtest)

gbm_model.fit(X_tfidf_fit_train, y_fit_train, eval_set=[(X_tfidf_fit_test, y_fit_test)], eval_metric='AUC')

X_blindtest = gbm_model.predict(X_tfidf_blindtest)
precision_test_score = metrics.precision_score(X_blindtest, y_blindtest, average='macro')
recall_test_score = metrics.recall_score(X_blindtest, y_blindtest, average='macro')
f1_test_score = metrics.f1_score(X_blindtest, y_blindtest, average='macro')

print('test: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_test_score, recall_test_score, f1_test_score))


[LightGBM] [Info] Number of positive: 42101, number of negative: 52399
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.823405 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 345174
[LightGBM] [Info] Number of data points in the train set: 94500, number of used features: 4939
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.445513 -> initscore=-0.218816
[LightGBM] [Info] Start training from score -0.218816
test: p:0.7807 r:0.7821 f:0.7814


In [10]:
import pickle


In [28]:
pickle.dump(tfidf_vectorizer, open('resource/github_bug_prediction_tfidf_vectorizer.pkl', 'wb'))
pickle.dump(gbm_model, open('resource/github_bug_prediction_basic_model.pkl', 'wb'))


In [29]:
import requests
import json
res = requests.get("http://localhost:5000/predict_basic?title=download fail fail fail fail&body=download failed failed download").text
json.loads(res)


{'bug_prob': 0.7695167191797154, 'predict_as': 'bug', 'status': 'success'}

In [30]:
res = requests.get("http://localhost:5000/predict_basic?title=downloading failed&body=failed 404").text
json.loads(res)


{'bug_prob': 0.7770326760033179, 'predict_as': 'bug', 'status': 'success'}

## (2) Topic modelling (pg 39-44)


In [11]:
from scipy.sparse import hstack
from sklearn.decomposition import TruncatedSVD


In [12]:
lsa = TruncatedSVD(n_components=100, n_iter=10, random_state=0)
X_lsa_fit = lsa.fit_transform(X_tfidf_fit)
gbm_model_with_lsa = lgb.LGBMClassifier(boosting_type="dart", num_leaves=63, verbosity=1)


In [13]:
scoring = {
    'precision': 'precision_macro',
    'recall': 'recall_macro',
    'f1': 'f1_macro'
}
scores = model_selection.cross_validate(gbm_model_with_lsa, X_lsa_fit, y_fit, cv=5, n_jobs=1, scoring=scoring, return_train_score=True)

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(scores["train_precision"].mean(), scores["train_recall"].mean(), scores["train_f1"].mean()))


[LightGBM] [Info] Number of positive: 48116, number of negative: 59884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022399 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 108000, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.445519 -> initscore=-0.218795
[LightGBM] [Info] Start training from score -0.218795
[LightGBM] [Info] Number of positive: 48115, number of negative: 59885
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 108000, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.445509 -> initscore=-0.218832
[LightGBM] [Info] Start training from score -0.218832
[Light

In [14]:
X_fit_with_lsa = hstack([X_tfidf_fit, X_lsa_fit]).tocsr()
scores = model_selection.cross_validate(gbm_model_with_lsa, X_fit_with_lsa, y_fit, cv=5, n_jobs=1, scoring=scoring, return_train_score=True)

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(scores["train_precision"].mean(), scores["train_recall"].mean(), scores["train_f1"].mean()))



[LightGBM] [Info] Number of positive: 48116, number of negative: 59884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.213167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 397386
[LightGBM] [Info] Number of data points in the train set: 108000, number of used features: 5394
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.445519 -> initscore=-0.218795
[LightGBM] [Info] Start training from score -0.218795
[LightGBM] [Info] Number of positive: 48115, number of negative: 59885
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.238721 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 398096
[LightGBM] [Info] Number of data points in the train set: 108000, number of used features: 5403
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.445509 -> initscore=-0.218832
[LightGBM] [Info] Start training from score -0.218832
[L

In [17]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer


In [18]:
count_vectorizer = CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(cleaned_title + cleaned_body)
X_tf_fit = count_vectorizer.transform(data_fit)
X_tf_blindtest = count_vectorizer.transform(data_blindtest)
lda = LatentDirichletAllocation(n_components=100, random_state=0)

X_lda_fit = lda.fit_transform(X_tf_fit)
gbm_model_with_lda = lgb.LGBMClassifier(boosting_type="dart", num_leaves=63, verbosity=1)


In [19]:
scoring = {
    'precision': 'precision_macro',
    'recall': 'recall_macro',
    'f1': 'f1_macro'
}
scores = model_selection.cross_validate(gbm_model_with_lda, X_lda_fit, y_fit, cv=5, n_jobs=1, scoring=scoring, return_train_score=True)

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(scores["train_precision"].mean(), scores["train_recall"].mean(), scores["train_f1"].mean()))


[LightGBM] [Info] Number of positive: 48116, number of negative: 59884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029488 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 108000, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.445519 -> initscore=-0.218795
[LightGBM] [Info] Start training from score -0.218795
[LightGBM] [Info] Number of positive: 48115, number of negative: 59885
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029345 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 108000, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.445509 -> initscore=-0.218832
[LightGBM] [Info] Start training from score -0.218832
[Light

In [20]:
X_fit_with_lda = hstack([X_tfidf_fit, X_lda_fit]).tocsr()
scores = model_selection.cross_validate(gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=1, scoring=scoring, return_train_score=True)

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(scores["train_precision"].mean(), scores["train_recall"].mean(), scores["train_f1"].mean()))


[LightGBM] [Info] Number of positive: 48116, number of negative: 59884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.175229 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 397386
[LightGBM] [Info] Number of data points in the train set: 108000, number of used features: 5394
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.445519 -> initscore=-0.218795
[LightGBM] [Info] Start training from score -0.218795
[LightGBM] [Info] Number of positive: 48115, number of negative: 59885
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.111096 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 398096
[LightGBM] [Info] Number of data points in the train set: 108000, number of used features: 5403
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.445509 -> initscore=-0.218832
[LightGBM] [Info] Start training from score -0.218832
[L