In [2]:
import pandas as pd
from sklearn import model_selection, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import optuna
import lightgbm as lgb
import numpy as np
import joblib
from scipy.sparse import hstack
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from pandarallel import pandarallel
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from multiprocessing.pool import ThreadPool as Pool
import nltk
import optuna

pandarallel.initialize(nb_workers=8)

nltk.download("stopwords")
nltk.download("punkt")


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


[nltk_data] Downloading package stopwords to /home/amogus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/amogus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
def initialize_worker(_stopwords, ps):
    global stemmer
    stemmer = ps
    global stopword_set
    stopword_set = _stopwords


def preprocess(text):
    cleaned_text = text.translate(
        str.maketrans(
            "!\#$%&'()*+,<=>?@[]^`{|}~\xa0",
            " " * 26,
        )
    )
    cleaned_text = cleaned_text.lower()
    cleaned_text = re.sub(r"\s+", " ", cleaned_text)
    cleaned_text = re.sub(r"\b\w*_\w+\b", "_term_with_underscore", cleaned_text)
    cleaned_text = re.sub(r"\b\w*-\w+\b", "_term_with_dash", cleaned_text)
    cleaned_text = re.sub(
        r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)",
        "_weburl",
        cleaned_text,
    )
    cleaned_text = re.sub(r"\b[\w\\/]+?[\\/][\w\d_-]+?\b", "_path", cleaned_text)
    cleaned_text = re.sub(r"\b0[xb]?[a-f0-9]+\b", "_non_decimal_number", cleaned_text)
    cleaned_text = re.sub(
        r"\b\d+(px)?\s?[x*]\s?\d+(px)?\b", "_resolution", cleaned_text
    )
    cleaned_text = re.sub(r"\bv\d+(?:\.\d+)?\b", "_version", cleaned_text)
    cleaned_text = re.sub(
        r"\b\d+(?:\.\d+)?\b",
        "_number",
        cleaned_text,
    )
    cleaned_text = re.sub(r"\b(?!_)\w{15,}\b", "_very_long_term", cleaned_text)

    tokenized_text = word_tokenize(cleaned_text)

    sw_removed_text = [word for word in tokenized_text if word not in stopword_set]
    sw_removed_text = [word for word in sw_removed_text if len(word) > 1]
    stemmed_text = " ".join([stemmer.stem(w) for w in sw_removed_text])
    return stemmed_text


In [102]:
dataset = pd.read_json('resources/embold_train.json')
dataset.loc[dataset['label'] > 0, 'label'] = -1
dataset.loc[dataset['label'] == 0, 'label'] = 1
dataset.loc[dataset['label'] == -1, 'label'] = 0
stopwords_set = set(stopwords.words('english'))
ps = PorterStemmer()
pool = Pool(8, initializer=initialize_worker, initargs=(stopwords_set, ps, ))

cleaned_title = pool.map(preprocess, dataset.title)
cleaned_body = pool.map(preprocess, dataset.body)


In [109]:
y = dataset['label']
data_texts = pd.concat([pd.DataFrame([cleaned_title, cleaned_body], index=['title','body']).T, y.to_frame("label")], axis=1)
data_texts


Unnamed: 0,title,body,label
0,_term_with_dash piano roll,_term_with_dash piano roll would use,0
1,buggi behavior select,screenshot _term_with_dash_term_with_dash _num...,1
2,auto updat featur,hi great job far saenzramiro auto updat featur...,0
3,filter noisi endpoint log,think stop log request _term_with_underscor _t...,0
4,enabl pid pid alarm action _path,expect behavior alarm action pid pid enabl dis...,1
...,...,...,...
149995,suggest _very_long_term return random option,-- note anyth within bracket hidden preview is...,0
149996,decod display neurovault imag incorrectli,note nicholst neurovault imag display properli...,1
149997,parser return error except,raml yaml raml _number titl test baseuri name ...,1
149998,errorexcept array string convers php artisan m...,see occur branch rout,1


In [3]:
data_texts = pd.read_parquet("resources/07/data_texts.parquet")
texts = data_texts[["title","body"]].agg(" ".join, axis=1)

data_fit, data_blindtest, y_fit, y_blindtest = model_selection.train_test_split(texts, data_texts["label"], test_size=0.2, stratify=data_texts["label"], random_state=0)


In [4]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
try:
    tfidf_vectorizer = joblib.load("resources/07/tfidf_model.joblib")
except FileNotFoundError:
    tfidf_vectorizer.fit(texts)
    joblib.dump(tfidf_vectorizer, "resources/07/tfidf_model.joblib")
X_tfidf = tfidf_vectorizer.transform(texts)
X_tfidf_fit = tfidf_vectorizer.transform(data_fit)
X_tfidf_blindtest = tfidf_vectorizer.transform(data_blindtest)


In [5]:
count_vectorizer = CountVectorizer(ngram_range=(1, 1))
try:
    count_vectorizer = joblib.load("resources/07/count_vectorizer.joblib")
except FileNotFoundError:
    count_vectorizer.fit(texts)
    joblib.dump(count_vectorizer, "resources/07/count_vectorizer.joblib")
X_count = count_vectorizer.transform(texts)
X_count_fit = count_vectorizer.transform(data_fit)
X_count_blindtest = count_vectorizer.transform(data_blindtest)


In [5]:
data_texts.to_parquet("resources/07/data_texts.parquet")


In [6]:
lsa = TruncatedSVD(n_components=1000, random_state=0, n_iter=50)
try:
    lsa = joblib.load("resources/07/lsa_model.joblib")
except FileNotFoundError:
    lsa.fit(X_tfidf)
    joblib.dump(lsa, "resources/07/lsa_model.joblib")


In [7]:
X_lsa_fit = lsa.transform(X_tfidf_fit)
X_lsa_blindtest = lsa.transform(X_tfidf_blindtest)


In [8]:
lda = LatentDirichletAllocation(n_components=1000, max_iter=10, random_state=0, verbose=1)
try:
    lda = joblib.load("resources/07/lda_model.joblib")
except FileNotFoundError:
    lda.fit(X_tfidf)
    joblib.dump(lda, "resources/07/lda_model.joblib")


In [9]:
X_lda_fit = lsa.transform(X_count_fit)
X_lda_blindtest = lsa.transform(X_count_blindtest)


In [10]:
X_fit_with_lsa_lda = hstack([X_tfidf_fit, X_lsa_fit, X_lda_fit]).tocsr()
X_blindtest_with_lsa_lda = hstack(
    [X_tfidf_blindtest, X_lsa_blindtest, X_lda_blindtest]
).tocsr()


In [13]:
del count_vectorizer
del tfidf_vectorizer
del lda
del lsa
del data_texts
del texts
del X_tfidf_fit, X_lsa_fit, X_lda_fit
del X_tfidf_blindtest, X_lsa_blindtest, X_lda_blindtest


In [15]:
def objective_lda(trial):
    param = {
        "objective": "binary",
        "boosting_type": trial.suggest_categorical(
            "boosting_type", ["gbdt", "dart"]
        ),
        "reg_alpha ": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda ": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 8, 256),
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 10, 1000),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "random_state": 0,
        "verbose": -1,
    }
    model = lgb.LGBMClassifier(**param)
    model.fit(X_fit_with_lsa_lda, y_fit)
    preds = model.predict(X_blindtest_with_lsa_lda)
    pred_labels = np.rint(preds)
    accuracy = metrics.roc_auc_score(y_blindtest, pred_labels)
    return accuracy


study_lda = optuna.create_study(direction="maximize")
study_lda.optimize(objective_lda, n_trials= 16)
joblib.dump(study_lda, "resources/07/study_lsa_lda.pkl")


[I 2024-03-10 08:18:50,335] A new study created in memory with name: no-name-9a14f0b2-ae55-49b5-9685-d15510e931a1


[I 2024-03-10 08:19:52,576] Trial 0 finished with value: 0.5 and parameters: {'boosting_type': 'dart', 'reg_alpha': 0.02470166041773878, 'reg_lambda': 9.679007242827921e-08, 'num_leaves': 26, 'learning_rate': 1.3336632801709667e-05, 'n_estimators': 104, 'min_child_samples': 57}. Best is trial 0 with value: 0.5.
[I 2024-03-10 08:28:08,881] Trial 1 finished with value: 0.8013786126299902 and parameters: {'boosting_type': 'gbdt', 'reg_alpha': 0.000148994657042437, 'reg_lambda': 4.266339225586519e-06, 'num_leaves': 126, 'learning_rate': 0.2616043238826958, 'n_estimators': 498, 'min_child_samples': 49}. Best is trial 1 with value: 0.8013786126299902.
[I 2024-03-10 08:32:05,222] Trial 2 finished with value: 0.8046624951043345 and parameters: {'boosting_type': 'gbdt', 'reg_alpha': 1.0966971432764787e-06, 'reg_lambda': 2.8840338865928714e-05, 'num_leaves': 133, 'learning_rate': 0.11111486817720409, 'n_estimators': 203, 'min_child_samples': 36}. Best is trial 2 with value: 0.8046624951043345.
[

['resources/07/study_lsa_lda.pkl']

In [16]:
study_lda.optimize(objective_lda, n_trials= 16)


[I 2024-03-10 10:20:59,970] Trial 16 finished with value: 0.8081622085329129 and parameters: {'boosting_type': 'gbdt', 'reg_alpha': 8.282486528834159e-06, 'reg_lambda': 3.953884755541324, 'num_leaves': 64, 'learning_rate': 0.023005216828739027, 'n_estimators': 871, 'min_child_samples': 73}. Best is trial 15 with value: 0.8087579195083453.
[I 2024-03-10 10:30:46,831] Trial 17 finished with value: 0.7478342520823233 and parameters: {'boosting_type': 'gbdt', 'reg_alpha': 8.906977080104934e-06, 'reg_lambda': 3.209539542862226, 'num_leaves': 50, 'learning_rate': 0.0012202742785845336, 'n_estimators': 975, 'min_child_samples': 70}. Best is trial 15 with value: 0.8087579195083453.
[I 2024-03-10 10:38:25,394] Trial 18 finished with value: 0.7845265600600737 and parameters: {'boosting_type': 'gbdt', 'reg_alpha': 0.0081972685581827, 'reg_lambda': 0.3439467909583704, 'num_leaves': 54, 'learning_rate': 0.9938281401412226, 'n_estimators': 936, 'min_child_samples': 100}. Best is trial 15 with value:

In [11]:
study_lda = joblib.load("resources/07/study_lsa_lda.pkl")
study_lda


<optuna.study.study.Study at 0x7fe85c62eef0>

In [12]:
gbm_model = lgb.LGBMClassifier(**study_lda.best_params)
gbm_model.fit(
    X_fit_with_lsa_lda,
    y_fit,
    eval_set=[(X_blindtest_with_lsa_lda, y_blindtest)],
    eval_metric="AUC",
)
joblib.dump(gbm_model, "resources/07/gbm_model.joblib")


[LightGBM] [Info] Number of positive: 53462, number of negative: 66538
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.537915 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 924173
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 5595
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.445517 -> initscore=-0.218802
[LightGBM] [Info] Start training from score -0.218802


['resources/07/gbm_model.joblib']

## Final Accuracy

In [13]:
preds = gbm_model.predict(X_blindtest_with_lsa_lda)
score = metrics.roc_auc_score(y_blindtest, preds)
score


0.8105475824942813

## Flask Application Sample

In [16]:
import requests

data_texts = pd.read_parquet("resources/07/data_texts.parquet").sample(50)


In [40]:
def process_predict(s):
    predict = pd.json_normalize(requests.get(rf"http://127.0.0.1:5000/predict?title={s.title}&body={s.body}").json())[["predict_as", "bug_prob"]].iloc[0]
    s["predict_as"] = predict.predict_as
    s["bug_prob"] = predict.bug_prob
    return s


data_texts.apply(
    process_predict,
    axis=1,
)


Unnamed: 0,title,body,label,predict_as,bug_prob
95140,implement new styleguid,new styleguid found _weburl one creat account ...,0,not bug,0.071387
142649,add _very_long_term gift event,new gifte creat,0,not bug,0.104231
92559,nativ proxi enabl use node -- _term_with_dash ...,environ instal apk _term_with_dash.apk _term_w...,0,bug,0.783775
30484,report invalid version,modul version meta file report meta error per ...,0,bug,0.852607
119538,code coverag report ui,would help develop could see test coverag repo...,0,bug,0.508291
114208,endless hang get job detail,seem happen job _term_with_dash mayb render ws...,1,bug,0.697286
88977,reset graph data updat handl,present remov contest rate graph data need del...,0,bug,0.516365
42211,exist signal metadata file format,issu dedic referenc alreadi exist metadata fil...,0,not bug,0.258212
107949,treemenu indic expand beyond scroll view,_number go _weburl _number open treemenu _numb...,0,not bug,0.367572
96552,download modal warn larg download,download give warn gener file take certain amo...,0,bug,0.520923
