## App prediction server
I use ipynb because I use remote kernel and would be a pain otherwise, LDA model is 1.9 GB

In [25]:
%pip install flask nltk


Note: you may need to restart the kernel to use updated packages.


In [26]:
from flask import Flask, request
from scipy.sparse import hstack
import pickle
import string
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
import joblib

nltk.download("stopwords")
nltk.download("punkt")


[nltk_data] Downloading package stopwords to /home/amogus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/amogus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
def preprocess(text, stopword_set, stemmer):
    cleaned_text = text.translate(str.maketrans('', '', '!"#$%&\'()*+,.<=>?@[]^`{|}~' + u'\xa0'))
    cleaned_text = cleaned_text.lower()
    cleaned_text = cleaned_text.translate(str.maketrans(string.whitespace, ' ' * len(string.whitespace), ''))
    cleaned_text = ' '.join(['_variable_with_underscore' if '_' in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_variable_with_dash' if '-' in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_long_variable_name' if len(t) > 15 and t[0] != '#' else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_weburl' if t.startswith('http') and '/' in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_number' if re.sub('[\\/;:_-]', '', t).isdigit() else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_variable_    with_address' if re.match('.*0x[0-9a-f].*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_name_with_number' if re.match('.*[a-f]*:[0-9]*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_number_starts_with_one_character' if re.match('[a-f][0-9].*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_number_starts_with_three_characters' if re.match('[a-f]{3}[0-9].*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_version' if any(i.isdigit() for i in t) and t.startswith('v') else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_localpath' if ('\\' in t or '/' in t) and ':' not in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_image_size' if t.endswith('px') else t for t in cleaned_text.split()])
    tokenized_text = word_tokenize(cleaned_text)

    sw_removed_text = [word for word in tokenized_text if word not in stopword_set]
    sw_removed_text = [word for word in sw_removed_text if len(word) > 2]
    stemmed_text = ' '.join([stemmer.stem(w) for w in sw_removed_text])

    return stemmed_text


app = Flask(__name__)

app.tfidf_vectorizer = joblib.load('resources/tfidf_model.joblib')
app.count_vectorizer = joblib.load('resources/count_model.joblib')
app.lda = joblib.load('resources/lda_model.joblib')
app.lgbm = joblib.load('resources/gbm_model_lda.joblib')
app.stopword_set = set(stopwords.words())
app.stemmer = PorterStemmer()

@app.route('/predict', methods=['GET'])
def predict_basic():
    response_object = {'status': 'success'}
    argList = request.args.to_dict(flat=False)
    title = argList['title'][0]
    body = argList['body'][0]
    count = app.count_vectorizer.transform([preprocess(' '.join([title, body]), app.stopword_set, app.stemmer)])
    tf_idf = app.tfidf_vectorizer.transform([preprocess(' '.join([title, body]), app.stopword_set, app.stemmer)])
    lda = app.lda.transform(count)
    predict = app.lgbm.predict_proba(hstack([tf_idf, lda]))
    print(f"{predict=}")
    response_object['predict_as'] = 'bug' if predict[0][1] > 0.5 else 'not bug'
    response_object['bug_prob'] = predict[0][1]
    return response_object


In [30]:
app.run(debug=False)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [09/Mar/2024 10:04:54] "GET /predict?title=cannot%20download%20file%20error%20404&body=can't%20download%20cant%20download%20error%20error HTTP/1.1" 200 -


predict=array([[0.35926334, 0.64073666]])


127.0.0.1 - - [09/Mar/2024 10:04:58] "GET /predict?title=hello%20students&body=my%20name%20is%20katpark HTTP/1.1" 200 -


predict=array([[0.7255712, 0.2744288]])


127.0.0.1 - - [09/Mar/2024 10:05:11] "GET /predict?title=cannot%20download%20file%20error%20404&body=can't%20download%20cant%20download%20error%20error%20bug HTTP/1.1" 200 -


predict=array([[0.12632423, 0.87367577]])


127.0.0.1 - - [09/Mar/2024 10:05:16] "GET /predict?title=cannot%20download%20file%20error%20404&body=can't%20download%20cant%20download%20error%20error%20bug%20bug%20bug%20bug HTTP/1.1" 200 -


predict=array([[0.1105239, 0.8894761]])
