In [1]:
import string
import requests
import re
import nltk
import os
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import smote_variants as sv
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from textblob import TextBlob
import spacy

In [3]:
# TODO: Preparing pre-process, 
# Load spaCy English language model
nlp = spacy.load("en_core_web_sm")

def pos_tagger_for_spacy(tag):
    # Mapping NLTK POS tags to spaCy POS tags
    tag_dict = {'N': 'NOUN', 'V': 'VERB', 'R': 'ADV', 'J': 'ADJ'}
    return tag_dict.get(tag, 'n')

def pre_process_spacy(s):
    doc = nlp(s)
    s = " ".join([token.lemma_ if token.pos_ in ['NOUN', 'VERB'] else token.text for token in doc if token.pos_ in ['NOUN', 'VERB']])
    return s


def pre_process_textblob(s):
    blob = TextBlob(s)
    # Remove stopwords
    s = [word for word in blob.words if word not in nltk.corpus.stopwords.words('english')]
    s = " ".join(s)
    return s


def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


def pre_process_porterstemmer(s):
    ps = PorterStemmer()
    s = word_tokenize(s)
    stopwords_set = set(stopwords.words('english'))
    stop_dict = {s: 1 for s in stopwords_set}
    s = [w for w in s if w not in stop_dict]
    s = [ps.stem(w) for w in s]
    s = ' '.join(s)
    return s


def pre_process_lemmatizer(s):
    s = word_tokenize(s)
    lemmatizer = nltk.stem.WordNetLemmatizer()
    stopwords_set = set(stopwords.words('english'))
    stop_dict = {s: 1 for s in stopwords_set}
    tags = nltk.pos_tag(s)
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), tags))
    s = [lemmatizer.lemmatize(word, tag) if tag == 'n' or tag == 'v' else None for word, tag in wordnet_tagged]
    s = list(filter(None, s))
    s = [w for w in s if w not in stop_dict]
    s = ' '.join(s)
    return s


In [4]:
# TODO: Prepare X, Y as Same length
filepath = Path(os.path.abspath('../resources/clean_demo.pkl'))
x = pd.read_pickle(filepath)
word_counts = x.str.count(' ') + 1

# Check X
data = pd.read_pickle(Path(os.path.abspath('../resources/hive_use_for_run_pre_process.pkl')))
data = data[data['title_n_body'].notnull()]
data.rename(columns={'title_n_body': 'title_n_body_not_clean'}, inplace=True)
data = pd.concat([data, x.dropna()], axis=1)

y1 = pd.read_csv(Path(os.path.abspath('../resources/tsdetect/all_test_smell/df_test_semantic_smell.csv')))
y2 = pd.read_csv(Path(os.path.abspath('../resources/tsdetect/all_test_smell/df_issue_in_test_step.csv')))
y3 = pd.read_csv(Path(os.path.abspath('../resources/tsdetect/all_test_smell/df_code_related.csv')))
y4 = pd.read_csv(Path(os.path.abspath('../resources/tsdetect/all_test_smell/df_dependencies.csv')))
y5 = pd.read_csv(Path(os.path.abspath('../resources/tsdetect/all_test_smell/df_test_execution.csv')))


def compare_y_to_x(dfx, dfy):
    return dfy.loc[dfy['url'].isin(dfx['url'])]

y1_to_x = compare_y_to_x(data, y1)
y2_to_x = compare_y_to_x(data, y2)
y3_to_x = compare_y_to_x(data, y3)
y4_to_x = compare_y_to_x(data, y4)
y5_to_x = compare_y_to_x(data, y5)

In [6]:
y1_to_x.info
y_test_semantic_smell = y1_to_x['y']
y_issue_in_test_step = y2_to_x['y']
y_code_related = y3_to_x['y']
y_dependencies = y4_to_x['y']
y_test_execution = y5_to_x['y']
y_test_semantic_smell.info

<bound method Series.info of 0       0
1       0
2       0
3       1
4       0
       ..
1179    1
1180    0
1181    0
1182    0
1183    1
Name: y, Length: 1147, dtype: int64>

In [210]:
# TODO: Prepare X: TF-IDF, Ngram 1, Normalization MinMax(0,1)
# vectorizer_porter_pre = TfidfVectorizer(use_idf=True, preprocessor=pre_process_porterstemmer)
tfidf_vectorizer_porter_pre = TfidfVectorizer(use_idf=True, preprocessor=pre_process_porterstemmer, ngram_range=(1, 2, 3))
tfidf_vectorizer_lemm_pre = TfidfVectorizer(use_idf=True, preprocessor=pre_process_lemmatizer, ngram_range=(1, 2, 3))
tfidf_vectorizer_textblob_pre = TfidfVectorizer(use_idf=True, preprocessor=pre_process_textblob, ngram_range=(1, 2, 3))
tfidf_vectorizer_spacy_pre = TfidfVectorizer(use_idf=True, preprocessor=pre_process_spacy, ngram_range=(1, 2, 3))


In [211]:
# TODO: Prepare X,Y: Split80:20
x_fit, x_test = model_selection.train_test_split(x, test_size=0.2)

y_for_train_test_semantic_smell, y_for_test_test_semantic_smell = model_selection.train_test_split(y_test_semantic_smell, test_size=0.2)
y_for_train_issue_in_test_step, y_for_test_issue_in_test_step = model_selection.train_test_split(y_issue_in_test_step, test_size=0.2)
y_for_train_code_related, y_for_test_code_related = model_selection.train_test_split(y_code_related, test_size=0.2)
y_for_train_dependencies, y_for_test_dependencies= model_selection.train_test_split(y_dependencies, test_size=0.2)
y_for_train_test_execution, y_for_test_test_execution= model_selection.train_test_split(y_test_execution, test_size=0.2)

X_tfidf_vector = tfidf_vectorizer_lemm_pre.fit(x_fit)

X_tfidf_train = X_tfidf_vector.transform(x_fit)
X_tfidf_test = X_tfidf_vector.transform(x_test)

In [212]:
print(X_tfidf_train.shape)
print(y_train.shape)

(917, 20938)
(917,)


In [213]:
# TODO: Prepare X,Y: Set SMOTE **Problem with lib**
# from imblearn.over_sampling import SMOTE
# sm = SMOTE(random_state=0)
# # X_for_res = np.array(X_tfidf_train.todense())
# X_for_res = pd.DataFrame(X_tfidf_train.toarray(), columns=tfidf_vectorizer_lemm_pre.get_feature_names_out())
# X_res, y_res = sm.fit_resample(X_tfidf_train, np.array(y_train))
# X_res_test, y_res_test = sm.fit_resample(X_tfidf_test, y_test)

In [214]:
# TODO: Prepare X: Normalization MinMax(0,1) **Problem**
# scaler = MinMaxScaler() 
# standardized_tfidf_matrix = scaler.fit_transform(X_tfidf_train)
# standardized_tfidf_matrix

In [215]:
# TODO: ML Model: GBM
gbm_model = GradientBoostingClassifier()
# what we should set on setting?
model = gbm_model.fit(X_tfidf_train, y_train)


In [216]:
# TODO: ML Model: Cross_validation, Metric
precision = model_selection.cross_val_score(model, X_tfidf_train, y_train, cv=5,
n_jobs=-2, scoring='precision_macro')
recall = model_selection.cross_val_score(model, X_tfidf_train, y_train, cv=5,
n_jobs=-2, scoring='recall_macro')
f1_cv_score = model_selection.cross_val_score(model, X_tfidf_train, y_train, cv=5,
n_jobs=-2, scoring='f1_macro')

In [217]:
precision

array([0.43902439, 0.62007011, 0.5491453 , 0.52752682, 0.62814465])

In [218]:
recall

array([0.48189522, 0.53762198, 0.53865579, 0.49747475, 0.52447552])

In [219]:
f1_cv_score

array([0.43246753, 0.52583139, 0.51004995, 0.45601842, 0.52502781])

In [220]:
# TODO: ML Model: GBM
gbm_model = GradientBoostingClassifier()
# what we should set on setting?
# res_model = gbm_model.fit(X_res, y_res)
