In [20]:
import nltk
import os
import numpy as np
import pandas as pd
import spacy
import joblib

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn import model_selection
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import csr_matrix
from textblob import TextBlob

In [21]:
# TODO: Preparing pre-process, 
# Load spaCy English language model
nlp = spacy.load("en_core_web_sm")

def pos_tagger_for_spacy(tag):
    # Mapping NLTK POS tags to spaCy POS tags
    tag_dict = {'N': 'NOUN', 'V': 'VERB', 'R': 'ADV', 'J': 'ADJ'}
    return tag_dict.get(tag, 'n')

def pre_process_spacy(s):
    doc = nlp(s)
    s = " ".join([token.lemma_ if token.pos_ in ['NOUN', 'VERB'] else token.text for token in doc if token.pos_ in ['NOUN', 'VERB']])
    return s


def pre_process_textblob(s):
    blob = TextBlob(s)
    # Remove stopwords
    s = [word for word in blob.words if word not in nltk.corpus.stopwords.words('english')]
    s = " ".join(s)
    return s


def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


def pre_process_porterstemmer(s):
    ps = PorterStemmer()
    s = word_tokenize(s)
    stopwords_set = set(stopwords.words('english'))
    stop_dict = {s: 1 for s in stopwords_set}
    s = [w for w in s if w not in stop_dict]
    s = [ps.stem(w) for w in s]
    s = ' '.join(s)
    return s


def pre_process_lemmatizer(s):
    s = word_tokenize(s)
    lemmatizer = nltk.stem.WordNetLemmatizer()
    stopwords_set = set(stopwords.words('english'))
    stop_dict = {s: 1 for s in stopwords_set}
    tags = nltk.pos_tag(s)
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), tags))
    s = [lemmatizer.lemmatize(word, tag) if tag == 'n' or tag == 'v' else None for word, tag in wordnet_tagged]
    s = list(filter(None, s))
    s = [w for w in s if w not in stop_dict]
    s = ' '.join(s)
    return s


In [22]:
# TODO: Prepare X, Y as Same length
filepath = Path(os.path.abspath('../resources/clean_demo.pkl'))
x = pd.read_pickle(filepath)
word_counts = x.str.count(' ') + 1

# Check X
data = pd.read_pickle(Path(os.path.abspath('../resources/hive_use_for_run_pre_process.pkl')))
data = data[data['title_n_body'].notnull()]
data.rename(columns={'title_n_body': 'title_n_body_not_clean'}, inplace=True)
data = pd.concat([data, x.dropna()], axis=1)

y1 = pd.read_csv(Path(os.path.abspath('../resources/tsdetect/all_test_smell/df_test_semantic_smell.csv')))
y2 = pd.read_csv(Path(os.path.abspath('../resources/tsdetect/all_test_smell/df_issue_in_test_step.csv')))
y3 = pd.read_csv(Path(os.path.abspath('../resources/tsdetect/all_test_smell/df_code_related.csv')))
y4 = pd.read_csv(Path(os.path.abspath('../resources/tsdetect/all_test_smell/df_dependencies.csv')))
y5 = pd.read_csv(Path(os.path.abspath('../resources/tsdetect/all_test_smell/df_test_execution.csv')))


def compare_y_to_x(dfx, dfy):
    return dfy.loc[dfy['url'].isin(dfx['url'])]

y1_to_x = compare_y_to_x(data, y1)
y2_to_x = compare_y_to_x(data, y2)
y3_to_x = compare_y_to_x(data, y3)
y4_to_x = compare_y_to_x(data, y4)
y5_to_x = compare_y_to_x(data, y5)

In [23]:
y_test_semantic_smell = y1_to_x['y']
y_issue_in_test_step = y2_to_x['y']
y_code_related = y3_to_x['y']
y_dependencies = y4_to_x['y']
y_test_execution = y5_to_x['y']

In [24]:
# TODO: Prepare X: TF, TF-IDF, Ngram 1-3,
# vectorizer_porter_pre = TfidfVectorizer(use_idf=True, preprocessor=pre_process_porterstemmer)
tf_vectorizer_porter_pre = CountVectorizer(preprocessor=pre_process_porterstemmer, ngram_range=(1, 3))
tf_vectorizer_lemma_pre = CountVectorizer(preprocessor=pre_process_lemmatizer, ngram_range=(1, 3))
tf_vectorizer_textblob_pre = CountVectorizer(preprocessor=pre_process_textblob, ngram_range=(1, 3))
tf_vectorizer_spacy_pre = CountVectorizer(preprocessor=pre_process_spacy, ngram_range=(1, 3))


tfidf_vectorizer_porter_pre = TfidfVectorizer(use_idf=True, preprocessor=pre_process_porterstemmer, ngram_range=(1, 3))
tfidf_vectorizer_lemma_pre = TfidfVectorizer(use_idf=True, preprocessor=pre_process_lemmatizer, ngram_range=(1, 3))
tfidf_vectorizer_textblob_pre = TfidfVectorizer(use_idf=True, preprocessor=pre_process_textblob, ngram_range=(1, 3))
tfidf_vectorizer_spacy_pre = TfidfVectorizer(use_idf=True, preprocessor=pre_process_spacy, ngram_range=(1, 3))


In [25]:
# TODO: Method to apply 1,0 and Log(1+X) normalization

def scale_sparse_matrix(tfidf_matrix):
    min_max_scaler = MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(tfidf_matrix.toarray())
    return csr_matrix(x_scaled)

def log_transform_tfidf(tfidf_matrix):
    return np.log1p(tfidf_matrix)

In [26]:
# TODO: Prepare X,Y: Split80:20
x_fit, x_test = model_selection.train_test_split(x, test_size=0.2)

y_for_train_test_semantic_smell, y_for_test_test_semantic_smell = model_selection.train_test_split(y_test_semantic_smell, test_size=0.2)
y_for_train_issue_in_test_step, y_for_test_issue_in_test_step = model_selection.train_test_split(y_issue_in_test_step, test_size=0.2)
y_for_train_code_related, y_for_test_code_related = model_selection.train_test_split(y_code_related, test_size=0.2)
y_for_train_dependencies, y_for_test_dependencies = model_selection.train_test_split(y_dependencies, test_size=0.2)
y_for_train_test_execution, y_for_test_test_execution = model_selection.train_test_split(y_test_execution, test_size=0.2)


In [27]:
x_fit.info

<bound method Series.info of 1164    hive23509 mapjoin assertionerror capacity must...
1115    hive23797 throw exception when no metastore fo...
593     hive25372 advance write id for all the ddls ad...
115     hive26683 sum windowing function returns wrong...
825     revert hive24624 repl load should detect the c...
                              ...                        
1181                     hive16787 fix itests in branch22
1111                        hive23483 remove dynamicserde
276     hive26280 copy more data into completedcompact...
1122    hive23444 concurrent acid direct inserts may f...
282     hive22670 arrayindexoutofboundsexception when ...
Name: title_n_body, Length: 917, dtype: object>

In [28]:
# TODO: Fit each X 

X_tf_train_porter = tf_vectorizer_porter_pre.fit_transform(x_fit)
X_tf_test_porter = tf_vectorizer_porter_pre.transform(x_test)

X_tfidf_train_porter = tfidf_vectorizer_porter_pre.fit_transform(x_fit)
X_tfidf_test_porter = tfidf_vectorizer_porter_pre.transform(x_test)

X_tf_train_lemma = tf_vectorizer_lemma_pre.fit_transform(x_fit)
X_tf_test_lemma = tf_vectorizer_lemma_pre.transform(x_test)

X_tfidf_train_lemma = tfidf_vectorizer_lemma_pre.fit_transform(x_fit)
X_tfidf_test_lemma = tfidf_vectorizer_lemma_pre.transform(x_test)

X_tf_train_spacy = tf_vectorizer_spacy_pre.fit_transform(x_fit)
X_tf_test_spacy = tf_vectorizer_lemma_pre.transform(x_test)

X_tfidf_train_spacy = tfidf_vectorizer_spacy_pre.fit_transform(x_fit)
X_tfidf_test_spacy = tfidf_vectorizer_spacy_pre.transform(x_test)

X_tf_train_textblob = tf_vectorizer_textblob_pre.fit_transform(x_fit)
X_tf_test_textblob = tf_vectorizer_textblob_pre.transform(x_test)

X_tfidf_train_textblob = tfidf_vectorizer_textblob_pre.fit_transform(x_fit)
X_tfidf_test_textblob = tfidf_vectorizer_textblob_pre.transform(x_test)


In [29]:
print(X_tfidf_train_porter.shape)
print(X_tfidf_test_porter.shape)
print(y_for_train_test_semantic_smell.shape)

(917, 51005)
(230, 51005)
(917,)


In [33]:
# TODO: Prepare X: Normalization (0-1) and Log(1+x) only TFIDF
# print(X_tf_train_porter.toarray())
# print(X_tfidf_test_porter.toarray())
# X_tfidf_train_porter_01, X_tfidf_test_porter_01 = scale_sparse_matrix(X_tfidf_train_porter, X_tfidf_test_porter)
X_tfidf_train_porter_01 = scale_sparse_matrix(X_tfidf_train_porter)
X_tfidf_train_porter_log = log_transform_tfidf(X_tfidf_train_porter)
X_tfidf_test_porter_01 = scale_sparse_matrix(X_tfidf_test_porter)
X_tfidf_test_porter_log = log_transform_tfidf(X_tfidf_test_porter)
# 
X_tfidf_train_lemma_01 = scale_sparse_matrix(X_tfidf_train_lemma)
X_tfidf_train_lemma_log = log_transform_tfidf(X_tfidf_train_lemma)
X_tfidf_test_lemma_01 = scale_sparse_matrix(X_tfidf_test_lemma)
X_tfidf_test_lemma_log = log_transform_tfidf(X_tfidf_test_lemma)
# 
X_tfidf_train_spacy_01 = scale_sparse_matrix(X_tfidf_train_spacy)
X_tfidf_train_spacy_log = log_transform_tfidf(X_tfidf_train_spacy)
X_tfidf_test_spacy_01 = scale_sparse_matrix(X_tfidf_test_spacy)
X_tfidf_test_spacy_log = log_transform_tfidf(X_tfidf_test_spacy)
# 
X_tfidf_train_textblob_01 = scale_sparse_matrix(X_tfidf_train_textblob)
X_tfidf_train_textblob_log = log_transform_tfidf(X_tfidf_train_textblob)
X_tfidf_test_textblob_01 = scale_sparse_matrix(X_tfidf_test_textblob)
X_tfidf_test_textblob_log = log_transform_tfidf(X_tfidf_test_textblob)

X_train_list = [X_tfidf_train_porter_01, X_tfidf_train_lemma_01, X_tfidf_train_spacy_01, X_tfidf_train_textblob_01, X_tfidf_train_porter_log, X_tfidf_train_lemma_log, X_tfidf_train_spacy_log, X_tfidf_train_textblob_log]
X_test_list = [X_tfidf_test_porter_01, X_tfidf_test_porter_log, X_tfidf_test_lemma_01, X_tfidf_test_lemma_log, X_tfidf_test_spacy_01, X_tfidf_test_spacy_log, X_tfidf_test_textblob_01, X_tfidf_test_textblob_log]
Y_train_list = [y_for_train_test_semantic_smell, y_for_train_code_related, y_for_train_dependencies, y_for_train_test_execution, y_for_train_issue_in_test_step]
Y_test_list = [y_for_test_test_semantic_smell, y_for_test_code_related, y_for_test_dependencies, y_for_test_test_execution, y_for_test_issue_in_test_step]
# print(X_tfidf_train_porter_01.shape)
# print('-------------')

X_train_list


In [None]:
dirname = os.path.dirname('/home/jiramed_withun')
filename = os.path.join(dirname, '/x_y_for_train')
print(dirname)
print(filename)

In [13]:
# TODO: Prepare X,Y: Set SMOTE **Problem with lib** Change to imbalance sklearn
# def oversampling(x,y):
#     oversampler = SMOTE(sampling_strategy='auto', random_state=42)
#     x_resampled, y_resampled = oversampler.fit_resample(x, y)
#     return x_resampled, y_resampled



for i in range(len(X_train_list)):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    Y_train = Y_train_list[i]
    Y_test = Y_test_list[i]
    X_train_var_name = [name for name, value in locals().items() if value is X_train][0]
    X_test_var_name = [name for name, value in locals().items() if value is X_test][0]
    Y_train_var_name = [name for name, value in locals().items() if value is Y_train][0]
    Y_test_var_name = [name for name, value in locals().items() if value is Y_test][0]

    # Apply SMOTE
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_resampled, Y_resampled = smote.fit_resample(X_train, Y_train)

    # Normalize the data
    scaler = MinMaxScaler()
    X_resampled = scaler.fit_transform(X_resampled)

    # Save the datasets as files
    joblib.dump(X_resampled, f'{X_train_var_name}_smote.pkl')
    joblib.dump(X_test, f'{X_test_var_name}.pkl')
    joblib.dump(Y_resampled, f'{X_train_var_name}_smote.pkl')
    joblib.dump(Y_test, f'{Y_test_var_name}.pkl')


In [14]:
# TODO: Prepare X,Y: Set SMOTE need to split train test?


In [15]:
# TODO: ML Model: GBM
gbm_model = GradientBoostingClassifier()
# what we should set on setting?
model = gbm_model.fit(X_tfidf_train, y_train)


NameError: name 'X_tfidf_train' is not defined

In [None]:
# TODO: ML Model: Cross_validation, Metric
precision = model_selection.cross_val_score(model, X_tfidf_train, y_train, cv=5,
n_jobs=-2, scoring='precision_macro')
recall = model_selection.cross_val_score(model, X_tfidf_train, y_train, cv=5,
n_jobs=-2, scoring='recall_macro')
f1_cv_score = model_selection.cross_val_score(model, X_tfidf_train, y_train, cv=5,
n_jobs=-2, scoring='f1_macro')

In [None]:
precision

In [None]:
recall

In [None]:
f1_cv_score

In [None]:
# TODO: ML Model: GBM
gbm_model = GradientBoostingClassifier()
# what we should set on setting?
# res_model = gbm_model.fit(X_res, y_res)
