# Using Naive Bayes Classifaction Model

In [130]:
import pandas as pd
import re
import numpy as np
import utilities as util
import nlp as nlp
import importlib
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score,train_test_split, StratifiedKFold
from sklearn.metrics import f1_score,precision_score,recall_score,confusion_matrix,classification_report
from textblob import TextBlob
import matplotlib.pyplot as plt

from nltk.util import ngrams

In [131]:
importlib.reload(util)
importlib.reload(nlp)

<module 'nlp' from 'C:\\Users\\teddy\\Documents\\Research Project\\SreYantra_Data\\BugZ\\TKnegsampleGeneration\\nlp.py'>

### Establishing global variables

In [104]:
D_pairs = "Teddy_Data/AllDependentPairs.csv"
I_pairs = "Teddy_Data/AllIndependentPairs.csv"

### Loading the Data

In [105]:
df_dp = pd.read_csv(D_pairs, low_memory = False)
df_ip = pd.read_csv(I_pairs, low_memory = False)

In [106]:
unnamed_columns = ["Unnamed: 0", "Unnamed: 0.1"]
df_dp = df_dp.drop(columns = unnamed_columns)
df_ip = df_ip.drop(columns = unnamed_columns)

In [6]:
print("The amount of independent pairs is {}".format(len(df_ip)))
print("The amount of dependent pairs is {}".format(len(df_dp)))

The amount of independent pairs is 740230
The amount of dependent pairs is 62011


# Testing the model for future uses

In [117]:
tfidf_transformer = TfidfTransformer()
count_vect = util.create_vectorizor()

### 1000 for training, 500 for testing

In [142]:
train_size = 1000
test_size = 500

In [143]:
df = df_dp[df_dp["req1Product"] == "Firefox"].sample(int(train_size/2))
df = df.append(df_ip[df_ip["req1Product"] == "Firefox"].sample(int(train_size/2)))
# randomize the dataframe
df = df.sample(frac = 1)
# get the test series'
binary_class = np.array(df["BinaryClass"])
multi_class = df["MultiClass"]
## drop unimportant columns from training
train_df = df.drop(columns = ['BinaryClass', 'MultiClass',"req1Product","req2Product"])

In [144]:
train_df = nlp.generate_ngrams_df(train_df, 2)

### Split up the train and test sets

In [145]:
X_train_counts = count_vect.fit_transform(np.array(train_df))
X_train_tfidf= tfidf_transformer.fit_transform(X_train_counts)

### Training the model (Binary Class)

In [146]:
clf_model = MultinomialNB().fit(X_train_tfidf,binary_class.astype('int'))

### Testing the model (Binary Class)

In [147]:
for i in range(10):
    test_df = df_dp[df_dp["req1Product"] == "Core"].sample(int(test_size/2))
    test_df = test_df.append(df_ip[df_ip["req1Product"] == "Core"].sample(int(test_size/2)))
    test_df = test_df.sample(frac = 1)
    test_binary = np.array(test_df["BinaryClass"])
    test_df = test_df.drop(columns = ['BinaryClass', 'MultiClass',"req1Product","req2Product"])
    
    test_df = nlp.generate_ngrams_df(test_df,2)
    
    X_test_counts = count_vect.transform(np.array(test_df))
    X_test_tfidf= tfidf_transformer.fit_transform(X_test_counts)

    predict_labels = clf_model.predict(X_test_tfidf)
    actualLabels = np.array(test_binary).astype('int')

    confusion_matrix(actualLabels, predict_labels)

    clf_test_score = clf_model.score(X_test_tfidf,actualLabels)
    print("\n"+100*"-")
    print(" Classifier Test Score : "+str(clf_test_score))

    precision = round(precision_score(actualLabels, predict_labels,average='macro'),2)
    recall = round(recall_score(actualLabels, predict_labels,average='macro'),2)
    f1 = round(f1_score(actualLabels, predict_labels,average='macro'),2)
    print(" f1score : "+str(f1))



----------------------------------------------------------------------------------------------------
 Classifier Test Score : 0.81
 f1score : 0.81

----------------------------------------------------------------------------------------------------
 Classifier Test Score : 0.804
 f1score : 0.8

----------------------------------------------------------------------------------------------------
 Classifier Test Score : 0.798
 f1score : 0.8

----------------------------------------------------------------------------------------------------
 Classifier Test Score : 0.786
 f1score : 0.79

----------------------------------------------------------------------------------------------------
 Classifier Test Score : 0.808
 f1score : 0.81

----------------------------------------------------------------------------------------------------
 Classifier Test Score : 0.782
 f1score : 0.78

----------------------------------------------------------------------------------------------------
 Classi

### Choose what is test and what is train (Multi Class)

In [None]:
train_x, train_y, test_x, test_y = util.train_test_multi_class(train,test)

### Train the model (Multi Class)

In [None]:
X_train_tfidf, X_test_tfidf = util.create_classified_sets(train_x, test_x)

In [None]:
clf_model = MultinomialNB().fit(X_train_tfidf,np.array(train_y).astype('int'))

### Test the model (Multi Class)

In [None]:
predict_labels = clf_model.predict(X_test_tfidf)
actualLabels = np.array(test_y).astype('int')

In [None]:
clf_test_score = clf_model.score(X_test_tfidf,actualLabels)
print("\n"+100*"-")
print(" Classifier Test Score : "+str(clf_test_score))


f1 = round(f1_score(actualLabels, predict_labels,average='macro'),2)
print(" f1score : "+str(f1))

# Brute Force!!! (Binary Class)

### Initilize Verfication sizes

In [None]:
total_projects = len(df["req1Product"].value_counts()) - 1
df_unique = df_dp["req1Product"].unique()
df_unique = np.intersect1d(df_unique, df_ip["req1Product"].unique())
df_scores = pd.DataFrame(columns = ["Train Project", "Test Project", "Prediction Score"])

In [None]:
df_unique

### Actual Model Training

### New Model

In [150]:
### important global variables for model to use
tfidf_transformer = TfidfTransformer()
count_vect = util.create_vectorizor()
ngrams = 2
important_projects = ["Core", "Firefox", "Thunderbird" ,"Bugzilla", "SeaMonkey"]
clf = MultinomialNB()
skf = StratifiedKFold(10)
new_results = pd.DataFrame(columns = ["Train Project", "Train Size", "Test Project", "Test Size", "Average Validation Score (15*15 fold)","Average f1 score (10 Tests)"])

In [151]:
project_num = 0
for df_name in important_projects:
    for i in range (5):
        if (i == 0):
            train_size = 100
            test_size = train_size/2
        else:
            train_size = train_size * 2
            test_size = train_size/2
        ### Training and Verfication Phase ####
        project_num = project_num + 1
        print("Training Project {}: {}".format(project_num, df_name))
        ## get the pairs from both dataset with the same req1product
        d_pairs = df_dp[df_dp["req1Product"] == df_name]
        i_pairs = df_ip[df_ip["req1Product"] == df_name]
        train_df = d_pairs.sample(int(train_size/2))
        train_df = train_df.append(i_pairs.sample(int(train_size/2)))
        ### randomize the data frame
        train_df = train_df.sample(frac = 1)
        train_binary_class = np.array(train_df["BinaryClass"])
        train_df = train_df.drop(columns = ['BinaryClass', 'MultiClass',"req1Product","req2Product"])
        train_df = nlp.generate_ngrams_df(train_df,ngrams)
        ## condense the data
        X_train_counts = count_vect.fit_transform(np.array(train_df))
        X_train_tfidf= tfidf_transformer.fit_transform(X_train_counts)
        ## k-fold validation
        scores = cross_val_score(clf, X_train_tfidf, train_binary_class, cv = skf)
        avgValidation = np.average(scores)
        ## checking for a high enough validation score
        if (avgValidation < 0.9):
            continue
        clf_model = MultinomialNB().fit(X_train_tfidf,train_binary_class.astype('int'))
        for df_name2 in important_projects:
            if df_name == df_name2:
                continue
            d_pairs2 = df_dp[df_dp["req1Product"] == df_name2]
            i_pairs2 = df_ip[df_ip["req1Product"] == df_name2]
            ## we want to run this 10 times and take 10 different random samples
            f1_scores = []
            for y in range(10):
                test_df = d_pairs2.sample(int(test_size/2))
                test_df = test_df.append(i_pairs2.sample(int(test_size/2)))
                test_df = test_df.sample(frac = 1)
                test_binary = np.array(test_df["BinaryClass"])
                test_df = test_df.drop(columns = ['BinaryClass', 'MultiClass',"req1Product","req2Product"])
                test_df = nlp.generate_ngrams_df(test_df,ngrams)
                ## condense the data 
                X_test_counts = count_vect.transform(np.array(test_df))
                X_test_tfidf= tfidf_transformer.fit_transform(X_test_counts)
                ## seperate prediction array from actual value array
                predict_labels = clf_model.predict(X_test_tfidf)
                actualLabels = np.array(test_binary).astype('int')
                ## create the confusion matrix
                cm = confusion_matrix(actualLabels, predict_labels)
                precision = round(precision_score(actualLabels, predict_labels,average='macro'),2)
                recall = round(recall_score(actualLabels, predict_labels,average='macro'),2)
                f1 = round(f1_score(actualLabels, predict_labels,average='macro'),2)
                f1_scores.append(f1)

            avgf1score = np.average(f1_scores)
            result = {"Train Project" : df_name,
                      "Train Size" : train_size,
                      "Test Project": df_name2,
                      "Test Size": test_size,
                      "Average Validation Score (10*10 fold)" : avgValidation,
                      "Average f1 score (10 Tests)" : avgf1score}
            new_results = new_results.append(result, ignore_index = True)
    
            
            
        
    
    
    
    
    
    

Training Project 1: Core
Training Project 2: Core
Training Project 3: Core
Training Project 4: Core
Training Project 5: Core
Training Project 6: Firefox
Training Project 7: Firefox
Training Project 8: Firefox
Training Project 9: Firefox
Training Project 10: Firefox
Training Project 11: Thunderbird
Training Project 12: Thunderbird
Training Project 13: Thunderbird
Training Project 14: Thunderbird
Training Project 15: Thunderbird
Training Project 16: Bugzilla
Training Project 17: Bugzilla
Training Project 18: Bugzilla
Training Project 19: Bugzilla
Training Project 20: Bugzilla
Training Project 21: SeaMonkey
Training Project 22: SeaMonkey
Training Project 23: SeaMonkey
Training Project 24: SeaMonkey
Training Project 25: SeaMonkey


In [152]:
new_results

Unnamed: 0,Train Project,Train Size,Test Project,Test Size,Average Validation Score (15*15 fold),Average f1 score (10 Tests),Average Validation Score (10*10 fold)
0,Core,400,Firefox,200.0,,0.720,0.91750
1,Core,400,Thunderbird,200.0,,0.613,0.91750
2,Core,400,Bugzilla,200.0,,0.527,0.91750
3,Core,400,SeaMonkey,200.0,,0.595,0.91750
4,Core,800,Firefox,400.0,,0.737,0.95375
5,Core,800,Thunderbird,400.0,,0.649,0.95375
6,Core,800,Bugzilla,400.0,,0.523,0.95375
7,Core,800,SeaMonkey,400.0,,0.624,0.95375
8,Core,1600,Firefox,800.0,,0.756,0.95750
9,Core,1600,Thunderbird,800.0,,0.642,0.95750


In [None]:
new_results.to_csv("New_Results.csv")

### Old model

In [None]:
project_num = 0
## every project will be used as a training model
for df_name in df_unique:
    project_num = project_num + 1
    train = df[df["req1Product"] == df_name]
    independent = len(train[train["BinaryClass"] == 0])
    dependent = len(train[train["BinaryClass"] == 1])
    ## if a data set only has less than 5 pairs of each, don't even bother training
    if ((independent < 5) or (dependent < 5)):
        print("Not using {}".format(df_name))
        continue
    ## if a data set's independent pairs are greater than dependent pairs, balance to match dependent pair count
    else:
        train = util.balance_train(train)
    train_x, train_y = util.x_y_split(train)
    ## classify and condense the model
    X_train_counts = count_vect.fit_transform(np.array(train_x))
    X_train_tfidf= tfidf_transformer.fit_transform(X_train_counts)
    ## train and fit the model
    clf_model = MultinomialNB().fit(X_train_tfidf,np.array(train_y).astype('int'))
    ## prompt the user which model is currently being used to test results
    print("Currently Testing using {} model, this is project number {} out of {}".format(df_name, project_num, len(df_unique)))
    # every training model will test all other models
    for df_name2 in df_unique:
        if df_name != df_name2:
            test = df[df["req1Product"] == df_name2]
            test_x, test_y = util.x_y_split(test)
            X_test_counts = count_vect.transform(np.array(test_x))
            X_test_tfidf= tfidf_transformer.fit_transform(X_test_counts)
            predict_labels = clf_model.predict(X_test_tfidf)
            actualLabels = np.array(test_y).astype('int')
            ## will use f1 scores to see how well the modelsdo
            clf_test_score = clf_model.score(X_test_tfidf,actualLabels)
            ## add results to the dataframe
            result = {"Train Project": df_name, "Test Project": df_name2, "Prediction Score": "{:.2f}".format(clf_test_score)}
            df_scores = df_scores.append(result, ignore_index = True)

            
    
    

### Output the Results

In [None]:
df_scores.to_csv("PredictionScores_BinaryClass.csv")

## Brute Force!!! (MultiClass)

In [None]:
df_scores = pd.DataFrame(columns = ["Train Project", "Test Project", "Prediction Score"])
project_num = 0
## every project will be used as a training model
for df_name in df_unique:
    project_num = project_num + 1
    train = df[df["req1Product"] == df_name]
    independent = len(df[(df["req1Product"] == df_name) & (df["BinaryClass"] == 0)])
    dependent = len(df[(df["req1Product"] == df_name) & (df["BinaryClass"] == 1)])
    ## if a data set only has less than 5 pairs of each, don't even bother training
    if ((independent < 5) or (dependent < 5)):
        print("Not using {}".format(df_name))
        continue
    ## if a data set's independent pairs are greater than dependent pairs, balance to match dependent pair count
    else:
        train = util.balance_train(train)
    train_x, train_y = util.x_y_multiclass_split(train)
    ## classify and condense the model
    X_train_counts = count_vect.fit_transform(np.array(train_x))
    X_train_tfidf= tfidf_transformer.fit_transform(X_train_counts)
    ## train and fit the model
    clf_model = MultinomialNB().fit(X_train_tfidf,np.array(train_y).astype('int'))
    ## prompt the user which model is currently being used to test results
    print("Currently Testing using {} model, this is project number {}".format(df_name, project_num))
    # every training model will test all other models
    for df_name2 in df_unique:
        if df_name != df_name2:
            test = df[df["req1Product"] == df_name2]
            test_x, test_y = util.x_y_multiclass_split(test)
            X_test_counts = count_vect.transform(np.array(test_x))
            X_test_tfidf= tfidf_transformer.fit_transform(X_test_counts)
            predict_labels = clf_model.predict(X_test_tfidf)
            actualLabels = np.array(test_y).astype('int')
            ## will use f1 scores to see how well the modelsdo
            clf_test_score = clf_model.score(X_test_tfidf,actualLabels)
            ## add results to the dataframe
            result = {"Train Project": df_name, "Test Project": df_name2, "Prediction Score": "{:.2f}".format(clf_test_score)}
            df_scores = df_scores.append(result, ignore_index = True)

In [None]:
df_scores.to_csv("PredictionScores_MultiClass.csv")