# Using Naive Bayes Classifaction Model

In [1]:
import pandas as pd
import numpy as np
import utilities as util
import importlib
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import f1_score,precision_score,recall_score,confusion_matrix,classification_report
from textblob import TextBlob
import matplotlib.pyplot as plt

In [2]:
importlib.reload(util)

<module 'utilities' from 'C:\\Users\\teddy\\Documents\\Research Project\\SreYantra_Data\\BugZ\\TKnegsampleGeneration\\utilities.py'>

### Establishing global variables

In [3]:
D_pairs = "Teddy_Data/AllDependentPairs.csv"
I_pairs = "Teddy_Data/AllIndependentPairs.csv"

### Loading the Data

In [4]:
df_dp = pd.read_csv(D_pairs, low_memory = False)
df_ip = pd.read_csv(I_pairs, low_memory = False)

In [5]:
unnamed_columns = ["Unnamed: 0", "Unnamed: 0.1"]
df_dp = df_dp.drop(columns = unnamed_columns)
df_ip = df_ip.drop(columns = unnamed_columns)

In [6]:
print("The amount of independent pairs is {}".format(len(df_dp)))
print("The amount of dependent pairs is {}".format(len(df_ip)))

The amount of independent pairs is 62011
The amount of dependent pairs is 740230


# Testing the model for future uses

In [7]:
tfidf_transformer = TfidfTransformer()
count_vect = util.create_vectorizor()

### Set 80-20 split

In [121]:
total = 1000
split = int(total/2)

train = int(0.8*total)
test = int(0.2*total)

In [122]:
df = df_dp[df_dp["req1Product"] == "Firefox"].head(split)
df = df.append(df_ip[df_ip["req1Product"] == "Firefox"].head(split))
# randomize the dataframe
df = df.sample(frac = 1)
# get the test series'
binary_class = df["BinaryClass"]
multi_class = df["MultiClass"]
## drop unimportant columns from training
sub_df = df.drop(columns = ['BinaryClass', 'MultiClass',"req1Product","req2Product"])

### Split up the train and test sets

In [130]:
train_x = sub_df.head(train)
train_y = binary_class.head(train)
test_x = sub_df.tail(test)
test_y = binary_class.tail(test)

In [131]:
X_train_counts = count_vect.fit_transform(np.array(train_x))
X_train_tfidf= tfidf_transformer.fit_transform(X_train_counts)

X_test_counts = count_vect.transform(np.array(test_x))
X_test_tfidf= tfidf_transformer.fit_transform(X_test_counts)

### Training the model (Binary Class)

In [132]:
clf_model = MultinomialNB().fit(X_train_tfidf,np.array(train_y).astype('int'))

### Testing the model (Binary Class)

In [133]:
predict_labels = clf_model.predict(X_test_tfidf)
actualLabels = np.array(test_y).astype('int')

In [146]:
confusion_matrix(actualLabels, predict_labels)

array([[104,   0],
       [  3,  93]], dtype=int64)

In [134]:
clf_test_score = clf_model.score(X_test_tfidf,actualLabels)
print("\n"+100*"-")
print(" Classifier Test Score : "+str(clf_test_score))

precision = round(precision_score(actualLabels, predict_labels,average='macro'),2)
recall = round(recall_score(actualLabels, predict_labels,average='macro'),2)
f1 = round(f1_score(actualLabels, predict_labels,average='macro'),2)
print(" f1score : "+str(f1))



----------------------------------------------------------------------------------------------------
 Classifier Test Score : 0.985
 f1score : 0.98


### Choose what is test and what is train (Multi Class)

In [None]:
train_x, train_y, test_x, test_y = util.train_test_multi_class(train,test)

### Train the model (Multi Class)

In [None]:
X_train_tfidf, X_test_tfidf = util.create_classified_sets(train_x, test_x)

In [None]:
clf_model = MultinomialNB().fit(X_train_tfidf,np.array(train_y).astype('int'))

### Test the model (Multi Class)

In [None]:
predict_labels = clf_model.predict(X_test_tfidf)
actualLabels = np.array(test_y).astype('int')

In [None]:
clf_test_score = clf_model.score(X_test_tfidf,actualLabels)
print("\n"+100*"-")
print(" Classifier Test Score : "+str(clf_test_score))


f1 = round(f1_score(actualLabels, predict_labels,average='macro'),2)
print(" f1score : "+str(f1))

# Brute Force!!! (Binary Class)

### Initilize Verfication sizes

In [197]:
total = 100
split = int(total/2)
train = int(0.8*total)
test = int(0.2*total)

threshold = 0.8

In [198]:
tfidf_transformer = TfidfTransformer()
count_vect = util.create_vectorizor()

In [199]:
total_projects = len(df["req1Product"].value_counts()) - 1
df_unique = df_dp["req1Product"].unique()
df_unique = np.intersect1d(df_unique, df_ip["req1Product"].unique())
df_scores = pd.DataFrame(columns = ["Train Project", "Test Project", "Prediction Score"])

### Actual Model Training

In [200]:
project_num = 0
for df_name in df_unique:
    ### Training and Verfication Phase ####
    project_num = project_num + 1
    print("Training Project {}: {}".format(project_num, df_name))
    ## get the pairs from both dataset with the same req1product
    d_pairs = df_dp[df_dp["req1Product"] == df_name]
    i_pairs = df_ip[df_ip["req1Product"] == df_name]
    ## if the sample size is too big, let the user know, so that they can change it
    if (split > len(d_pairs) or split > len(i_pairs)):
        print("Sample size too big, please reduce")
        continue
    df = d_pairs.head(split)
    df = df.append(i_pairs.head(split))
    df = df.sample(frac = 1)
    train_x, train_y, test_x, test_y = util.train_test_split(df, train, test, "Binary")
    ## condense the x values for the train and test models
    X_train_counts = count_vect.fit_transform(np.array(train_x))
    X_train_tfidf= tfidf_transformer.fit_transform(X_train_counts)
    X_test_counts = count_vect.transform(np.array(test_x))
    X_test_tfidf= tfidf_transformer.fit_transform(X_test_counts)
    ## train the model
    clf_model = MultinomialNB().fit(X_train_tfidf,np.array(train_y).astype('int'))
    predict_labels = clf_model.predict(X_test_tfidf)
    actualLabels = np.array(test_y).astype('int')
    
    precision = round(precision_score(actualLabels, predict_labels,average='macro'),2)
    recall = round(recall_score(actualLabels, predict_labels,average='macro'),2)
    f1 = round(f1_score(actualLabels, predict_labels,average='macro'),2)
    
    if (threshold > f1):
        print("Training model failed Verification")
        continue
    
    print("Training model passed Verfication")
    #### Training and Testing Phase ####
    train_x, train_y = util.x_y_split(df, "Binary")
    X_train_counts = count_vect.fit_transform(np.array(train_x))
    X_train_tfidf= tfidf_transformer.fit_transform(X_train_counts)
    ## train and fit the model
    clf_model = MultinomialNB().fit(X_train_tfidf,np.array(train_y).astype('int'))
    
    print("Model has been Trained")
    train_project = df_name
    
    for df_name2 in df_unique:
        ### find all test projects, see how it works
        if df_name == df_name2:
            continue
        d_pairs2 = df_dp[df_dp["req1Product"] == df_name2]
        i_pairs2 = df_ip[df_ip["req1Product"] == df_name2]
        if 
            
            
        
    
    
    
    
    
    

Training Project 1: Bugzilla
Training model passed Verfication
Model has been Trained
Training Project 2: Calendar
Training model passed Verfication
Model has been Trained
Training Project 3: Core
Training model passed Verfication
Model has been Trained
Training Project 4: Firefox
Training model passed Verfication
Model has been Trained
Training Project 5: Firefox Build System
Training model passed Verfication
Model has been Trained
Training Project 6: MailNews Core
Training model passed Verfication
Model has been Trained
Training Project 7: NSPR
Sample size too big, please reduce
Training Project 8: NSS
Training model passed Verfication
Model has been Trained
Training Project 9: Other Applications
Training model passed Verfication
Model has been Trained
Training Project 10: SeaMonkey
Training model passed Verfication
Model has been Trained
Training Project 11: Testing
Sample size too big, please reduce
Training Project 12: Thunderbird
Training model passed Verfication
Model has been T

In [None]:
project_num = 0
## every project will be used as a training model
for df_name in df_unique:
    project_num = project_num + 1
    train = df[df["req1Product"] == df_name]
    independent = len(train[train["BinaryClass"] == 0])
    dependent = len(train[train["BinaryClass"] == 1])
    ## if a data set only has less than 5 pairs of each, don't even bother training
    if ((independent < 5) or (dependent < 5)):
        print("Not using {}".format(df_name))
        continue
    ## if a data set's independent pairs are greater than dependent pairs, balance to match dependent pair count
    else:
        train = util.balance_train(train)
    train_x, train_y = util.x_y_split(train)
    ## classify and condense the model
    X_train_counts = count_vect.fit_transform(np.array(train_x))
    X_train_tfidf= tfidf_transformer.fit_transform(X_train_counts)
    ## train and fit the model
    clf_model = MultinomialNB().fit(X_train_tfidf,np.array(train_y).astype('int'))
    ## prompt the user which model is currently being used to test results
    print("Currently Testing using {} model, this is project number {} out of {}".format(df_name, project_num, len(df_unique)))
    # every training model will test all other models
    for df_name2 in df_unique:
        if df_name != df_name2:
            test = df[df["req1Product"] == df_name2]
            test_x, test_y = util.x_y_split(test)
            X_test_counts = count_vect.transform(np.array(test_x))
            X_test_tfidf= tfidf_transformer.fit_transform(X_test_counts)
            predict_labels = clf_model.predict(X_test_tfidf)
            actualLabels = np.array(test_y).astype('int')
            ## will use f1 scores to see how well the modelsdo
            clf_test_score = clf_model.score(X_test_tfidf,actualLabels)
            ## add results to the dataframe
            result = {"Train Project": df_name, "Test Project": df_name2, "Prediction Score": "{:.2f}".format(clf_test_score)}
            df_scores = df_scores.append(result, ignore_index = True)

            
    
    

### Output the Results

In [None]:
df_scores.to_csv("PredictionScores_BinaryClass.csv")

## Brute Force!!! (MultiClass)

In [None]:
df_scores = pd.DataFrame(columns = ["Train Project", "Test Project", "Prediction Score"])
project_num = 0
## every project will be used as a training model
for df_name in df_unique:
    project_num = project_num + 1
    train = df[df["req1Product"] == df_name]
    independent = len(df[(df["req1Product"] == df_name) & (df["BinaryClass"] == 0)])
    dependent = len(df[(df["req1Product"] == df_name) & (df["BinaryClass"] == 1)])
    ## if a data set only has less than 5 pairs of each, don't even bother training
    if ((independent < 5) or (dependent < 5)):
        print("Not using {}".format(df_name))
        continue
    ## if a data set's independent pairs are greater than dependent pairs, balance to match dependent pair count
    else:
        train = util.balance_train(train)
    train_x, train_y = util.x_y_multiclass_split(train)
    ## classify and condense the model
    X_train_counts = count_vect.fit_transform(np.array(train_x))
    X_train_tfidf= tfidf_transformer.fit_transform(X_train_counts)
    ## train and fit the model
    clf_model = MultinomialNB().fit(X_train_tfidf,np.array(train_y).astype('int'))
    ## prompt the user which model is currently being used to test results
    print("Currently Testing using {} model, this is project number {}".format(df_name, project_num))
    # every training model will test all other models
    for df_name2 in df_unique:
        if df_name != df_name2:
            test = df[df["req1Product"] == df_name2]
            test_x, test_y = util.x_y_multiclass_split(test)
            X_test_counts = count_vect.transform(np.array(test_x))
            X_test_tfidf= tfidf_transformer.fit_transform(X_test_counts)
            predict_labels = clf_model.predict(X_test_tfidf)
            actualLabels = np.array(test_y).astype('int')
            ## will use f1 scores to see how well the modelsdo
            clf_test_score = clf_model.score(X_test_tfidf,actualLabels)
            ## add results to the dataframe
            result = {"Train Project": df_name, "Test Project": df_name2, "Prediction Score": "{:.2f}".format(clf_test_score)}
            df_scores = df_scores.append(result, ignore_index = True)

In [None]:
df_scores.to_csv("PredictionScores_MultiClass.csv")