# Using Naive Bayes Classifaction Model

In [None]:
import pandas as pd
import numpy as np
import utilities as util
import importlib
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import f1_score,precision_score,recall_score,confusion_matrix,classification_report
from textblob import TextBlob
import matplotlib.pyplot as plt

In [None]:
importlib.reload(util)

### Establishing global variables

In [None]:
PAIRS = "ModelPairs.csv"

### Loading the Data

In [None]:
df = pd.read_csv(PAIRS, low_memory = False)
df = df.drop(columns = ["Unnamed: 0", "Unnamed: 0.1"])

print("The amount of independent pairs is {}".format(len(df[df["BinaryClass"] == 0])))
print("The amount of dependent pairs is {}".format(len(df[df["BinaryClass"] == 1])))

if (len(df[df["BinaryClass"] == 0]) == len(df[df["BinaryClass"] == 1])):
    print("Equally Split data")

In [None]:
df = df.drop(columns = ["req1Id", "req1Priority", "req1Release", "req1Severity", "req1Type", "req1Ver", "req2Id", 
                       "req2Priority", "req2Release", "req2Severity", "req2Type", "req2Ver"])

### Getting the value counts of product

In [None]:
print("Independent pairs for Core is {}".format(len(df[(df["req1Product"] == "Core") & (df["BinaryClass"] == 0) ])))
print("dependent pairs for Core is {}".format(len(df[(df["req1Product"] == "Core") & (df["BinaryClass"] == 1) ])))

In [None]:
print("Independent pairs for Core is {}".format(len(df[(df["req1Product"] == "SeaMonkey") & (df["BinaryClass"] == 0) ])))
print("dependent pairs for Core is {}".format(len(df[(df["req1Product"] == "SeaMonkey") & (df["BinaryClass"] == 1) ])))

### Creating specific dfs

In [None]:
dfs = []

df_core = df[df["req1Product"] == "Core"]
df_firefox = df[df["req1Product"] == "Firefox"]
df_thunderbird = df[df["req1Product"] == "Thunderbird"]
df_bugzilla = df[df["req1Product"] == "Bugzilla"]
df_seamonkey = df[df["req1Product"] == "SeaMonkey"]
df_least = df[df["req1Product"] == "Web Compatibility"]

In [None]:
train_x = util.balance_train("SeaMonkey", df_seamonkey , True)

### Testing the balancing

In [None]:
print("Independent pairs for Core is {}".format(len(train_x[(train_x["req1Product"] == "SeaMonkey") & (train_x["BinaryClass"] == 0) ])))
print("dependent pairs for Core is {}".format(len(train_x[(train_x["req1Product"] == "SeaMonkey") & (train_x["BinaryClass"] == 1) ])))

# Testing the model for future uses

### Choose what is test and what is train (Binary Class)

In [None]:
train = df_firefox
test = df_core

### Condense the data

In [None]:
train_x, train_y = util.x_y_split(train)
test_x, test_y = util.x_y_split(test)

In [None]:
X_train_tfidf, X_test_tfidf = util.create_classified_sets(train_x, test_x)

### Training the model (Binary Class)

In [None]:
clf_model = MultinomialNB().fit(X_train_tfidf,np.array(train_y).astype('int'))

### Testing the model (Binary Class)

In [None]:
predict_labels = clf_model.predict(X_test_tfidf)
actualLabels = np.array(test_y).astype('int')

In [None]:
clf_test_score = clf_model.score(X_test_tfidf,actualLabels)
print("\n"+100*"-")
print(" Classifier Test Score : "+str(clf_test_score))

precision = round(precision_score(actualLabels, predict_labels,average='macro'),2)
recall = round(recall_score(actualLabels, predict_labels,average='macro'),2)
f1 = round(f1_score(actualLabels, predict_labels,average='macro'),2)
print(" f1score : "+str(f1))


### Choose what is test and what is train (Multi Class)

In [None]:
train_x, train_y, test_x, test_y = util.train_test_multi_class(train,test)

### Train the model (Multi Class)

In [None]:
X_train_tfidf, X_test_tfidf = util.create_classified_sets(train_x, test_x)

In [None]:
clf_model = MultinomialNB().fit(X_train_tfidf,np.array(train_y).astype('int'))

### Test the model (Multi Class)

In [None]:
predict_labels = clf_model.predict(X_test_tfidf)
actualLabels = np.array(test_y).astype('int')

In [None]:
clf_test_score = clf_model.score(X_test_tfidf,actualLabels)
print("\n"+100*"-")
print(" Classifier Test Score : "+str(clf_test_score))


f1 = round(f1_score(actualLabels, predict_labels,average='macro'),2)
print(" f1score : "+str(f1))

## Brute Force!!! (Binary Class)

In [None]:
tfidf_transformer = TfidfTransformer()
count_vect = util.create_vectorizor()

In [None]:
total_projects = len(df["req1Product"].value_counts()) - 1
df_unique = df["req1Product"].unique()
df_scores = pd.DataFrame(columns = ["Train Project", "Test Project", "Prediction Score"])

### Actual Model Training

In [None]:
project_num = 0
## every project will be used as a training model
for df_name in df_unique:
    project_num = project_num + 1
    train = df[df["req1Product"] == df_name]
    independent = len(df[(df["req1Product"] == df_name) & (df["BinaryClass"] == 0)])
    dependent = len(df[(df["req1Product"] == df_name) & (df["BinaryClass"] == 1)])
    ## if a data set only has less than 5 pairs of each, don't even bother training
    if ((independent == 0) or (dependent == 0)):
        print("Not using {}".format(df_name))
        continue
    ## if a data set's independent pairs are greater than dependent pairs, balance to match dependent pair count
    elif (independent > dependent):
        train = util.balance_train(df_name, train, True)
    ## if a data set's dependent pairs are greater than independent pairs, balance to match independent pair count
    elif (dependent > independent):
        train = util.balance_train(df_name, train, False)
    else: 
        train = train
    train_x, train_y = util.x_y_split(train)
    ## classify and condense the model
    X_train_counts = count_vect.fit_transform(np.array(train_x))
    X_train_tfidf= tfidf_transformer.fit_transform(X_train_counts)
    ## train and fit the model
    clf_model = MultinomialNB().fit(X_train_tfidf,np.array(train_y).astype('int'))
    ## prompt the user which model is currently being used to test results
    print("Currently Testing using {} model, this is project number {} out of {}".format(df_name, project_num, len(df_unique)))
    # every training model will test all other models
    for df_name2 in df_unique:
        if df_name != df_name2:
            test = df[df["req1Product"] == df_name2]
            test_x, test_y = util.x_y_split(test)
            X_test_counts = count_vect.transform(np.array(test_x))
            X_test_tfidf= tfidf_transformer.fit_transform(X_test_counts)
            predict_labels = clf_model.predict(X_test_tfidf)
            actualLabels = np.array(test_y).astype('int')
            ## will use f1 scores to see how well the modelsdo
            clf_test_score = clf_model.score(X_test_tfidf,actualLabels)
            ## add results to the dataframe
            result = {"Train Project": df_name, "Test Project": df_name2, "Prediction Score": "{:.2f}".format(clf_test_score)}
            df_scores = df_scores.append(result, ignore_index = True)

            
    
    

### Output the Results

In [None]:
df_scores.to_csv("PredictionScores_BinaryClass_Retry.csv")

## Brute Force!!! (MultiClass)

In [None]:
df_scores = pd.DataFrame(columns = ["Train Project", "Test Project", "Prediction Score"])
project_num = 0
## every project will be used as a training model
for df_name in df_unique:
    project_num = project_num + 1
    train = df[df["req1Product"] == df_name]
    independent = len(df[(df["req1Product"] == df_name) & (df["BinaryClass"] == 0)])
    dependent = len(df[(df["req1Product"] == df_name) & (df["BinaryClass"] == 1)])
    ## if a data set only has less than 5 pairs of each, don't even bother training
    if ((independent == 0) or (dependent == 0)):
        print("Not using {}".format(df_name))
        continue
    ## if a data set's independent pairs are greater than dependent pairs, balance to match dependent pair count
    elif (independent > dependent):
        train = util.balance_train(df_name, train, True)
    ## if a data set's dependent pairs are greater than independent pairs, balance to match independent pair count
    elif (dependent > independent):
        train = util.balance_train(df_name, train, False)
    else: 
        train = train
    train_x, train_y = util.x_y_multiclass_split(train)
    ## classify and condense the model
    X_train_counts = count_vect.fit_transform(np.array(train_x))
    X_train_tfidf= tfidf_transformer.fit_transform(X_train_counts)
    ## train and fit the model
    clf_model = MultinomialNB().fit(X_train_tfidf,np.array(train_y).astype('int'))
    ## prompt the user which model is currently being used to test results
    print("Currently Testing using {} model, this is project number {}".format(df_name, project_num))
    # every training model will test all other models
    for df_name2 in df_unique:
        if df_name != df_name2:
            test = df[df["req1Product"] == df_name2]
            test_x, test_y = util.x_y_multiclass_split(test)
            X_test_counts = count_vect.transform(np.array(test_x))
            X_test_tfidf= tfidf_transformer.fit_transform(X_test_counts)
            predict_labels = clf_model.predict(X_test_tfidf)
            actualLabels = np.array(test_y).astype('int')
            ## will use f1 scores to see how well the modelsdo
            clf_test_score = clf_model.score(X_test_tfidf,actualLabels)
            ## add results to the dataframe
            result = {"Train Project": df_name, "Test Project": df_name2, "Prediction Score": "{:.2f}".format(clf_test_score)}
            df_scores = df_scores.append(result, ignore_index = True)

In [None]:
df_scores.to_csv("PredictionScores_MultiClass_Retry.csv")