# Importing required packages:

In [None]:
import nltk
import numpy as np
import scipy
import javalang
import re
import sys
import pyparsing
import json
import random
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from pandas import DataFrame

import os
from os import listdir
from os.path import isfile, join, splitext,split
import csv
import math
import time
import warnings

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score

from sklearn.feature_selection import RFE, VarianceThreshold, SelectKBest, f_classif, mutual_info_classif, chi2, f_regression, SelectFpr, SelectFdr
from sklearn.calibration import CalibratedClassifierCV

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVR

In [None]:
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
from anytree import Node, RenderTree, PreOrderIter, PostOrderIter, LevelOrderIter
from anytree.exporter import DotExporter
import graphviz

In [None]:
warnings.filterwarnings("ignore")

In [None]:
plt.rcParams.update({'font.size':12})

In [None]:
threshold = 0.5

# Defining a method to calculate metrics:

In [None]:
def compute_metrics(Y_test,predicted_result):
    c_matrix = confusion_matrix(Y_test,predicted_result)
    print('confusion matrix:')
    print(c_matrix)
    tn = c_matrix[0,0]
    fp = c_matrix[0,1]
    fn = c_matrix[1,0]
    tp = c_matrix[1,1]
    
    if ((tp + fp) == 0):
        precision = 'not_defined'
    else:
        precision = tp / (tp + fp)
    if ((tp + fn) == 0):
        recall = 'not_defined'
    else:
        recall = tp / (tp + fn)
    
    accuracy = (tp + tn)/(tn + fp + fn + tp)
    
    if (precision == 'not_defined' or recall == 'not_defined' or (precision + recall) == 0):
        f1_score = 'not_defined'
    else:
        f1_score = (2 * precision * recall)/(precision + recall)
        
        
    if ((fp + tn) == 0):
        fp_rate = 'not_defined'
    else:
        fp_rate = fp / (fp + tn)

    print('precision:',precision)
    print('recall:',recall)
    print('accuracy:',accuracy)
    print('f1 score:',f1_score)
    print('FP rate:',fp_rate)
    return (precision,recall,accuracy,f1_score,fp_rate)

# Plotting the training accuracy and loss:

In [None]:
def plot_variables(history):
    training_accuracy = history.history['acc']
    validation_accuracy = history.history['val_acc']
    training_loss = history.history['loss']
    validation_loss = history.history['val_loss']
    epochs = range(1,(epochs_num+1))

    plt.rcParams["figure.figsize"] = (10,6)
    plt.plot(epochs,training_accuracy,color='darkblue',label='training accuracy')
    plt.plot(epochs,validation_accuracy,color='red',label='validation accuracy')
    plt.title('Diagram of training accuracy and epochs',fontsize=16)
    plt.xlabel('epochs',fontsize=14)
    plt.ylabel('training accuracy',fontsize=14)
    plt.legend(fontsize=12)
    plt.show()

    plt.plot(epochs,training_loss,color='darkblue',label='training loss')
    plt.plot(epochs,validation_loss,color='red',label='validation loss')
    plt.title('Diagram of training loss and epochs',fontsize=16)
    plt.xlabel('epochs',fontsize=14)
    plt.ylabel('training loss',fontsize=14)
    plt.legend(fontsize=12)
    plt.show()

# Concatenating two datasets:

In [None]:
def c_1(data_1,data_2):
    length_1 = data_1.shape[1]-1
    feature_1 = data_1[:,0:length_1]
    feature = np.concatenate((feature_1,data_2),axis=1)
    return feature

# Plot ROC

In [None]:
def plot_roc(fp_r,tp_r):
    plt.rcParams["figure.figsize"] = (3,3)
    plt.plot(fp_r,tp_r)
    plt.xlabel('FP rate')
    plt.ylabel('TP rate')
    plt.show()

# K-Fold splittig the data:

In [None]:
def kf_data_split(data):
    kf_data = []
    kf_1 = KFold(n_splits = 10)
    for train_i, test_i in kf_1.split(data):
        train = data[train_i]
        test = data[test_i]
        kf_data.append([train,test])
    return(kf_data)

# Converting to binary array:

In [None]:
def convert_to_binary(array_1):
    array_2 = np.where(array_1>0,1,0)
    return(array_2)

# Plot ROC-AUC and number of features:

In [None]:
def compute_metrics_2(Y_test,predicted_result):
    c_matrix = confusion_matrix(Y_test,predicted_result)
    tn = c_matrix[0,0]
    fp = c_matrix[0,1]
    fn = c_matrix[1,0]
    tp = c_matrix[1,1]
    
    if ((tp + fp) == 0):
        precision = 'not_defined'
    else:
        precision = tp / (tp + fp)
    if ((tp + fn) == 0):
        recall = 'not_defined'
    else:
        recall = tp / (tp + fn)
    
    accuracy = (tp + tn)/(tn + fp + fn + tp)
    
    if (precision == 'not_defined' or recall == 'not_defined' or (precision + recall) == 0):
        f1_score = 'not_defined'
    else:
        f1_score = (2 * precision * recall)/(precision + recall)
        
        
    if ((fp + tn) == 0):
        fp_rate = 'not_defined'
    else:
        fp_rate = fp / (fp + tn)

    return (precision,recall,accuracy,f1_score,fp_rate)

# Classification methods:

In [None]:
def logistic_regression_3_2(train_1,test_1):
    print('classification method: LR')
    max_iter = 1000000
    
#     threshold = 0.1623
      
    length = train_1.shape[1] - 1
    X_train = train_1[:,0:length]
    Y_train = train_1[:,length]
    X_test = test_1[:,0:length]
    Y_test = test_1[:,length]
    
    logistic_regression = LogisticRegression(random_state = 0)
    model = logistic_regression.fit(X_train, Y_train)
    
    predicted_r = model.predict_proba(X_test)[:,1]
    
    fp_r, tp_r, t_array = roc_curve(Y_test,predicted_r)
    
    auc_1 = roc_auc_score(Y_test,predicted_r)

    length_2 = int(predicted_r.shape[0])
    
    predicted_result = np.zeros(length_2)
    for i in range(length_2):
        if (predicted_r[i] >= threshold):
            predicted_result[i] = 1
        else:
            predicted_result[i] = 0
    
    Y_test = Y_test.astype(int)
    computed_metrics = compute_metrics_2(Y_test,predicted_result)
    return computed_metrics,auc_1

In [None]:
def RF_classifier_3_2(train_1,test_1):
    print('classification method: RF')
#     threshold = 0.1623
    length = train_1.shape[1] - 1
    X_train = train_1[:,0:length]
    Y_train = train_1[:,length]
    X_test = test_1[:,0:length]
    Y_test = test_1[:,length]
    
    clf = RandomForestClassifier(max_depth=m_depth,random_state=0)
    clf.fit(X_train, Y_train)

    predicted_r = clf.predict_proba(X_test)[:,1]
    
    fp_r, tp_r, t_array = roc_curve(Y_test,predicted_r)
    
    auc_1 = roc_auc_score(Y_test,predicted_r)

    length_2 = int(predicted_r.shape[0])
    
    predicted_result = np.zeros(length_2)
    for i in range(length_2):
        if (predicted_r[i] >= threshold):
            predicted_result[i] = 1
        else:
            predicted_result[i] = 0
    
    Y_test = Y_test.astype(int)
    computed_metrics = compute_metrics_2(Y_test,predicted_result)
    return computed_metrics,auc_1

In [None]:
def KN_classifier_3_2(train_1,test_1):
    print('classification method: KNN')
#     threshold = 0.1623
    length = train_1.shape[1] - 1
    X_train = train_1[:,0:length]
    Y_train = train_1[:,length]
    X_test = test_1[:,0:length]
    Y_test = test_1[:,length]
    
    k_value = math.floor(math.sqrt(train_1.shape[0]))
    if (k_value % 2 == 0):
        k_value += 1
    else:
        start_val = 0
    
    model = KNeighborsClassifier(n_neighbors = k_value)
    model.fit(X_train, Y_train)

    predicted_r = model.predict_proba(X_test)[:,1]
    
    fp_r, tp_r, t_array = roc_curve(Y_test,predicted_r)
    
    auc_1 = roc_auc_score(Y_test,predicted_r)

    length_2 = int(predicted_r.shape[0])
    
    predicted_result = np.zeros(length_2)
    for i in range(length_2):
        if (predicted_r[i] >= threshold):
            predicted_result[i] = 1
        else:
            predicted_result[i] = 0
    
    Y_test = Y_test.astype(int)
    computed_metrics = compute_metrics_2(Y_test,predicted_result)
    return computed_metrics,auc_1

In [None]:
def G_NB_3_2(train_1,test_1):
    print('classification method: GNB')
#     threshold = 0.1623
    length = train_1.shape[1] - 1
    X_train = train_1[:,0:length]
    Y_train = train_1[:,length]
    X_test = test_1[:,0:length]
    Y_test = test_1[:,length]
    
    clf = GaussianNB()
    clf.fit(X_train, Y_train)

    predicted_r = clf.predict_proba(X_test)[:,1]
    
    fp_r, tp_r, t_array = roc_curve(Y_test,predicted_r)
    
    auc_1 = roc_auc_score(Y_test,predicted_r)

    length_2 = int(predicted_r.shape[0])
    
    predicted_result = np.zeros(length_2)
    for i in range(length_2):
        if (predicted_r[i] >= threshold):
            predicted_result[i] = 1
        else:
            predicted_result[i] = 0
    
    Y_test = Y_test.astype(int)
    computed_metrics = compute_metrics_2(Y_test,predicted_result)
    return computed_metrics,auc_1

In [None]:
def Gradient_B_C_3_2(train_1,test_1):
    print('classification method: Gradient Boosting Classifier')
#     threshold = 0.1623
    length = train_1.shape[1] - 1
    X_train = train_1[:,0:length]
    Y_train = train_1[:,length]
    X_test = test_1[:,0:length]
    Y_test = test_1[:,length]
    
    clf = GradientBoostingClassifier(learning_rate=1.0,max_depth=m_depth,random_state=0)
    clf.fit(X_train, Y_train)

    predicted_r = clf.predict_proba(X_test)[:,1]
    
    fp_r, tp_r, t_array = roc_curve(Y_test,predicted_r)
    
    auc_1 = roc_auc_score(Y_test,predicted_r)

    length_2 = int(predicted_r.shape[0])
    
    predicted_result = np.zeros(length_2)
    for i in range(length_2):
        if (predicted_r[i] >= threshold):
            predicted_result[i] = 1
        else:
            predicted_result[i] = 0
    
    Y_test = Y_test.astype(int)
    computed_metrics = compute_metrics_2(Y_test,predicted_result)
    return computed_metrics,auc_1

In [None]:
def Ada_B_3_2(train_1,test_1):
    print('classification method: Ada B')
#     threshold = 0.1623
    length = train_1.shape[1] - 1
    X_train = train_1[:,0:length]
    Y_train = train_1[:,length]
    X_test = test_1[:,0:length]
    Y_test = test_1[:,length]
    
    clf = AdaBoostClassifier(n_estimators = 100, random_state = 0)
    clf.fit(X_train, Y_train)

    predicted_r = clf.predict_proba(X_test)[:,1]
    
    fp_r, tp_r, t_array = roc_curve(Y_test,predicted_r)
    
    auc_1 = roc_auc_score(Y_test,predicted_r)

    length_2 = int(predicted_r.shape[0])
    
    predicted_result = np.zeros(length_2)
    for i in range(length_2):
        if (predicted_r[i] >= threshold):
            predicted_result[i] = 1
        else:
            predicted_result[i] = 0
    
    Y_test = Y_test.astype(int)
    computed_metrics = compute_metrics_2(Y_test,predicted_result)
    return computed_metrics,auc_1

In [None]:
def SVM_3_2(train_1,test_1):
    print('classification method: linear SVM')
#     threshold = 0.1623
    length = train_1.shape[1] - 1
    X_train = train_1[:,0:length]
    Y_train = train_1[:,length]
    X_test = test_1[:,0:length]
    Y_test = test_1[:,length]
    
    clf = SGDClassifier(loss='hinge',max_iter=1000,random_state = 0)
    clf.fit(X_train, Y_train)
    
    predicted_result = clf.predict(X_test)
    computed_metrics = compute_metrics_2(Y_test,predicted_result)
    return computed_metrics

# Selecting subset of data:

In [None]:
def select_data_subset(data_1,c):
    start_time = time.time()
    if (c == 1):
        new_d = select_features_2(data_1,400,7)[0]
    elif (c == 2):
        new_d = select_features_2(data_1,400,8)[0]
    elif (c == 4):
        new_d = select_features_2(data_1,400,10)[0]
    finish_time = time.time()
    elapsed_time = finish_time - start_time
    print(elapsed_time)
    return new_d

In [None]:
# 1:LR,   2:RF,   9:KNN,   10:GNB
c1 = 1

In [None]:
step_value = 25
d_1_p_2 = select_data_subset(d_1_p,c1)
d_1_c_2 = select_data_subset(d_1_c,c1)

step_value = 25
d_2_p_2 = select_data_subset(d_2_p,c1)
d_2_c_2 = select_data_subset(d_2_c,c1)

step_value = 25
d_3_p_2 = select_data_subset(d_3_p,c1)
d_3_c_2 = select_data_subset(d_3_c,c1)

step_value = 25
d_4_p_2 = select_data_subset(d_4_p,c1)
d_4_c_2 = select_data_subset(d_4_c,c1)

step_value = 25
d_5_p_2 = select_data_subset(d_5_p,c1)
d_5_c_2 = select_data_subset(d_5_c,c1)

In [None]:
step_value = 25
d_1_p_2 = select_data_subset(d_1_p,c1)
d_1_c_2 = select_data_subset(d_1_c,c1)

In [None]:
step_value = 25
d_2_p_2 = select_data_subset(d_2_p,c1)
d_2_c_2 = select_data_subset(d_2_c,c1)

In [None]:
step_value = 25
d_3_p_2 = select_data_subset(d_3_p,c1)
d_3_c_2 = select_data_subset(d_3_c,c1)

In [None]:
step_value = 25
d_4_p_2 = select_data_subset(d_4_p,c1)
d_4_c_2 = select_data_subset(d_4_c,c1)

In [None]:
step_value = 25
d_5_p_2 = select_data_subset(d_5_p,c1)
d_5_c_2 = select_data_subset(d_5_c,c1)

In [None]:
np.save('./data_2/dense_trigram_data_AST1_LR.npy',d_1_p_2)
np.save('./data_2/dense_trigram_data_c_AST1_LR.npy',d_1_c_2)

np.save('./data_2/dense_trigram_data_AST2_LR.npy',d_2_p_2)
np.save('./data_2/dense_trigram_data_c_AST2_LR.npy',d_2_c_2)

np.save('./data_2/dense_trigram_data_AST3_LR.npy',d_3_p_2)
np.save('./data_2/dense_trigram_data_c_AST3_LR.npy',d_3_c_2)

np.save('./data_2/dense_trigram_data_AST4_LR.npy',d_4_p_2)
np.save('./data_2/dense_trigram_data_c_AST4_LR.npy',d_4_c_2)

np.save('./data_2/dense_trigram_data_word_LR.npy',d_5_p_2)
np.save('./data_2/dense_trigram_data_c_word_LR.npy',d_5_c_2)

In [None]:
d_1_p_2 = np.load('./data_2/dense_trigram_data_AST1_LR.npy')
d_1_c_2 = np.load('./data_2/dense_trigram_data_c_AST1_LR.npy')

d_2_p_2 = np.load('./data_2/dense_trigram_data_AST2_LR.npy')
d_2_c_2 = np.load('./data_2/dense_trigram_data_c_AST2_LR.npy')

d_3_p_2 = np.load('./data_2/dense_trigram_data_AST3_LR.npy')
d_3_c_2 = np.load('./data_2/dense_trigram_data_c_AST3_LR.npy')

d_4_p_2 = np.load('./data_2/dense_trigram_data_AST4_LR.npy')
d_4_c_2 = np.load('./data_2/dense_trigram_data_c_AST4_LR.npy')

d_5_p_2 = np.load('./data_2/dense_trigram_data_word_LR.npy')
d_5_c_2 = np.load('./data_2/dense_trigram_data_c_word_LR.npy')

# Dividing the data into fixed sets of test and train:

In [None]:
def split_data_2(data):
    l_1 =data.shape[0]
    l_2 = int(np.ceil(l_1 / 5))
    random.seed(10)
    index_list = random.sample(range(l_1),l_2)
    test_data = []
    train_data = []
    for i in range(l_1):
        if i in index_list:
            test_data.append(data[i])
        else:
            train_data.append(data[i])
    test_data = np.array(test_data)
    train_data = np.array(train_data)
    return train_data, test_data

In [None]:
train_2_p, test_2_p = split_data_2(d_2_p)
train_2_c, test_2_c = split_data_2(d_2_c)

# Feature selection:

In [None]:
def select_features_2(data,k_value,score_f):
    length = data.shape[1] - 1
    print('Primary number of features:',length)
    X = data[:,0:length]
    y = data[:,length].reshape(-1,1)
    
    if (score_f <= 5):
        if (score_f == 1):
            s = SelectKBest(score_func=f_classif, k=k_value)
        elif (score_f == 2):
            s = SelectKBest(score_func=chi2, k=k_value)
        elif (score_f == 3):
            s = SelectKBest(score_func=mutual_info_classif, k=k_value)
        elif (score_f == 4):
            s = SelectFpr(score_func=chi2, alpha=0.01)
        elif (score_f == 5):
            s = SelectFdr(score_func=chi2, alpha=0.01)
         
        start_time = time.time()
        new_X = s.fit_transform(X,y)
        finish_time = time.time()
        elapsed_time = finish_time - start_time
        
        columns = s.get_support(indices=True)
        scores = s.scores_[s.get_support()]
        new_data = np.hstack((new_X,y))

        zipped_f = zip(scores,columns)
        zipped_f = sorted(zipped_f,reverse=True)
        sorted_columns = [column for (score,column) in zipped_f]
    else:
        if (score_f == 6):
            estimator = SVR(kernel = "linear")
        elif (score_f == 7):
            estimator = LogisticRegression(random_state = 0)
            print('LR','step_value:',step_value)
        elif (score_f == 8):
            estimator = RandomForestClassifier(max_depth=m_depth,random_state=0)
            print('RF','step_value:',step_value)
        elif (score_f == 10):
            estimator = GaussianNB()
            print('GNB','step_value:',step_value)
        elif (score_f == 12):
            estimator = AdaBoostClassifier(n_estimators = 100, random_state = 0)
            print('Ada_B','step_value:',step_value)
        
        s = RFE(estimator,n_features_to_select=k_value,step=step_value)
        
        start_time = time.time()
        new_X = s.fit_transform(X,y)
        finish_time = time.time()
        elapsed_time = finish_time - start_time
        
        columns = s.get_support(indices=True)
        new_data = np.hstack((new_X,y))
    
    return new_data,columns,elapsed_time


# Evaluation:

In [None]:
def classification_result(train_2,test_2,c):
    if (c == 1):
        r = logistic_regression_3_2(train_2,test_2)
    elif (c == 2):
        r = RF_classifier_3_2(train_2,test_2)
    elif (c == 4):
        r = G_NB_3_2(train_2,test_2)
    elif (c == 5):
        r = Gradient_B_C_3_2(train_2,test_2)
    elif (c == 6):
        r = Ada_B_3_2(train_2,test_2)
    elif (c == 7):
        r = SVM_3_2(train_2,test_2)
    
    return r

In [None]:
def Evaluation_RFE(data_1,data_2,data_3,data_4,data_5,data_6,data_7,data_8,data_9,data_10,c_method):
    d_1 =data_1.copy()
    d_2 =data_2.copy()
    d_3 =data_3.copy()
    d_4 =data_4.copy()
    d_5 =data_5.copy()
    d_6 =data_6.copy()
    d_7 =data_7.copy()
    d_8 =data_8.copy()
    d_9 =data_9.copy()
    d_10 =data_10.copy()
    
    train_1_p, test_1_p = split_data_2(d_1)
    train_1_c, test_1_c = split_data_2(d_2)

    train_2_p, test_2_p = split_data_2(d_3)
    train_2_c, test_2_c = split_data_2(d_4)

    train_3_p, test_3_p = split_data_2(d_5)
    train_3_c, test_3_c = split_data_2(d_6)

    train_4_p, test_4_p = split_data_2(d_7)
    train_4_c, test_4_c = split_data_2(d_8)

    train_5_p, test_5_p = split_data_2(d_9)
    train_5_c, test_5_c = split_data_2(d_10)


    features_n = []
    
    auc_array_p_AST1 = []
    auc_array_c_AST1 = []
    auc_array_b_AST1 = []
    
    auc_array_p_AST2 = []
    auc_array_c_AST2 = []
    auc_array_b_AST2 = []
    
    auc_array_p_AST3 = []
    auc_array_c_AST3 = []
    auc_array_b_AST3 = []
    
    auc_array_p_AST4 = []
    auc_array_c_AST4 = []
    auc_array_b_AST4 = []
    
    auc_array_p_word = []
    auc_array_c_word = []
    auc_array_b_word = []


    if (c_method == 1):
        score_f_type = 7
    elif (c_method == 2):
        score_f_type = 8
    elif (c_method == 4):
        score_f_type = 10
    elif (c_method == 6):
        score_f_type = 12
    
    i_max = 500
    for i in range(i_max,24,-25):
        print(i)
        features_n.append(i)
        
        
        feature_s = select_features_2(train_1_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_1_p = feature_s[0]
        length_1 = test_1_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_1_p = test_1_p[:,columns_1]
        auc_value_p = classification_result(train_1_p,test_1_p,c_method)[1]
        auc_array_p_AST1.append(auc_value_p)
        if (i == i_max):
            print('AST1_p:')
            print(round(feature_s[2],4),'\n')
            
        feature_s = select_features_2(train_1_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_1_c = feature_s[0]
        length_1 = test_1_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_1_c = test_1_c[:,columns_1]
        auc_value_c = classification_result(train_1_c,test_1_c,c_method)[1]
        auc_array_c_AST1.append(auc_value_c)
        if (i == i_max):
            print('AST1_c:')
            print(round(feature_s[2],4),'\n')
        
        train_1_c_2 = train_1_c.copy()
        test_1_c_2 = test_1_c.copy()
        b_train_data = convert_to_binary(train_1_c_2)
        b_test_data = convert_to_binary(test_1_c_2)
        auc_value_b = classification_result(b_train_data,b_test_data,c_method)[1]
        auc_array_b_AST1.append(auc_value_b)
        
        
        
        
        feature_s = select_features_2(train_2_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_2_p = feature_s[0]
        length_1 = test_2_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_2_p = test_2_p[:,columns_1]
        auc_value_p = classification_result(train_2_p,test_2_p,c_method)[1]
        auc_array_p_AST2.append(auc_value_p)
        if (i == i_max):
            print('AST2_p:')
            print(round(feature_s[2],4),'\n')
        
        feature_s = select_features_2(train_2_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_2_c = feature_s[0]
        length_1 = test_2_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_2_c = test_2_c[:,columns_1]
        auc_value_c = classification_result(train_2_c,test_2_c,c_method)[1]
        auc_array_c_AST2.append(auc_value_c)
        if (i == i_max):
            print('AST2_c:')
            print(round(feature_s[2],4),'\n')
        
        train_2_c_2 = train_2_c.copy()
        test_2_c_2 = test_2_c.copy()
        b_train_data = convert_to_binary(train_2_c_2)
        b_test_data = convert_to_binary(test_2_c_2)
        auc_value_b = classification_result(b_train_data,b_test_data,c_method)[1]
        auc_array_b_AST2.append(auc_value_b)
        
        
        
        
        feature_s = select_features_2(train_3_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_3_p = feature_s[0]
        length_1 = test_3_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_3_p = test_3_p[:,columns_1]
        auc_value_p = classification_result(train_3_p,test_3_p,c_method)[1]
        auc_array_p_AST3.append(auc_value_p)
        if (i == i_max):
            print('AST3_p:')
            print(round(feature_s[2],4),'\n')
        
        feature_s = select_features_2(train_3_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_3_c = feature_s[0]
        length_1 = test_3_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_3_c = test_3_c[:,columns_1]
        auc_value_c = classification_result(train_3_c,test_3_c,c_method)[1]
        auc_array_c_AST3.append(auc_value_c)
        if (i == i_max):
            print('AST3_c')
            print(round(feature_s[2],4),'\n')
        
        train_3_c_2 = train_3_c.copy()
        test_3_c_2 = test_3_c.copy()
        b_train_data = convert_to_binary(train_3_c_2)
        b_test_data = convert_to_binary(test_3_c_2)
        auc_value_b = classification_result(b_train_data,b_test_data,c_method)[1]
        auc_array_b_AST3.append(auc_value_b)
        
        
        
        
        feature_s = select_features_2(train_4_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_4_p = feature_s[0]
        length_1 = test_4_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_4_p = test_4_p[:,columns_1]
        auc_value_p = classification_result(train_4_p,test_4_p,c_method)[1]
        auc_array_p_AST4.append(auc_value_p)
        if (i == i_max):
            print('AST4_p:')
            print(round(feature_s[2],4),'\n')
        
        feature_s = select_features_2(train_4_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_4_c = feature_s[0]
        length_1 = test_4_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_4_c = test_4_c[:,columns_1]
        auc_value_c = classification_result(train_4_c,test_4_c,c_method)[1]
        auc_array_c_AST4.append(auc_value_c)
        if (i == i_max):
            print('AST4_c:')
            print(round(feature_s[2],4),'\n')
        
        train_4_c_2 = train_4_c.copy()
        test_4_c_2 = test_4_c.copy()
        b_train_data = convert_to_binary(train_4_c_2)
        b_test_data = convert_to_binary(test_4_c_2)
        auc_value_b = classification_result(b_train_data,b_test_data,c_method)[1]
        auc_array_b_AST4.append(auc_value_b)
        
        
        
        
        feature_s = select_features_2(train_5_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_5_p = feature_s[0]
        length_1 = test_5_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_5_p = test_5_p[:,columns_1]
        auc_value_p = classification_result(train_5_p,test_5_p,c_method)[1]
        auc_array_p_word.append(auc_value_p)
        if (i == i_max):
            print('Word_p:')
            print(round(feature_s[2],4),'\n')
            
        feature_s = select_features_2(train_5_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_5_c = feature_s[0]
        length_1 = test_5_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_5_c = test_5_c[:,columns_1]
        auc_value_c = classification_result(train_5_c,test_5_c,c_method)[1]
        auc_array_c_word.append(auc_value_c)
        if (i == i_max):
            print('Word_c:')
            print(round(feature_s[2],4),'\n')
        
        train_5_c_2 = train_5_c.copy()
        test_5_c_2 = test_5_c.copy()
        b_train_data = convert_to_binary(train_5_c_2)
        b_test_data = convert_to_binary(test_5_c_2)
        auc_value_b = classification_result(b_train_data,b_test_data,c_method)[1]
        auc_array_b_word.append(auc_value_b)
            
    return(features_n,auc_array_p_AST1,auc_array_c_AST1,auc_array_b_AST1,auc_array_p_AST2,auc_array_c_AST2,auc_array_b_AST2,auc_array_p_AST3,auc_array_c_AST3,auc_array_b_AST3,auc_array_p_AST4,auc_array_c_AST4,auc_array_b_AST4,auc_array_p_word,auc_array_c_word,auc_array_b_word)
        

In [None]:
# AST4
def Evaluation_RFE_3(data_1,data_2,data_3,data_4,data_5,data_6,data_7,data_8,data_9,data_10,c_method):
    d_1 =data_1.copy()
    d_2 =data_2.copy()
    d_3 =data_3.copy()
    d_4 =data_4.copy()
    d_5 =data_5.copy()
    d_6 =data_6.copy()
    d_7 =data_7.copy()
    d_8 =data_8.copy()
    d_9 =data_9.copy()
    d_10 =data_10.copy()
    
    train_1_p, test_1_p = split_data_2(d_1)
    train_1_c, test_1_c = split_data_2(d_2)

    train_2_p, test_2_p = split_data_2(d_3)
    train_2_c, test_2_c = split_data_2(d_4)

    train_3_p, test_3_p = split_data_2(d_5)
    train_3_c, test_3_c = split_data_2(d_6)

    train_4_p, test_4_p = split_data_2(d_7)
    train_4_c, test_4_c = split_data_2(d_8)

    train_5_p, test_5_p = split_data_2(d_9)
    train_5_c, test_5_c = split_data_2(d_10)


    features_n = []
    
    auc_array_p_AST1 = []
    auc_array_c_AST1 = []
    auc_array_b_AST1 = []
    
    auc_array_p_AST2 = []
    auc_array_c_AST2 = []
    auc_array_b_AST2 = []
    
    auc_array_p_AST3 = []
    auc_array_c_AST3 = []
    auc_array_b_AST3 = []
    
    auc_array_p_AST4 = []
    auc_array_c_AST4 = []
    auc_array_b_AST4 = []
    
    auc_array_p_word = []
    auc_array_c_word = []
    auc_array_b_word = []


    
    if (c_method == 1):
        score_f_type = 7
    elif (c_method == 2):
        score_f_type = 8
    elif (c_method == 4):
        score_f_type = 10
    elif (c_method == 6):
        score_f_type = 12
    
    i_max = 500
    for i in range(i_max,24,-25):
        print(i)
        features_n.append(i)
        
        
        feature_s = select_features_2(train_4_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_4_p = feature_s[0]
        length_1 = test_4_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_4_p = test_4_p[:,columns_1]
        auc_value_p = classification_result(train_4_p,test_4_p,c_method)[1]
        auc_array_p_AST4.append(auc_value_p)
        if (i == i_max):
            print('AST4_p:')
            print(round(feature_s[2],4),'\n')
        
        feature_s = select_features_2(train_4_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_4_c = feature_s[0]
        length_1 = test_4_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_4_c = test_4_c[:,columns_1]
        auc_value_c = classification_result(train_4_c,test_4_c,c_method)[1]
        auc_array_c_AST4.append(auc_value_c)
        if (i == i_max):
            print('AST4_c:')
            print(round(feature_s[2],4),'\n')
        
        train_4_c_2 = train_4_c.copy()
        test_4_c_2 = test_4_c.copy()
        b_train_data = convert_to_binary(train_4_c_2)
        b_test_data = convert_to_binary(test_4_c_2)
        auc_value_b = classification_result(b_train_data,b_test_data,c_method)[1]
        auc_array_b_AST4.append(auc_value_b)
        
            
    return(features_n,auc_array_p_AST1,auc_array_c_AST1,auc_array_b_AST1,auc_array_p_AST2,auc_array_c_AST2,auc_array_b_AST2,auc_array_p_AST3,auc_array_c_AST3,auc_array_b_AST3,auc_array_p_AST4,auc_array_c_AST4,auc_array_b_AST4,auc_array_p_word,auc_array_c_word,auc_array_b_word)
        

In [None]:
# RFE Logistic Regression
step_value = 25
result = Evaluation_RFE(d_1_p, d_1_c, d_2_p, d_2_c, d_3_p, d_3_c, d_4_p, d_4_c, d_5_p, d_5_c, 1)

In [None]:
# RFE Logistic Regression
step_value = 25
result = Evaluation_RFE(b_1_p, b_1_c, b_2_p, b_2_c, b_3_p, b_3_c, b_4_p, b_4_c, b_5_p, b_5_c, 1)

In [None]:
# RFE Random Forest
step_value = 25
m_depth = 10
result = Evaluation_RFE(d_1_p, d_1_c, d_2_p, d_2_c, d_3_p, d_3_c, d_4_p, d_4_c, d_5_p, d_5_c, 2)

In [None]:
# RFE  Logistic Regression AST4
step_value = 25
result = Evaluation_RFE_3(d_1_p, d_1_c, d_2_p, d_2_c, d_3_p, d_3_c, d_4_p, d_4_c, d_5_p, d_5_c, 1)

In [None]:
# RFE  Random Forest AST4
step_value = 25
m_depth = 10
result = Evaluation_RFE_3(d_1_p, d_1_c, d_2_p, d_2_c, d_3_p, d_3_c, d_4_p, d_4_c, d_5_p, d_5_c, 2)

In [None]:
# RFE  GNB AST4
step_value = 25
result = Evaluation_RFE_3(d_1_p, d_1_c, d_2_p, d_2_c, d_3_p, d_3_c, d_4_p, d_4_c, d_5_p, d_5_c, 4)

In [None]:
# RFE Ada_B AST4
step_value = 25
result = Evaluation_RFE_3(d_1_p, d_1_c, d_2_p, d_2_c, d_3_p, d_3_c, d_4_p, d_4_c, d_5_p, d_5_c, 6)

In [None]:
with open("./data_2/results/poi_GNB_trigrams_chi2.txt","rb") as fp:
    result = pickle.load(fp)

In [None]:
features_n,auc_array_p_AST1,auc_array_c_AST1,auc_array_b_AST1,auc_array_p_AST2,auc_array_c_AST2,auc_array_b_AST2,auc_array_p_AST3,auc_array_c_AST3,auc_array_b_AST3,auc_array_p_AST4,auc_array_c_AST4,auc_array_b_AST4,auc_array_p_word,auc_array_c_word,auc_array_b_word = result

In [None]:
r_names = ['features','AST1_p','AST1_c','AST1_b','AST2_p','AST2_c','AST2_b','AST3_p','AST3_c','AST3_b','AST4_p','AST4_c','AST4_b','Word_p','Word_c','Word_b']
for i in range(len(result)):
    print(r_names[i])
    print(round(np.mean(result[i]),4))
    print(round(np.std(result[i]),4),'\n')

In [None]:
plt.rcParams["figure.figsize"] = (10,10)
 
plt.plot(features_n,auc_array_p_AST1,color='magenta',marker='o',markersize=5,label='Level-order AST, probability')
plt.plot(features_n,auc_array_c_AST1,color='magenta',marker='o',markersize=5,linestyle='--',dashes=(3,3),label='Level-order AST, count')
plt.plot(features_n,auc_array_b_AST1,color='magenta',marker='o',markersize=5,linestyle='dotted',label='Level-order AST, binary')

plt.plot(features_n,auc_array_p_AST2,color='grey',marker='o',markersize=5,label='Pre-order AST, probability')
plt.plot(features_n,auc_array_c_AST2,color='grey',marker='o',markersize=5,linestyle='--',dashes=(3,3),label='Pre-order AST, count')
plt.plot(features_n,auc_array_b_AST2,color='grey',marker='o',markersize=5,linestyle='dotted',label='Pre-order AST, binary')

plt.plot(features_n,auc_array_p_AST3,color='firebrick',marker='o',markersize=5,label='Post-order AST, probability')
plt.plot(features_n,auc_array_c_AST3,color='firebrick',marker='o',markersize=5,linestyle='--',dashes=(3,3),label='Post-order AST, count')
plt.plot(features_n,auc_array_b_AST3,color='firebrick',marker='o',markersize=5,linestyle='dotted',label='Post-order AST, binary')

plt.plot(features_n,auc_array_p_AST4,color='blue',marker='o',markersize=5,label='Path-based AST, probability')
plt.plot(features_n,auc_array_c_AST4,color='blue',marker='o',markersize=5,linestyle='--',dashes=(3,3),label='Path-based AST, count')
plt.plot(features_n,auc_array_b_AST4,color='blue',marker='o',markersize=5,linestyle='dotted',label='Path-based AST, binary')

plt.plot(features_n,auc_array_p_word,color='black',marker='o',markersize=5,label='Word level, probability')
plt.plot(features_n,auc_array_c_word,color='black',marker='o',markersize=5,linestyle='--',dashes=(3,3),label='Word level, count')
plt.plot(features_n,auc_array_b_word,color='black',marker='o',markersize=5,linestyle='dotted',label='Word level, binary')

plt.title('AUC per number of features',fontsize=16)
plt.xlabel('Number of features',fontsize=14)
plt.ylabel('AUC',fontsize=14)
axes = plt.gca()
axes.set_ylim([0.5,1])
plt.gca().legend(loc='center left', bbox_to_anchor=(1,0.5))
plt.show()

In [None]:
results_list = list(result)

In [None]:
with open("./data_2/results/poi_RF_trigrams_FE_AST4.txt","wb") as fp:
    pickle.dump(results_list, fp)

In [None]:
with open("./data_2/results/poi_Ada_B_trigrams_FE_AST4.txt","rb") as fp:
    result = pickle.load(fp)
print(result,len(result))

# Chi2 evaluation:

In [None]:
def Evaluation_chi2(data_1,data_2,data_3,data_4,data_5,data_6,data_7,data_8,data_9,data_10,c_method):
    d_1 =data_1.copy()
    d_2 =data_2.copy()
    d_3 =data_3.copy()
    d_4 =data_4.copy()
    d_5 =data_5.copy()
    d_6 =data_6.copy()
    d_7 =data_7.copy()
    d_8 =data_8.copy()
    d_9 =data_9.copy()
    d_10 =data_10.copy()
    
    train_1_p, test_1_p = split_data_2(d_1)
    train_1_c, test_1_c = split_data_2(d_2)

    train_2_p, test_2_p = split_data_2(d_3)
    train_2_c, test_2_c = split_data_2(d_4)

    train_3_p, test_3_p = split_data_2(d_5)
    train_3_c, test_3_c = split_data_2(d_6)

    train_4_p, test_4_p = split_data_2(d_7)
    train_4_c, test_4_c = split_data_2(d_8)

    train_5_p, test_5_p = split_data_2(d_9)
    train_5_c, test_5_c = split_data_2(d_10)


    features_n = []
    
    auc_array_p_AST1 = []
    auc_array_c_AST1 = []
    auc_array_b_AST1 = []
    
    auc_array_p_AST2 = []
    auc_array_c_AST2 = []
    auc_array_b_AST2 = []
    
    auc_array_p_AST3 = []
    auc_array_c_AST3 = []
    auc_array_b_AST3 = []
    
    auc_array_p_AST4 = []
    auc_array_c_AST4 = []
    auc_array_b_AST4 = []
    
    auc_array_p_word = []
    auc_array_c_word = []
    auc_array_b_word = []

    score_f_type = 2
    
    print('Start training')
    feature_s_1 = select_features_2(train_5_p,400,score_f_type)
    print('Stop training')
    
    i_max = 400
    for i in range(i_max,24,-25):
        print(i)
        features_n.append(i)
        
        
        feature_s = select_features_2(train_1_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_1_p_2 = feature_s[0]
        length_1 = test_1_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_1_p_2 = test_1_p[:,columns_1]
        auc_value_p = classification_result(train_1_p_2,test_1_p_2,c_method)[1]
        auc_array_p_AST1.append(auc_value_p)
        if (i == i_max):
            print('AST1_p:')
            print(round(feature_s[2],4),'\n')
        
        feature_s = select_features_2(train_1_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_1_c_2 = feature_s[0]
        length_1 = test_1_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_1_c_2 = test_1_c[:,columns_1]
        auc_value_c = classification_result(train_1_c_2,test_1_c_2,c_method)[1]
        auc_array_c_AST1.append(auc_value_c)
        if (i == i_max):
            print('AST1_c:')
            print(round(feature_s[2],4),'\n')
        
        train_1_c_3 = train_1_c_2.copy()
        test_1_c_3 = test_1_c_2.copy()
        b_train_data = convert_to_binary(train_1_c_3)
        b_test_data = convert_to_binary(test_1_c_3)
        auc_value_b = classification_result(b_train_data,b_test_data,c_method)[1]
        auc_array_b_AST1.append(auc_value_b)
        
        
        
        
        feature_s = select_features_2(train_2_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_2_p_2 = feature_s[0]
        length_1 = test_2_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_2_p_2 = test_2_p[:,columns_1]
        auc_value_p = classification_result(train_2_p_2,test_2_p_2,c_method)[1]
        auc_array_p_AST2.append(auc_value_p)
        if (i == i_max):
            print('AST2_p:')
            print(round(feature_s[2],4),'\n')
        
        feature_s = select_features_2(train_2_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_2_c_2 = feature_s[0]
        length_1 = test_2_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_2_c_2 = test_2_c[:,columns_1]
        auc_value_c = classification_result(train_2_c_2,test_2_c_2,c_method)[1]
        auc_array_c_AST2.append(auc_value_c)
        if (i == i_max):
            print('AST2_c:')
            print(round(feature_s[2],4),'\n')
        
        train_2_c_3 = train_2_c_2.copy()
        test_2_c_3 = test_2_c_2.copy()
        b_train_data = convert_to_binary(train_2_c_3)
        b_test_data = convert_to_binary(test_2_c_3)
        auc_value_b = classification_result(b_train_data,b_test_data,c_method)[1]
        auc_array_b_AST2.append(auc_value_b)
        
        
        
        
        feature_s = select_features_2(train_3_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_3_p_2 = feature_s[0]
        length_1 = test_3_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_3_p_2 = test_3_p[:,columns_1]
        auc_value_p = classification_result(train_3_p_2,test_3_p_2,c_method)[1]
        auc_array_p_AST3.append(auc_value_p)
        if (i == i_max):
            print('AST3_p:')
            print(round(feature_s[2],4),'\n')
        
        feature_s = select_features_2(train_3_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_3_c_2 = feature_s[0]
        length_1 = test_3_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_3_c_2 = test_3_c[:,columns_1]
        auc_value_c = classification_result(train_3_c_2,test_3_c_2,c_method)[1]
        auc_array_c_AST3.append(auc_value_c)
        if (i == i_max):
            print('AST3_c:')
            print(round(feature_s[2],4),'\n')
        
        train_3_c_3 = train_3_c_2.copy()
        test_3_c_3 = test_3_c_2.copy()
        b_train_data = convert_to_binary(train_3_c_3)
        b_test_data = convert_to_binary(test_3_c_3)
        auc_value_b = classification_result(b_train_data,b_test_data,c_method)[1]
        auc_array_b_AST3.append(auc_value_b)
        
        
        
        
        feature_s = select_features_2(train_4_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_4_p_2 = feature_s[0]
        length_1 = test_4_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_4_p_2 = test_4_p[:,columns_1]
        auc_value_p = classification_result(train_4_p_2,test_4_p_2,c_method)[1]
        auc_array_p_AST4.append(auc_value_p)
        if (i == i_max):
            print('AST4_p:')
            print(round(feature_s[2],4),'\n')
        
        feature_s = select_features_2(train_4_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_4_c_2 = feature_s[0]
        length_1 = test_4_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_4_c_2 = test_4_c[:,columns_1]
        auc_value_c = classification_result(train_4_c_2,test_4_c_2,c_method)[1]
        auc_array_c_AST4.append(auc_value_c)
        if (i == i_max):
            print('AST4_c:')
            print(round(feature_s[2],4),'\n')
        
        train_4_c_3 = train_4_c_2.copy()
        test_4_c_3 = test_4_c_2.copy()
        b_train_data = convert_to_binary(train_4_c_3)
        b_test_data = convert_to_binary(test_4_c_3)
        auc_value_b = classification_result(b_train_data,b_test_data,c_method)[1]
        auc_array_b_AST4.append(auc_value_b)
        
        
        
        
        feature_s = select_features_2(train_5_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_5_p_2 = feature_s[0]
        length_1 = test_5_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_5_p_2 = test_5_p[:,columns_1]
        auc_value_p = classification_result(train_5_p_2,test_5_p_2,c_method)[1]
        auc_array_p_word.append(auc_value_p)
        if (i == i_max):
            print('Word_p:')
            print(round(feature_s[2],4),'\n')
            
        feature_s = select_features_2(train_5_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_5_c_2 = feature_s[0]
        length_1 = test_5_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_5_c_2 = test_5_c[:,columns_1]
        auc_value_c = classification_result(train_5_c_2,test_5_c_2,c_method)[1]
        auc_array_c_word.append(auc_value_c)
        if (i == i_max):
            print('Word_c:')
            print(round(feature_s[2],4),'\n')
        
        train_5_c_3 = train_5_c_2.copy()
        test_5_c_3 = test_5_c_2.copy()
        b_train_data = convert_to_binary(train_5_c_3)
        b_test_data = convert_to_binary(test_5_c_3)
        auc_value_b = classification_result(b_train_data,b_test_data,c_method)[1]
        auc_array_b_word.append(auc_value_b)
            
    return(features_n,auc_array_p_AST1,auc_array_c_AST1,auc_array_b_AST1,auc_array_p_AST2,auc_array_c_AST2,auc_array_b_AST2,auc_array_p_AST3,auc_array_c_AST3,auc_array_b_AST3,auc_array_p_AST4,auc_array_c_AST4,auc_array_b_AST4,auc_array_p_word,auc_array_c_word,auc_array_b_word)
        

### 1:LR, 2:RF, 4:GNB, 5:Gradient_B_C, 6: Ada-Boost

In [None]:
# Chi2 Logistic Regression
step_value = 25
result = Evaluation_chi2(d_1_p, d_1_c, d_2_p, d_2_c, d_3_p, d_3_c, d_4_p, d_4_c, d_5_p, d_5_c, 1)

In [None]:
# Chi2 Logistic Regression
step_value = 25
result = Evaluation_chi2(b_1_p, b_1_c, b_2_p, b_2_c, b_3_p, b_3_c, b_4_p, b_4_c, b_5_p, b_5_c, 1)

In [None]:
# Chi2 Logistic Regression (bigrams)
step_value = 25
result = Evaluation_chi2(b_1_p, b_1_c, b_2_p, b_2_c, b_3_p, b_3_c, b_4_p, b_4_c, b_5_p, b_5_c, 1)

In [None]:
# Chi2 Random Forest
step_value = 25
m_depth = 10
result = Evaluation_chi2(d_1_p, d_1_c, d_2_p, d_2_c, d_3_p, d_3_c, d_4_p, d_4_c, d_5_p, d_5_c, 2)

In [None]:
# Chi2 Gaussian Naive Bayes
step_value = 25
result = Evaluation_chi2(d_1_p, d_1_c, d_2_p, d_2_c, d_3_p, d_3_c, d_4_p, d_4_c, d_5_p, d_5_c, 4)

In [None]:
# Chi2 Gradient Boosting Classifier
step_value = 25
m_depth = 1
result = Evaluation_chi2(d_1_p, d_1_c, d_2_p, d_2_c, d_3_p, d_3_c, d_4_p, d_4_c, d_5_p, d_5_c, 5)

In [None]:
# Chi2 Ada Boost Classifier
step_value = 25
result = Evaluation_chi2(d_1_p, d_1_c, d_2_p, d_2_c, d_3_p, d_3_c, d_4_p, d_4_c, d_5_p, d_5_c, 6)

# Concatenating data:

In [None]:
def Evaluation_RFE_2(data_1,data_2,c_method):
    d_1 = data_1.copy()
    d_2 = data_2.copy()
    
    train_1_p, test_1_p = split_data_2(d_1)
    train_1_c, test_1_c = split_data_2(d_2)

    features_n = []
    
    auc_array_p = []
    auc_array_c = []
    auc_array_b = []
    
    if (c_method == 1):
        score_f_type = 7
    elif (c_method == 2):
        score_f_type = 8
    
    i_max = 500
    for i in range(i_max,24,-25):
        print(i)
        features_n.append(i)
        
        
        feature_s = select_features_2(train_1_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_1_p = feature_s[0]
        length_1 = test_1_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_1_p = test_1_p[:,columns_1]
        auc_value_p = classification_result(train_1_p,test_1_p,c_method)[1]
        auc_array_p.append(auc_value_p)
        if (i == i_max):
            print('p:')
            print(round(feature_s[2],4),'\n')
            
        feature_s = select_features_2(train_1_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_1_c = feature_s[0]
        length_1 = test_1_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_1_c = test_1_c[:,columns_1]
        auc_value_c = classification_result(train_1_c,test_1_c,c_method)[1]
        auc_array_c.append(auc_value_c)
        if (i == i_max):
            print('c:')
            print(round(feature_s[2],4),'\n')
        
        train_1_c_2 = train_1_c.copy()
        test_1_c_2 = test_1_c.copy()
        b_train_data = convert_to_binary(train_1_c_2)
        b_test_data = convert_to_binary(test_1_c_2)
        auc_value_b = classification_result(b_train_data,b_test_data,c_method)[1]
        auc_array_b.append(auc_value_b)
            
    return(features_n,auc_array_p,auc_array_c,auc_array_b)

In [None]:
# RFE Logistic Regression
step_value = 25
result_2 = Evaluation_RFE_2(new_data_p,new_data_c,1)

In [None]:
features_n,auc_array_p_concatenated,auc_array_c_concatenated,auc_array_b_concatenated = result_2

In [None]:
new_data_p = c_1(b_4_p,d_4_p)
print(new_data_p.shape)

new_data_c = c_1(b_4_c,d_4_c)
print(new_data_c.shape)

In [None]:
new_data_p = c_1(c_1(b_4_p,d_4_p),c_1(b_5_p,d_5_p))
print(new_data_p.shape)

new_data_c = c_1(c_1(b_4_c,d_4_c),c_1(b_5_c,d_5_c))
print(new_data_c.shape)

# Without feature selection:

In [None]:
def without_fs(d_1,c_method):
    train_1, test_1 = split_data_2(d_1)
    start_time = time.time()
    result_2 = classification_result(train_1,test_1,c_method)
    finish_time = time.time()
    elapsed_time = finish_time - start_time
    print(round(elapsed_time,4))
    return result_2

In [None]:
m_depth = 10
without_fs_result = without_fs(d_1_p,2)
print(without_fs_result)

# KF-average

In [None]:
def classification_result_2(train_2,test_2,c):
    if (c == 1):
        start_time = time.time()
        r = logistic_regression_3_2(train_2,test_2)
        finish_time = time.time()
    elif (c == 2):
        start_time = time.time()
        r = RF_classifier_3_2(train_2,test_2)
        finish_time = time.time()
    elif (c == 4):
        start_time = time.time()
        r = G_NB_3_2(train_2,test_2)
        finish_time = time.time()
    elif (c == 5):
        start_time = time.time()
        r = Gradient_B_C_3_2(train_2,test_2)
        finish_time = time.time()
    elif (c == 6):
        start_time = time.time()
        r = Ada_B_3_2(train_2,test_2)
        finish_time = time.time()
        
    elif (c == 7):
        start_time = time.time()
        r = SVM_3_2(train_2,test_2)
        finish_time = time.time()
    
    elapsed_time = finish_time - start_time
    return r, elapsed_time

In [None]:
def kf_average(d_1,c_method):
    total_time = 0
    trials_n = 1
    precision_array = []
    recall_array = []
    accuracy_array = []
    f1_score_array = []
    fp_rate_array = []
    auc_array = []
    data = kf_data_split(d_1)
    for t in range(trials_n):
        print('\nTrial number:',t+1)
        for i in range(len(data)):
            print('\nIteration number:',(i+1))
            train_data = data[i][0]
            test_data = data[i][1]

            total_result = classification_result_2(train_data,test_data,c_method)
            result = total_result[0]
            total_time += total_result[1]
                
            precision,recall,accuracy,f1_score,fp_rate = result[0]
            auc_result = result[1]

            if (precision != 'not_defined'):
                precision_array.append(precision)
            if (recall != 'not_defined'):
                recall_array.append(recall)
            if (accuracy != 'not_defined'):
                accuracy_array.append(accuracy)
            if (f1_score != 'not_defined'):
                f1_score_array.append(f1_score)
            if (fp_rate != 'not_defined'):
                fp_rate_array.append(fp_rate)
            if (auc_result != 'not_defined'):
                auc_array.append(auc_result)
    
    precision_array = np.array(precision_array)
    recall_array = np.array(recall_array)
    accuracy_array = np.array(accuracy_array)
    f1_score_array = np.array(f1_score_array)
    fp_rate_array = np.array(fp_rate_array)
    auc_array = np.array(auc_array)
    
    print('\nprecision array:',precision_array,'\naverage precision:',precision_array.mean(),'\n')
    print('\nrecall array:',recall_array,'\naverage recall:',recall_array.mean(),'\n')
    print('\naccuracy array:',accuracy_array,'\naverage accuracy:',accuracy_array.mean(),'\n')
    print('\nf1_score array:',f1_score_array,'\naverage f1_score:',f1_score_array.mean(),'\n')
    print('\nfp_rate array:',fp_rate_array,'\naverage fp_rate:',fp_rate_array.mean(),'\n')
    print('\nauc array:',auc_array,'\naverage auc:',auc_array.mean(),'\n')
    
    print('precision:',round(precision_array.mean(),4))
    print('recall:',round(recall_array.mean(),4))
    print('accuracy:',round(accuracy_array.mean(),4))
    print('f1 score:',round(f1_score_array.mean(),4))
    print('fp rate:',round(fp_rate_array.mean(),4))
    print('auc:',round(auc_array.mean(),4))
    print('Total time',round(total_time,4))

In [None]:
c1 = 1
m_depth = 10

In [None]:
c1 = 1

print('AST4:')
print('\n\nProbability:')
a = kf_average(d_4_p,c1)

print('Word:')
print('\n\nProbability:')
a = kf_average(d_5_p,c1)

d_6_p = c_1(d_4_p,d_5_p)
print('Combination:')
print('\n\nProbability:')
a = kf_average(d_6_p,c1)

# Loading bigrams:

In [None]:
# ant
threshold = 0.2052

In [None]:
# jEdit
threshold = 0.1623

In [None]:
# poi
threshold = 0.5130

In [None]:
# xalan
threshold = 0.5432

In [None]:
threshold = 0.5

In [None]:
# bigrams
path_1 = 'D:/SW_defect_prediction_data/'

path_2 = 'poi'

path_3 = os.path.join(path_1,path_2)


path_4 = os.path.join(path_3,'dense_bigram_data_AST1.npy')
b_1_p = np.load(path_4)

path_4 = os.path.join(path_3,'dense_bigram_data_c_AST1.npy')
b_1_c = np.load(path_4)


path_4 = os.path.join(path_3,'dense_bigram_data_AST2.npy')
b_2_p = np.load(path_4)

path_4 = os.path.join(path_3,'dense_bigram_data_c_AST2.npy')
b_2_c = np.load(path_4)


path_4 = os.path.join(path_3,'dense_bigram_data_AST3.npy')
b_3_p = np.load(path_4)

path_4 = os.path.join(path_3,'dense_bigram_data_c_AST3.npy')
b_3_c = np.load(path_4)


path_4 = os.path.join(path_3,'dense_bigram_data_AST4.npy')
b_4_p = np.load(path_4)

path_4 = os.path.join(path_3,'dense_bigram_data_c_AST4.npy')
b_4_c = np.load(path_4)


path_4 = os.path.join(path_3,'dense_bigram_data.npy')
b_5_p = np.load(path_4)

path_4 = os.path.join(path_3,'dense_bigram_data_c.npy')
b_5_c = np.load(path_4)

In [None]:
b_1_b = convert_to_binary(b_1_c)
b_2_b = convert_to_binary(b_2_c)
b_3_b = convert_to_binary(b_3_c)
b_4_b = convert_to_binary(b_4_c)
b_5_b = convert_to_binary(b_5_c)

In [None]:
print(b_1_p.shape)
print(b_1_c.shape)
print(b_2_p.shape)
print(b_2_c.shape)
print(b_3_p.shape)
print(b_3_c.shape)
print(b_4_p.shape)
print(b_4_c.shape)
print(b_5_p.shape)
print(b_5_c.shape)

# Loading trigrams:

In [None]:
# ant
threshold = 0.2052

In [None]:
# jEdit
threshold = 0.1623

In [None]:
# poi
threshold = 0.5130

In [None]:
#xalan
threshold = 0.5432

In [None]:
# trigrams
path_1 = 'D:/SW_defect_prediction_data/'

path_2 = 'poi'

path_3 = os.path.join(path_1,path_2)


path_4 = os.path.join(path_3,'dense_trigram_data_AST1.npy')
d_1_p = np.load(path_4)

path_4 = os.path.join(path_3,'dense_trigram_data_c_AST1.npy')
d_1_c = np.load(path_4)


path_4 = os.path.join(path_3,'dense_trigram_data_AST2.npy')
d_2_p = np.load(path_4)

path_4 = os.path.join(path_3,'dense_trigram_data_c_AST2.npy')
d_2_c = np.load(path_4)


path_4 = os.path.join(path_3,'dense_trigram_data_AST3.npy')
d_3_p = np.load(path_4)

path_4 = os.path.join(path_3,'dense_trigram_data_c_AST3.npy')
d_3_c = np.load(path_4)


path_4 = os.path.join(path_3,'dense_trigram_data_AST4.npy')
d_4_p = np.load(path_4)

path_4 = os.path.join(path_3,'dense_trigram_data_c_AST4.npy')
d_4_c = np.load(path_4)


path_4 = os.path.join(path_3,'dense_trigram_data.npy')
d_5_p = np.load(path_4)

path_4 = os.path.join(path_3,'dense_trigram_data_c.npy')
d_5_c = np.load(path_4)

In [None]:
d_1_b = convert_to_binary(d_1_c)
d_2_b = convert_to_binary(d_2_c)
d_3_b = convert_to_binary(d_3_c)
d_4_b = convert_to_binary(d_4_c)
d_5_b = convert_to_binary(d_5_c)

In [None]:
print(d_1_p.shape)
print(d_1_c.shape)
print(d_1_b.shape)

print(d_2_p.shape)
print(d_2_c.shape)
print(d_2_b.shape)

print(d_3_p.shape)
print(d_3_c.shape)
print(d_3_b.shape)

print(d_4_p.shape)
print(d_4_c.shape)
print(d_4_b.shape)

print(d_5_p.shape)
print(d_5_c.shape)
print(d_5_b.shape)

In [None]:
# xalan
np.random.seed(0)
index_list = np.random.choice(3304,1700,replace=False)

d_1_p = d_1_p[index_list]
d_1_c = d_1_c[index_list]

d_2_p = d_2_p[index_list]
d_2_c = d_2_c[index_list]

d_3_p = d_3_p[index_list]
d_3_c = d_3_c[index_list]

d_4_p = d_4_p[index_list]
d_4_c = d_4_c[index_list]

d_5_p = d_5_p[index_list]
d_5_c = d_5_c[index_list]

# Print multiple plots:

## One classification method_multiple projects:

In [None]:
p_name = ['Ant','jEdit','POI','Xalan']
path_1 = "./data_2/results/"
def print_interval(file_names):
    for i in range(4):
        path_2 = join(path_1,file_names[i])
        with open(path_2,"rb") as fp:
            result = pickle.load(fp)
        
        features_n,auc_array_p_AST1,auc_array_c_AST1,auc_array_b_AST1,auc_array_p_AST2,auc_array_c_AST2,auc_array_b_AST2,auc_array_p_AST3,auc_array_c_AST3,auc_array_b_AST3,auc_array_p_AST4,auc_array_c_AST4,auc_array_b_AST4,auc_array_p_word,auc_array_c_word,auc_array_b_word = result
        
        min_AST1 = np.min((np.min(auc_array_p_AST1),np.min(auc_array_c_AST1),np.min(auc_array_b_AST1)))
        max_AST1 = np.max((np.max(auc_array_p_AST1),np.max(auc_array_c_AST1),np.max(auc_array_b_AST1)))
        
        min_AST2 = np.min((np.min(auc_array_p_AST2),np.min(auc_array_c_AST2),np.min(auc_array_b_AST2)))
        max_AST2 = np.max((np.max(auc_array_p_AST2),np.max(auc_array_c_AST2),np.max(auc_array_b_AST2)))
        
        min_AST3 = np.min((np.min(auc_array_p_AST3),np.min(auc_array_c_AST3),np.min(auc_array_b_AST3)))
        max_AST3 = np.max((np.max(auc_array_p_AST3),np.max(auc_array_c_AST3),np.max(auc_array_b_AST3)))
        
        min_AST4 = np.min((np.min(auc_array_p_AST4),np.min(auc_array_c_AST4),np.min(auc_array_b_AST4)))
        max_AST4 = np.max((np.max(auc_array_p_AST4),np.max(auc_array_c_AST4),np.max(auc_array_b_AST4)))
        
        min_word = np.min((np.min(auc_array_p_word),np.min(auc_array_c_word),np.min(auc_array_b_word)))
        max_word = np.max((np.max(auc_array_p_word),np.max(auc_array_c_word),np.max(auc_array_b_word)))
        
        min_1 = np.min((min_AST1,min_AST2,min_AST3,min_AST4,min_word))
        max_1 = np.max((max_AST1,max_AST2,max_AST3,max_AST4,max_word))
        
        print('\nFinal result:',p_name[i],min_1,max_1)
        
    return 0

In [None]:
file_names = ['ant_LR_trigrams_chi2.txt','jEdit_LR_trigrams_chi2.txt','poi_LR_trigrams_chi2.txt','xalan_LR_trigrams_chi2.txt']
a = print(print_interval(file_names))

In [None]:
file_names = ['ant_RF_trigrams_chi2.txt','jEdit_RF_trigrams_chi2.txt','poi_RF_trigrams_chi2.txt','xalan_RF_trigrams_chi2.txt']
a = print(print_interval(file_names))

In [None]:
file_names = ['ant_LR_trigrams_FE.txt','jEdit_LR_trigrams_FE.txt','poi_LR_trigrams_FE.txt','xalan_LR_trigrams_FE.txt']
a = print(print_interval(file_names))

In [None]:
# Chi2, LR
file_names = ['ant_LR_trigrams_chi2.txt','jEdit_LR_trigrams_chi2.txt','poi_LR_trigrams_chi2.txt','xalan_LR_trigrams_chi2.txt']

In [None]:
# Chi2, RF
file_names = ['ant_RF_trigrams_chi2.txt','jEdit_RF_trigrams_chi2.txt','poi_RF_trigrams_chi2.txt','xalan_RF_trigrams_chi2.txt']

In [None]:
# FE, LR
file_names = ['ant_LR_trigrams_FE.txt','jEdit_LR_trigrams_FE.txt','poi_LR_trigrams_FE.txt','xalan_LR_trigrams_FE.txt']

In [None]:
fig, a = plt.subplots(4,figsize=(9,17))
plt.rcParams.update({'font.size':10})
path_1 = "./data_2/results/"
p_name = ['Ant','jEdit','POI','Xalan']
intervals = [[0.58,0.84],[0.63,0.9],[0.55,0.74],[0.58,0.84]]
for i in range(4):
    path_2 = join(path_1,file_names[i])
    with open(path_2,"rb") as fp:
        result = pickle.load(fp)
    features_n,auc_array_p_AST1,auc_array_c_AST1,auc_array_b_AST1,auc_array_p_AST2,auc_array_c_AST2,auc_array_b_AST2,auc_array_p_AST3,auc_array_c_AST3,auc_array_b_AST3,auc_array_p_AST4,auc_array_c_AST4,auc_array_b_AST4,auc_array_p_word,auc_array_c_word,auc_array_b_word = result

    a[i].plot(features_n,auc_array_b_AST1,color='magenta',marker='o',markersize=5,linestyle='dotted',label='Level-order AST, binary')

    a[i].plot(features_n,auc_array_b_AST2,color='grey',marker='o',markersize=5,linestyle='dotted',label='Pre-order AST, binary')

    a[i].plot(features_n,auc_array_b_AST3,color='firebrick',marker='o',markersize=5,linestyle='dotted',label='Post-order AST, binary')

    a[i].plot(features_n,auc_array_b_AST4,color='blue',marker='o',markersize=5,linestyle='dotted',label='Path-based AST, binary')

    a[i].plot(features_n,auc_array_b_word,color='black',marker='o',markersize=5,linestyle='dotted',label='Word level, binary')
    
    a[i].set_title(p_name[i] + ' project',fontsize=18)
    a[i].set_xlabel('Number of features',fontsize=14)
    a[i].set_ylabel('AUC',fontsize=14)
    a[i].set_ylim(intervals[i])

a[0].legend(fontsize=12,loc=(0.14,1.25),ncol = 2)
fig.subplots_adjust(hspace = 0.4)
plt.show()

# One project_multiple classification method:

In [None]:
file_names = ['jEdit_LR_trigrams_chi2.txt','jEdit_RF_trigrams_chi2.txt','jEdit_GNB_trigrams_chi2.txt','jEdit_Ada_B_trigrams_chi2.txt']

In [None]:
file_names = ['ant_LR_trigrams_chi2.txt','ant_RF_trigrams_chi2.txt','ant_GNB_trigrams_chi2.txt','ant_Ada_B_trigrams_chi2.txt']

In [None]:
file_names = ['poi_LR_trigrams_chi2.txt','poi_RF_trigrams_chi2.txt','poi_GNB_trigrams_chi2.txt','poi_Ada_B_trigrams_chi2.txt']

In [None]:
fig, a = plt.subplots(4,figsize=(9,12))
path_1 = "./data_2/results/"

p_name = ['Logistic Regression Classifier','Random Forest Classifier','Gaussian Naive Bayes Classifier','Ada-Boost Classifier']
for i in range(4):
    path_2 = join(path_1,file_names[i])
    with open(path_2,"rb") as fp:
        result = pickle.load(fp)
    features_n,auc_array_p_AST1,auc_array_c_AST1,auc_array_b_AST1,auc_array_p_AST2,auc_array_c_AST2,auc_array_b_AST2,auc_array_p_AST3,auc_array_c_AST3,auc_array_b_AST3,auc_array_p_AST4,auc_array_c_AST4,auc_array_b_AST4,auc_array_p_word,auc_array_c_word,auc_array_b_word = result

    a[i].plot(features_n,auc_array_p_AST4,color='blue',marker='o',markersize=5,label='Path-based AST, probability')
    a[i].plot(features_n,auc_array_c_AST4,color='blue',marker='o',markersize=5,linestyle='--',dashes=(3,3),label='Path-based AST, count')
    a[i].plot(features_n,auc_array_b_AST4,color='blue',marker='o',markersize=5,linestyle='dotted',label='Path-based AST, binary')
    
    a[i].set_title(p_name[i],fontsize=18)

    a[i].set_xlabel('Number of features',fontsize=14)
    a[i].set_ylabel('AUC',fontsize=14)

a[0].legend(fontsize=14,loc=(0.08,1.25),ncol = 2)
fig.tight_layout()
plt.show()

## Different feature selection methods:

In [None]:
file_names = [['ant_LR_trigrams_RFE.txt', 'ant_LR_trigrams_chi2.txt'],['jEdit_LR_trigrams_RFE.txt', 'jEdit_LR_trigrams_chi2.txt'],['poi_LR_trigrams_RFE.txt', 'poi_LR_trigrams_chi2.txt'],['xalan_LR_trigrams_RFE.txt', 'xalan_LR_trigrams_chi2.txt']]

In [None]:
fig, a = plt.subplots(4,2,figsize=(20,18))
path_1 = "./data_2/results/"

p_name = ['ant','jEdit','poi','xalan']
for i in range(4):
    for j in range(2):
        path_2 = join(path_1,file_names[i][j])
        with open(path_2,"rb") as fp:
            result = pickle.load(fp)
        features_n,auc_array_p_AST1,auc_array_c_AST1,auc_array_b_AST1,auc_array_p_AST2,auc_array_c_AST2,auc_array_b_AST2,auc_array_p_AST3,auc_array_c_AST3,auc_array_b_AST3,auc_array_p_AST4,auc_array_c_AST4,auc_array_b_AST4,auc_array_p_word,auc_array_c_word,auc_array_b_word = result

        a[i,j].plot(features_n,auc_array_p_AST1,color='magenta',marker='o',markersize=5,label='Level-order AST, probability')
    
        a[i,j].plot(features_n,auc_array_p_AST2,color='grey',marker='o',markersize=5,label='Pre-order AST, probability')
    
        a[i,j].plot(features_n,auc_array_p_AST3,color='firebrick',marker='o',markersize=5,label='Post-order AST, probability')
    
        a[i,j].plot(features_n,auc_array_p_AST4,color='blue',marker='o',markersize=5,label='Path-based AST, probability')
    
        a[i,j].plot(features_n,auc_array_p_word,color='black',marker='o',markersize=5,label='Word level, probability')

        if (j == 0):
            m_1 = 'RFE'
        else:
            m_1 = 'Chi2'
            
        a[i,j].set_title(p_name[i] + ' project' + '_' + str(m_1),fontsize=18)
        a[i,j].set_xlabel('Number of features',fontsize=14)
        a[i,j].set_ylabel('AUC',fontsize=14)

a[0,0].legend(fontsize=14,loc=(0.40,1.20),ncol =3)
fig.subplots_adjust(hspace = 0.4)
fig.subplots_adjust(wspace = 0.2)
plt.show()

## Probability, count, binary:

In [None]:
file_names = [['jEdit_LR_trigrams_chi2.txt', 'poi_LR_trigrams_chi2.txt'],['jEdit_RF_trigrams_chi2.txt', 'poi_RF_trigrams_chi2.txt'],['jEdit_GNB_trigrams_chi2.txt', 'poi_GNB_trigrams_chi2.txt']]

In [None]:
fig, a = plt.subplots(3,2,figsize=(20,13))
plt.rcParams.update({'font.size':10})
path_1 = "./data_2/results/"
intervals_2 = [[0.63,0.9],[0.55,0.74]]

p_name = ['Logistic Regression Classifier','Random Forest Classifier','Gaussian Naive Bayes Classifier']

for i in range(3):
    for j in range(2):
        path_2 = join(path_1,file_names[i][j])
        with open(path_2,"rb") as fp:
            result = pickle.load(fp)
        features_n,auc_array_p_AST1,auc_array_c_AST1,auc_array_b_AST1,auc_array_p_AST2,auc_array_c_AST2,auc_array_b_AST2,auc_array_p_AST3,auc_array_c_AST3,auc_array_b_AST3,auc_array_p_AST4,auc_array_c_AST4,auc_array_b_AST4,auc_array_p_word,auc_array_c_word,auc_array_b_word = result

        a[i,j].plot(features_n,auc_array_p_AST4,color='blue',marker='o',markersize=5,label='Path-based AST, probability')
        a[i,j].plot(features_n,auc_array_c_AST4,color='blue',marker='o',markersize=5,linestyle='--',dashes=(3,3),label='Path-based AST, count')
        a[i,j].plot(features_n,auc_array_b_AST4,color='blue',marker='o',markersize=5,linestyle='dotted',label='Path-based AST, binary')

        if (j == 0):
            m_1 = 'jEdit'
        else:
            m_1 = 'POI'
            
        a[i,j].set_title(p_name[i] + ', ' + str(m_1) + ' project',fontsize=18)
        a[i,j].set_xlabel('Number of features',fontsize=14)
        a[i,j].set_ylabel('AUC',fontsize=14)
        
        a[i,j].set_ylim(intervals_2[j])

a[0,0].legend(fontsize=12,loc=(0.40,1.20),ncol =3)
fig.subplots_adjust(hspace = 0.4)
fig.subplots_adjust(wspace = 0.2)
plt.show()

In [None]:
file_names = [['jEdit_LR_trigrams_chi2.txt', 'poi_LR_trigrams_chi2.txt'],['jEdit_RF_trigrams_chi2.txt', 'poi_RF_trigrams_chi2.txt'],['jEdit_GNB_trigrams_chi2.txt', 'poi_GNB_trigrams_chi2.txt'],['jEdit_Ada_B_trigrams_chi2.txt', 'poi_Ada_B_trigrams_chi2.txt']]

In [None]:
fig, a = plt.subplots(4,2,figsize=(20,18))
path_1 = "./data_2/results/"

p_name = ['Logistic Regression Classifier','Random Forest Classifier','Gaussian Naive Bayes Classifier','Ada-Boost Classifier']

for i in range(4):
    for j in range(2):
        path_2 = join(path_1,file_names[i][j])
        with open(path_2,"rb") as fp:
            result = pickle.load(fp)
        features_n,auc_array_p_AST1,auc_array_c_AST1,auc_array_b_AST1,auc_array_p_AST2,auc_array_c_AST2,auc_array_b_AST2,auc_array_p_AST3,auc_array_c_AST3,auc_array_b_AST3,auc_array_p_AST4,auc_array_c_AST4,auc_array_b_AST4,auc_array_p_word,auc_array_c_word,auc_array_b_word = result

        a[i,j].plot(features_n,auc_array_p_AST4,color='blue',marker='o',markersize=5,label='Path-based AST, probability')
        a[i,j].plot(features_n,auc_array_c_AST4,color='blue',marker='o',markersize=5,linestyle='--',dashes=(3,3),label='Path-based AST, count')
        a[i,j].plot(features_n,auc_array_b_AST4,color='blue',marker='o',markersize=5,linestyle='dotted',label='Path-based AST, binary')

        if (j == 0):
            m_1 = 'jEdit'
        else:
            m_1 = 'poi'
            
        a[i,j].set_title(p_name[i] + ', ' + str(m_1) + ' project',fontsize=18)
        a[i,j].set_xlabel('Number of features',fontsize=14)
        a[i,j].set_ylabel('AUC',fontsize=14)

a[0,0].legend(fontsize=14,loc=(0.40,1.20),ncol =3)
fig.subplots_adjust(hspace = 0.4)
fig.subplots_adjust(wspace = 0.2)
plt.show()

# Trigram, Bigram, Combination:

In [None]:
threshold = 0.2052

In [None]:
path_1 = 'D:/SW_defect_prediction_data/'

path_2 = 'ant'

path_3 = os.path.join(path_1,path_2)

path_4 = os.path.join(path_3,'dense_bigram_data_AST4.npy')
b_4_p = np.load(path_4)

path_4 = os.path.join(path_3,'dense_bigram_data_c_AST4.npy')
b_4_c = np.load(path_4)

path_4 = os.path.join(path_3,'dense_trigram_data_AST4.npy')
d_4_p = np.load(path_4)

path_4 = os.path.join(path_3,'dense_trigram_data_c_AST4.npy')
d_4_c = np.load(path_4)

In [None]:
path_1 = 'D:/SW_defect_prediction_data/'

path_2 = 'jEdit'

path_3 = os.path.join(path_1,path_2)

path_4 = os.path.join(path_3,'dense_trigram_data_AST4.npy')
b_4_p = np.load(path_4)

path_4 = os.path.join(path_3,'dense_trigram_data_c_AST4.npy')
b_4_c = np.load(path_4)

path_4 = os.path.join(path_3,'dense_trigram_data.npy')
d_4_p = np.load(path_4)

path_4 = os.path.join(path_3,'dense_trigram_data_c.npy')
d_4_c = np.load(path_4)

In [None]:
def Evaluation_chi2_2(data_1,data_2,data_3,data_4,c_method):
    
    d_1 = data_1.copy()
    d_2 = data_2.copy()
    d_3 = data_3.copy()
    d_4 = data_4.copy()
    d_5 = c_1(d_1,d_3)
    d_6 = c_1(d_2,d_4)
    
    print(d_5.shape)
    print(d_6.shape)
    
    train_1_p, test_1_p = split_data_2(d_1)
    train_1_c, test_1_c = split_data_2(d_2)

    train_2_p, test_2_p = split_data_2(d_3)
    train_2_c, test_2_c = split_data_2(d_4)
    
    train_3_p, test_3_p = split_data_2(d_5)
    train_3_c, test_3_c = split_data_2(d_6)

    features_n = []
    
    auc_array_p_bigram = []
    auc_array_c_bigram = []
    auc_array_b_bigram = []
    
    auc_array_p_trigram = []
    auc_array_c_trigram = []
    auc_array_b_trigram = []
    
    auc_array_p_combination = []
    auc_array_c_combination = []
    auc_array_b_combination = []

    score_f_type = 2
    
    print('Start training')
    feature_s_1 = select_features_2(train_1_p,400,score_f_type)
    print('Stop training')
    
    i_max = 400
    for i in range(i_max,24,-25):
        print(i)
        features_n.append(i)
        
        
        feature_s = select_features_2(train_1_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_1_p_2 = feature_s[0]
        length_1 = test_1_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_1_p_2 = test_1_p[:,columns_1]
        auc_value_p = classification_result(train_1_p_2,test_1_p_2,c_method)[1]
        auc_array_p_bigram.append(auc_value_p)
        if (i == i_max):
            print('bigram_p:')
            print(round(feature_s[2],4),'\n')
        
        feature_s = select_features_2(train_1_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_1_c_2 = feature_s[0]
        length_1 = test_1_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_1_c_2 = test_1_c[:,columns_1]
        auc_value_c = classification_result(train_1_c_2,test_1_c_2,c_method)[1]
        auc_array_c_bigram.append(auc_value_c)
        if (i == i_max):
            print('bigram_c:')
            print(round(feature_s[2],4),'\n')
        
        train_1_c_3 = train_1_c_2.copy()
        test_1_c_3 = test_1_c_2.copy()
        b_train_data = convert_to_binary(train_1_c_3)
        b_test_data = convert_to_binary(test_1_c_3)
        auc_value_b = classification_result(b_train_data,b_test_data,c_method)[1]
        auc_array_b_bigram.append(auc_value_b)
        
        
        
        
        feature_s = select_features_2(train_2_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_2_p_2 = feature_s[0]
        length_1 = test_2_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_2_p_2 = test_2_p[:,columns_1]
        auc_value_p = classification_result(train_2_p_2,test_2_p_2,c_method)[1]
        auc_array_p_trigram.append(auc_value_p)
        if (i == i_max):
            print('trigram_p:')
            print(round(feature_s[2],4),'\n')
        
        feature_s = select_features_2(train_2_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_2_c_2 = feature_s[0]
        length_1 = test_2_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_2_c_2 = test_2_c[:,columns_1]
        auc_value_c = classification_result(train_2_c_2,test_2_c_2,c_method)[1]
        auc_array_c_trigram.append(auc_value_c)
        if (i == i_max):
            print('trigram_c:')
            print(round(feature_s[2],4),'\n')
        
        train_2_c_3 = train_2_c_2.copy()
        test_2_c_3 = test_2_c_2.copy()
        b_train_data = convert_to_binary(train_2_c_3)
        b_test_data = convert_to_binary(test_2_c_3)
        auc_value_b = classification_result(b_train_data,b_test_data,c_method)[1]
        auc_array_b_trigram.append(auc_value_b)
        
        
        
        
        feature_s = select_features_2(train_3_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_3_p_2 = feature_s[0]
        length_1 = test_3_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_3_p_2 = test_3_p[:,columns_1]
        auc_value_p = classification_result(train_3_p_2,test_3_p_2,c_method)[1]
        auc_array_p_combination.append(auc_value_p)
        if (i == i_max):
            print('combination_p:')
            print(round(feature_s[2],4),'\n')
        
        feature_s = select_features_2(train_3_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_3_c_2 = feature_s[0]
        length_1 = test_3_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_3_c_2 = test_3_c[:,columns_1]
        auc_value_c = classification_result(train_3_c_2,test_3_c_2,c_method)[1]
        auc_array_c_combination.append(auc_value_c)
        if (i == i_max):
            print('combination_c:')
            print(round(feature_s[2],4),'\n')
        
        train_3_c_3 = train_3_c_2.copy()
        test_3_c_3 = test_3_c_2.copy()
        b_train_data = convert_to_binary(train_3_c_3)
        b_test_data = convert_to_binary(test_3_c_3)
        auc_value_b = classification_result(b_train_data,b_test_data,c_method)[1]
        auc_array_b_combination.append(auc_value_b)
        
    return(features_n,auc_array_p_bigram,auc_array_c_bigram,auc_array_b_bigram,auc_array_p_trigram,auc_array_c_trigram,auc_array_b_trigram,auc_array_p_combination,auc_array_c_combination,auc_array_b_combination)
        

In [None]:
# logistic regression
result = Evaluation_chi2_2(b_4_p,b_4_c,d_4_p,d_4_c,2)

In [None]:
features_n,auc_array_p_bigram,auc_array_c_bigram,auc_array_b_bigram,auc_array_p_trigram,auc_array_c_trigram,auc_array_b_trigram,auc_array_p_combination,auc_array_c_combination,auc_array_b_combination = result

In [None]:
plt.rcParams["figure.figsize"] = (10,10)
 
plt.plot(features_n,auc_array_p_bigram,color='skyblue',marker='o',markersize=5,label='Level-order AST, probability')
plt.plot(features_n,auc_array_c_bigram,color='skyblue',marker='o',markersize=5,linestyle='--',dashes=(3,3),label='Level-order AST, count')
plt.plot(features_n,auc_array_b_bigram,color='skyblue',marker='o',markersize=5,linestyle='dotted',label='Level-order AST, binary')

plt.plot(features_n,auc_array_p_trigram,color='blue',marker='o',markersize=5,label='Pre-order AST, probability')
plt.plot(features_n,auc_array_c_trigram,color='blue',marker='o',markersize=5,linestyle='--',dashes=(3,3),label='Pre-order AST, count')
plt.plot(features_n,auc_array_b_trigram,color='blue',marker='o',markersize=5,linestyle='dotted',label='Pre-order AST, binary')

plt.plot(features_n,auc_array_p_combination,color='brown',marker='o',markersize=5,label='Post-order AST, probability')
plt.plot(features_n,auc_array_c_combination,color='brown',marker='o',markersize=5,linestyle='--',dashes=(3,3),label='Post-order AST, count')
plt.plot(features_n,auc_array_b_combination,color='brown',marker='o',markersize=5,linestyle='dotted',label='Post-order AST, binary')



plt.title('AUC per number of features',fontsize=16)
plt.xlabel('Number of features',fontsize=14)
plt.ylabel('AUC',fontsize=14)
axes = plt.gca()
plt.gca().legend(loc='center left', bbox_to_anchor=(1,0.5))
plt.show()

# Bar plot:

In [None]:
plt.rcParams["figure.figsize"] = (12,5)
plt.rcParams.update({'font.size':14})
fig = plt.figure()
a = fig.add_axes([0,0,1,1])
methods = ['Word-level','Level-order AST','Pre-order AST','Post-order AST','Path-based AST']

trigrams_n = [4409,11828,3659,4101,1809]
a.bar(methods,trigrams_n,color = 'blue')
a.set_ylabel('Number of trifram types')
plt.show()

# Classification using linear SVM (accuracy):

In [None]:
def kf_average_accuracy(d_1,c_method):
    c_method = 7
    total_time = 0
    trials_n = 1
    precision_array = []
    recall_array = []
    accuracy_array = []
    f1_score_array = []
    fp_rate_array = []
    
    data = kf_data_split(d_1)
    for t in range(trials_n):
        print('\nTrial number:',t+1)
        for i in range(len(data)):
            print('\nIteration number:',(i+1))
            train_data = data[i][0]
            test_data = data[i][1]

            total_result = classification_result_2(train_data,test_data,c_method)
            result = total_result[0]
            total_time += total_result[1]
                
            precision,recall,accuracy,f1_score,fp_rate = result
            

            if (precision != 'not_defined'):
                precision_array.append(precision)
            if (recall != 'not_defined'):
                recall_array.append(recall)
            if (accuracy != 'not_defined'):
                accuracy_array.append(accuracy)
            if (f1_score != 'not_defined'):
                f1_score_array.append(f1_score)
            if (fp_rate != 'not_defined'):
                fp_rate_array.append(fp_rate)
            
    
    precision_array = np.array(precision_array)
    recall_array = np.array(recall_array)
    accuracy_array = np.array(accuracy_array)
    f1_score_array = np.array(f1_score_array)
    fp_rate_array = np.array(fp_rate_array)
    
    print('\nprecision array:',precision_array,'\naverage precision:',precision_array.mean(),'\n')
    print('\nrecall array:',recall_array,'\naverage recall:',recall_array.mean(),'\n')
    print('\naccuracy array:',accuracy_array,'\naverage accuracy:',accuracy_array.mean(),'\n')
    print('\nf1_score array:',f1_score_array,'\naverage f1_score:',f1_score_array.mean(),'\n')
    print('\nfp_rate array:',fp_rate_array,'\naverage fp_rate:',fp_rate_array.mean(),'\n')
    
    print('precision:',round(precision_array.mean(),4))
    print('recall:',round(recall_array.mean(),4))
    print('accuracy:',round(accuracy_array.mean(),4))
    print('f1 score:',round(f1_score_array.mean(),4))
    print('fp rate:',round(fp_rate_array.mean(),4))
    print('Total time',round(total_time,4))

In [None]:
c1 = 7
print('AST4:')
print('\n\nProbability:')
a = kf_average_accuracy(d_4_p,c1)

# Chi2 for accuracy:

In [None]:
def Evaluation_chi2_accuracy(data_1,data_2,data_3,data_4,data_5,data_6,data_7,data_8,data_9,data_10,c_method):
    d_1 =data_1.copy()
    d_2 =data_2.copy()
    d_3 =data_3.copy()
    d_4 =data_4.copy()
    d_5 =data_5.copy()
    d_6 =data_6.copy()
    d_7 =data_7.copy()
    d_8 =data_8.copy()
    d_9 =data_9.copy()
    d_10 =data_10.copy()
    
    train_1_p, test_1_p = split_data_2(d_1)
    train_1_c, test_1_c = split_data_2(d_2)

    train_2_p, test_2_p = split_data_2(d_3)
    train_2_c, test_2_c = split_data_2(d_4)

    train_3_p, test_3_p = split_data_2(d_5)
    train_3_c, test_3_c = split_data_2(d_6)

    train_4_p, test_4_p = split_data_2(d_7)
    train_4_c, test_4_c = split_data_2(d_8)

    train_5_p, test_5_p = split_data_2(d_9)
    train_5_c, test_5_c = split_data_2(d_10)


    features_n = []
    
    accuracy_array_p_AST1 = []
    accuracy_array_c_AST1 = []
    accuracy_array_b_AST1 = []
    
    accuracy_array_p_AST2 = []
    accuracy_array_c_AST2 = []
    accuracy_array_b_AST2 = []
    
    accuracy_array_p_AST3 = []
    accuracy_array_c_AST3 = []
    accuracy_array_b_AST3 = []
    
    accuracy_array_p_AST4 = []
    accuracy_array_c_AST4 = []
    accuracy_array_b_AST4 = []
    
    accuracy_array_p_word = []
    accuracy_array_c_word = []
    accuracy_array_b_word = []

    score_f_type = 2
    
    
    i_max = 500
    for i in range(i_max,24,-25):
#     for i in range(i_max,i_max-1,-25):
        print(i)
        features_n.append(i)
        
        
        feature_s = select_features_2(train_1_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_1_p_2 = feature_s[0]
        length_1 = test_1_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_1_p_2 = test_1_p[:,columns_1]
        result_p = classification_result(train_1_p_2,test_1_p_2,c_method)
        accuracy_array_p_AST1.append(result_p[2])
        if (i == i_max):
            print('AST1_p:')
            print(round(feature_s[2],4),'\n')
        
        feature_s = select_features_2(train_1_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_1_c_2 = feature_s[0]
        length_1 = test_1_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_1_c_2 = test_1_c[:,columns_1]
        result_c = classification_result(train_1_c_2,test_1_c_2,c_method)
        accuracy_array_c_AST1.append(result_c[2])
        if (i == i_max):
            print('AST1_c:')
            print(round(feature_s[2],4),'\n')
        
        train_1_c_3 = train_1_c_2.copy()
        test_1_c_3 = test_1_c_2.copy()
        b_train_data = convert_to_binary(train_1_c_3)
        b_test_data = convert_to_binary(test_1_c_3)
        result_b = classification_result(b_train_data,b_test_data,c_method)
        accuracy_array_b_AST1.append(result_b[2])
        
        
        
        
        feature_s = select_features_2(train_2_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_2_p_2 = feature_s[0]
        length_1 = test_2_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_2_p_2 = test_2_p[:,columns_1]
        result_p = classification_result(train_2_p_2,test_2_p_2,c_method)
        accuracy_array_p_AST2.append(result_p[2])
        if (i == i_max):
            print('AST2_p:')
            print(round(feature_s[2],4),'\n')
        
        feature_s = select_features_2(train_2_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_2_c_2 = feature_s[0]
        length_1 = test_2_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_2_c_2 = test_2_c[:,columns_1]
        result_c = classification_result(train_2_c_2,test_2_c_2,c_method)
        accuracy_array_c_AST2.append(result_c[2])
        if (i == i_max):
            print('AST2_c:')
            print(round(feature_s[2],4),'\n')
        
        train_2_c_3 = train_2_c_2.copy()
        test_2_c_3 = test_2_c_2.copy()
        b_train_data = convert_to_binary(train_2_c_3)
        b_test_data = convert_to_binary(test_2_c_3)
        result_b = classification_result(b_train_data,b_test_data,c_method)
        accuracy_array_b_AST2.append(result_b[2])
        
        
        
        
        feature_s = select_features_2(train_3_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_3_p_2 = feature_s[0]
        length_1 = test_3_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_3_p_2 = test_3_p[:,columns_1]
        result_p = classification_result(train_3_p_2,test_3_p_2,c_method)
        accuracy_array_p_AST3.append(result_p[2])
        if (i == i_max):
            print('AST3_p:')
            print(round(feature_s[2],4),'\n')
        
        feature_s = select_features_2(train_3_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_3_c_2 = feature_s[0]
        length_1 = test_3_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_3_c_2 = test_3_c[:,columns_1]
        result_c = classification_result(train_3_c_2,test_3_c_2,c_method)
        accuracy_array_c_AST3.append(result_c[2])
        if (i == i_max):
            print('AST3_c:')
            print(round(feature_s[2],4),'\n')
        
        train_3_c_3 = train_3_c_2.copy()
        test_3_c_3 = test_3_c_2.copy()
        b_train_data = convert_to_binary(train_3_c_3)
        b_test_data = convert_to_binary(test_3_c_3)
        result_b = classification_result(b_train_data,b_test_data,c_method)
        accuracy_array_b_AST3.append(result_b[2])
        
        
        
        
        feature_s = select_features_2(train_4_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_4_p_2 = feature_s[0]
        length_1 = test_4_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_4_p_2 = test_4_p[:,columns_1]
        result_p = classification_result(train_4_p_2,test_4_p_2,c_method)
        accuracy_array_p_AST4.append(result_p[2])
        if (i == i_max):
            print('AST4_p:')
            print(round(feature_s[2],4),'\n')
        
        feature_s = select_features_2(train_4_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_4_c_2 = feature_s[0]
        length_1 = test_4_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_4_c_2 = test_4_c[:,columns_1]
        result_c = classification_result(train_4_c_2,test_4_c_2,c_method)
        accuracy_array_c_AST4.append(result_c[2])
        if (i == i_max):
            print('AST4_c:')
            print(round(feature_s[2],4),'\n')
        
        train_4_c_3 = train_4_c_2.copy()
        test_4_c_3 = test_4_c_2.copy()
        b_train_data = convert_to_binary(train_4_c_3)
        b_test_data = convert_to_binary(test_4_c_3)
        result_b = classification_result(b_train_data,b_test_data,c_method)
        accuracy_array_b_AST4.append(result_b[2])
        
        
        
        
        feature_s = select_features_2(train_5_p,i,score_f_type)
        columns_1 = feature_s[1]
        train_5_p_2 = feature_s[0]
        length_1 = test_5_p.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_5_p_2 = test_5_p[:,columns_1]
        result_p = classification_result(train_5_p_2,test_5_p_2,c_method)
        accuracy_array_p_word.append(result_p[2])
        if (i == i_max):
            print('Word_p:')
            print(round(feature_s[2],4),'\n')
            
        feature_s = select_features_2(train_5_c,i,score_f_type)
        columns_1 = feature_s[1]
        train_5_c_2 = feature_s[0]
        length_1 = test_5_c.shape[1]
        columns_1 = np.append(columns_1,(length_1 - 1))
        test_5_c_2 = test_5_c[:,columns_1]
        result_c = classification_result(train_5_c_2,test_5_c_2,c_method)
        accuracy_array_c_word.append(result_c[2])
        if (i == i_max):
            print('Word_c:')
            print(round(feature_s[2],4),'\n')
        
        train_5_c_3 = train_5_c_2.copy()
        test_5_c_3 = test_5_c_2.copy()
        b_train_data = convert_to_binary(train_5_c_3)
        b_test_data = convert_to_binary(test_5_c_3)
        result_b = classification_result(b_train_data,b_test_data,c_method)
        accuracy_array_b_word.append(result_b[2])
            
    return(features_n,accuracy_array_p_AST1,accuracy_array_c_AST1,accuracy_array_b_AST1,accuracy_array_p_AST2,accuracy_array_c_AST2,accuracy_array_b_AST2,accuracy_array_p_AST3,accuracy_array_c_AST3,accuracy_array_b_AST3,accuracy_array_p_AST4,accuracy_array_c_AST4,accuracy_array_b_AST4,accuracy_array_p_word,accuracy_array_c_word,accuracy_array_b_word)
        

In [None]:
# Chi2 Logistic Regression
step_value = 25
result = Evaluation_chi2_accuracy(d_1_p, d_1_c, d_2_p, d_2_c, d_3_p, d_3_c, d_4_p, d_4_c, d_5_p, d_5_c, 7)