<a href="https://colab.research.google.com/github/Kasperbang/AEEEM/blob/main/AUX_Notebook_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is intended to show to computations behind the curtains. That is, here, one can expect to find functions that are used in our main notebook:
https://colab.research.google.com/drive/1igEkqsbo0Zo9jfq1jWqeqvsavkoUEjRD#scrollTo=eQGC48qm1W2r 

In [2]:
#Libraries and packages:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import scipy as sp
from scipy.io.arff import loadarff 
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler


#Auxillary function to make our data readable:

def numerical_response(df, data_set):
    if data_set == 'AEEEM':
        for i in range(len(df["class"])):
            if df["class"].iloc[i][2:7] == 'clean':
                df["class"].iloc[i] = 0
            else: 
                df["class"].iloc[i] = 1

        # df.loc[df["class"] == b'buggy', "class"] = 1
        # df.loc[df["class"] == b'clean', "class"] = 0
        
    if data_set == 'NASA':
        df.rename(columns={'Defective':'class'}, inplace=True)
        df.loc[df["class"] == 'Y', "class"] = 1
        df.loc[df["class"] == 'N', "class"] = 0
        
        
    if data_set == 'PROMISE':
        df.rename(columns={'bug':'class'}, inplace=True)
        df.loc[df["class"] > 0, "class"] = 1
          
    return 

In [3]:
#Functions to help investigate data:

def investigate_df(df):
    data_points = len(df["class"])
    buggy = df["class"].sum()
    percent_buggy = round((buggy / data_points )* 100, 1) 
    print('Number of data points = ',data_points)
    print('Number of buggy data points = ',buggy)
    print('Percent buggy data points = ', percent_buggy, '%') 
    return 



def plot_two_columns(df, col_name1, col_name2):
    plt.subplot(1, 2, 2)
    plt.hist2d(df[col_name1], df[col_name2], cmap=plt.cm.YlGn)
    plt.colorbar()
    plt.xlabel(col_name1)
    
    plt.subplot(1, 2,1)
    plt.scatter(df[col_name1], df[col_name2])
    plt.xlabel(col_name1)
    plt.ylabel(col_name2)
    plt.show()
    return



def plot_var(df):
    col_names = (df.columns).tolist()
    for i in range(len(col_names)):
        plot_two_columns(df, 'class', col_names[i])
    return


#Return a heatmap of pairwise correlation among all columns in a dataframe.
def pearsoncor(df):
  sns.heatmap(df.corr())
  return


#Create a histogram of all columns in a dataframe.
def histofdf(df):  
  df.hist(bins=20, 
          figsize=(25, 55),
          xlabelsize = 12, 
          grid = False, 
          linewidth=3.0,
          layout = (16,4))
  return


#Find columns in a project with correlation > X
def find_correlated_columns(df, treshold):
    correlated_columns = []
    cor = df.corr()
    np.fill_diagonal(cor.values, 0)
    column_names = (cor.columns).to_list()
    i = 0
    current_length = len(column_names) - 1
    while i < current_length:
        max_value = max(abs(cor.iloc[i,:]))
        if max_value > treshold:
            correlated_columns.append(column_names[i])
            cor.drop(columns=column_names[i], inplace=True)
            current_length = current_length - 1

        i = i + 1 
    return correlated_columns



#Drop specific features of a project to decrease dimensionality.
def drop_variables(df, drop_list):
    df.drop(columns=drop_list, inplace=True)
    return


# **Functions to help tune the models:**

In [4]:
def backward_feature_selection(df_train, df_test, func, scale = False):
    # returns a drop list with variables we should drop
    columns = df_train.columns.to_list()
    number_of_columns = len(columns)
    number_of_variables = len(columns) -1
    auc_list = []
    dropped_column_list = []

    y_train = list(df_train['class'])
    y_test = list(df_test['class'])
    
    if scale == True:
        X_train, X_test = scale_data(df_train, df_test)
        
    if scale == False:
        X_train = df_train.iloc[:,:-1]
        X_test = df_test.iloc[:,:-1]
    
    X_train_copy = (X_train).copy()
    X_test_copy = (X_test).copy()
    
    for j in range(number_of_columns - 2):
        auc_temp_list = []
        columns = X_train_copy.columns.to_list()
        
        number_of_columns = len(columns)
        print(j," iteration done: ", number_of_columns - 2, " iterations left.")

        for i in range(number_of_columns - 1):

            X_train = X_train_copy.copy()
            X_test = X_test_copy.copy()
            X_train.drop(columns = columns[i], inplace=True)
            X_test.drop(columns = columns[i], inplace=True)
            
            model = func()
            model.fit(X_train, y_train)
            
            y_pred = model.predict(X_test)
            auc_temp_list.append(roc_auc_score(y_test, y_pred))
        
        idx = np.argmax(auc_temp_list)
        auc_list.append(auc_temp_list[idx])
        dropped_column_list.append(columns[idx])
        print("Removed ", columns[idx])
        print("Shape of data frame: ",np.shape(X_train_copy),"\n")
        X_train_copy.drop(columns = columns[idx], inplace=True)
        X_test_copy.drop(columns = columns[idx], inplace=True)
    
    idx_max_auc = np.argmax(auc_list)
    print("\nMax auc value: ",np.round(auc_list[idx_max_auc], 2))
    print("and is found using ", number_of_variables-idx_max_auc ," variables.")
        
    return dropped_column_list[0:idx_max_auc]



def scale_data_unsupervised(df):
    std_scaler = StandardScaler()
    df_std = pd.DataFrame(std_scaler.fit_transform(df), columns=df.columns)
    
    return df_std



def scale_data(train_data, test_data):
    X_train = train_data.iloc[:,:-1]
    X_test = test_data.iloc[:,:-1]
    
    std_scaler = StandardScaler()
    
    scaled_train_X = pd.DataFrame(std_scaler.fit_transform(X_train), columns=X_train.columns)
    scaled_test_X = pd.DataFrame(std_scaler.fit_transform(X_test), columns=X_test.columns)

    return scaled_train_X, scaled_test_X


# **Performance of the models**

In [5]:

def AUC_unsupervised(function, df, project_list = 0):
    y_pred, y_test = function(df)
    auc = roc_auc_score(y_test, y_pred)
    
    return np.round(np.mean(auc), 2)
    
    
    
def average_AUC_cross(function, df_test, projects = 0):
    n = len(projects)
    auc = np.zeros(n)
    for j in range(n):
        df_train = projects[j]
        y_pred, y_test = function(df_train, df_test)
        auc[j] = roc_auc_score(y_test, y_pred)
        
    return np.round(np.mean(auc), 2)



def average_AUC_within(function, df, project_list = 0):
    t = 500 # change to 500 for final results
    auc = np.zeros(2*t)
    for i in range(t):
        df1, df2 = train_test_split(df, test_size=0.5)
        y_pred, y_test = function(df1, df2)
        auc[i] = roc_auc_score(y_test, y_pred)
        y_pred, y_test = function(df2, df1)
        auc[2*t - i - 1] = roc_auc_score(y_test, y_pred)
    return np.round(np.mean(auc), 2)



def get_auc_scores(project_dict, func, auc_func):
    average_auc_list = []
    
    project_list = list(project_dict.values())
    project_names = list(project_dict.keys())
    
    all_projects = project_list

    for p in range(len(project_list)): 
        # only for cross project        
        train_project = all_projects.copy()
        train_project.pop(p)
        
        # for all (cross project, within project and unsupervised)
        test_project = project_list[p]

        auc = auc_func(func, test_project, train_project)
        print('\nAUC score for ' + project_names[p] + ' :', auc)
        average_auc_list.append(auc)
    
    print('\nMean AUC score for all projects:', np.round(np.mean(average_auc_list),2))  
    return average_auc_list



def scott_knott(function, df):
    t = 500 # change to 500 for final results
    auc = np.zeros(2*t)
    for i in range(t):
        df1, df2 = train_test_split(df, test_size=0.5)
        y_pred, y_test = function(df1, df2)
        auc[i] = roc_auc_score(y_test, y_pred)
        y_pred, y_test = function(df2, df1)
        auc[2*t - i - 1] = roc_auc_score(y_test, y_pred)
    print(len(auc))
    
    return auc



# **To present results:**

In [6]:
def to_csv_supervised(project_dict, function_dict, file_name):
    measure = ['CP', 'WP', 'diff']
    m = len(measure)
    function_names = list(function_dict.keys())
    column_names =[]
    for i in range(len(function_dict)):
        for j in range(m):
            column_names.append(function_names[i] + '\n' + measure[j])
    project_list = list(project_dict.values())
    function_list = list(function_dict.values())
    row_names = list(project_dict.keys())
    
    col = 0
    all_projects = project_list
    
    # initialize data
    data = np.zeros((len(project_list), len(function_list)*m))
    for func in function_list:
        print(func) # del
        for p in range(len(project_list)):         
            train_project = all_projects.copy()
            train_project.pop(p)
            
            test_project = project_list[p]
            
            AUC_cross = average_AUC_cross(func, test_project, train_project)
            AUC_within = average_AUC_within(func, test_project)
            
            data[p,col] = AUC_cross
            data[p,col+1] = AUC_within
            data[p,col+2] = round(AUC_within - AUC_cross,2)
            print(project_list[p])           
        col = col + m
    df = pd.DataFrame(data, index = row_names) 
    df.to_csv(file_name, encoding='utf-8', index=True, header=column_names)
    return



def to_csv_unsupervised(project_dict, function_dict, file_name):
    function_names = list(function_dict.keys())
    column_names =[]
    for i in range(len(function_dict)):
        column_names.append(function_names[i])
    
    project_list = list(project_dict.values())
    function_list = list(function_dict.values())
    row_names = list(project_dict.keys())
    
    col = 0
    all_projects = project_list
    
    # initialize data
    data = np.zeros((len(project_list), len(function_list)))
    for func in function_list:
        for p in range(len(project_list)):         
            df = project_list[p]
    
            AUC = AUC_unsupervised(func, df)
            data[p,col] = AUC
            
    df = pd.DataFrame(data, index = row_names) 
    df.to_csv(file_name, encoding='utf-8', index=True, header=column_names)
    return













