# Import Data

In [None]:
import os
import researchpy as rp
import pandas as pd
import pickle

import statsmodels.api as sm
from statsmodels.formula.api import ols

dir = os.scandir('data/')
experiments = list(filter(lambda x: x.name.endswith(".pickle"), dir))

df = pd.DataFrame()

for experiment in experiments:
    dffile = open(experiment.path, 'rb')      
    read_df = pickle.load(dffile)
    read_df = read_df[['max_epochs', 'executor_memory', 'executor_cores', 'model', 'accuracy', 'time']]
    df = pd.concat([df, read_df])


acc_df = pd.DataFrame({'y': df["accuracy"],
                       'epochs': df["max_epochs"],
                       'memory': df["executor_memory"],
                       'cores': df["executor_cores"],
                       'model': df["model"]})

time_df = pd.DataFrame({'y': df["time"],
                        'epochs': df["max_epochs"],
                        'memory': df["executor_memory"],
                        'cores': df["executor_cores"],
                        'model': df["model"]})

print(f"Imported {len(experiments)} experiments!")

# Data Preparation

In [None]:
def df_to_x_and_y(data_df):
    output_column = "epochs"
    data_X = data_df[[x for x in data_df.columns if x != output_column]]
    data_y = data_df[[output_column]]
    data_y = data_y.to_numpy().flatten()
    return data_X, data_y

def numerize_model_names(df_data):
    numerized = df_data.copy()
    numerized['model'] = numerized['model'].apply(lambda x: -1 if x == "bi-rnn" else 1)
    return numerized

In [None]:
from imblearn import over_sampling
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, linear_model, svm, metrics, tree
from sklearn.metrics import mean_absolute_error, mean_squared_error, confusion_matrix
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt

acc_df_numerized_models = numerize_model_names(acc_df)
time_df_numerized_models = numerize_model_names(time_df)


# All Models

In [None]:
import numpy as np
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


def model_testing(input_df, learning_models, x_axis, plot_name_prefix, x_label):
    
    error_df = pd.DataFrame({"model": [], "err": [], "baseline_err": [], "err^2": []})
    
    for model in learning_models:
        print(f"=========   {plot_name_prefix}: {model}   ==========")
        absolute_errors = []
        absolute_errors_baseline = []
        squared_errors = []

        feature_vectors, labels = df_to_x_and_y(input_df)

        for i in range(0, 10):
            x_train, x_test, y_train, y_test = train_test_split(feature_vectors,
                                                                labels,
                                                                train_size=int(.80 * len(input_df)),
                                                                test_size=int(.20 * len(input_df)))

            trained_model = model.fit(x_train, y_train)
            predictions = trained_model.predict(x_test)
            absolute_errors.append(mean_absolute_error(y_test, predictions))
            squared_errors.append(mean_squared_error(y_test, predictions))
            baseline_predictions = [40 for _ in range(len(predictions))]
            absolute_errors_baseline.append(mean_absolute_error(y_test, baseline_predictions))
        
        mean_abs_err = np.mean(np.array(absolute_errors))
        mean_abs_baseline_error = np.mean(np.array(absolute_errors_baseline))
        mean_sqrd_err = np.mean(np.array(squared_errors))
        print(f'Mean absolute error: {mean_abs_err}')
        print(f'Mean absolute error baseline: {mean_abs_baseline_error}')
        print(f'Mean squared error: {mean_sqrd_err}')
        
        error_df = error_df.append({"model": model, "err": mean_abs_err, "baseline_err": mean_abs_baseline_error, "err^2": mean_sqrd_err}, ignore_index=True)
        error_df["% improv"] = (error_df["baseline_err"] - error_df["err"]) / error_df["baseline_err"] * 100
        error_df[["err", "baseline_err", "err^2", "% improv"]] = error_df[["err", "baseline_err", "err^2", "% improv"]].apply(lambda x: round(x, 2))
        

        for learning_model in [-1, 1]:
            for cores in [8, 16]:
                for memory in [4, 32]:
                    input_data = [[i, memory, cores, learning_model] for i in x_axis]
                    predictions = trained_model.predict(input_data)
                    learning_model_name = "le" if learning_model == 1 else "br"
                    plt.plot(x_axis, predictions, label=f"{learning_model_name} C:{cores} M:{memory}")

        plt.scatter(input_df["y"], input_df["epochs"])
        plt.legend()
        plt.plot(x_axis, predictions)
        plt.title(model)
        plt.ylim(0, 100)
        plt.ylabel("Epochs")
        plt.xlabel(x_label)
        plt.show()
    
    return error_df

## Time model results

In [None]:

time_learning_models = [
                        SVR(),
                        tree.DecisionTreeRegressor(),
                        MLPRegressor(),
                        linear_model.BayesianRidge(),
                        linear_model.ARDRegression(),
                       ]

error_df = model_testing(time_df_numerized_models, time_learning_models, [i * 10 for i in range(100)], "Time", "Time (s)")

error_df = error_df.sort_values(by=['err'])

print(error_df.to_latex(index=False, caption='Black Box model errors for predicting epochs based on time', label='black-box-time-model-errors'))
error_df

## Accuracy model results

In [None]:

acc_learning_models = [
                        SVR(),
                        tree.DecisionTreeRegressor(),
                        MLPRegressor(),
                        linear_model.BayesianRidge(),
                        linear_model.ARDRegression(),
                       ]

error_df = model_testing(acc_df_numerized_models, acc_learning_models, [i / 100 for i in range(100)], "Accuracy", "Accuracy")
error_df = error_df.sort_values(by=['err'])
print(error_df.to_latex(index=False, caption='Black Box model errors for predicting epochs based on accuracy', label='black-box-accuracy-model-errors'))
error_df

## CPU Cores comparisons

In [None]:
import numpy as np

def plot_cpu_graph(input_df, model, target, plot_title):
    print(f"=========   Varying cores using {model}   ==========")
    
    feature_vectors, labels = df_to_x_and_y(input_df)

    x_train, x_test, y_train, y_test = train_test_split(feature_vectors,
                                                        labels,
                                                        train_size=int(.80 * len(input_df)),
                                                        test_size=int(.20 * len(input_df)))

    trained_model = model.fit(x_train, y_train)
    
    x_axis = list(range(1,64+1))
    learning_model = 1
    input_data = [[target, 32, i, learning_model] for i in x_axis]
    predictions = trained_model.predict(input_data)
    plt.plot(x_axis, predictions)
    
    plt.plot(x_axis, predictions)
    plt.title(f"{model}: {plot_title}")
    plt.ylim(0, 100)
    plt.ylabel("Epochs")
    plt.xlabel("Cores")
    plt.show()


In [None]:
target = 0.9
plot_cpu_graph(acc_df_numerized_models, linear_model.ARDRegression(), target, f"Epochs for {target * 100}% accuracy")

In [None]:
target = 300
plot_cpu_graph(time_df_numerized_models, linear_model.ARDRegression(), target, f"Epochs for {target} seconds")