# Import Data

In [None]:
import os
import researchpy as rp
import pandas as pd
import pickle

import statsmodels.api as sm
from statsmodels.formula.api import ols

dataset_pickle = 'data/dataset.pickle'

with open(dataset_pickle, 'rb') as handle:
    df = pickle.load(handle)

assert df is not None, f'The dataset pickle file {dataset_pickle} is not found'

        
acc_df = pd.DataFrame({'y': df["accuracy"],
                       'epochs': df["max_epochs"],
                       'memory': df["executor_memory"],
                       'cores': df["executor_cores"],
                       'model': df["model"]})

time_df = pd.DataFrame({'y': df["time"],
                        'epochs': df["max_epochs"],
                        'memory': df["executor_memory"],
                        'cores': df["executor_cores"],
                        'model': df["model"]})

print(f"Imported {len(dataset_pickle)} experiments!")



## DataFrame Summaries

In [None]:
summary_acc_df = rp.summary_cont(acc_df.groupby(['model', 'epochs', 'cores', 'memory']))['y']
summary_acc_df

In [None]:
summary_time_df = rp.summary_cont(time_df.groupby(['model', 'epochs', 'cores', 'memory']))['y']
summary_time_df

## ANOVA analysis

In [None]:
def anova_analysis(input_df):
    model = ols('y ~ C(model)*epochs*cores*memory', input_df).fit()
    
    res = sm.stats.anova_lm(model, typ=2)
    res['PR(>F) < 0.05'] = res['PR(>F)'] < 0.05
    return res, model

In [None]:
acc_res, acc_model = anova_analysis(acc_df)
print("Accuracy ANOVA Analysis")
acc_res

In [None]:
time_res, time_model = anova_analysis(time_df)
print("Time ANOVA Analysis")
time_res

In [None]:
epoch_model_acc = ols('epochs ~ C(model)*cores*memory*y', acc_df).fit()
epoch_model_time = ols('epochs ~ C(model)*cores*memory*y', time_df).fit()

def epoch_model_predict(trained_model, model, cores, memory, y):
    assert model.lower() in ['bi-rnn', 'lenet5'], 'unsupported model (supported: bi-rnn or lenet5)'
    assert cores > 0, 'impossible to run on 0 cores'
    assert memory > 0, 'impossible to run without memory'
    assert y > 0, 'either time or accuracy should be >0 (time or accuracy depends on the trained model)'
    return trained_model.predict(exog={'model': model.lower(), 'cores': cores, 'memory': memory, 'y': y})[0]

In [None]:
prediction_with_acc = epoch_model_predict(epoch_model_acc, model='Bi-rnn', cores=10, memory=64, y=0.6)
prediction_with_time = epoch_model_predict(epoch_model_time, model='Bi-rnn', cores=10, memory=64, y=300)

print(f'Predicted epochs with accuracy: {prediction_with_acc}')
print(f'Predicted epochs with time: {prediction_with_time}')

In [None]:
def calculate_percentages_variation_explained(acc_model, time_model):
    df_time_test = sm.stats.anova_lm(time_model, typ=1)[['sum_sq']]
    df_acc_test = sm.stats.anova_lm(acc_model, typ=1)[['sum_sq']]

    df_acc_test['Percentage of variation explained (accuracy)'] = df_acc_test['sum_sq']/df_acc_test['sum_sq'].sum() * 100
    df_time_test['Percentage of variation explained (time)'] = df_time_test['sum_sq']/df_time_test['sum_sq'].sum() * 100

    df_acc_test = df_acc_test.drop('sum_sq', axis=1)
    df_acc_test['Percentage of variation explained (time)'] = df_time_test['Percentage of variation explained (time)']
    df_acc_test['Percentage of variation explained (accuracy)'] = df_acc_test['Percentage of variation explained (accuracy)'].apply(lambda x: f'{x:.2f}%')
    df_acc_test['Percentage of variation explained (time)'] = df_acc_test['Percentage of variation explained (time)'].apply(lambda x: f'{x:.2f}%')
    return df_acc_test

# *PrePoch*

In [None]:

# Weights
w1 = 0.1
w2 = 0.9

# System-Parameters
model = 'lenet5'
cores = 10
memory = 64

# Target Parameters
target_accuracy = 0.1
target_time = 300

# Predictions
prediction_with_acc = epoch_model_predict(epoch_model_acc, model=model, cores=cores, memory=memory, y=target_accuracy)
prediction_with_time = epoch_model_predict(epoch_model_time, model=model, cores=cores, memory=memory, y=target_time)

# Prepoch results
prepoch = w1 * prediction_with_acc + w2 * prediction_with_time

print(prepoch)


## Check normality of errors

In [None]:
from numpy.random import seed
from numpy.random import randn
from statsmodels.graphics.gofplots import qqplot
from matplotlib import pyplot
from numpy import *
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import shapiro

result_df = pd.DataFrame()

counter = 0
for name, df in acc_df.groupby(['model', 'epochs', 'cores', 'memory']):
    counter = counter + 1
    df = df.reset_index()
    for i, row in df.iterrows():
        y_pred = acc_model.predict(exog={
            'epochs': row['epochs'] ,
            'model': row['model'], 
            'cores': row['cores'], 
            'memory': row['memory']
        })[0]

        error = y_pred - row['y'] 
        
        df.at[i,'y_pred'] = y_pred
        df.at[i,'error'] = error
        
    data = df['error']
    stat, p = shapiro(data)
    
    # manual inspection
    if counter is 4:
        qqplot(data, line='s')
        pyplot.show()
    
    result_df = result_df.append({'experiment': name, 'p-value (Shapiro-Wilk)': p}, ignore_index=True)

result_df['Reject Null Hypothesis'] = result_df['p-value (Shapiro-Wilk)'] < 0.05
print(result_df.to_latex(index=False, caption='', label=''))
