In [1]:
import numpy as np 
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

In [2]:
def select_train(list_var, data):
    for var in list_var :
        data.pop(var)
    return data

def select_test(list_var, data):
    for var in list_var :
        data.pop(var)
    return data

In [3]:
# 1/ Training data :

column_names = ['Model_name', 'input_shape', 'memory', 'input_width', 'input_size', 'nb_layers',
                'sum_activations', 'params']
raw_dataset = pd.read_csv('./data/training_dataset.csv', names=column_names,
                      na_values = "?", comment='\t', sep=",", skipinitialspace=True)
train_dataset = raw_dataset.copy()
train_dataset.isna().sum()
train_dataset = train_dataset.dropna()

# 1/ Test data :

column_names = ['Model_name', 'input_shape', 'memory', 'input_width', 'input_size', 'nb_layers',
                'sum_activations', 'params']
raw_dataset = pd.read_csv('./data/test_dim1.csv', names=column_names,
                      na_values = "?", comment='\t', sep=",", skipinitialspace=True)
test_dataset1 = raw_dataset.copy()
test_dataset1.isna().sum()
test_dataset1 = test_dataset1.dropna()

# 2/ Test data :

column_names = ['Model_name', 'input_shape', 'memory', 'input_width', 'input_size', 'nb_layers',
                'sum_activations', 'params']
raw_dataset = pd.read_csv('./data/test_dim2.csv', names=column_names,
                      na_values = "?", comment='\t', sep=",", skipinitialspace=True)
test_dataset2 = raw_dataset.copy()
test_dataset2.isna().sum()
test_dataset2 = test_dataset2.dropna()

# 3/ Test data :

column_names = ['Model_name', 'input_shape', 'memory', 'input_width', 'input_size', 'nb_layers',
                'sum_activations', 'params']
raw_dataset = pd.read_csv('./data/test_dim3.csv', names=column_names,
                      na_values = "?", comment='\t', sep=",", skipinitialspace=True)
test_dataset3 = raw_dataset.copy()
test_dataset3.isna().sum()
test_dataset3 = test_dataset3.dropna()

In [4]:
list_comp = ['Model_name', 'input_shape', 'memory', 'input_width', 'input_size', 'nb_layers',
                'sum_activations', 'params']

list_var = ['input_width']

train_dataset = select_train(list_var, train_dataset)
test_dataset1 = select_test(list_var, test_dataset1)
test_dataset2 = select_test(list_var, test_dataset2)
test_dataset3 = select_test(list_var, test_dataset3)

In [5]:
y_train = train_dataset.pop('memory')
X_train = train_dataset
X_train.tail()

Unnamed: 0,Model_name,input_shape,input_size,nb_layers,sum_activations,params
112,basic_model_32_23,"(1, 32, 32, 3)",3072,41,17086005248,17190346
113,basic_model_331_48,"(1, 331, 331, 3)",328683,78,65066541656,14519141
114,basic_model_64_21,"(1, 64, 64, 3)",12288,38,29858337280,25511850
115,basic_model_112_22,"(1, 112, 112, 3)",37632,43,9436995840,15188976
116,basic_model_600_42,"(1, 600, 600, 3)",1080000,71,79782663040,12260173


In [6]:
Y_e1 = test_dataset1.pop('memory')
X_e1 = test_dataset1

Y_e2 = test_dataset2.pop('memory')
X_e2 = test_dataset2

Y_e3 = test_dataset3.pop('memory')
X_e3 = test_dataset3

In [7]:
X_train_model_names = X_train.pop('Model_name')
X_train_input_shapes = X_train.pop('input_shape')

X_test_model_names1 = X_e1.pop('Model_name')
X_test_input_shapes1 = X_e1.pop('input_shape')

X_test_model_names2 = X_e2.pop('Model_name')
X_test_input_shapes2 = X_e2.pop('input_shape')

X_test_model_names3 = X_e3.pop('Model_name')
X_test_input_shapes3 = X_e3.pop('input_shape')

In [8]:
y_test1 = Y_e1.values
y_test2 = Y_e2.values
y_test3 = Y_e3.values

In [9]:
steps = [('scaler', StandardScaler()), ('RFR', RandomForestRegressor())]
pipeline = Pipeline(steps)

In [10]:
# gridSearchCV hyperparameters space
"""
param_grid = [
        {'RFR__bootstrap': [True,False],
         'RFR__max_depth': [None,10,20,30,40,50,60,70,80,90,100,120,140,160,180,200,220,240,260,280,300],
         'RFR__n_estimators': [100,200,300,400,500,600,700,800,900,1000],
         'RFR__max_features': ['auto', 'sqrt', 'log2', None],
         'RFR__min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9, 10],
         'RFR__min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
         'RFR__random_state': [42]
        },
    ]
"""

# optimal hyperparameters
param_grid = [
        {'RFR__bootstrap': [True],
         'RFR__max_depth': [20],
         'RFR__n_estimators': [100],
         'RFR__max_features': [None],
         'RFR__min_samples_leaf': [2], 
         'RFR__min_samples_split': [2], 
         'RFR__random_state': [42]
        },
    ]

In [11]:
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

In [12]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('RFR', RandomForestRegressor())]),
             param_grid=[{'RFR__bootstrap': [True], 'RFR__max_depth': [20],
                          'RFR__max_features': [None],
                          'RFR__min_samples_leaf': [2],
                          'RFR__min_samples_split': [2],
                          'RFR__n_estimators': [100],
                          'RFR__random_state': [42]}])

In [13]:
print(grid.best_params_)

{'RFR__bootstrap': True, 'RFR__max_depth': 20, 'RFR__max_features': None, 'RFR__min_samples_leaf': 2, 'RFR__min_samples_split': 2, 'RFR__n_estimators': 100, 'RFR__random_state': 42}


In [14]:
best_pipe = grid.best_estimator_
best_pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('RFR',
                 RandomForestRegressor(max_depth=20, max_features=None,
                                       min_samples_leaf=2, random_state=42))])

In [15]:
y_pred1 = best_pipe.predict(X_e1.values)
y_pred2 = best_pipe.predict(X_e2.values)
y_pred3 = best_pipe.predict(X_e3.values)

In [24]:
y_test = y_test3
y_pred = y_pred3
test_models = X_test_model_names3.values
test_input_shapes = X_test_input_shapes3.values

In [25]:
print('Explained variance : ',explained_variance_score(y_test, y_pred))
print('R2_value : ',r2_score(y_test, y_pred))
print('Mean absolute error :', mean_absolute_error(y_test, y_pred))
print('Root Mean squared error :', mean_squared_error(y_test, y_pred, squared=False))
print('Median absolute error :', median_absolute_error(y_test, y_pred))

Explained variance :  0.7559091787624019
R2_value :  0.753711952938296
Mean absolute error : 208.6353186904762
Root Mean squared error : 249.14242795894097
Median absolute error : 191.35972023809484


In [26]:
# Calculate mean absolute percentage error (MAPE)
errors = abs(y_pred-y_test)
mape = 100 * (errors / y_test) # Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('MAX MAPE :', max(mape))
print('MAPE:', round(np.mean(mape), 2), '%.')
print('Accuracy:', round(accuracy, 2), '%.')

MAX MAPE : 26.783338587641868
MAPE: 10.47 %.
Accuracy: 89.53 %.


In [27]:
import csv
with open('./measured_vs_predicted_RF_dim3.csv', 'w', newline='') as file : 
    writer = csv.writer(file)
    writer.writerow(['CNN_model', 'input_shape', 'model&shape', 'measured', 'predicted', 'absolute_deviation', 'MAPE (in %)'])
    for i in range(0, len(y_pred)) :
        f = ''+test_models[i]+'_'+test_input_shapes[i]
        writer.writerow([test_models[i], test_input_shapes[i], f, y_test[i], y_pred[i],abs(y_pred[i]-y_test[i]), ((abs(y_pred[i]-y_test[i]))/y_test[i])*100])