In [1]:
import pandas as pd
import os
import plotly.express as px
import re

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from statsmodels.tsa.seasonal import seasonal_decompose

import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline


In [8]:

class datasets:

    def __init__(self):

        self.performance_0 = {}
        self.performance_1 = {}
        self.performance_7 = {}

        self.metrics_0 = {}
        self.metrics_0 = {}
        self.metrics_0 = {}

        self.get_datasets()


    def get_datasets(self):

        folder_path = os.getcwd()
        csv_directory = folder_path + r"\csvs"
        files = os.listdir(csv_directory)
        files = " ".join(files)
        datasets = re.findall(" [a-zA-Z]+_performances_[0-9]+.csv", files)
        for dataset in datasets:
            file_path = csv_directory + "/" + dataset[1:]
            characteristics = str.split(dataset, '_')
            set_name = characteristics[0][1:]
            period = int(str.split(characteristics[2], '.')[0])
            if period == 0:
                self.performance_0[set_name] = pd.read_csv(file_path)
            if period == 1:
                self.performance_1[set_name] = pd.read_csv(file_path)
            if period == 7:
                self.performance_7[set_name] = pd.read_csv(file_path)

        datasets = re.findall(" [a-zA-Z]+_metrics_[0-9]+.csv", files)
        for dataset in datasets:
            if '_cv_' not in dataset: 
                file_path = csv_directory + "/" + dataset[1:]
                characteristics = str.split(dataset, '_')
                set_name = characteristics[0][1:]
                period = int(str.split(characteristics[2], '.')[0])
                if period == 0:
                    self.metrics_0[set_name] = pd.read_csv(file_path)
                if period == 1:
                    self.metrics_1[set_name] = pd.read_csv(file_path)
                if period == 7:
                    self.metrics_7[set_name] = pd.read_csv(file_path)


    def get_performance_data(self, set_name, model, baseline, period):
        
        if period == 0:
            dataframe = self.performance_0.get(set_name)
        elif period == 1:
            dataframe = self.performance_1.get(set_name)
        elif period == 7:
            dataframe = self.performance_7.get(set_name)

        if baseline:
            dataframe = dataframe[["Date", "Actual", model, "Baseline"]]
        else:
            dataframe = dataframe[["Date", "Actual", model]]

        return dataframe
    
    
    def get_metrics_data(self, set_name, model, baseline, period):
        
        if period == 0:
            dataframe = self.metrics_0.get(set_name)
        elif period == 1:
            dataframe = self.metrics_1.get(set_name)
        elif period == 7:
            dataframe = self.metrics_7.get(set_name)

        if baseline:
            baseline_row = dataframe[dataframe['Model'] == "Baseline"]
            model_row = dataframe[dataframe['Model'] == model]
            rows = pd.concat([baseline_row, model_row], ignore_index=True)
        else:
            rows = dataframe[dataframe['Model'] == model]

        return rows
    

    def get_performance_0(self):
        return self.performance_0
    

    def get_performance_1(self):
        return self.performance_1
    

    def get_performance_7(self):
        return self.performance_7
    

    def get_metrics_0(self):
        return self.metrics_0
    

    def get_metrics_1(self):
        return self.metrics_1
    
    
    def get_metrics_7(self):
        return self.metrics_7

In [9]:
dataset = datasets()
set_name = 'matlab'
model = 'xgb'
baseline = 0
period = 0

In [10]:
perf = dataset.get_performance_0()
perf

{'matlab':            Date    Actual    Basic_nn  Complex_nn       xgb           rf
 0    2010-12-07  10288.48  10097.3840  10372.9470  9892.565  8919.886717
 1    2010-12-07  10120.55  10053.4290  10268.4130  9667.802  8919.886717
 2    2010-12-07   9782.41   9925.5750  10139.3460  9559.137  8919.886717
 3    2010-12-07   9533.65   9656.0080   9914.3500  9471.435  8919.886717
 4    2010-12-07   9305.94   9600.7390   9776.5960  9366.166  8919.886717
 ..          ...       ...         ...         ...       ...          ...
 995  2010-12-28   7671.25   7613.3296   8125.3860  8777.749  8919.886717
 996  2010-12-28   7654.08   7668.3955   8063.4087  8935.477  8919.886717
 997  2010-12-28   7637.34   7618.9756   7994.8150  8163.061  8919.886717
 998  2010-12-28   7642.43   7593.0806   7944.6436  8403.816  8919.886717
 999  2010-12-28   7609.50   7567.2620   7910.4680  8283.473  8919.886717
 
 [1000 rows x 6 columns]}

In [12]:
perf = dataset.get_performance_data(set_name, model, baseline, period)
print(perf.head())
met = dataset.get_metrics_data(set_name, model, baseline, period)
print(met.head())

         Date    Actual       xgb
0  2010-12-07  10288.48  9892.565
1  2010-12-07  10120.55  9667.802
2  2010-12-07   9782.41  9559.137
3  2010-12-07   9533.65  9471.435
4  2010-12-07   9305.94  9366.166
  Model        RMSE        R2            MSE         MAE      MAPE
2   xgb  578.530713  0.822339  334697.785827  455.922305  0.053363


In [30]:
class datasets:

    def __init__(self):

        self.performance_0 = {}
        self.performance_1 = {}
        self.performance_7 = {}

        self.metrics_0 = {}
        self.metrics_0 = {}
        self.metrics_0 = {}

        self.get_datasets()


    def get_datasets(self):

        folder_path = os.getcwd()
        csv_directory = folder_path + r"\csvs"
        files = os.listdir(csv_directory)
        files = " ".join(files)
        datasets = re.findall(" [a-zA-Z]+_performances_[0-9]+.csv", files)
        for dataset in datasets:
            file_path = csv_directory + "/" + dataset[1:]
            characteristics = str.split(dataset, '_')
            set_name = characteristics[0][1:]
            period = int(str.split(characteristics[2], '.')[0])
            if period == 0:
                self.performance_0[set_name] = pd.read_csv(file_path)
            if period == 1:
                self.performance_1[set_name] = pd.read_csv(file_path)
            if period == 7:
                self.performance_7[set_name] = pd.read_csv(file_path)

        datasets = re.findall(" [a-zA-Z]+_metrics_[0-9]+.csv", files)
        for dataset in datasets:
            if '_cv_' not in dataset: 
                file_path = csv_directory + "/" + dataset[1:]
                characteristics = str.split(dataset, '_')
                set_name = characteristics[0][1:]
                period = int(str.split(characteristics[2], '.')[0])
                if period == 0:
                    self.metrics_0[set_name] = pd.read_csv(file_path)
                    # self.metrics_0 = self.normalise_data(["RMSE", "MSE", "MAE", "MAPE"], self.metrics_0)
                if period == 1:
                    self.metrics_1[set_name] = pd.read_csv(file_path)
                    # self.metrics_1 = self.normalise_data(["RMSE", "MSE", "MAE", "MAPE"], self.metrics_1)
                if period == 7:
                    self.metrics_7[set_name] = pd.read_csv(file_path)
                    # self.metrics_7 = self.normalise_data(["RMSE", "MSE", "MAE", "MAPE"], self.metrics_7)


    def get_performance_data(self, set_name, model, baseline, period):
        
        if period == 0:
            dataframe = self.performance_0.get(set_name)
        elif period == 1:
            dataframe = self.performance_1.get(set_name)
        elif period == 7:
            dataframe = self.performance_7.get(set_name)

        if baseline:
            dataframe = dataframe[["Date", "Actual", model, "Baseline"]]
        else:
            dataframe = dataframe[["Date", "Actual", model]]

        return dataframe
    
    
    def get_metrics_data(self, set_name, model, baseline, period):
        
        if period == 0:
            dataframe = self.metrics_0.get(set_name)
        elif period == 1:
            dataframe = self.metrics_1.get(set_name)
        elif period == 7:
            dataframe = self.metrics_7.get(set_name)

        if baseline:
            baseline_row = dataframe[dataframe['Model'] == "Baseline"]
            model_row = dataframe[dataframe['Model'] == model]
            rows = pd.concat([baseline_row, model_row], ignore_index=True)
        else:
            rows = dataframe[dataframe['Model'] == model]

        return rows
    

    def get_performance_0(self):
        return self.performance_0
    

    def get_performance_1(self):
        return self.performance_1
    

    def get_performance_7(self):
        return self.performance_7
    

    def get_metrics_0(self):
        return self.metrics_0
    

    def get_metrics_1(self):
        return self.metrics_1
    
    
    def get_metrics_7(self):
        return self.metrics_7

In [37]:
set = datasets()
metrics = set.get_metrics_0()
datatypes = ["RMSE", "MSE", "MAE", "MAPE"]

In [38]:
metrics

{'matlab':          Model Metric         Value
 0     Basic_nn   RMSE  3.535459e+02
 1     Basic_nn    MSE  1.249947e+05
 2     Basic_nn    MAE  2.732168e+02
 3     Basic_nn   MAPE  3.122753e-02
 4     Basic_nn     R2  9.336516e-01
 5   Complex_nn   RMSE  3.046511e+02
 6   Complex_nn    MSE  9.281230e+04
 7   Complex_nn    MAE  2.296959e+02
 8   Complex_nn   MAPE  2.613103e-02
 9   Complex_nn     R2  9.507344e-01
 10         xgb   RMSE  5.785307e+02
 11         xgb    MSE  3.346978e+05
 12         xgb    MAE  4.559223e+02
 13         xgb   MAPE  5.336325e-02
 14         xgb     R2  8.223393e-01
 15          rf   RMSE  1.376828e+03
 16          rf    MSE  1.895656e+06
 17          rf    MAE  1.119793e+03
 18          rf   MAPE  1.345946e-01
 19          rf     R2 -6.231635e-03}

In [39]:
values = []
names_vals = {}

for datatype in datatypes:
    for set_name, df in metrics.items():
        models = df.loc[df["Metric"] == datatype, "Model"].values
        values = df.loc[df["Metric"] == datatype, "Value"].values
        min_val = 0
        max_val = max(values)
        scaled_values = [(value - min_val) / (max_val - min_val) for value in values]
        print(datatype)
        print(models)
        print(values)
        print(scaled_values)
        for i in range(len(models)):
            df.loc[(df["Metric"] == datatype) & (df["Model"] == models[i]), "Value"] = scaled_values[i]

        print(df)
       
    #for row in rows:

    #metrics.loc[metrics["Model"] == model & metrics["metric"] == datatype, value] = metric_outputs.get("RMSE")

RMSE
['Basic_nn' 'Complex_nn' 'xgb' 'rf']
[ 353.54590098  304.6511185   578.53071295 1376.82806946]
[0.25678289746157323, 0.2212702698728134, 0.42019096340664636, 1.0]
         Model Metric         Value
0     Basic_nn   RMSE  2.567829e-01
1     Basic_nn    MSE  1.249947e+05
2     Basic_nn    MAE  2.732168e+02
3     Basic_nn   MAPE  3.122753e-02
4     Basic_nn     R2  9.336516e-01
5   Complex_nn   RMSE  2.212703e-01
6   Complex_nn    MSE  9.281230e+04
7   Complex_nn    MAE  2.296959e+02
8   Complex_nn   MAPE  2.613103e-02
9   Complex_nn     R2  9.507344e-01
10         xgb   RMSE  4.201910e-01
11         xgb    MSE  3.346978e+05
12         xgb    MAE  4.559223e+02
13         xgb   MAPE  5.336325e-02
14         xgb     R2  8.223393e-01
15          rf   RMSE  1.000000e+00
16          rf    MSE  1.895656e+06
17          rf    MAE  1.119793e+03
18          rf   MAPE  1.345946e-01
19          rf     R2 -6.231635e-03
MSE
['Basic_nn' 'Complex_nn' 'xgb' 'rf']
[ 124994.704101     92812.30400169 