<a href="https://colab.research.google.com/github/Idaogah/datalab/blob/main/updated_SPP_MSFT_20000103_20220729_MPN5P_LSTM15_GO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Introduction**
This code is Learning Model Creator (LMB). The program extract financial assets data (MSFT for example) saved in the Google drive into Colab environment where the data is trained using Long Short-term Memory (LSTM) to train the data and generate the predictions. The best model is saved in .h5 file which was exported to AWS. This file is used to make predictions to subscribers whether to buy or sell.

## Packages

In [None]:
!pip install --upgrade tsmoothie #installaton of the package for applying the Kalman filter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tsmoothie
  Downloading tsmoothie-1.0.4-py3-none-any.whl (21 kB)
Collecting simdkalman
  Downloading simdkalman-1.0.2-py2.py3-none-any.whl (11 kB)
Installing collected packages: simdkalman, tsmoothie
Successfully installed simdkalman-1.0.2 tsmoothie-1.0.4


In [None]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

#Libraries

In [None]:
import pandas as pd #for managing the dataframe

import tsmoothie #for the Kalman filter, it is an efficient recursive filter that evaluates the state of a dynamic system starting from a series of measurements subject to noise.
from sklearn.preprocessing import RobustScaler #for preprocessing, it scales features using statistics that are robust to outliers.
import numpy as np #for the e_logarithmic filter (and also some other mathematical operations)

import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM #the two main layers of the model
from tensorflow.keras import optimizers #for the training of the model
from tensorflow.keras.models import load_model

from google.colab import drive #to mount the google drive folders that contain all the data needed

from scipy.stats import linregress #for the slope and the value of Y at X=0 of the linear trend line
from sklearn.linear_model import LinearRegression

from datetime import datetime #for today's date

import random #to set the seed to replicate results

import matplotlib.pyplot as plt

random.seed(42)  #set the seed to replicate results

#Functions

In [None]:
'''Function for making sequences (blocks) of test and train data'''
def building_data_sequences(data, timesteps): #timesteps means how many days we consider for each block

    X=[]
    y_MPNxP = []
    y_HPNxP = []
    y_LPNxP = []
    y_HPN1P = []
    y_LPN1P = []

    for i in range(len(data)-timesteps+1):  #how it works: every timesteps (e.g. 10 days) a block is constituted and for each block data and true values are stored


        X.append(data[i:(i+timesteps),:7])
        y_MPNxP.append(data[i+timesteps-1,7])
        y_HPNxP.append(data[i+timesteps-1,8])
        y_LPNxP.append(data[i+timesteps-1,9])
        y_HPN1P.append(data[i+timesteps-1,10])
        y_LPN1P.append(data[i+timesteps-1,11])

    return np.array(X), [np.array(y_MPNxP), np.array(y_HPNxP), np.array(y_LPNxP), np.array(y_HPN1P), np.array(y_LPN1P)]

'''Function for computing the analytical parameters'''
def sir_parameters(x,y): #sir stands for slope, intercept, rvalue (actually there's also the average trend line distance or avg_tld, but it came later)

  analytical_params = linregress(x, y)

  slope = analytical_params.slope
  intercept = analytical_params.intercept
  rvalue = analytical_params.rvalue #pay attention that here we have the correlaton coefficient (so not r2 that is the coefficient of determination)

  x_trend_line = slope*x + intercept #this is computed just for the avg_tld
  avg_trend_line_distance = np.mean(np.abs(x_trend_line-y))

  return slope, intercept, rvalue**2, avg_trend_line_distance

#Variables

In [None]:
'''This is the timestep which indicates how many days we're considering to compute our predictions. Change that if you're considering a different dataset and the code will adapt consequently in an automatic way.'''
model_case_version_time_steps= '10'
model_case_version_main_target_code='40'

#today = '20220706'
today = datetime.today().strftime('%Y%m%d') #just for names of files (for now)
print('Today is', today)

Today is 20220901


In [None]:
''' Here, we define the case name'''
case = 'MSFT'
print(case)

MSFT


In [None]:
'''Here, we define the list of targets we are going to work on and also the average for each target (this value is used during the training for normalization\rescaling of some analytical parameters)'''
targets =['MPN'+model_case_version_main_target_code+'P']  #this must be changed whenever tha targets change
avg_prices_list = []

In [None]:
#1L = dispersion = 1
#4L = slope + intercept + resqr + dispersion = 1
#5L = all the weighting = 1
#new paramaters case = intercept + slope + end intercept + correlation + dispersion
slope_weighting_exponent_ratio = 1
intercept_weighting_exponent_ratio = 1
end_intercept_weighting_exponent_ratio = 0
rsqr_weighting_exponent_ratio = 1
dispersion_weighting_exponent_ratio = 1

slope_weighting_exponent_predicted_actual = 0
intercept_weighting_exponent_predicted_actual = 0
rsqr_weighting_exponent_predicted_actual = 0
dispersion_weighting_exponent_predicted_actual = 0

analytical_parametrs = str(intercept_weighting_exponent_ratio)+str(slope_weighting_exponent_ratio)+str(end_intercept_weighting_exponent_ratio)+str(rsqr_weighting_exponent_ratio)+str(dispersion_weighting_exponent_ratio)
print(analytical_parametrs)

11011


In [None]:
'''Model parameters: in order to understand, consult the original documentation (case_version_cat Tab) '''
n_epochs = 100
batch = 64

'''These are the exponent used to define the number of nodes for each layer'''
twoexp_nodes_number_layer_1 = 7
twoexp_nodes_number_layer_2 = 10
twoexp_nodes_number_layer_3 = 7
twoexp_nodes_number_layer_4 = 6
twoexp_nodes_number_layer_5 = 0

lr=0.0005 #learning rate

In [None]:
'''Other variables to be used during training'''

max_iterations = 200 #maximum number of iterations for the while loop (we will ee later in the code)
precision = 0.00000000001 #this precision is related to the quality of the compound_run_term value we want to obtain (that is representative of the quality of the model)
attenuation_factor = 0.75 #it us used in the computation of the attenuated_padding_value (see custom_loss_function)

#Data

In [None]:
'''Import the dataset and build the df to work on'''

drive.mount('/content/drive') #mount the Google Drive to access the .csv file of the dataset
#model_path = "/content/drive/MyDrive/Models/SPP-"+case+"-2000-2022/SPP-"+case+"_200001-202207_MPN"+model_case_version_main_target_code+"P_LSTM15/DOHLCAV-0-S-KER-710760-64-100-0.0005-10-11011L0.75_GO/"

model_path = "/content/drive/My Drive/SPP-"+case+"-2000-2022/SPP-"+case+"_20000103-20220729_MPN"+model_case_version_main_target_code+"P_LSTM15/DOHLCAV-0-S-KER-710760-64-100-0.0005-10-11011L0.75_GO/"
dohlcav_mpnxp_data=pd.read_csv(model_path+"proceedit 20220830 iLab - dySPP- "+case+"-20000103-20220729-MPN5_7_10_20P analysis and trading - MPN"+model_case_version_main_target_code+"P_DOHLCAV_input_data.csv") #always check that the name of the dataset is correct
#dohlcav_mpnxp_data = pd.read_csv("/content/proceedit 20220706 iLab - dySPP-EURUSD=X - MPN2P_DOHLCAV_data.csv")
dohlcav_mpnxp_data.head() #let's have a look to the dataset

Mounted at /content/drive


Unnamed: 0,ID,UUID,DCP_date_current_period,DNCP_day_number_current_period,OPCP_open_price_current_period,HPCP_high_price_current_period,LPCP_low_price_current_period,CPCP_close_price_current_period,ACPCP_adjusted_close_price_current_period,VTCP_volume_of_transactions_current_period,MPN5P_median_price_next_5_periods,HPN5P_highest_price_next_5_periods,LPN5P_lowest_price_next_5_periods,HPN1P_high_price_next_1_period,LPN1P_low_price_next_1_period
0,1,,2000-01-03,36528.0,58.69,59.31,56.0,58.28,36.72,53228400.0,56.13,58.56,53.66,58.56,56.13
1,2,,2000-01-04,36529.0,56.78,58.56,56.13,56.31,35.48,54119000.0,55.72,58.19,53.66,58.19,54.69
2,3,,2000-01-05,36530.0,55.56,58.19,54.69,56.91,35.85,64059600.0,55.48,57.13,52.22,56.94,54.19
3,4,,2000-01-06,36531.0,56.09,56.94,54.19,55.0,34.65,54976600.0,54.44,57.13,50.75,56.13,53.66
4,5,,2000-01-07,36532.0,54.31,56.13,53.66,55.72,35.1,62013600.0,54.44,57.13,50.75,56.84,55.69


In [None]:
dohlcav_mpnxp_data.tail()

Unnamed: 0,ID,UUID,DCP_date_current_period,DNCP_day_number_current_period,OPCP_open_price_current_period,HPCP_high_price_current_period,LPCP_low_price_current_period,CPCP_close_price_current_period,ACPCP_adjusted_close_price_current_period,VTCP_volume_of_transactions_current_period,MPN5P_median_price_next_5_periods,HPN5P_highest_price_next_5_periods,LPN5P_lowest_price_next_5_periods,HPN1P_high_price_next_1_period,LPN1P_low_price_next_1_period
5696,5697,,2022-08-24,44797.0,275.41,277.23,275.11,275.79,275.79,18137000.0,267.69,280.34,260.99,279.02,274.52
5697,5698,,2022-08-25,44798.0,277.33,279.02,274.52,278.85,278.85,16583400.0,266.67,280.34,260.99,280.34,267.98
5698,5699,,2022-08-26,44799.0,279.08,280.34,267.98,268.09,268.09,27532500.0,265.41,267.40,260.99,267.4,263.85
5699,5700,,2022-08-29,44802.0,265.85,267.4,263.85,265.23,265.23,20324600.0,264.27,267.05,260.99,267.05,260.99
5700,5701,,2022-08-30,44803.0,266.67,267.05,260.99,262.35,262.35,8932038.0,#NUM!,#NUM!,#NUM!,,


In [None]:
dohlcav_mpnxp_data = dohlcav_mpnxp_data.replace(',','', regex=True) #remove the ',' otherwise it's impossible to deal with numbers in the dataset
dohlcav_mpnxp_data.tail() #ohoh there's a problem with the last row (there are some NaN values because of course in that day we didn't have yet the real values)

Unnamed: 0,ID,UUID,DCP_date_current_period,DNCP_day_number_current_period,OPCP_open_price_current_period,HPCP_high_price_current_period,LPCP_low_price_current_period,CPCP_close_price_current_period,ACPCP_adjusted_close_price_current_period,VTCP_volume_of_transactions_current_period,MPN5P_median_price_next_5_periods,HPN5P_highest_price_next_5_periods,LPN5P_lowest_price_next_5_periods,HPN1P_high_price_next_1_period,LPN1P_low_price_next_1_period
5696,5697,,2022-08-24,44797.0,275.41,277.23,275.11,275.79,275.79,18137000.0,267.69,280.34,260.99,279.02,274.52
5697,5698,,2022-08-25,44798.0,277.33,279.02,274.52,278.85,278.85,16583400.0,266.67,280.34,260.99,280.34,267.98
5698,5699,,2022-08-26,44799.0,279.08,280.34,267.98,268.09,268.09,27532500.0,265.41,267.40,260.99,267.4,263.85
5699,5700,,2022-08-29,44802.0,265.85,267.4,263.85,265.23,265.23,20324600.0,264.27,267.05,260.99,267.05,260.99
5700,5701,,2022-08-30,44803.0,266.67,267.05,260.99,262.35,262.35,8932038.0,#NUM!,#NUM!,#NUM!,,


In [None]:
dohlcav_mpnxp_data = dohlcav_mpnxp_data[:-1] #simply remove the last row
dohlcav_mpnxp_data.tail() #now it's perfect!

Unnamed: 0,ID,UUID,DCP_date_current_period,DNCP_day_number_current_period,OPCP_open_price_current_period,HPCP_high_price_current_period,LPCP_low_price_current_period,CPCP_close_price_current_period,ACPCP_adjusted_close_price_current_period,VTCP_volume_of_transactions_current_period,MPN5P_median_price_next_5_periods,HPN5P_highest_price_next_5_periods,LPN5P_lowest_price_next_5_periods,HPN1P_high_price_next_1_period,LPN1P_low_price_next_1_period
5695,5696,,2022-08-23,44796.0,276.44,278.86,275.4,276.44,276.44,17527400.0,273.87,280.34,260.99,277.23,275.11
5696,5697,,2022-08-24,44797.0,275.41,277.23,275.11,275.79,275.79,18137000.0,267.69,280.34,260.99,279.02,274.52
5697,5698,,2022-08-25,44798.0,277.33,279.02,274.52,278.85,278.85,16583400.0,266.67,280.34,260.99,280.34,267.98
5698,5699,,2022-08-26,44799.0,279.08,280.34,267.98,268.09,268.09,27532500.0,265.41,267.4,260.99,267.4,263.85
5699,5700,,2022-08-29,44802.0,265.85,267.4,263.85,265.23,265.23,20324600.0,264.27,267.05,260.99,267.05,260.99


In [None]:
''' Here what we do is to separate data in two parts: the first goes from the beginnig of 2020 to the end of 2021 and
it is used for training the model; the second goes from the beginning of 2022 until the end
(it dependes on the last update we did for the dataset) and it is used for testing the model.
To do that, we siply compute the size of the training set and then we use this value (later in the code) to split the dataset '''

dohlcav_mpnxp_data['DCP_date_current_period'] = pd.to_datetime(dohlcav_mpnxp_data['DCP_date_current_period']) #date values in the dataset are converted

In [None]:
start_date = pd.Timestamp('2000-01-03') #start date of the training set
end_date = pd.Timestamp('2022-08-29') #end date of the training set #to define as variable at the beginning

In [None]:
'''This is the mask of booleans that will be used to filter data and take just what we need (data from the beginning until the end of 2021)'''

mask = (dohlcav_mpnxp_data['DCP_date_current_period'] <= end_date)

In [None]:

training_size = dohlcav_mpnxp_data.loc[mask].shape[0] #the mask is applied and from the correspondent dataframe we take just the shape[0] (the size\the number of rows)

print('Training size: ', training_size)
print('Test size: ', dohlcav_mpnxp_data.shape[0] - training_size)

Training size:  5700
Test size:  0


In [None]:
'''These will be used in the predictions output file (in order to know from which point starting to paste the results).'''
dates = dohlcav_mpnxp_data.iloc[int(model_case_version_time_steps)-1:,2].apply(lambda x: x.date().strftime('%Y-%m-%d')).reset_index(drop=True)
dates

0       2000-01-14
1       2000-01-18
2       2000-01-19
3       2000-01-20
4       2000-01-21
           ...    
5686    2022-08-23
5687    2022-08-24
5688    2022-08-25
5689    2022-08-26
5690    2022-08-29
Name: DCP_date_current_period, Length: 5691, dtype: object

In [None]:
len(dates)

5691

In [None]:
dohlcav_mpnxp_data

Unnamed: 0,ID,UUID,DCP_date_current_period,DNCP_day_number_current_period,OPCP_open_price_current_period,HPCP_high_price_current_period,LPCP_low_price_current_period,CPCP_close_price_current_period,ACPCP_adjusted_close_price_current_period,VTCP_volume_of_transactions_current_period,MPN5P_median_price_next_5_periods,HPN5P_highest_price_next_5_periods,LPN5P_lowest_price_next_5_periods,HPN1P_high_price_next_1_period,LPN1P_low_price_next_1_period
0,1,,2000-01-03,36528.00,58.69,59.31,56.00,58.28,36.72,53228400.0,56.13,58.56,53.66,58.56,56.13
1,2,,2000-01-04,36529.00,56.78,58.56,56.13,56.31,35.48,54119000.0,55.72,58.19,53.66,58.19,54.69
2,3,,2000-01-05,36530.00,55.56,58.19,54.69,56.91,35.85,64059600.0,55.48,57.13,52.22,56.94,54.19
3,4,,2000-01-06,36531.00,56.09,56.94,54.19,55.00,34.65,54976600.0,54.44,57.13,50.75,56.13,53.66
4,5,,2000-01-07,36532.00,54.31,56.13,53.66,55.72,35.10,62013600.0,54.44,57.13,50.75,56.84,55.69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5695,5696,,2022-08-23,44796.00,276.44,278.86,275.40,276.44,276.44,17527400.0,273.87,280.34,260.99,277.23,275.11
5696,5697,,2022-08-24,44797.00,275.41,277.23,275.11,275.79,275.79,18137000.0,267.69,280.34,260.99,279.02,274.52
5697,5698,,2022-08-25,44798.00,277.33,279.02,274.52,278.85,278.85,16583400.0,266.67,280.34,260.99,280.34,267.98
5698,5699,,2022-08-26,44799.00,279.08,280.34,267.98,268.09,268.09,27532500.0,265.41,267.40,260.99,267.40,263.85


In [None]:
'''Building the dataframe with just the necessary columns (removing 'id', 'uuid' and 'ACPCP_adjusted_close_price_current_period' column)'''

#pay attention here because everytime targets change, also the name of the columns change
df = dohlcav_mpnxp_data.drop(["ID", "UUID", "DCP_date_current_period"], axis=1)
df

Unnamed: 0,DNCP_day_number_current_period,OPCP_open_price_current_period,HPCP_high_price_current_period,LPCP_low_price_current_period,CPCP_close_price_current_period,ACPCP_adjusted_close_price_current_period,VTCP_volume_of_transactions_current_period,MPN5P_median_price_next_5_periods,HPN5P_highest_price_next_5_periods,LPN5P_lowest_price_next_5_periods,HPN1P_high_price_next_1_period,LPN1P_low_price_next_1_period
0,36528.00,58.69,59.31,56.00,58.28,36.72,53228400.0,56.13,58.56,53.66,58.56,56.13
1,36529.00,56.78,58.56,56.13,56.31,35.48,54119000.0,55.72,58.19,53.66,58.19,54.69
2,36530.00,55.56,58.19,54.69,56.91,35.85,64059600.0,55.48,57.13,52.22,56.94,54.19
3,36531.00,56.09,56.94,54.19,55.00,34.65,54976600.0,54.44,57.13,50.75,56.13,53.66
4,36532.00,54.31,56.13,53.66,55.72,35.10,62013600.0,54.44,57.13,50.75,56.84,55.69
...,...,...,...,...,...,...,...,...,...,...,...,...
5695,44796.00,276.44,278.86,275.40,276.44,276.44,17527400.0,273.87,280.34,260.99,277.23,275.11
5696,44797.00,275.41,277.23,275.11,275.79,275.79,18137000.0,267.69,280.34,260.99,279.02,274.52
5697,44798.00,277.33,279.02,274.52,278.85,278.85,16583400.0,266.67,280.34,260.99,280.34,267.98
5698,44799.00,279.08,280.34,267.98,268.09,268.09,27532500.0,265.41,267.40,260.99,267.40,263.85


In [None]:
# '''Renaming columns to have a more compact and a better reading of the df'''

# #pay attention here because everytime targets change, also the name of the columns change
# df = df.rename(columns={"DNCP_day_number_current_period": "DNCP","OPCP_open_price_current_period":"OPCP", "HPCP_high_price_current_period":"HPCP",
#                         "LPCP_low_price_current_period":"LPCP", "CPCP_close_price_current_period":"CPCP", "ACPCP_adjusted_close_price_current_period": "ACPCP",
#                         "VTCP_volume_of_transactions_current_period":"VTCP", "MPN"+timestep+"P_median_price_next_"+timestep+"_period":"MPN"+timestep+"P",
#                         "HPN"+timestep+"P_highest_price_next_"+timestep+"_period":"HPN"+timestep+"P", "LPN"+timestep+"P_lowest_price_next_"+timestep+"_period":"LPN"+timestep+"P",
#                         'HPN1P_high_price_next_1_period':'hpn1P','LPN1P_low_price_next_1_period':'lpn1P'})

# #df.columns = ['DNCP', 'OPCP', 'HPCP', 'LPCP', 'CPCP', 'ACPCP', 'VTCP', 'MPN1P', 'HPN1P', 'LPN1P', 'HPN1P', 'LPN1P']

# df.head()

In [None]:
'''Renaming columns to have a more compact and a better reading of the df'''

#pay attention here because everytime targets change, also the name of the columns change
df = df.rename(columns={"DNCP_day_number_current_period": "DNCP","OPCP_open_price_current_period":"OPCP", "HPCP_high_price_current_period":"HPCP",
                        "LPCP_low_price_current_period":"LPCP", "CPCP_close_price_current_period":"CPCP", "ACPCP_adjusted_close_price_current_period": "ACPCP",
                        "VTCP_volume_of_transactions_current_period":"VTCP", "MPN"+model_case_version_main_target_code+"P_median_price_next_"+model_case_version_main_target_code+"_periods":"MPN"+model_case_version_main_target_code+"P",
                        "HPN"+model_case_version_main_target_code+"P_highest_price_next_"+model_case_version_main_target_code+"_periods":"HPN"+model_case_version_main_target_code+"P", "LPN"+model_case_version_main_target_code+"P_lowest_price_next_"+model_case_version_main_target_code+"_periods":"LPN"+model_case_version_main_target_code+"P",
                        'HPN1P_high_price_next_1_period':'hpn1P','LPN1P_low_price_next_1_period':'lpn1P'})

#df.columns = ['DNCP', 'OPCP', 'HPCP', 'LPCP', 'CPCP', 'ACPCP', 'VTCP', 'MPN1P', 'HPN1P', 'LPN1P', 'HPN1P', 'LPN1P']

df.head()

Unnamed: 0,DNCP,OPCP,HPCP,LPCP,CPCP,ACPCP,VTCP,MPN5P,HPN5P,LPN5P,hpn1P,lpn1P
0,36528.0,58.69,59.31,56.0,58.28,36.72,53228400.0,56.13,58.56,53.66,58.56,56.13
1,36529.0,56.78,58.56,56.13,56.31,35.48,54119000.0,55.72,58.19,53.66,58.19,54.69
2,36530.0,55.56,58.19,54.69,56.91,35.85,64059600.0,55.48,57.13,52.22,56.94,54.19
3,36531.0,56.09,56.94,54.19,55.0,34.65,54976600.0,54.44,57.13,50.75,56.13,53.66
4,36532.0,54.31,56.13,53.66,55.72,35.1,62013600.0,54.44,57.13,50.75,56.84,55.69


In [None]:
'''Let's check also whether there are missing values or not'''

print("Number of NaN values:")
print(df.isna().sum())

Number of NaN values:
DNCP     0
OPCP     0
HPCP     0
LPCP     0
CPCP     0
ACPCP    0
VTCP     0
MPN5P    0
HPN5P    0
LPN5P    0
hpn1P    0
lpn1P    0
dtype: int64


In [None]:
for target in targets:
  avg_prices_list.append(df[target].astype(float).mean())

avg_prices_list

[66.20393333333332]

In [None]:
dncp = dohlcav_mpnxp_data['DNCP_day_number_current_period'].replace(',','', regex=True).astype(float) [int(model_case_version_time_steps)-1:]
dncp = dncp.astype(int).to_numpy()
span_dncp=dncp[-1] - dncp[0] + 1
positions_day_number = dncp-dncp[0]+1
positions_day_number

array([   1,    5,    6, ..., 8260, 8261, 8264])

In [None]:
'''The padding point value is calculated for computing the value at the end of the trend line. We'll see better during the training and the application of vertical padding '''
padding_point = positions_day_number[-1]
padding_point

8264

#Preprocessing

In [None]:
'''Preprocessing consists, in this case, to transform the dataset through 3 filters: Kalman, E_logrithmic and RobustScaler (in this order)'''
df_array=np.zeros((df.shape), dtype=np.float64) #necessary to deal with filter results
print(df_array)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
'''Application of the Kalman filter (rounding data)'''
kalman_smoother=tsmoothie.KalmanSmoother(component='level_trend',  component_noise={'level':0.1, 'trend':0.1}) #values for Kalman filters parameters
                                                                                                                #are taken from an example in the original code
                                                                                                                #of the library

for i in range(len(df.columns)): #this is the more convenient way I've found to apply the filter
  kalman_smoother.smooth(df[df.columns[i]].astype(float))
  df_array[:,i] = kalman_smoother.smooth_data

In [None]:
print(pd.DataFrame(df_array)) #I use pd.DataFrame(array) to print because in this way it's neater

                0           1           2           3           4   \
0     35704.479103   56.564318   57.798188   54.759012   56.342212   
1     36057.347464   56.317438   57.739525   54.847002   56.224995   
2     36303.754354   56.009570   57.518248   54.711727   56.067093   
3     36454.939890   55.764494   57.151265   54.517023   55.769625   
4     36535.246617   55.492314   56.796090   54.324608   55.555046   
...            ...         ...         ...         ...         ...   
5695  44795.642462  280.294835  281.817455  277.695997  278.848469   
5696  44796.979474  277.858853  279.414185  274.867268  276.206265   
5697  44798.326971  275.773847  277.317710  271.889444  273.481873   
5698  44799.762746  273.307802  274.812177  268.404116  269.838533   
5699  44801.340723  269.863216  271.358056  264.860182  266.266406   

              5             6           7           8           9   \
0      35.498992  5.307338e+07   54.779940   56.875451   52.403562   
1      35.423595  5

In [None]:
'''Application of the E_logarithmic filter (detrending)'''
df_array=np.log(df_array) #logarithm with base e

In [None]:
print(pd.DataFrame(df_array))

             0         1         2         3         4         5          6   \
0     10.483031  4.035378  4.056957  4.002942  4.031444  3.569504  17.787186   
1     10.492866  4.031004  4.055942  4.004548  4.029361  3.567378  17.814997   
2     10.499676  4.025523  4.052102  4.002078  4.026549  3.564514  17.839440   
3     10.503832  4.021137  4.045702  3.998513  4.021229  3.559160  17.841278   
4     10.506033  4.016245  4.039467  3.994977  4.017374  3.555274  17.847435   
...         ...       ...       ...       ...       ...       ...        ...   
5695  10.709866  5.635842  5.641260  5.626527  5.630669  5.630669  16.798512   
5696  10.709896  5.627113  5.632695  5.616288  5.621148  5.621148  16.803460   
5697  10.709926  5.619581  5.625164  5.605396  5.611235  5.611235  16.831328   
5698  10.709958  5.610599  5.616088  5.592494  5.597824  5.597824  16.888232   
5699  10.709993  5.597915  5.603439  5.579202  5.584497  5.584497  16.907986   

            7         8         9      

In [None]:
'''Application of the Robust scaler (scaling)'''
robust_scaler = RobustScaler()

for i in range(len(df.columns)): #in such a way I can have only a RS for each column (this is useful when I need to convert data later on)
    col_scaled = robust_scaler.fit_transform(df_array[:,i].reshape(-1,1))
    df_array[:,i] = col_scaled[:, 0]

In [None]:
print(pd.DataFrame(df_array)) #this is the final result

            0         1         2         3         4         5         6   \
0    -1.278462  0.707668  0.721822  0.679094  0.701303  0.393339  0.074161   
1    -1.181814  0.702497  0.720616  0.680988  0.698842  0.391421  0.110750   
2    -1.114884  0.696017  0.716056  0.678075  0.695517  0.388837  0.142908   
3    -1.074043  0.690832  0.708453  0.673869  0.689229  0.384008  0.145326   
4    -1.052418  0.685048  0.701048  0.669698  0.684672  0.380503  0.153427   
...        ...       ...       ...       ...       ...       ...       ...   
5695  0.950736  2.599739  2.603610  2.594338  2.591631  2.252516 -1.226572   
5696  0.951029  2.589419  2.593437  2.582260  2.580378  2.243928 -1.220062   
5697  0.951325  2.580515  2.584492  2.569411  2.568661  2.234987 -1.183399   
5698  0.951640  2.569896  2.573711  2.554191  2.552808  2.222890 -1.108535   
5699  0.951986  2.554902  2.558688  2.538512  2.537056  2.210869 -1.082545   

            7         8         9         10        11  
0     

#Splitting Data

In [None]:
'''Now, here we are going to use the training_size value we computed before at the beginning of the code to generate the training data and the test data'''
train_data = df_array[:training_size]
test_data = df_array[training_size: ]

In [None]:
print(train_data.shape[0],"days for training.")
print(test_data.shape[0],"days for testing.")

5700 days for training.
0 days for testing.


In [None]:
'''Creating the input blocks for the models. The timestep value must be changed according to the targets and dataset we are working with'''
X_train, y_train = building_data_sequences(train_data, timesteps=int(model_case_version_time_steps)) #see Functions section above;
X_test, y_test = building_data_sequences(test_data, timesteps=int(model_case_version_time_steps))

In [None]:
print(X_train.shape)
print(X_test.shape)

(5691, 10, 7)
(0,)


In [None]:
len(positions_day_number)

5691

#Configuration parameters

In [None]:
'''In this section, we're going to define some variables that will be useful during the training and testing of the model'''

input_shape=((X_train).shape[1],(X_train).shape[2])
print("Input shape obtained is:",input_shape)

Input shape obtained is: (10, 7)


In [None]:
'''We need actual values (true values) to compute some analytical parameters during the training'''

df_actual = df.iloc[int(model_case_version_time_steps)-1:,:].reset_index()
print(df_actual)

      index      DNCP    OPCP    HPCP    LPCP    CPCP   ACPCP        VTCP  \
0         9  36539.00   53.59   56.97   52.88   56.13   35.36  73416400.0   
1        10  36543.00   55.91   58.25   55.88   57.66   36.32  81483600.0   
2        11  36544.00   55.25   55.75   53.00   53.50   33.71  97568200.0   
3        12  36545.00   53.53   54.84   52.94   53.00   33.39  56349800.0   
4        13  36546.00   53.50   53.63   51.63   51.88   32.68  68416200.0   
...     ...       ...     ...     ...     ...     ...     ...         ...   
5686   5695  44796.00  276.44  278.86  275.40  276.44  276.44  17527400.0   
5687   5696  44797.00  275.41  277.23  275.11  275.79  275.79  18137000.0   
5688   5697  44798.00  277.33  279.02  274.52  278.85  278.85  16583400.0   
5689   5698  44799.00  279.08  280.34  267.98  268.09  268.09  27532500.0   
5690   5699  44802.00  265.85  267.40  263.85  265.23  265.23  20324600.0   

       MPN5P   HPN5P   LPN5P   hpn1P   lpn1P  
0      53.50   58.25   50.41

In [None]:
actuals_cols = [] #simply put the targets columns into a list (in order to be iterated during training)

for target in targets:
  actuals_cols.append(np.array(df_actual[target].astype(float)))

actuals_cols

[array([ 53.5 ,  52.84,  51.75, ..., 266.67, 265.41, 264.27])]

In [None]:
best_models_path = model_path + 'best_model/' #this path is used just to make everything more orderly

In [None]:
print(best_models_path)

/content/drive/My Drive/SPP-MSFT-2000-2022/SPP-MSFT_20000103-20220729_MPN5P_LSTM15/DOHLCAV-0-S-KER-710760-64-100-0.0005-10-11011L0.75_GO/best_model/


#Training

In [None]:
'''This is the custom loss function we defined in order to apply a sort of padding during the training of the model. It consists in a normal mean squared error loss fnuction except for the application of
a multiplicative factor on the prediction of the model'''

def custom_loss_function(attenuated_padding_value):

  def padding_loss_function(y_true, y_pred):

    y_pred = tf.multiply(y_pred, attenuated_padding_value) #this is the multiplication between the predictions and the attenuated_padding_value

    squared_difference = tf.square(y_true - y_pred)

    return tf.reduce_mean(squared_difference, axis=-1) #mse

  return padding_loss_function

intercepts_and_slopes = {'intercepts': [],
                         'slopes':[],
                         'end_intercepts':[]
                         }

In [None]:
'''This is about a part that must be inserted in the analytical paramters file and which is referred to the weights that are used to compute the compound_run_term (see later in the code)'''

weights_df = pd.DataFrame([None, None, None, None, None, None, None, None, slope_weighting_exponent_ratio, intercept_weighting_exponent_ratio, end_intercept_weighting_exponent_ratio, rsqr_weighting_exponent_ratio, dispersion_weighting_exponent_ratio,
                           slope_weighting_exponent_predicted_actual, intercept_weighting_exponent_predicted_actual, rsqr_weighting_exponent_predicted_actual, dispersion_weighting_exponent_predicted_actual, None, None, None, None,None]).T

weights_df=weights_df.rename({0:'weights'})
weights_df=weights_df.rename(columns={0:'slope_predicted_calculated_target_ratio_versus_period',
                           1: 'intercept_predicted_calculated_target_ratio_versus_period',
                           2: 'end_intercept_predicted_calculated_target_ratio_versus_period',
                           3: 'rsqr_predicted_calculated_target_ratio_versus_period',
                           4: 'average_tld_predicted_calculated_target_ratio_versus_period',
                           5: 'slope_predicted_versus_calculated_target',
                           6: 'intercept_predicted_versus_calculated_target',
                           7: 'rsqr_predicted_versus_calculated_target',
                           8: 'average_tld_predicted_versus_calculated_target',
                           9: 'normalized_slope_predicted_calculated_target_ratio_versus_period',
                           10: 'normalized_intercept_predicted_calculated_target_ratio_versus_period',
                           11: 'normalized_end_intercept_predicted_calculated_target_ratio_versus_period',
                           12: 'normalized_rsqr_predicted_calculated_target_ratio_versus_period',
                           13: 'rescaled_normalized_average_tld_predicted_calculated_target_ratio_versus_period',
                           14: 'normalized_slope_predicted_versus_calculated_target',
                           15: 'rescaled_normalized_intercept_predicted_versus_calculated_target',
                           16: 'normalized_rsqr_predicted_versus_calculated_target',
                           17: 'rescaled_normalized_average_tld_predicted_versus_calculated_target',
                           18: 'compound_run_term',
                           19: 'best_run',
                           20: 'adjusted_compound_run_term',
                           21: 'padding_correction_factor',
                           22: 'padding_correction_factor_attenuation'
                           })

In [None]:
# The new renamed analytical parameters

analytical_parameters_updated = {
    'normalized_trend_slope': [],
    'normalized_trend_intercept': [],
    'normalized_trend_end_intercept': [],
    'normalized_trend_correlation': [],
    'rescaled_normalized_trend_dispersion': [],
    'trend_slope_weighting_exponent': [],
    'trend_intercept_weighting_exponent': [],
    'trend_end_intercept_weighting_exponent': [],
    'trend_correlation_weighting_exponent': [],
    'trend_dispersion_weighting_exponent': []
}

In [None]:
''''This is the core part, where the the model is defined and trained. In particular, we're using an approach defined by us as 'Multiple Run' because what we do is to iterate the training of our model for a certain amount of times,
until we do not obtain the best one (according to a certain value of course) over all the others. More in details: we train a model and for this model we compute the compound_run_term (that is the value used to eveluate the quality
of the model itself). Of course, at the beginning the first model we obtain is the best one. However, in the next iteration, we train another model (with the same parameters, layers, etc.) and also for this second model we compute
the compound_run_term (to assess its quality). If the compound_run_term of the second model is better ('better' in this case means that is closer to 1) than the best model we've obtained so far (so the first one), then the second
model becomes the new best model. So, we proceed in this way, until either we reach the maximum number of iterations or we obtain a compound_run_term with a certain precision. This approach was born because of an instability problem
that affects our model'''

after_training_predictions = []

# intercepts_and_slopes = {'intercepts': [],
#                          'slopes':[],
#                          'end_intercepts':[]
#                          }

for target in range(len(targets)): #for each target

  '''This is the analytical parameters file inizialization. Analytical parameters are values referred to two graphs (ratio between predicted and actual and actual vs predicted) that we use to assess the quality of the predictions.
  In particular, in the file, we store also the normalize/rescaled version of the parameters because they are used to compute the compound_run_term (see later in the code) and other information like the number ofthe best iteration
  (best_run), the padding values and so on. '''

  analytical_parameters = {
    'slope_predicted_calculated_target_ratio_versus_period':[], #slope of the trend line obtained by considering the ratio between predicted values and actual values.
    'intercept_predicted_calculated_target_ratio_versus_period':[], #intercept of the trend line obtained by considering the ratio between predicted values and actual values.
    'end_intercept_predicted_calculated_target_ratio_versus_period':[],
    'rsqr_predicted_calculated_target_ratio_versus_period':[], #r2 of the trend line obtained by considering the ratio between predicted values and actual values.
    'average_tld_predicted_calculated_target_ratio_versus_period':[], #average trend line distance of the trend line obtained by considering the ratio between predicted values and actual values.

    'slope_predicted_versus_calculated_target':[], #slope of the trend line obtained by considering the actual values as x and the predicted values as y.
    'intercept_predicted_versus_calculated_target':[], #intercept of the trend line obtained by considering the actual values as x and the predicted values as y.
    'rsqr_predicted_versus_calculated_target':[], #r2 of the trend line obtained by considering the actual values as x and the predicted values as y.
    'average_tld_predicted_versus_calculated_target':[], #average trend line distance of the trend line obtained by considering the actual values as x and the predicted values as y.

    'normalized_slope_predicted_calculated_target_ratio_versus_period': [],
    'normalized_intercept_predicted_calculated_target_ratio_versus_period':[],
    'normalized_end_intercept_predicted_calculated_target_ratio_versus_period':[],
    'normalized_rsqr_predicted_calculated_target_ratio_versus_period': [],
    'rescaled_normalized_average_tld_predicted_calculated_target_ratio_versus_period': [],

    'normalized_slope_predicted_versus_calculated_target': [],
    'rescaled_normalized_intercept_predicted_versus_calculated_target': [],
    'normalized_rsqr_predicted_versus_calculated_target': [],
    'rescaled_normalized_average_tld_predicted_versus_calculated_target': [],

    'compound_run_term': [],
    'best_run': [],
    'adjusted_compound_run_term':[],
    'padding_correction_factor':[],
    'padding_correction_factor_attenuation':[],
    'absolute_difference_normalized_intercept': []

    }


  '''Inzialization of some values'''
  y_target = y_train[target] #the current target we're considering
  diff = 1e3 #this is used to determine whether the current best model is still the best or not.
  best_run = -1 #this is used to take trace of the best iteration so far
  adjusted_compound_run_term = 1e3 #it is initialized with a very big value just to iterate at least one time the while loop
  iteration = 0 #counter for iterations
  attenuated_padding_value = 1 #initial value for the attenuating padding value


  '''As long as the compound_run_term has not reached the precision we want or the maximum number of iterations is not reached, the code inside the while loop is executed.'''
  while abs(adjusted_compound_run_term - 1.0) >= precision and iteration < max_iterations:


    print('Iteration', iteration, 'for target',targets[target])
    print('Attenuated padding value', attenuated_padding_value)

    model = tf.keras.Sequential()

    '''The layers of the model (see case_version_cat Tab)'''

    model.add(LSTM(2**twoexp_nodes_number_layer_1,input_shape=input_shape,return_sequences=True))
    model.add(LSTM(2**twoexp_nodes_number_layer_2, return_sequences=True))
    model.add(LSTM(2**twoexp_nodes_number_layer_3))
    model.add(Dense(2**twoexp_nodes_number_layer_4))
    model.add(Dense(2**twoexp_nodes_number_layer_5))

    model.compile(optimizer = optimizers.Adam(learning_rate = lr), loss = custom_loss_function(attenuated_padding_value))

    '''Training of the model'''
    model.fit(X_train, y_target, batch_size=batch, epochs=n_epochs, verbose=0)

    '''Computation of the predictions'''
    train_predictions = model.predict(X_train) #predictions for training data
    train_predictions = np.exp(robust_scaler.inverse_transform(train_predictions)) #convert prediction first by inverting the Robust scaler transformation and then the e_logarithmic one.
    train_array = np.concatenate(train_predictions).ravel() #with these 3 lines, we create a unique array with all the predictions coming from training and test data.

    multiple_run_predictions = train_array
    actual = actuals_cols[target][:len(multiple_run_predictions)]

    '''Computation of the analytical parameter values '''
    predicted_actual_ratio = (multiple_run_predictions/actual) #compute the ratio between predicted values and actual values

    slope_first, intercept_first, r_squared_first, avg_tld_first = sir_parameters(positions_day_number, predicted_actual_ratio) # computation of the analytical parameters values for the ratio graph (see Functions section above for more details)
    slope_second, intercept_second, r_squared_second, avg_tld_second = sir_parameters(actual, multiple_run_predictions) # computation of the analytical parameters values for the actual vs predicted graph (see Functions section above for more details)
    end_intercept_first = slope_first * padding_point + intercept_first

    if end_intercept_first < 0:    # end_intercept shouldn't be negative condition
       end_intercept_first = 0.1

    print(' Predicted actual ratio', predicted_actual_ratio) #Added by me just to check the ratio
    print(' Intercept', intercept_first)
    print(' Slope', slope_first)
    print(' End Intercept', end_intercept_first)
    print(' Postion day number',padding_point)

    '''Just adding each value to the correspondent list'''
    analytical_parameters['slope_predicted_calculated_target_ratio_versus_period'].append(slope_first)
    analytical_parameters['intercept_predicted_calculated_target_ratio_versus_period'].append(intercept_first)
    analytical_parameters['end_intercept_predicted_calculated_target_ratio_versus_period'].append(end_intercept_first)
    analytical_parameters['rsqr_predicted_calculated_target_ratio_versus_period'].append(r_squared_first)
    analytical_parameters['average_tld_predicted_calculated_target_ratio_versus_period'].append(avg_tld_first)
    analytical_parameters['slope_predicted_versus_calculated_target'].append(slope_second)
    analytical_parameters['intercept_predicted_versus_calculated_target'].append(intercept_second)
    analytical_parameters['rsqr_predicted_versus_calculated_target'].append(r_squared_second)
    analytical_parameters['average_tld_predicted_versus_calculated_target'].append(avg_tld_second)

    '''Now, we normalized\rescaled the analytical parameters values according the our reference values (that is 1).This is why we add 1 to some values or we divide for the aevrage value of that specific targets.
     Afterwards, we put the values in the correpsoondent lists. '''
    norm_slope_ratio = slope_first+1
    norm_intercept_ratio = intercept_first #I added this line
    norm_end_intercept_ratio = end_intercept_first #I added this line
    norm_rsqr_ratio = r_squared_first+1
    resc_norm_avg_tld_ratio = (avg_tld_first/avg_prices_list[target])+1 #here you can see the avg_price_list that we computeed at the beginning of the code
    resc_norm_inter_predact = (intercept_second/avg_prices_list[target])+1
    resc_norm_avg_tld_predact = (avg_tld_second/avg_prices_list[target])+1

    analytical_parameters['normalized_slope_predicted_calculated_target_ratio_versus_period'].append(norm_slope_ratio)
    analytical_parameters['normalized_intercept_predicted_calculated_target_ratio_versus_period'].append(norm_intercept_ratio) #I changed this line
    analytical_parameters['normalized_end_intercept_predicted_calculated_target_ratio_versus_period'].append(norm_end_intercept_ratio) #I changed this line
    analytical_parameters['normalized_rsqr_predicted_calculated_target_ratio_versus_period'].append(norm_rsqr_ratio)
    analytical_parameters['rescaled_normalized_average_tld_predicted_calculated_target_ratio_versus_period'].append(resc_norm_avg_tld_ratio)
    analytical_parameters['normalized_slope_predicted_versus_calculated_target'].append(slope_second)
    analytical_parameters['rescaled_normalized_intercept_predicted_versus_calculated_target'].append(resc_norm_inter_predact)
    analytical_parameters['normalized_rsqr_predicted_versus_calculated_target'].append(r_squared_second)
    analytical_parameters['rescaled_normalized_average_tld_predicted_versus_calculated_target'].append(resc_norm_avg_tld_predact)

    '''Here, there's the computation of the compound_run_term. The idea is, in fact, to multiply all the normalized\rescaled analytical parameters among themselves. In addition to that, each value is weighted with a certain exponent.
    The best model is considered as the one whose compound_run_term is the closest to our rederence value (that is 1)'''
    compound_run_term = (norm_slope_ratio)**slope_weighting_exponent_ratio * (norm_intercept_ratio)**intercept_weighting_exponent_ratio * (norm_end_intercept_ratio)**end_intercept_weighting_exponent_ratio * (norm_rsqr_ratio)**rsqr_weighting_exponent_ratio * (resc_norm_avg_tld_ratio)**dispersion_weighting_exponent_ratio * (slope_second)**(slope_weighting_exponent_predicted_actual) * (resc_norm_inter_predact)**(intercept_weighting_exponent_predicted_actual) * (r_squared_second)**(rsqr_weighting_exponent_predicted_actual) * (resc_norm_avg_tld_predact)**(dispersion_weighting_exponent_predicted_actual)

    analytical_parameters['compound_run_term'].append(compound_run_term)

    '''The adjusted_compound_run term is the version of the compound_run_term that can be used for comparing compound run terms of different cases. Remember to change them according to the weights you're working with'''
    #adjusted_compound_run_term = (compound_run_term)**(8/(slope_weighting_exponent_ratio+intercept_weighting_exponent_ratio+rsqr_weighting_exponent_ratio+dispersion_weighting_exponent_ratio+slope_weighting_exponent_predicted_actual+intercept_weighting_exponent_predicted_actual+rsqr_weighting_exponent_predicted_actual+dispersion_weighting_exponent_predicted_actual))
    adjusted_compound_run_term = (compound_run_term)
    print(' Adjusted Compound run term:', adjusted_compound_run_term)

    analytical_parameters['adjusted_compound_run_term'].append(adjusted_compound_run_term)

    '''This is te if branch in which the best model is determined'''
    if abs(adjusted_compound_run_term - 1) < diff: #if the compound_run_term of the current model is nearer to one than the compound_run_term of the best model, then we have a new best model

      analytical_parameters['best_run'].append(iteration) #store the best run so far
      diff = abs(adjusted_compound_run_term - 1) #store the new difference (for successive comparisons)
      best_run = iteration #change the value for the best_run (in such a way it works also if we are in the else branch)
      model.save(best_models_path+targets[target]+'/proceedit '+today+' SPP-'+case+'_200001-202207_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_GO - Best model for '+targets[target]+'.h5') #save the best model

      after_training_predictions = []
      after_training_predictions.append(multiple_run_predictions)

      intercepts_and_slopes['intercepts'].clear()
      intercepts_and_slopes['intercepts'].append(intercept_first)
      intercepts_and_slopes['slopes'].clear()
      intercepts_and_slopes['slopes'].append(slope_first)
      intercepts_and_slopes['end_intercepts'].clear()
      intercepts_and_slopes['end_intercepts'].append(end_intercept_first)
      best_model_intercept_first = intercept_first   #save the the intercept for the best model
      best_model_slope_first = slope_first     #save the the slope for the best model
      best_model_end_intercept_first = end_intercept_first #save the the end intercept for the best model
      print(' Predictions during training:', multiple_run_predictions)    #print the predictions during the training of every run

      analytical_parameters_updated['normalized_trend_slope'].append(norm_slope_ratio)
      analytical_parameters_updated['normalized_trend_intercept'].append(norm_intercept_ratio)
      analytical_parameters_updated['normalized_trend_end_intercept'].append(norm_end_intercept_ratio)
      analytical_parameters_updated['normalized_trend_correlation'].append(norm_rsqr_ratio)
      analytical_parameters_updated['rescaled_normalized_trend_dispersion'].append(resc_norm_avg_tld_ratio)
      analytical_parameters_updated['trend_slope_weighting_exponent'].append(slope_second)
      analytical_parameters_updated['trend_intercept_weighting_exponent'].append(resc_norm_inter_predact)
      analytical_parameters_updated['trend_end_intercept_weighting_exponent'].append("")
      analytical_parameters_updated['trend_correlation_weighting_exponent'].append(r_squared_second)
      analytical_parameters_updated['trend_dispersion_weighting_exponent'].append(resc_norm_avg_tld_predact)

    else: #otherwise, we simply take trace of the best_run so far

      analytical_parameters['best_run'].append(best_run)

    print(' Best model so far: ', best_run)
    print(' Best Model Adjusted compound run term:', adjusted_compound_run_term) #Just to print the best model compound run term
    print(' Best Model Intercept:', best_model_intercept_first)
    print(' Best model Slope:', best_model_slope_first)
    print(' Best model end Intercept:', best_model_end_intercept_first)
    #print(' Predictions during training:', multiple_run_predictions)


    '''The following block of lines is related to the way we compute the attenuated_padding_value (for the custom loss function). In fact, according to the position of this block, we can have two approaches:

          - if the block is inside the if branch (the one where the best model is determined), then we are in the B approach: the attenuated_padding_value is computed according to the best model obtained so far
          - if the block is outside the if branch (the one where the best model is determined), then we are in the L approach: the attenuated_padding_value is computed according to the last model obtained so far

    So in order to switch from an approach to the other, you can just cut the following 5 lines of code and paste them in the position you desire.'''

    intercept = intercept_first
    slope = slope_first
    padding_ratio_value = slope * padding_point + intercept #here we have the padding point we computed at the beginning
    padding_value = 1/padding_ratio_value
    attenuated_padding_value = ((padding_value-1)*attenuation_factor)+1 #formula for computing the attenuated_padding_value.. we have eliminated the -1 after paddinf_value
    print(' padding_ratio_value',padding_ratio_value) #I added these print here to check the calculation before the next iteration
    print(' padding_value',padding_value) #I added these print here to check the calculation before the next iteration
    print(' Attenuated padding value at end',attenuated_padding_value) #I added these print here to check the calculation before the next iteration

    if ((attenuated_padding_value< 0.8) or (attenuated_padding_value> 1.2)): #the attenuated_padding_value shouldn't be negative and shouldn't exceed 1
        attenuated_padding_value = 1

    print(' Attenuated padding value at end after the limit',attenuated_padding_value)
    print()


    '''Appending the values referred to the padding'''
    analytical_parameters['padding_correction_factor'].append(padding_value)
    analytical_parameters['padding_correction_factor_attenuation'].append(attenuated_padding_value)

    analytical_parameters['absolute_difference_normalized_intercept'].append(abs(intercept_first-1)) #this value is used for certain graphs (I don't know if it is still relevant for our analysis)

    '''Here , we store the analytical parameters file inside the loop (and not at the end when we have all the results) because in this way we can monitor step by step where we are. Of course, the file is overwritten everytime .'''
    analytical_params_df = pd.DataFrame(analytical_parameters)
    analytical_params_df.to_excel(best_models_path+targets[target]+'/proceedit '+today+' '+case+'_200001-202207_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_GO - Analytical parameters.xlsx')

    iteration += 1
  '''
  fit = np.polyfit(np.arange(0, analytical_params_df['absolute_difference_normalized_intercept'].shape[0],1), np.log(analytical_params_df['absolute_difference_normalized_intercept']),1) #this value is used for certain graphs (I don't know if it is still relevant for our analysis.
  alpha = fit[0]
  beta = fit[1]
  exponent_points = np.exp(beta + alpha*np.arange(0, analytical_params_df['absolute_difference_normalized_intercept'].shape[0],1))
  analytical_params_df['exponent_points'] = exponent_points'''

  '''Computation of slope and intercept for the best model predcitions'''
  last_model_predicted_actual_ratio = (multiple_run_predictions/actual)
  #best_model_predicted_actual_ratio = upload_predictions / actual
  last_model_slope_first, last_model_intercept_first, last_model_r_squared_first, last_model_avg_tld_first = sir_parameters(positions_day_number, last_model_predicted_actual_ratio)
  #print('Last Model Intercept:', last_model_intercept_first)
  #print('Last model Slope:', last_model_slope_first)
  print('Best Model Intercept:', best_model_intercept_first)
  print('Best model Slope:', best_model_slope_first)
  print('Best model End Intercept:', best_model_end_intercept_first)
  print('The Predictions:', multiple_run_predictions)

  '''Saving the final analytical parameters file for a specific target'''
  analytical_params_df = pd.concat([weights_df, analytical_params_df])
  analytical_params_df.to_excel(best_models_path+targets[target]+'/proceedit '+today+' '+case+'_200001-202207_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_GO - Analytical parameters.xlsx')

  analytical_parameters_updated_df = pd.DataFrame(analytical_parameters_updated)
  analytical_parameters_updated_df.to_excel(best_models_path+targets[target]+'/proceedit '+today+' '+case+'_200001-202207_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_GO - Updated Names Analytical parameters.xlsx')

Iteration 0 for target MPN5P
Attenuated padding value 1
 Predicted actual ratio [1.02425513 1.0076826  0.98155323 ... 1.04065331 1.02522976 1.01883258]
 Intercept 0.9881889810001112
 Slope 2.0604009279748124e-06
 End Intercept 1.005216134268895
 Postion day number 8264
 Adjusted Compound run term: 1.0836905436140996
 Predictions during training: [ 54.79765  53.24595  50.79538 ... 277.51102 272.10623 269.2469 ]
 Best model so far:  0
 Best Model Adjusted compound run term: 1.0836905436140996
 Best Model Intercept: 0.9881889810001112
 Best model Slope: 2.0604009279748124e-06
 Best model end Intercept: 1.005216134268895
 padding_ratio_value 1.005216134268895
 padding_value 0.9948109326033762
 Attenuated padding value at end 0.9961081994525322
 Attenuated padding value at end after the limit 0.9961081994525322

Iteration 1 for target MPN5P
Attenuated padding value 0.9961081994525322
 Predicted actual ratio [0.98664122 0.98052353 0.96082656 ... 0.94799028 0.93563795 0.93305697]
 Intercept 1

#Predictions

In [None]:
'''This part is referred to he computation of the predictions for the best model and the application of a vertical padding to the results in order to obtain more precise outcomes.'''
max_it = 5 #this is the maximum number of iterations that allow us to not be stucked in the while loop of the vertical padding
after_upload_predictions = [] # here, we are going to store the predictions of thebest model without the vertical padding.
after_vertical_predictions = []
after_swing_predictions = []

for target in targets: #for each target

    idx = targets.index(target)
    a_params_df = pd.read_excel(best_models_path+target+'/proceedit '+today+' '+case+'_200001-202207_MPN'+model_case_version_main_target_code+
                                'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+
                                'L'+str(attenuation_factor)+'_GO - Analytical parameters.xlsx') #retrieve the analytical parameters file
                                                                                                                                                                                                                                                                              #for the correspondent target
    path = best_models_path+target+'/proceedit '+today+' SPP-'+case+'_200001-202207_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_GO - Best model for '+target+'.h5' #path to the best model for the correspondent target

    number = int(a_params_df['best_run'].iloc[-1]) #this is the number of the best model
    print('Best model for', target,': ',number)
    print()

    if number == 0:
      padding_correction_factor_attenuation = 1

    else:
      padding_correction_factor_attenuation = a_params_df['padding_correction_factor_attenuation'].loc[number+1] #this is the attenuating_padding_value of the best model (we need it to compute predictions with the custom_loss_function)

    print('Padding_correction_factor_attenuation',padding_correction_factor_attenuation)
    print()

    best_model = load_model(path, custom_objects={'padding_loss_function': custom_loss_function(padding_correction_factor_attenuation)}) #load the best model
    original_intercept_ratio = a_params_df['intercept_predicted_calculated_target_ratio_versus_period'].loc[number+1] #retrieve the intercept value (of ratio graph)
    original_slope_ratio = a_params_df['slope_predicted_calculated_target_ratio_versus_period'].loc[number+1] #retrieve the slope value (of ratio graph)
    original_end_intercept_ratio = a_params_df['end_intercept_predicted_calculated_target_ratio_versus_period'].loc[number+1] #retrieve the end_intercept value (of ratio graph)

    intercepts_and_slopes['intercepts'].append(original_intercept_ratio)
    intercepts_and_slopes['slopes'].append(original_slope_ratio)
    intercepts_and_slopes['end_intercepts'].append(original_end_intercept_ratio)

    print('Best model intercept_ratio:', original_intercept_ratio)
    print('Best model slope_ratio:', original_slope_ratio)
    print('Best model end intercept_ratio:', original_end_intercept_ratio)
    print()

    '''Computation of the predictions'''
    train_predictions = best_model.predict(X_train) #predictions for training data
    train_predictions = np.exp(robust_scaler.inverse_transform(train_predictions)) #convert prediction first by inverting the Robust scaler transformation and then the e_logarithmic one.
    train_array = np.concatenate(train_predictions).ravel() #with these 3 lines, we create a unique array with all the predictions coming from training and test data.

    upload_predictions = train_array
    after_upload_predictions.append(upload_predictions)

    '''Computation of slope and intercept for the best model predcitions'''
    after_best_model_predicted_actual_ratio = upload_predictions / actual
    after_best_model_slope_first, after_best_model_intercept_first, after_best_model_r_squared_first, after_best_model_avg_tld_first = sir_parameters(positions_day_number, after_best_model_predicted_actual_ratio)
    print('After Intercept first:', after_best_model_intercept_first)  #calculate the intercept after we defined the best model at the end of the multi-run
    print('After Slope first:', after_best_model_slope_first)    #calculate the slope after we defined the best model at the end of the multi-run
    #print('The Predictions:', upload_predictions)


    '''Vertical padding'''
    actual = actuals_cols[idx][:len(upload_predictions)] #this is used to compute the ratio between predicted and actual values in the vertical padding (see while loop)
    vertical_predictions = upload_predictions #initialization for the predictions with padding

    vertical_it = 0
    intercept_at_the_end = 1e3

    intercept_ratio = -1
    slope_ratio = -1
    end_intercept_ratio = 1

    ratio_array = (vertical_predictions/actual)
    print(upload_predictions)


    x = np.linspace(0,len(upload_predictions), len(upload_predictions), dtype= np.int32)
    z = np.polyfit(x, ratio_array, 1)
    p = np.poly1d(z)


    plt.figure(figsize=(10,10))
    plt.plot(ratio_array, linestyle = 'dotted')
    plt.plot(x, p(x), linewidth = 2.5)
    plt.show()

    vertical_ratio_array = (vertical_predictions/actual)

    '''The idea here is to start from a certain intercept of the trende line (which is referred to the ratio between predicted and actual values) and iterate until we do not obtain an intercept that is closer
    (with a certain precision) to our reference valye (that is 1)'''
    while abs(intercept_at_the_end -1) >= precision and vertical_it < max_it: #as long as the intercept

      vertical_it +=1
      print('Iteration:', vertical_it)

      #ratio_array = (vertical_predictions/actual) #computing the ratio

      slope_ratio, intercept_ratio, _, _ = sir_parameters(positions_day_number, vertical_ratio_array) #compute the intercept and slope
      #vertical_slope_at_the_beginning,vertical_intercept_at_the_beginning, _, _ = sir_parameters(positions_day_number, vertical_ratio_array) #compute the intercept and the slope at the beginning
      end_intercept_ratio = slope_ratio * padding_point + intercept_ratio
      #end_intercept_ratio = vertical_intercept_at_the_beginning + vertical_slope_at_the_beginning * padding_point
      print('Vertical Intercept at the beginning:', intercept_ratio)
      print('Vertical Slope at the beginning:', slope_ratio)
      #print('Swing Slope at the beginning:', swing_slope_at_the_beginning)
      #print(' positions_day_number:', positions_day_number)
      #print(' Vertical ratio_array:', vertical_ratio_array)
      #print(' Intercept_ratio:', intercept_ratio)
      #print(' Slope_ratio:', slope_ratio)

      #vertical_intercept_at_the_end = vertical_slope_at_the_beginning * padding_point + vertical_intercept_at_the_beginning
      intercept_at_the_end = slope_ratio * padding_point + intercept_ratio
      print(' Vertical Intercept at the end:', intercept_at_the_end)
      print(' end_intercept_ratio:', end_intercept_ratio)
      print()


      vertical_padding_value = 1/intercept_at_the_end
      vertical_ratio_array = vertical_ratio_array * vertical_padding_value


    vertical_predictions = vertical_ratio_array * actual #this is the formula for vertical padding

    after_vertical_predictions.append(vertical_predictions)


    intercepts_and_slopes['intercepts'].append(intercept_ratio)
    intercepts_and_slopes['slopes'].append(slope_ratio)
    intercepts_and_slopes['end_intercepts'].append(end_intercept_ratio)



    x = np.linspace(0,len(vertical_predictions), len(vertical_predictions), dtype= np.int32)
    z = np.polyfit(x, vertical_ratio_array, 1) #changed from ratio_array to vertical_ratio_array
    p = np.poly1d(z)


    plt.figure(figsize=(10,10))
    plt.plot(vertical_ratio_array, linestyle = 'dotted')
    plt.plot(x, p(x), linewidth = 2.5)
    plt.show()

    '''Swing padding'''

    swing_predictions = vertical_predictions

    swing_intercept_at_the_beginning = 1e3
    swing_slope_at_the_beginning = -1
    swing_it = 0
    swing_end_intercept = 1
    #positions = np.arange(0, span_dncp)
    swing_ratio_array = (swing_predictions/actual) #computing the ratio

    while abs(swing_intercept_at_the_beginning - 1) >= precision and swing_it < max_it: #as long as the intercept

      swing_it +=1
      print('Iteration:', swing_it)


      swing_slope_at_the_beginning, swing_intercept_at_the_beginning, _, _ = sir_parameters(positions_day_number, swing_ratio_array)

      print('Swing Intercept at the beginning:', swing_intercept_at_the_beginning)
      print('Swing Slope at the beginning:', swing_slope_at_the_beginning)
      swing_intercept_at_the_end = swing_slope_at_the_beginning * padding_point + swing_intercept_at_the_beginning
      print(' Swing Intercept at the end:', swing_intercept_at_the_end)
      swing_end_intercept = swing_slope_at_the_beginning * padding_point + swing_intercept_at_the_beginning


      swing_factors = 1/(1+(swing_intercept_at_the_beginning - 1)*(span_dncp - positions_day_number)/(span_dncp - 1))
      print('Swing factors', swing_factors)
      print()
      swing_ratio_array = swing_ratio_array * swing_factors


    swing_predictions = swing_ratio_array * actual
    after_swing_predictions.append(swing_predictions)
    print('After swing predictions', after_swing_predictions)

    print()
    intercepts_and_slopes['intercepts'].append(swing_intercept_at_the_beginning)
    intercepts_and_slopes['slopes'].append(swing_slope_at_the_beginning)
    intercepts_and_slopes['end_intercepts'].append(swing_end_intercept)

    x = np.linspace(0,len(swing_predictions), len(swing_predictions), dtype= np.int32)
    z = np.polyfit(x, swing_ratio_array, 1)
    p = np.poly1d(z)


    plt.figure(figsize=(10,10))
    plt.plot(swing_ratio_array, linestyle = 'dotted')
    plt.plot(x, p(x), linewidth = 2.5)
    plt.show()

'''Creating the dataframe for the predictions'''
after_training_predictions_df = pd.DataFrame(after_training_predictions).T
after_training_predictions_df = after_training_predictions_df.rename(columns={0:'after_training_MPN'+model_case_version_main_target_code+'P_prediction',1:'after_training_HPN'+model_case_version_main_target_code+'P_prediction',2:'after_training_LPN'+model_case_version_main_target_code+'P_prediction',3:'after_training_HPN1P_prediction',4:'after_training_LPN1P_prediction'})

after_upload_predictions_df = pd.DataFrame(after_upload_predictions).T
after_upload_predictions_df = after_upload_predictions_df.rename(columns={0:'after_upload_MPN'+model_case_version_main_target_code+'P_prediction',1:'after_upload_HPN'+model_case_version_main_target_code+'P_prediction',2:'after_upload_LPN'+model_case_version_main_target_code+'P_prediction',3:'after_upload_HPN1P_prediction',4:'after_upload_LPN1P_prediction'})

after_vertical_predictions_df = pd.DataFrame(after_vertical_predictions).T
after_vertical_predictions_df = after_vertical_predictions_df.rename(columns={0:'after_vertical_MPN'+model_case_version_main_target_code+'P_prediction',1:'after_vertical_HPN'+model_case_version_main_target_code+'P_prediction',2:'after_vertical_LPN'+model_case_version_main_target_code+'P_prediction',3:'after_vertical_HPN1P_prediction',4:'after_vertical_LPN1P_prediction'})

after_swing_predictions_df = pd.DataFrame(after_swing_predictions).T
after_swing_predictions_df = after_swing_predictions_df.rename(columns={0:'after_swing_MPN'+model_case_version_main_target_code+'P_prediction',1:'after_swing_HPN'+model_case_version_main_target_code+'P_prediction',2:'after_swing_LPN'+model_case_version_main_target_code+'P_prediction',3:'after_swing_HPN1P_prediction',4:'after_swing_LPN1P_prediction'})

intercepts_and_slopes_df = pd.DataFrame(intercepts_and_slopes).T.rename(columns={0:'after_training_MPN'+model_case_version_main_target_code+'P_prediction', 1:'after_upload_MPN'+model_case_version_main_target_code+'P_prediction', 2: 'after_vertical_MPN'+model_case_version_main_target_code+'P_prediction',
                                                                                 3:'after_swing_MPN'+model_case_version_main_target_code+'P_prediction'})


In [None]:
predictions_df = pd.concat([dohlcav_mpnxp_data['DNCP_day_number_current_period'], after_upload_predictions_df, after_vertical_predictions_df, after_swing_predictions_df], axis=1)
predictions_df = pd.concat([intercepts_and_slopes_df, predictions_df])
predictions_df.insert(0, 'DCP_date_current_period', dates)
predictions_df.to_excel(model_path+'/proceedit '+today+' SPP-'+case+'_200001-202207_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_GO - Predictions.xlsx')