-
Notifications
You must be signed in to change notification settings - Fork 68
/
Data_multidimension.py
155 lines (129 loc) · 9.75 KB
/
Data_multidimension.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import _KEYS_DICT
from LogRoot.Logging import Logger
from Model_TF_definitions import ModelDefinition
from Utils.UtilsL import bcolors
from _KEYS_DICT import Y_TARGET
from Utils import Utils_model_predict
class Data_multidimension:
BACHT_SIZE_LOOKBACK = _KEYS_DICT.BACHT_SIZE_LOOKBACK
will_shuffle = True
path_csv = None
op_buy_sell = None
columns_selection = None
name_models_stock = None
array_aux_np = None
train_labels = None
val_labels = None
test_labels = None
train_features = None
val_features = None
test_features = None
bool_train_labels = None
cols_df = None
imput_shape = None
dict_Model_Definition = None
def __init__(self,columns_selection_a: [], op_buy_sell_a : _KEYS_DICT.Op_buy_sell, path_csv_a, name_models_stock):
self.columns_selection = columns_selection_a
self.op_buy_sell = op_buy_sell_a
self.path_csv = path_csv_a
self.name_models_stock = name_models_stock
self.load_split_data_multidimension()
self.dict_Model_Definition = ModelDefinition(shape_inputs_m=self.imput_shape, num_features_m=len(self.cols_df)).get_dicts_models_multi_dimension()
Logger.logr.debug(bcolors.HEADER +'Created object TF_multidimension Path from: ' + self.path_csv+ bcolors.ENDC)
def load_split_data_multidimension(self):
if not "_PLAIN_" in self.path_csv :
Logger.logr.error(bcolors.HEADER + 'The input data must not have any scaling on the input to be correctly scaled. Path: ' + self.path_csv + bcolors.ENDC)
raise ValueError('The input data must not have any scaling on the input to be correctly scaled. Path: ' + self.path_csv )
df = Utils_model_predict.load_and_clean_DF_Train_from_csv(self.path_csv, self.op_buy_sell, self.columns_selection) # shape is (5086, 13)
self.cols_df = df.columns
if 'ticker' in self.cols_df:
Logger.logr.error(bcolors.HEADER + '\"ticker\" column detected, development required for multi-stock predictions. Path: ' + self.path_csv + bcolors.ENDC)
raise ValueError('\"ticker\" column detected, development required for multi-stock predictions. Path: ' + self.path_csv )
# SMOTE and Tomek links
# The SMOTE oversampling approach could generate noisy samples since it creates synthetic data. To solve this problem, after SMOTE, we could use undersampling techniques to clean up. We’ll use the Tomek links undersampling technique in this example.
# Utils_plotter.plot_2d_space(df.drop(columns=[Y_TARGET]).iloc[:,4:5] , df[Y_TARGET], path = "SMOTE_antes.png")
array_aux_np = df[Y_TARGET] # TODO antes o despues del balance??
self.array_aux_np = array_aux_np
# En caso de que las predicciones den numeros identicos
# https://datascience.stackexchange.com/questions/21955/tensorflow-regression-model-giving-same-prediction-every-time
# 1.0 ADD MULTIDIMENSION Get 2D array , with BACHT_SIZE_LOOKBACK from "backward glances".
# Values go from (10000 rows, 10 columns ) to (10000 rows, ( 10-1[groundTrue] * 10 dimensions ) columns ) but for the moment it does not go to 3d array remains 2d.
# df.shape: (1000, 10) to (1000, 90)
arr_mul_labels, arr_mul_features = Utils_model_predict.df_to_df_multidimension_array_2D(df.reset_index(drop=True), BACHT_SIZE_LOOKBACK = self.BACHT_SIZE_LOOKBACK)
shape_imput_3d = (-1,self.BACHT_SIZE_LOOKBACK, len(df.columns)-1) # (-1, 10, 12)
# 1.1 validate the structure of the data, this can be improved by
arr_vali = arr_mul_features.reshape(shape_imput_3d) # 5077, 10, 12
for i in range(1, arr_vali.shape[0], self.BACHT_SIZE_LOOKBACK * 3):
list_fails_dates = [x for x in arr_vali[i][:, 0] if not (2018 <= datetime.fromtimestamp(x).year <= 2024)]
if list_fails_dates:
Logger.logr.error("The dates of the new 2D array do not appear in the first column. ")
raise ValueError("The dates of the new 2D array do not appear in the first column. ")
# 2.0 SCALER scaling the data before, save a .scal file (it will be used to know how to scale the model for future predictions )
# Do I have to scale now or can I wait until after I split
# You can scale between the following values _KEYS_DICT.MIN_SCALER, _KEYS_DICT.MAX_SCALER
# " that you learn for your scaling so that doing scaling before or after may give you the same results (but this depends on the actual scaling function)." https://datascience.stackexchange.com/questions/71515/should-i-scale-data-before-or-after-balancing-dataset
# TODO verify the correct order to "scaler split and SMOTE" order SMOTE. sure: SMOTE only aplay on train_df
arr_mul_features = Utils_model_predict.scaler_min_max_array(arr_mul_features,path_to_save= _KEYS_DICT.PATH_SCALERS_FOLDER+self.name_models_stock+".scal")
arr_mul_labels = Utils_model_predict.scaler_min_max_array(arr_mul_labels.reshape(-1,1))
# 2.1 Let's put real groound True Y_TARGET in a copy of scaled dataset
df_with_target = pd.DataFrame(arr_mul_features)
df_with_target[Y_TARGET] = arr_mul_labels.reshape(-1,)
# 3.0 SPLIT Ok we should split in 3 train val and test
# "you divide your data first and then apply synthetic sampling SMOTE on the training data only" https://datascience.stackexchange.com/questions/15630/train-test-split-after-performing-smote
# CAUTION SMOTE generates twice as many rows
train_df, test_df = train_test_split(df_with_target, test_size=0.18, shuffle=self.will_shuffle) # Shuffle in a time series? hmmm
train_df, val_df = train_test_split(train_df, test_size=0.35, shuffle=self.will_shuffle) # Shuffle in a time series? hmmm
# Be carefull not to touch test_df, val_df
# Apply smote only to train_df but first remove Y_TARGET from train_df
# 3.1 Create a array 2d form dfs . Remove Y_target from train_df, because that's we want to predict and that would be cheating
train_df_x = np.asarray(train_df.drop(columns=[Y_TARGET] ) )
# In train_df_y We drop everything except Y_TARGET
train_df_y = np.asarray(train_df[Y_TARGET] )
# 4.0 SMOTE train_df to balance the data since there are few positive inputs, you have to generate "neighbors" of positive inputs. only in the df_train. according to the documentation of the imblearn pipeline:
# Now we can smote only train_df . Doing the smote with 2D, with 3D is not possible.
X_smt, y_smt = Utils_model_predict.prepare_to_split_SMOTETomek_01(train_df_x, train_df_y)
# 4.1 Let's put real groound True Y_TARGET in a copy of scaled dataset
train_cleaned_df_target = pd.DataFrame(X_smt)
train_cleaned_df_target[Y_TARGET] = y_smt.reshape(-1,)
#the SMOTE leaves the positives very close together
train_cleaned_df_target = shuffle(train_cleaned_df_target)
# 5 PREPARE the data to be entered in TF with the correct dimensions
# 5.1 pass Y_TARGET labels to 2D array required for TF
train_labels = np.asarray(train_cleaned_df_target[Y_TARGET]).astype('float32').reshape((-1, 1)) # no need already 2d
bool_train_labels = (train_labels != 0).reshape((-1))
val_labels = np.asarray(val_df[Y_TARGET]).astype('float32').reshape((-1, 1)) # no need already 2d
test_labels = np.asarray(test_df[Y_TARGET]).astype('float32').reshape((-1, 1)) # no need already 2d
# 5.2 all array windows that were in 2D format (to overcome the SCALER and SMOTE methods),
# must be displayed in 3D for TF by format of varible shape_imput_3d
train_features = np.array(train_cleaned_df_target.drop(columns=[Y_TARGET]) ).reshape(shape_imput_3d)
test_features = np.array(test_df.drop(columns=[Y_TARGET]) ).reshape(shape_imput_3d )
val_features = np.array(val_df.drop(columns=[Y_TARGET]) ).reshape(shape_imput_3d )
# 6 DISPLAY show the df format before accessing TF
Utils_model_predict.log_shapes_trains_val_data(test_features, test_labels, train_features, train_labels, val_features, val_labels)
# for N_row in [return_feature.shape[0] // 2, return_feature.shape[0] // 3, return_feature.shape[0] // 5,
# return_feature.shape[0] - 2]:
# N_row = 88
# arr_check = np.array(dataframe.loc[(N_row - self.BACHT_SIZE_LOOKBACK + 1):N_row, dataframe.columns.drop(Y_TARGET)])[::-1].reshape( 1, -1)
# # return_feature[N_row-BACHT_SIZE_LOOKBACK+1]
# if not (arr_check == return_feature[N_row - self.BACHT_SIZE_LOOKBACK + 1]).all():
# Logger.logr.error("data has not been reshaped 2D correctly ")
# raise ValueError("df_to_df_multidimension_array_2D() - data has not been reshaped 2D correctly ")
# TODO validate the correct tranformation 2d to 3d by raise
self.imput_shape = (train_features.shape[1], train_features.shape[2])
self.train_labels = train_labels
self.val_labels = val_labels
self.test_labels = test_labels
self.train_features = train_features
self.val_features = val_features
self.test_features = test_features
self.bool_train_labels = bool_train_labels
def get_all_data(self):
return self.array_aux_np, self.train_labels, self.val_labels, self.test_labels, self.train_features, self.val_features, self.test_features, self.bool_train_labels
def get_dicts_models_multi_dimension(self, model_type : _KEYS_DICT.MODEL_TF_DENSE_TYPE_MULTI_DIMENSI):
return self.dict_Model_Definition[model_type]
# ModelDefinition(shape_inputs_m=imput_shape, num_features_m=len(columns_df)).get_dicts_models_multi_dimension()