In [48]:
#importando as bibliotecas
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
%matplotlib inline

In [49]:
#Lendo o arquivo CSV
df = pd.read_csv("C:\Python36\win_5m.csv", delimiter=',')
df

Unnamed: 0,time,open,high,low,close,tick_volume,spread,real_volume
0,2020-06-16 16:30:00,96394.0,96630.0,96281.0,96486.0,794,1,1545
1,2020-06-16 16:35:00,96481.0,96507.0,96235.0,96373.0,439,1,1008
2,2020-06-16 16:40:00,96389.0,96455.0,96255.0,96389.0,587,1,870
3,2020-06-16 16:45:00,96384.0,96461.0,96301.0,96425.0,596,1,1055
4,2020-06-16 16:50:00,96414.0,96558.0,96219.0,96286.0,979,1,1790
...,...,...,...,...,...,...,...,...
29995,2021-07-27 16:40:00,124605.0,124675.0,124575.0,124630.0,6373,0,101761
29996,2021-07-27 16:45:00,124630.0,124740.0,124630.0,124730.0,5941,0,112333
29997,2021-07-27 16:50:00,124730.0,124905.0,124725.0,124880.0,11366,0,213132
29998,2021-07-27 16:55:00,124880.0,125010.0,124870.0,124995.0,9740,0,156870


In [50]:
df.dtypes

time            object
open           float64
high           float64
low            float64
close          float64
tick_volume      int64
spread           int64
real_volume      int64
dtype: object

In [51]:
#Mudar o tipo data
df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S')

In [52]:
df.dtypes

time           datetime64[ns]
open                  float64
high                  float64
low                   float64
close                 float64
tick_volume             int64
spread                  int64
real_volume             int64
dtype: object

In [53]:
#criando novos campos (média móvel de 5 e 21)
df['mm5'] = df['close'].rolling(5).mean()
df['mm21'] = df['close'].rolling(21).mean()
df.head(8)

Unnamed: 0,time,open,high,low,close,tick_volume,spread,real_volume,mm5,mm21
0,2020-06-16 16:30:00,96394.0,96630.0,96281.0,96486.0,794,1,1545,,
1,2020-06-16 16:35:00,96481.0,96507.0,96235.0,96373.0,439,1,1008,,
2,2020-06-16 16:40:00,96389.0,96455.0,96255.0,96389.0,587,1,870,,
3,2020-06-16 16:45:00,96384.0,96461.0,96301.0,96425.0,596,1,1055,,
4,2020-06-16 16:50:00,96414.0,96558.0,96219.0,96286.0,979,1,1790,96391.8,
5,2020-06-16 16:55:00,96250.0,96445.0,96183.0,96440.0,712,1,1420,96382.6,
6,2020-06-16 17:00:00,96445.0,96450.0,96271.0,96337.0,534,1,1115,96375.4,
7,2020-06-16 17:05:00,96322.0,96322.0,96235.0,96276.0,500,1,1097,96352.8,


In [54]:
df['close'] = df['close'].shift(-1)
#df.head()

In [55]:
df.dropna(inplace=True)
df

Unnamed: 0,time,open,high,low,close,tick_volume,spread,real_volume,mm5,mm21
20,2020-06-17 09:15:00,97346.0,97443.0,97203.0,97090.0,32493,1,84803,97062.4,96517.571429
21,2020-06-17 09:20:00,97295.0,97305.0,97023.0,96931.0,33416,1,85857,97202.6,96546.333333
22,2020-06-17 09:25:00,97095.0,97228.0,96839.0,96880.0,40779,1,102724,97189.2,96572.904762
23,2020-06-17 09:30:00,96926.0,97075.0,96788.0,96808.0,47364,1,123151,97109.4,96596.285714
24,2020-06-17 09:35:00,96875.0,96900.0,96665.0,96793.0,39472,1,109233,97001.8,96614.523810
...,...,...,...,...,...,...,...,...,...,...
29994,2021-07-27 16:35:00,124545.0,124615.0,124495.0,124630.0,5903,0,106207,124611.0,124436.428571
29995,2021-07-27 16:40:00,124605.0,124675.0,124575.0,124730.0,6373,0,101761,124604.0,124462.619048
29996,2021-07-27 16:45:00,124630.0,124740.0,124630.0,124880.0,5941,0,112333,124621.0,124490.714286
29997,2021-07-27 16:50:00,124730.0,124905.0,124725.0,124995.0,11366,0,213132,124678.0,124523.809524


In [56]:
#verificando quantidade de linhas
qtd_linhas = len(df)
qtd_linhas_treino = (qtd_linhas * 0.7)
qtd_linhas_teste = (qtd_linhas * 0.25)
qtd_linhas_validacao = qtd_linhas - qtd_linhas_treino - qtd_linhas_teste
print(qtd_linhas_treino)
print(qtd_linhas_teste)
print(qtd_linhas_validacao)

20985.3
7494.75
1498.9500000000007


In [57]:
df = df.reset_index(drop=True)

In [58]:
#separando as features e Labels
features = df.drop(['time','spread'],1)
labels = df['close']

In [59]:
#escolhendo as melhores features com Kbest
features_list = ('open', 'real_volume', 'tick_volume','mm5','mm21')

k_best_features = SelectKBest(k='all')
k_best_features.fit_transform(features, labels)
k_best_features_scores = k_best_features.scores_
raw_pairs = zip(features_list[1:], k_best_features_scores)
ordered_pairs = list(reversed(sorted(raw_pairs, key=lambda x: x[1])))

k_best_features_final = dict(ordered_pairs[:15])
best_features = k_best_features_final.keys()
print ('')
print ("Melhores features:")
print (k_best_features_final)


Melhores features:
{'mm21': inf, 'tick_volume': 10763.812717167857, 'mm5': 10552.013147313583, 'real_volume': 8107.930819999416}


  f = msb / msw


In [60]:
#Normalizando os dados de entrada (features) para que não sejam atribuídos pesos a eles
scaler = MinMaxScaler().fit(features)
features_scale = scaler.transform(features)

print('Features: ',features_scale.shape)
print(features_scale)#Normalizando os dados de entrada

Features:  (29979, 8)
[[0.05539145 0.05318947 0.05327447 ... 0.12274846 0.04306006 0.02437115]
 [0.05400938 0.04943875 0.04839933 ... 0.12443496 0.04688241 0.02516048]
 [0.04858947 0.04734596 0.04341585 ... 0.15142385 0.04651708 0.02588969]
 ...
 [0.79477521 0.79509689 0.79611072 ... 0.16679921 0.79440552 0.79205262]
 [0.79748516 0.79958144 0.79868371 ... 0.32808767 0.79595954 0.79296087]
 [0.80155009 0.80243525 0.80261091 ... 0.23806286 0.79841326 0.79393446]]
