In [6]:
#dependencies
import pandas as pd
import numpy as np
import yfinance as yf
import time
from datetime import datetime 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import tensorflow as tf # This code has been tested with TensorFlow 1.6

ModuleNotFoundError: No module named 'tensorflow'

In [9]:
end = datetime.now()
start = datetime(end.year-5,end.month,end.day)
data = yf.download("AAPL", start=start, end=end)
df = pd.DataFrame(data=data)
df = df.rename(columns={"Date":"date","Open":"open","High":"high","Low":"low","Close":"close","Adj Close":"adj_close","Volume":"volume"})
#df.to_csv(''+symbol+'.csv')
print(len(df.index))
print(df.head)

[*********************100%***********************]  1 of 1 completed
1260
<bound method NDFrame.head of                   open        high         low       close   adj_close  \
Date                                                                     
2016-02-02   23.855000   24.010000   23.570000   23.620001   21.883259   
2016-02-03   23.750000   24.209999   23.520000   24.087500   22.316381   
2016-02-04   23.965000   24.332500   23.797501   24.150000   22.495693   
2016-02-05   24.129999   24.230000   23.422501   23.504999   21.894878   
2016-02-08   23.282499   23.924999   23.260000   23.752501   22.125422   
...                ...         ...         ...         ...         ...   
2021-01-27  143.429993  144.300003  140.410004  142.059998  142.059998   
2021-01-28  139.520004  141.990005  136.699997  137.089996  137.089996   
2021-01-29  135.830002  136.740005  130.210007  131.960007  131.960007   
2021-02-01  133.750000  135.380005  130.929993  134.139999  134.139999   
2021-02-

In [10]:
high_prices = df.loc[:,'high'].to_numpy()
low_prices = df.loc[:,'low'].to_numpy()
mid_prices = (high_prices+low_prices)/2.0

train_data = mid_prices[:630]
test_data = mid_prices[630:]

# Scale the data to be between 0 and 1
# When scaling remember! You normalize both test and train data with respect to training data
# Because you are not supposed to have access to test data
scaler = MinMaxScaler()
train_data = train_data.reshape(-1,1)
test_data = test_data.reshape(-1,1)

# Train the Scaler with training data and smooth data
smoothing_window_size = 150
for di in range(0,600,smoothing_window_size):
    scaler.fit(train_data[di:di+smoothing_window_size,:])
    train_data[di:di+smoothing_window_size,:] = scaler.transform(train_data[di:di+smoothing_window_size,:])

    # You normalize the last bit of remaining data
scaler.fit(train_data[di+smoothing_window_size:,:])
train_data[di+smoothing_window_size:,:] = scaler.transform(train_data[di+smoothing_window_size:,:])

In [11]:
train_data = train_data.reshape(-1)

# Normalize test data
test_data = scaler.transform(test_data).reshape(-1)


In [13]:
# Now perform exponential moving average smoothing
# So the data will have a smoother curve than the original ragged data
EMA = 0.0
gamma = 0.1
for ti in range(253):
    EMA = gamma*train_data[ti] + (1-gamma)*EMA
    train_data[ti] = EMA

# Used for visualization and test purposes
all_mid_data = np.concatenate([train_data,test_data],axis=0)

In [14]:
window_size = 25
N = train_data.size

run_avg_predictions = []
run_avg_x = []

mse_errors = []

running_mean = 0.0
run_avg_predictions.append(running_mean)

decay = 0.5

for pred_idx in range(1,N):

    running_mean = running_mean*decay + (1.0-decay)*train_data[pred_idx-1]
    run_avg_predictions.append(running_mean)
    mse_errors.append((run_avg_predictions[-1]-train_data[pred_idx])**2)
    run_avg_x.append(df.index)

print('MSE error for EMA averaging: %.5f'%(0.5*np.mean(mse_errors)))

MSE error for EMA averaging: 0.00339


In [15]:
class DataGeneratorSeq(object):
    
    def __init__(self,prices,batch_size,num_unroll):
        self._prices = prices
        self._prices_length = len(self._prices) - num_unroll
        self._batch_size = batch_size
        self._num_unroll = num_unroll
        self._segments = self._prices_length //self._batch_size
        self._cursor = [offset * self._segments for offset in range(self._batch_size)]

    def next_batch(self):
        
        batch_data = np.zeros((self._batch_size),dtype=np.float32)
        batch_labels = np.zeros((self._batch_size),dtype=np.float32)
        
        for b in range(self._batch_size):
            if self._cursor[b]+1>=self._prices_length:
                #self._cursor[b] = b * self._segments
                self._cursor[b] = np.random.randint(0,(b+1)*self._segments)
                
            batch_data[b] = self._prices[self._cursor[b]]
            batch_labels[b]= self._prices[self._cursor[b]+np.random.randint(1,5)]
            
            self._cursor[b] = (self._cursor[b]+1)%self._prices_length
            
        return batch_data,batch_labels
    
    def unroll_batches(self):
            
        unroll_data,unroll_labels = [],[]
        init_data, init_label = None,None
        for ui in range(self._num_unroll):
            
            data, labels = self.next_batch()    

            unroll_data.append(data)
            unroll_labels.append(labels)

        return unroll_data, unroll_labels
    
    def reset_indices(self):
        for b in range(self._batch_size):
            self._cursor[b] = np.random.randint(0,min((b+1)*self._segments,self._prices_length-1))
        
dg = DataGeneratorSeq(train_data,5,5)
u_data, u_labels = dg.unroll_batches()

for ui,(dat,lbl) in enumerate(zip(u_data,u_labels)):   
    print('\n\nUnrolled index %d'%ui)
    dat_ind = dat
    lbl_ind = lbl
    print('\tInputs: ',dat )
    print('\n\tOutput:',lbl)



Unrolled index 0
	Inputs:  [0.00205708 0.32920447 0.32554197 0.24911918 0.4327254 ]

	Output: [0.01811437 0.38690606 0.3312129  0.2375221  0.3419207 ]


Unrolled index 1
	Inputs:  [0.0059025  0.34219217 0.3312129  0.23502654 0.37112626]

	Output: [0.01811437 0.37097654 0.33857298 0.4497944  0.23938282]


Unrolled index 2
	Inputs:  [0.01153853 0.35617256 0.33857298 0.2375221  0.3053181 ]

	Output: [0.03979395 0.4221821  0.60602325 0.49119204 0.34753206]


Unrolled index 3
	Inputs:  [0.01811437 0.37097654 0.6038015  0.49119204 0.3419207 ]

	Output: [0.03979395 0.38690606 0.6313255  0.48649433 0.14985336]


Unrolled index 4
	Inputs:  [0.02494098 0.38690606 0.60602325 0.43716985 0.34753206]

	Output: [0.0321712  0.44101515 0.68118984 0.4497944  0.23938282]


In [16]:
D = 1 # Dimensionality of the data. Since our data is 1-D this would be 1
num_unrollings = 50 # Number of time steps you look into the future.
batch_size = 500 # Number of samples in a batch
num_nodes = [200,200,150] # Number of hidden nodes in each layer of the deep LSTM stack we're using
n_layers = len(num_nodes) # number of layers
dropout = 0.2 # dropout amount

tf.reset_default_graph() # This is important in case you run this multiple times

NameError: name 'tf' is not defined