In [91]:
import pandas as pd
from collections import deque
import random
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

In [92]:
df = pd.read_csv("stock_data.csv")

In [93]:
df.head()

Unnamed: 0,Date,SPY,SPY_vol,AAPl,AAPl_vol,MSFT,MSFT_vol
0,2000-01-03,98.96,8164300,3.46,133949200,37.29,53228400
1,2000-01-04,95.09,8089800,3.17,128094400,36.03,54119000
2,2000-01-05,95.26,12177900,3.22,194580400,36.41,64059600
3,2000-01-06,93.73,6227200,2.94,191993200,35.19,54976600
4,2000-01-07,99.17,8066500,3.08,115183600,35.65,62013600


In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5134 entries, 0 to 5133
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      5134 non-null   object 
 1   SPY       5134 non-null   float64
 2   SPY_vol   5134 non-null   int64  
 3   AAPl      5134 non-null   float64
 4   AAPl_vol  5134 non-null   int64  
 5   MSFT      5134 non-null   float64
 6   MSFT_vol  5134 non-null   int64  
dtypes: float64(3), int64(3), object(1)
memory usage: 280.9+ KB


In [95]:
df.set_index("Date",inplace=True)


In [96]:
SEQ_LEN = 10  # how long of a preceeding sequence to collect for RNN. In this case 2 trading weeks
FUTURE_PERIOD_PREDICT = 3  # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "MSFT"


def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0
    
df['future'] = df[f'{RATIO_TO_PREDICT}'].shift(-FUTURE_PERIOD_PREDICT)
df['target'] = list(map(classify, df[f'{RATIO_TO_PREDICT}'], df['future']))

In [97]:
times = sorted(df.index.values)  # get the times
last_5pct = sorted(df.index.values)[-int(0.05*len(times))]  # get the last 5% of the times

validation_main_df = df[(df.index >= last_5pct)]  # make the validation data where the index is in the last 5%
main_df = df[(df.index < last_5pct)]  # now the main_df is all the data up to the last 5%

In [98]:
main_df.head(20)

Unnamed: 0_level_0,SPY,SPY_vol,AAPl,AAPl_vol,MSFT,MSFT_vol,future,target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-01-03,98.96,8164300,3.46,133949200,37.29,53228400,35.19,0
2000-01-04,95.09,8089800,3.17,128094400,36.03,54119000,35.65,0
2000-01-05,95.26,12177900,3.22,194580400,36.41,64059600,35.91,0
2000-01-06,93.73,6227200,2.94,191993200,35.19,54976600,34.99,0
2000-01-07,99.17,8066500,3.08,115183600,35.65,62013600,33.85,0
2000-01-10,99.51,5741700,3.02,126266000,35.91,44963600,34.49,0
2000-01-11,98.32,7503700,2.87,110387200,34.99,46743600,35.91,1
2000-01-12,97.34,6907700,2.7,244017200,33.85,66532400,36.89,1
2000-01-13,98.66,5158300,2.99,258171200,34.49,83144000,34.23,0
2000-01-14,100.0,7437300,3.11,97594000,35.91,73416400,33.91,0


In [99]:
def preprocess_df(df):
    df = df.drop("future", 1)  

    for col in df.columns:  
        if col != "target":  
            df[col] = df[col].pct_change() 
            df.dropna(inplace=True)  

            scaler = preprocessing.MinMaxScaler()
            df[col] = scaler.fit_transform(df[col].values.reshape(-1,1))
            #df[col] = preprocessing.scale(df[col].values)  #
    
    df.dropna(inplace=True) 
    

    sequential_data = []  
    prev_days = deque(maxlen=SEQ_LEN)  
    
    for i in df.values:  
        prev_days.append([n for n in i[:-1]]) 
        if len(prev_days) == SEQ_LEN: 
            sequential_data.append([np.array(prev_days), i[-1]])  
            
    random.shuffle(sequential_data)  

    buys = []  
    sells = []  

    for seq, target in sequential_data:  
        if target == 0:  
            sells.append([seq, target])  
        elif target == 1:  
            buys.append([seq, target])  

    random.shuffle(buys)  
    random.shuffle(sells)  

    lower = min(len(buys), len(sells))  

    buys = buys[:lower]  
    sells = sells[:lower]  

    sequential_data = buys+sells  
    random.shuffle(sequential_data)  

    X = []
    y = []

    for seq, target in sequential_data: 
        X.append(seq)  
        y.append(target)  

    return np.array(X), y  


In [35]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)