In [1]:
import pandas as pd
import json
import requests
import time
import numpy as np
from functools import reduce
from sklearn import preprocessing

## Introduction

The purpose of this notebook is to show the steps for data prepration for the prediction task.

## Data Collection
We get our raw data from Alpha Vantage APIs as it's refined and free of cost.
The data is got in form of JSON object which is stored as a pandas dataframe in a dictionary with symbol name as the key

In [2]:
# commonly used variables

symbol = "AAPL";
apikey = "V1SZPLE0U8CCXSFW.";
datatype = "json";
url = "https://www.alphavantage.co/query";
outputsize = "full";
interval = "daily";
timeperiod = "10";
seriestype = "close";
index = "date";

In [3]:
# Time Series Data

parameters = {'function':'TIME_SERIES_DAILY_ADJUSTED', 'outputsize':outputsize,'datatype':datatype,'apikey':apikey,'symbol':symbol}

response = requests.get(url, params = parameters);
data = response.json()

AAPL_TS_data = pd.DataFrame(data["Time Series (Daily)"]).T

AAPL_TS_data.reset_index(inplace=True);
AAPL_TS_data.rename(columns={'index': 'date'}, inplace=True);

AAPL_TS_data.rename(columns={'1. open': 'open'}, inplace=True);
AAPL_TS_data.rename(columns={'2. high': 'high'}, inplace=True);
AAPL_TS_data.rename(columns={'3. low': 'low'}, inplace=True);
AAPL_TS_data.rename(columns={'4. close': 'close'}, inplace=True);
AAPL_TS_data.rename(columns={'5. adjusted close': 'adjusted close'}, inplace=True);
AAPL_TS_data.rename(columns={'6. volume': 'volume'}, inplace=True);
AAPL_TS_data.rename(columns={'7. dividend amount': 'dividend amount'}, inplace=True);
AAPL_TS_data.rename(columns={'8. split coefficient': 'split coefficient'}, inplace=True);

# Let's add another column to the frame which is the adjusted close data shifted by 10 days which is what we want to predict

AAPL_TS_data["Target"] = AAPL_TS_data["adjusted close"].shift(-10);

# Now that we donot have the data of the future we need to clip the dataframe where the target is not defined 
AAPL_TS_data = AAPL_TS_data.dropna()

display(AAPL_TS_data.tail())
display(AAPL_TS_data.info())

Unnamed: 0,date,open,high,low,close,adjusted close,volume,dividend amount,split coefficient,Target
5226,2018-10-09,223.64,227.27,222.2462,226.87,226.87,26891029,0.0,1.0,222.73
5227,2018-10-10,225.46,226.35,216.05,216.36,216.36,41990554,0.0,1.0,215.09
5228,2018-10-11,214.52,219.5,212.32,214.45,214.45,53124392,0.0,1.0,219.8
5229,2018-10-12,220.42,222.88,216.84,222.11,222.11,40337851,0.0,1.0,216.3
5230,2018-10-15,221.16,221.83,217.27,217.36,217.36,30791007,0.0,1.0,212.24


<class 'pandas.core.frame.DataFrame'>
Int64Index: 5231 entries, 0 to 5230
Data columns (total 10 columns):
date                 5231 non-null object
open                 5231 non-null object
high                 5231 non-null object
low                  5231 non-null object
close                5231 non-null object
adjusted close       5231 non-null object
volume               5231 non-null object
dividend amount      5231 non-null object
split coefficient    5231 non-null object
Target               5231 non-null object
dtypes: object(10)
memory usage: 449.5+ KB


None

In [4]:
#ADX Data

parameters = {'function':'ADX', 'interval':interval,'time_period':timeperiod,'datatype':datatype,'apikey':apikey,'symbol':symbol}

response = requests.get(url, params = parameters);
data = response.json()

AAPL_ADX_data = pd.DataFrame(data["Technical Analysis: ADX"]).T

AAPL_ADX_data.reset_index(inplace=True);
AAPL_ADX_data.rename(columns={'index': 'date'}, inplace=True);


display(AAPL_ADX_data.head())
display(AAPL_ADX_data.info())

Unnamed: 0,date,ADX
0,1998-01-30,50.0186
1,1998-02-02,45.054
2,1998-02-03,40.8746
3,1998-02-04,37.1132
4,1998-02-05,33.728


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5222 entries, 0 to 5221
Data columns (total 2 columns):
date    5222 non-null object
ADX     5222 non-null object
dtypes: object(2)
memory usage: 81.7+ KB


None

In [5]:
#RSI Data

parameters = {'function':'RSI', 'interval':interval,'time_period':timeperiod,'series_type':seriestype,'datatype':datatype,'apikey':apikey,'symbol':symbol}

response = requests.get(url, params = parameters);
data = response.json()

AAPL_RSI_data = pd.DataFrame(data["Technical Analysis: RSI"]).T

AAPL_RSI_data.reset_index(inplace=True);
AAPL_RSI_data.rename(columns={'index': 'date'}, inplace=True);

display(AAPL_RSI_data.head())
display(AAPL_RSI_data.info())

Unnamed: 0,date,RSI
0,1998-01-16,65.8809
1,1998-01-20,67.0176
2,1998-01-21,65.5614
3,1998-01-22,67.3482
4,1998-01-23,68.676


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5231 entries, 0 to 5230
Data columns (total 2 columns):
date    5231 non-null object
RSI     5231 non-null object
dtypes: object(2)
memory usage: 81.8+ KB


None

In [6]:
#SMA_data

parameters = {'function':'SMA', 'interval':interval,'time_period':timeperiod,'series_type':seriestype,'datatype':datatype,'apikey':apikey,'symbol':symbol}

response = requests.get(url, params = parameters);
data = response.json()

AAPL_SMA_data = pd.DataFrame(data["Technical Analysis: SMA"]).T
AAPL_SMA_data.reset_index(inplace=True);
AAPL_SMA_data.rename(columns={'index': 'date'}, inplace=True);

display(AAPL_SMA_data.head())
display(AAPL_SMA_data.info())

Unnamed: 0,date,SMA
0,1998-01-15,18.164
1,1998-01-16,18.42
2,1998-01-20,18.738
3,1998-01-21,18.735
4,1998-01-22,18.91


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5232 entries, 0 to 5231
Data columns (total 2 columns):
date    5232 non-null object
SMA     5232 non-null object
dtypes: object(2)
memory usage: 81.8+ KB


None

In [7]:
# MACD_data

parameters = {'function':'MACD', 'interval':interval,'series_type':seriestype,'datatype':datatype,'apikey':apikey,'symbol':symbol}

response = requests.get(url, params = parameters);
data = response.json()

AAPL_MACD_data = pd.DataFrame(data["Technical Analysis: MACD"]).T

AAPL_MACD_data.reset_index(inplace=True);
AAPL_MACD_data.rename(columns={'index': 'date'}, inplace=True);

display(AAPL_MACD_data.head())
display(AAPL_MACD_data.info())

Unnamed: 0,date,MACD,MACD_Hist,MACD_Signal
0,1998-02-20,0.4518,0.1604,0.2913
1,1998-02-23,0.5519,0.2084,0.3435
2,1998-02-24,0.6289,0.2283,0.4005
3,1998-02-25,0.7617,0.289,0.4728
4,1998-02-26,0.9521,0.3835,0.5686


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5208 entries, 0 to 5207
Data columns (total 4 columns):
date           5208 non-null object
MACD           5208 non-null object
MACD_Hist      5208 non-null object
MACD_Signal    5208 non-null object
dtypes: object(4)
memory usage: 162.8+ KB


None

## Structuring the data
Now that we have all the data we can start with the preprocessing of the same.
let's define the fields that we are interested in.

we need below details for every stock

+ simple moving average
+ ADX
+ RSI
+ MACD
+ volume
+ high
+ low
+ adjusted close

And having all this info we would be predicting what the price would be 10 days down the line.

#### Let's first create the single data frame and split that into Train, Test and Validation data.

In [8]:
All_frames = [AAPL_ADX_data,AAPL_SMA_data,AAPL_RSI_data,AAPL_MACD_data,AAPL_TS_data];

# Luckily this cool function reduces the final merged dataframe dropping any of the rows where the values were not present.
# Thus we get a clean dataframe to use which is fully cleaned.

Merged_frames = reduce(lambda  left,right: pd.merge(left,right), All_frames)

display(Merged_frames.head())
display(Merged_frames.info())

Unnamed: 0,date,ADX,SMA,RSI,MACD,MACD_Hist,MACD_Signal,open,high,low,close,adjusted close,volume,dividend amount,split coefficient,Target
0,1998-02-20,35.8066,19.562,62.2914,0.4518,0.1604,0.2913,20.5,20.56,19.81,20.0,0.633,2905500,0.0,1.0,0.7735
1,1998-02-23,38.2897,19.837,72.2154,0.5519,0.2084,0.3435,20.12,21.62,20.0,21.25,0.6726,4263300,0.0,1.0,0.72
2,1998-02-24,40.5245,20.049,72.6,0.6289,0.2283,0.4005,21.31,21.37,20.75,21.31,0.6745,4076700,0.0,1.0,0.7615
3,1998-02-25,43.729,20.336,78.1904,0.7617,0.289,0.4728,21.31,22.75,20.94,22.31,0.7061,6363100,0.0,1.0,0.8267
4,1998-02-26,47.0657,20.786,82.824,0.9521,0.3835,0.5686,22.31,23.56,21.87,23.5,0.7438,5313700,0.0,1.0,0.8546


<class 'pandas.core.frame.DataFrame'>
Int64Index: 5198 entries, 0 to 5197
Data columns (total 16 columns):
date                 5198 non-null object
ADX                  5198 non-null object
SMA                  5198 non-null object
RSI                  5198 non-null object
MACD                 5198 non-null object
MACD_Hist            5198 non-null object
MACD_Signal          5198 non-null object
open                 5198 non-null object
high                 5198 non-null object
low                  5198 non-null object
close                5198 non-null object
adjusted close       5198 non-null object
volume               5198 non-null object
dividend amount      5198 non-null object
split coefficient    5198 non-null object
Target               5198 non-null object
dtypes: object(16)
memory usage: 690.4+ KB


None

In [10]:
# And now for the final touch and creating the Training, Validation and Test sets

total_length = Merged_frames.shape[0];

Train_set = Merged_frames.iloc[:5000];
Validation_set = Merged_frames.iloc[5000:5100];
Test_set = Merged_frames.iloc[5100:total_length];

## Training Data and Labels
Train_labels = Train_set[['Target']].copy()
Train_data = Train_set.drop(['Target','date','split coefficient'], axis=1);

## pre processing the data
train_names = Train_data.columns
train_data_scaler = preprocessing.StandardScaler()
Train_data_scaled = train_data_scaler.fit_transform(Train_data)
Train_data_scaled = pd.DataFrame(Train_data_scaled, columns=train_names)



## Validation data and labels
Validation_labels = Validation_set[['Target']].copy();
Validation_data = Validation_set.drop(['Target','date','split coefficient'], axis=1);


## pre processing the data
validation_names = Validation_data.columns
validation_data_scaler = preprocessing.StandardScaler()
Validation_data_scaled = validation_data_scaler.fit_transform(Validation_data)
Validation_data_scaled = pd.DataFrame(Validation_data_scaled, columns=validation_names)




## Test data and labels
Test_labels = Test_set[['Target']].copy();
Test_data = Test_set.drop(['Target','date','split coefficient'], axis=1);


## pre processing the data
test_names = Test_data.columns
test_data_scaler = preprocessing.StandardScaler()
Test_data_scaled = test_data_scaler.fit_transform(Test_data)
Test_data_scaled = pd.DataFrame(Test_data_scaled, columns=test_names)


display(Train_labels.head())
display(Train_data_scaled.head())


display(Validation_labels.head())
display(Validation_data_scaled.head())

display(Test_labels.head())
display(Test_data_scaled.head())



Unnamed: 0,Target
0,0.7735
1,0.72
2,0.7615
3,0.8267
4,0.8546


Unnamed: 0,ADX,SMA,RSI,MACD,MACD_Hist,MACD_Signal,open,high,low,close,adjusted close,volume,dividend amount
0,0.322917,-0.840074,0.52735,0.024826,0.053747,0.008684,-0.833204,-0.836981,-0.833443,-0.83649,-0.829775,-0.982158,-0.051427
1,0.513779,-0.838401,1.166106,0.035262,0.069881,0.014486,-0.83551,-0.830598,-0.832279,-0.828897,-0.828878,-0.907823,-0.051427
2,0.685556,-0.83711,1.19086,0.04329,0.07657,0.020822,-0.828288,-0.832103,-0.827683,-0.828533,-0.828835,-0.918038,-0.051427
3,0.931868,-0.835364,1.550685,0.057135,0.096973,0.028859,-0.828288,-0.823793,-0.826518,-0.822459,-0.828119,-0.792866,-0.051427
4,1.188341,-0.832625,1.848925,0.076985,0.128738,0.039507,-0.822218,-0.818916,-0.820819,-0.815232,-0.827266,-0.850317,-0.051427


Unnamed: 0,Target
5000,177.229
5001,176.4381
5002,174.9946
5003,175.0342
5004,172.2461


Unnamed: 0,ADX,SMA,RSI,MACD,MACD_Hist,MACD_Signal,open,high,low,close,adjusted close,volume,dividend amount
0,-1.052789,-0.155788,-0.153397,-0.086175,-0.485662,0.199194,-0.202802,-0.182719,-0.074027,-0.243822,-0.309965,-0.416318,-0.142465
1,-1.102008,-0.180237,0.06319,-0.063443,-0.374794,0.157701,-0.201531,-0.329149,-0.059614,-0.14352,-0.212628,-0.925183,-0.142465
2,-0.993046,-0.169713,0.534683,0.023149,-0.182401,0.147063,-0.087097,-0.07154,0.056894,0.103474,0.027079,-0.863208,-0.142465
3,-0.876966,-0.180399,0.317087,0.064411,-0.102161,0.149237,0.028608,-0.039,0.162591,0.021979,-0.052019,-1.08514,-0.142465
4,-0.842889,-0.19141,0.310046,0.092387,-0.059519,0.158288,0.054038,-0.113571,0.100134,0.019471,-0.054444,-0.998239,-0.142465


Unnamed: 0,Target
5100,191.606
5101,190.0316
5102,190.1312
5103,188.1781
5104,188.0784


Unnamed: 0,ADX,SMA,RSI,MACD,MACD_Hist,MACD_Signal,open,high,low,close,adjusted close,volume,dividend amount
0,0.168625,-1.0551,0.187505,0.381486,-0.122516,0.476566,-1.113314,-1.122979,-1.089139,-1.105042,-1.10303,-0.430901,-0.101535
1,0.101142,-1.048158,0.041007,0.278251,-0.290477,0.432837,-1.105676,-1.169518,-1.095039,-1.130616,-1.128031,-0.736391,-0.101535
2,-0.018955,-1.056737,-0.193841,0.154454,-0.467976,0.370263,-1.137503,-1.155246,-1.136996,-1.170895,-1.167408,-0.033914,-0.101535
3,-0.014438,-1.035454,0.53415,0.166433,-0.324788,0.322859,-1.088412,-1.029282,-1.031449,-0.955436,-0.956775,-0.356722,-0.101535
4,0.11522,-0.999307,0.791318,0.217696,-0.127471,0.296401,-0.856466,-0.8332,-0.795443,-0.85378,-0.857398,-0.131118,-0.101535


In [11]:
## Finally add the data to their respective csv files for use by the models


Train_data_scaled.to_csv("..\data\Train_data_scaled.csv", encoding='utf-8');
Train_labels.to_csv("..\data\Train_labels.csv", encoding='utf-8');
Validation_data_scaled.to_csv("..\data\Validation_data_scaled.csv", encoding='utf-8');
Validation_labels.to_csv("..\data\Validation_labels.csv", encoding='utf-8');
Test_data_scaled.to_csv("..\data\Test_data_scaled.csv", encoding='utf-8');
Test_labels.to_csv("..\data\Test_labels.csv", encoding='utf-8');