In [1]:
%matplotlib notebook

# Stock Price Predictions

In [34]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from alpha_vantage.timeseries import TimeSeries
from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation, Conv1D, Flatten, MaxPooling1D
from keras.optimizers import SGD, Adam
from keras.callbacks import EarlyStopping
import tools_for_prediction as tfp
from keras.utils import to_categorical
from sklearn.metrics import f1_score, mean_squared_error
import time

In [3]:
# parameters
to_use = ['XOM', 'DAL', 'AAPL', 'GOOGL', 'MSFT']
# to_use = ['AAPL']
to_predict = 'AAPL'

# x_len = 30
x_len = 100

y_len = 10
# y_len = 30

n_features = len(to_use)

alpha_vantage_key = 'K1M63RZKLZKLDHXD'

# problem_type = 'regression'
# type_prediction = 'exact_numbers'

problem_type = 'classification'
type_prediction = 'up_or_down'

past_importance = 'alternate'
nb_period=10
# past_importance = 'shuffle'
seed=0
# past_importance = 'time-consistent'

## Loading data

In [4]:
prices_df = tfp.loading_mid_prices(alpha_vantage_key, to_use, to_predict )

Processing XOM
Processing DAL
Processing AAPL
Processing GOOGL
Processing MSFT


In [5]:
prices_df['AAPL'][0]

14.875

In [6]:
tfp.visualize_stocks(prices_df)

<IPython.core.display.Javascript object>

Prediction via averaging (one-step ahead)

In [7]:
#def MSE(T, value, error):
    #M=0        
    #for i in range(T+1):
     #   M = M + (value[i] - error[i])**2
   # M = M/T
#return(M)

In [77]:
x_stav=prices_df['AAPL'].copy()

window_size = 10
N = len(x_stav)
std_avg_predictions = []

for i in range(0, N):
    
    if i < window_size:
        std_avg_predictions.append(np.mean(x_stav[0:i]))
    else:
        std_avg_predictions.append(np.mean(x_stav[i-window_size:i]))

std_avg_predictions[0]=x_stav[0]
        

m=tfp.nmse_metric_for_np(x_stav, std_avg_predictions)

n=mean_squared_error(x_stav, std_avg_predictions)

print("NMSE for standart averaging:", m)
print("Mean squared error for standard averaging:", n)
print("Prediction for the next day: ", std_avg_predictions[-1])

NMSE for standart averaging: 0.005547603133554655
Mean squared error for standard averaging: 281.2649791365334
Prediction for the next day:  173.29515999999998


In [9]:
plt.figure(figsize=(20,10))
plt.plot(x_stav, label='True')
plt.plot(std_avg_predictions, label='Prediction')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x220ea446978>]

Exponential Moving Average

In [76]:
smoothing_coefficient = 0.4

N = len(x_stav)

model = SimpleExpSmoothing(x_stav)
model_fit = model.fit(smoothing_coefficient)
EMA_predictions = model_fit.predict(0, N)
EMA_predictions=EMA_predictions[1:]

n=mean_squared_error(x_stav, EMA_predictions)

m=tfp.nmse_metric_for_np(x_stav, EMA_predictions)

print("NMSE for EMA:", m)
print("Mean squared error for EMA:", n)
print("Prediction for the next day: ", EMA_predictions[1])

NMSE for EMA: nan
Mean squared error for EMA: 41.106513784105154
Prediction for the next day:  15.668248364250502




In [62]:
EMA_predictions

0        16.197081
1        15.668248
2        15.750949
3        16.400569
4        17.102342
5        17.373405
6        17.798043
7        17.826826
8        18.320095
9        18.830057
10       18.972034
11       19.009221
12       18.991532
13       18.918919
14       19.025352
15       19.203211
16       19.195927
17       19.255556
18       19.151334
19       19.014800
20       18.832880
21       18.473728
22       18.346237
23       18.307742
24       18.284645
25       18.358787
26       18.589272
27       18.877563
28       19.000538
29       19.100323
           ...    
5297    154.881134
5298    154.296680
5299    153.822008
5300    154.783205
5301    154.867923
5302    155.368754
5303    158.497252
5304    161.810351
5305    164.068211
5306    166.227926
5307    169.222776
5308    171.218286
5309    171.586971
5310    170.768183
5311    170.552910
5312    170.471746
5313    170.763047
5314    170.586128
5315    170.641677
5316    170.571006
5317    171.204604
5318    171.

In [63]:
x_stav

1998-01-02     14.87500
1998-01-05     15.87500
1998-01-06     17.37500
1998-01-07     18.15500
1998-01-08     17.78000
1998-01-09     18.43500
1998-01-12     17.87000
1998-01-13     19.06000
1998-01-14     19.59500
1998-01-15     19.18500
1998-01-16     19.06500
1998-01-20     18.96500
1998-01-21     18.81000
1998-01-22     19.18500
1998-01-23     19.47000
1998-01-26     19.18500
1998-01-27     19.34500
1998-01-28     18.99500
1998-01-29     18.81000
1998-01-30     18.56000
1998-02-02     17.93500
1998-02-03     18.15500
1998-02-04     18.25000
1998-02-05     18.25000
1998-02-06     18.47000
1998-02-09     18.93500
1998-02-10     19.31000
1998-02-11     19.18500
1998-02-12     19.25000
1998-02-13     19.43500
                ...    
2019-01-22    154.67500
2019-01-23    153.42000
2019-01-24    153.11000
2019-01-25    156.22500
2019-01-28    154.99500
2019-01-29    156.12000
2019-01-30    163.19000
2019-01-31    166.78000
2019-02-01    167.45500
2019-02-04    169.46750
2019-02-05    17

In [11]:
plt.figure(figsize=(20,10))
plt.plot(x_stav, label='True')
plt.plot(EMA_predictions, label='Prediction')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x220e9f62048>]

## Regression

### Building the training, valiadation and testing datasets

In [12]:
#Using a sliding window model to build input and output time series
df = tfp.create_dataset(prices_df, to_predict, type_prediction, x_len, y_len)

#Dividing in train/val/test our data
(train,val,test) = tfp.split_train_val_test(df, model=past_importance, nb_period=nb_period, seed=seed)

# Reshaping our data so that they can be properly fed to out networks
x_train = tfp.to_multidim_array_train(train[to_use])
x_val =  tfp.to_multidim_array_train(val[to_use])
x_test =  tfp.to_multidim_array_train(test[to_use])
y_train =  tfp.to_multidim_array_y(train['y_' + to_predict])
y_val =  tfp.to_multidim_array_y(val['y_' + to_predict])
y_test =  tfp.to_multidim_array_y(test['y_' + to_predict])

TypeError: 'numpy.int64' object is not iterable

### CNN model 1 (x_len = 100)

In [None]:
a = time.time()

model = Sequential()
model.add(Conv1D(filters=256, kernel_size=4, activation='relu',  input_shape=(x_len, n_features)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(pool_size=1))
model.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(pool_size=1))
model.add(Conv1D(filters=32, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(pool_size=1))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(y_len, activation='relu'))
model.compile(optimizer='adam', loss='mse',  metrics=[tfp.nmse_metric, tfp.std_diff_metric])


print(model.summary())

model.fit(x_train, y_train, epochs=400, batch_size=32, verbose=1, validation_data=(x_val, y_val))

predictions = model.predict(x_test)

pred = pd.DataFrame({"index":test.index, "predictions":tfp.to_onedim_array(predictions)}).set_index('index')

tfp.visualize_predictions_training(prices_df, pred, x_len, y_len, problem_type)

b = time.time()
print("Training time = ", b-a)

In [None]:
future_pred = model.predict(tfp.data_for_prediction(to_predict, prices_df, x_len))
print(future_pred)
df_final = tfp.add_predictions_to_vizualization(future_pred, prices_df, to_predict)

### CNN model 2 (x_len = 30)

In [None]:
a = time.time()

model = Sequential()
model.add(Conv1D(filters=256, kernel_size=4, activation='relu',  input_shape=(x_len, n_features)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling1D(pool_size=1))
model.add(Conv1D(filters=64, kernel_size=1, activation='relu'))
model.add(MaxPooling1D(pool_size=1))
model.add(Conv1D(filters=32, kernel_size=1, activation='relu'))
model.add(MaxPooling1D(pool_size=1))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(y_len, activation='relu'))
model.compile(optimizer='adam', loss='mse',  metrics=[tfp.nmse_metric, tfp.std_diff_metric])

print(model.summary())

model.fit(x_train, y_train, epochs=600, batch_size=32, verbose=1, validation_data=(x_val, y_val))

predictions = model.predict(x_test)

pred = pd.DataFrame({"index":test.index, "predictions":tfp.to_onedim_array(predictions)}).set_index('index')

tfp.visualize_predictions_training(prices_df, pred, x_len, y_len, problem_type)

b = time.time()
print("Training time = ", b-a)

In [None]:
future_pred = model.predict(tfp.data_for_prediction(to_predict, prices_df, x_len))
print(future_pred)
df_final = tfp.add_predictions_to_vizualization(future_pred, prices_df, to_predict)

### LSTM Model (Put a small x_len (<30-35))

In [None]:
a = time.time()

model = Sequential()
model.add(LSTM(128, activation='relu', return_sequences=True, input_shape=(x_len, n_features)))
model.add(LSTM(128, activation='relu'))
model.add(Dense(y_len, activation='relu'))
model.compile(optimizer="adam", loss='mse',  metrics=[tfp.nmse_metric, tfp.std_diff_metric])


print(model.summary())

model.fit(x_train, y_train, epochs=400, batch_size=32, verbose=1, validation_data=(x_val, y_val))

predictions = model.predict(x_test)

pred = pd.DataFrame({"index":test.index, "predictions":tfp.to_onedim_array(predictions)}).set_index('index')

tfp.visualize_predictions_training(prices_df, pred, x_len, y_len, problem_type)

b = time.time()
print("Training time = ", b-a)

In [None]:
future_pred = model.predict(tfp.data_for_prediction(to_predict, prices_df, x_len))
print(future_pred)
final_df = tfp.add_predictions_to_vizualization(future_pred, prices_df, to_predict)

## Classification

### Building the training, valiadation and testing datasets

In [None]:
#Using a sliding window model to build input time series and the 1 or 0 for classification (1=up, 0=down)
df = tfp.create_dataset(prices_df, to_predict, type_prediction, x_len, y_len)

#Dividing in train/val/test our data
(train,val,test) = tfp.split_train_val_test(df, model=past_importance, nb_period=nb_period, seed=seed)

# Reshaping our data so that they can be properly fed to out networks
x_train = tfp.to_multidim_array_train(train[to_use])
x_val =  tfp.to_multidim_array_train(val[to_use])
x_test =  tfp.to_multidim_array_train(test[to_use])
y_train =  to_categorical(list(train['y_' + to_predict]))
y_val = to_categorical(list(val['y_' + to_predict]))
y_test =  to_categorical(list(test['y_' + to_predict]))

### CNN model 1 (x_len = 100)

In [None]:
a = time.time()

model = Sequential()
model.add(Conv1D(filters=256, kernel_size=4, activation='relu',  input_shape=(x_len, n_features)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(pool_size=1))
model.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(pool_size=1))
model.add(Conv1D(filters=32, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(pool_size=1))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer="adam", loss='categorical_crossentropy')


print(model.summary())

model.fit(x_train, y_train, epochs=400, batch_size=32, verbose=1, validation_data=(x_val, y_val))

predictions = model.predict(x_test)
pred = []
for i in range(len(predictions)):
    if predictions[i][0] < 0.5:
        pred.append(0)
    else:
        pred.append(1)
print("F1-score: ", f1_score(y_test, pred))

pred = pd.DataFrame({"index":test.index, "predictions":tfp.to_onedim_array(predictions)}).set_index('index')

b = time.time()
print("Training time = ", b-a)

In [None]:
future_pred = model.predict(tfp.data_for_prediction(to_predict, prices_df, x_len))
print(future_pred[0][0])

### CNN model 2 (x_len = 30)

In [None]:
a = time.time()

model = Sequential()
model.add(Conv1D(filters=256, kernel_size=4, activation='relu',  input_shape=(x_len, n_features)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling1D(pool_size=1))
model.add(Conv1D(filters=64, kernel_size=1, activation='relu'))
model.add(MaxPooling1D(pool_size=1))
model.add(Conv1D(filters=32, kernel_size=1, activation='relu'))
model.add(MaxPooling1D(pool_size=1))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer="adam", loss='binary_crossentropy',  metrics=['accuracy'])


print(model.summary())

model.fit(x_train, y_train, epochs=600, batch_size=32, verbose=1, validation_data=(x_val, y_val))

predictions = model.predict(x_test)
pred = []
for i in range(len(predictions)):
    if predictions[i][0] < 0.5:
        pred.append(0)
    else:
        pred.append(1)
print("F1-score: ", f1_score(y_test, pred))

pred = pd.DataFrame({"index":test.index, "predictions":tfp.to_onedim_array(predictions)}).set_index('index')

b = time.time()
print("Training time = ", b-a)

In [None]:
future_pred = model.predict(tfp.data_for_prediction(to_predict, prices_df, x_len))
print(future_pred[0][0])

### LSTM Model (Put a small x_len (<30-35))

In [None]:
a = time.time()

model = Sequential()
model.add(LSTM(128, activation='relu', return_sequences=True, input_shape=(x_len, n_features)))
model.add(LSTM(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer="adam", loss='binary_crossentropy',  metrics=['accuracy'])


print(model.summary())

model.fit(x_train, y_train, epochs=400, batch_size=32, verbose=1, validation_data=(x_val, y_val))

predictions = model.predict(x_test)
pred = []
for i in range(len(predictions)):
    if predictions[i][0] < 0.5:
        pred.append(0)
    else:
        pred.append(1)
print("F1-score: ", f1_score(y_test, pred))

pred = pd.DataFrame({"index":test.index, "predictions":tfp.to_onedim_array(predictions)}).set_index('index')

b = time.time()
print("Training time = ", b-a)

In [None]:
future_pred = model.predict(tfp.data_for_prediction(to_predict, prices_df, x_len))
print(future_pred[0][0])