# Project: Financial time series forecasting - Hamed Ahmadinia

### Grade 1 -  Implement a complete process for forecasting a single stock.

In [1]:
# First upgrade the environment.
# https://pypi.org/project/yfinance
import pip
from subprocess import run
# add what you will need
modules =[
#     'pandas_datareader',
#     'yfinance',
    'pandas_market_calendars',
    'plotly', 
    'numpy',
    'sklearn',
    'pandas'
]
proc = run(f'pip install {" ".join(modules)} --upgrade --no-input', 
       shell=True, 
       text=True, 
       capture_output=True, 
       timeout=120) #a couple of minutes
print(proc.stderr)




In [2]:
## Run this if you need to check your modules
# import pip
# from pip._internal.utils.misc import get_installed_distributions
# pkgs = ''.join(str(get_installed_distributions(local_only=True)))

# with open("modules.txt", "a") as file_object:
#     for p in (get_installed_distributions(local_only=True)):
#         file_object.write(str(p)+'\n')
#         print(str(p))
# file_object.close()

In [3]:
import pandas as pd
pd.options.display.max_rows = 3000
pd.set_option('display.width', 1200)
from pathlib import Path
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter, FuncFormatter, StrMethodFormatter
%matplotlib inline

import plotly as ply
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import sklearn
from sklearn.preprocessing import StandardScaler, Normalizer, PolynomialFeatures
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline

from functools import reduce
from operator import mul
from pprint import PrettyPrinter
pprint = PrettyPrinter().pprint

In [4]:
# step 1: reading EURUSD data set which is a CSV file with data including Date,Timestamp,Open,High,Low,Close,Volume
data = pd.read_csv('EURUSD1m.csv')

In [5]:
# step 2: Creading datetime column from EURUSD data frame. 
# step 3: converting strings to datetime which is readable and useable in pandas.
data = data.set_index(pd.to_datetime(data.Date.astype(str)+ " "+data.Timestamp.astype(str)))

# step 4: Making pandas index according to converted datetime strings.
data = data.drop(['Date', 'Timestamp'], axis=1)

# step 5: checking the first 10 rows for the object based on position.
data.head(10)

Unnamed: 0,Open,High,Low,Close,Volume
2010-01-01 11:00:00,1.43327,1.43335,1.43319,1.43335,56.299999
2010-01-01 11:01:00,1.43333,1.43333,1.43318,1.43327,82.100001
2010-01-01 11:02:00,1.43328,1.43333,1.43319,1.43333,86.9
2010-01-01 11:03:00,1.43325,1.43333,1.43319,1.43326,68.899999
2010-01-01 11:04:00,1.43326,1.43333,1.43319,1.43319,45.3
2010-01-01 11:05:00,1.43325,1.43331,1.43318,1.43318,64.8
2010-01-01 11:06:00,1.43328,1.43333,1.43319,1.43326,53.3
2010-01-01 11:07:00,1.43333,1.43343,1.43328,1.43343,40.6
2010-01-01 11:08:00,1.4334,1.4334,1.4331,1.4331,56.4
2010-01-01 11:09:00,1.43317,1.43323,1.43317,1.43323,64.8


In [6]:
# Step 6: Creating subsample data to one day timesteps with using mean 
# step 7: Analyze and droping Rows/Columns with Null values in different ways by using dropna()
df = data.resample('1D').mean().dropna()

# step 8: Create a Label column for my forecast, by shifting the Close value 1 step. We will predict one day ahead
df["label"] = df.Close.shift(-1)

# Step 9: Droping Rows/Columns with Null values in different ways in our data frame
df = df.dropna()

In [7]:
# step 10: defining pd.Series "X" as a data frame by droping specified labels "lable" with axis='columns'.
X = df.drop("label", axis=1)

# step 11: defining pd.Series "y" by index/slice a single-column DataFrame using variable "lable".
y = df[["label"]]

#Step 12: Spliting data into 80/20 (train/test)
split = int(len(df)*.8)

# Step 13: Spliting data farams in two sets including two sets of tranings 80% for our dataframes "A", and "B".
X_train = X[:split]
Y_train = y[:split]

# Step 14: Here we define two sets of testing with trashhold as 20% for our dataframes "A", and "B".
X_test = X[split:]
Y_test = y[split:]

In [8]:
## Normalize or standardize in order to prevent information leakage to the test subset.
# Step 16: Defining Standardscalers to fit to data, then transform it in traning sets which we have as dataframe A and B.
scaler_x_train = StandardScaler()
X_train = pd.DataFrame(scaler_x_train.fit_transform(X_train), X_train.index, X_train.columns)

scaler_y_train = StandardScaler()
Y_train = pd.DataFrame(scaler_y_train.fit_transform(Y_train), Y_train.index, Y_train.columns)

# Step 17: Defining Standardscalers to fit to data, then transform it in testing sets which we have as dataframe A and B.
scaler_x_test = StandardScaler()
X_test = pd.DataFrame(scaler_x_test.fit_transform(X_test), X_test.index, X_test.columns)

scaler_y_test = StandardScaler()
Y_test = pd.DataFrame(scaler_y_test.fit_transform(Y_test), Y_test.index, Y_test.columns)

In [9]:
# defining Algorithmic Trading with Williams %R in Python
# Step 18: Defining a data framework for Williams based on our orginal dataframework in this project

def william(transactions, n=14):
    highh = transactions.High.rolling(n).max()
    lowl = transactions.Low.rolling(n).min()
    return 100*(highh - transactions.Close)/(highh - lowl)

# highh=Highest price in the lookback
# lowl=Lowest price in the lookback
# n = period, typically 14 days(Qiu and Yu, 2016).

# step 19: setting traning and testing sets according Williams %R roles.
X_train["william"] = william(X_train)
X_test["william"] = william(X_test)

#step 20: removing unrelivant collomns from our traning and testing sets as Williams only use closing price. 
x_train = X_train.drop(["High", "Low", "Open", "Volume"], axis=1)
x_test = X_test.drop(["High", "Low", "Open", "Volume"], axis=1)

#Step 21: removing rows without a valied data from data frame "A" and related rows in datafarame "B"
test_index = x_test.dropna().index

x_train = X_train.dropna()
y_train = Y_train.loc[x_train.index]

x_test = X_test.dropna()
y_test = Y_test.loc[x_test.index]

# reference: Qiu, M., & Song, Y. (2016). Predicting the Direction of Stock Market Index Movement Using an Optimized Artificial Neural Network Model. PloS one, 11(5), e0155133. https://doi.org/10.1371/journal.pone.0155133

In [10]:
# Step 22: fiting a linner regression to the training data
linear = LinearRegression()
linear.fit(x_train, y_train)

# step 23: evaluating predicting train and testing set and showing the dataframe work for the first 10 rows
pred_train = linear.predict(x_train)
pred_test = linear.predict(x_test)
pred_df = pd.DataFrame(pred_test, index=test_index, columns=['prediction'])
pred_df.head(10)

Unnamed: 0,prediction
2018-01-15,1.82316
2018-01-16,1.795266
2018-01-17,1.720848
2018-01-18,1.760362
2018-01-19,1.785974
2018-01-21,1.713888
2018-01-22,1.784803
2018-01-23,1.871892
2018-01-24,2.098352
2018-01-25,2.235305


In [11]:
# step 24: measuring classification performance
from sklearn.metrics import r2_score, mean_squared_error

# step 25:calculating Coefficients of regression.
print('Coefficients of regression:', linear.coef_)

# Step 26: calculating Mean Squared Error on both the training data set and the test
print('Mean Squared Error for training set: %.4f'% mean_squared_error(y_train, pred_train))
print('Mean Squared Error for testing set: %.4f'% mean_squared_error(y_test, pred_test))

# Step 27: calculating the R² error on both the training data set and the test
print('R2 for training set: %.4f'% r2_score(y_train, pred_train))
print('R2 for testing set: %.4f'% r2_score(y_test, pred_test))

Coefficients of regression: [[-3.39730102e+02  6.50794608e+01  6.42889892e+01  2.11360564e+02
   3.79210306e-04 -6.32952000e-05]]
Mean Squared Error for training set: 0.0017
Mean Squared Error for testing set: 0.0039
R2 for training set: 0.9983
R2 for testing set: 0.9961


In [12]:
# step 28: fit a linear regression with williams feautre importance which is Close price 
linear_ = LinearRegression()
linear_.fit(x_train.drop('william', axis=1), y_train)

# step 29: Predection traning and Predection test with william feature 
pred_train = linear_.predict(x_train.drop('william', axis=1))
pred_test = linear_.predict(x_test.drop('william', axis=1))

# step 30:calculating Coefficients of regression for Williams
print('Coefficients - Williams:', linear_.coef_)

# Step 31: calculating Mean Squared Error on both the training data set and the test
print('Mean Squared Error for training set: %.4f'% mean_squared_error(y_train, pred_train))
print('Mean Squared Error for testing set: %.4f'% mean_squared_error(y_test, pred_test))

# Step 32: calculating the R² error on both the training data set and the test
print('R2 for training set: %.4f'% r2_score(y_train, pred_train))
print('R2 for testing set: %.4f'% r2_score(y_test, pred_test))

print("# Williams coefficincy comparing to coefficincy of regression model is showing that applying this model is not providing a better prediction about future market price. It might be related to onely taking one factor as closing price into consideration #")

Coefficients - Williams: [[-3.46908493e+02  6.55858072e+01  6.51657174e+01  2.17156061e+02
   4.54367399e-04]]
Mean Squared Error for training set: 0.0017
Mean Squared Error for testing set: 0.0038
R2 for training set: 0.9983
R2 for testing set: 0.9961
# Williams coefficincy comparing to coefficincy of regression model is showing that applying this model is not providing a better prediction about future market price. It might be related to onely taking one factor as closing price into consideration #


### Grade 2 - Illustrate data using plotly (or other) library.

In [13]:
# step 1: defining Slow Stochastics as a momentum indicator used to signal trend reversals in the stock market.
def stochastics(df, low="Low", high="High", close="Close", k_period=14, d_period=3, append=True):
    low_min  = df[low].rolling( window = k_period ).min()
    high_max = df[high].rolling( window = k_period ).max()

    k_fast = 100 * (df[close] - low_min)/(high_max - low_min)
    d_fast = k_fast.rolling(window = d_period).mean()
    return d_fast.rolling(window = d_period).mean()

# step 2: setting traning and testing sets according slow_stochastic.
X_train["slow_stochastic"] = stochastics(X_train)
X_test["slow_stochastic"] = stochastics(X_test)

# step 3: inversing standard scaler which we defined before in order to draw a plot based on Slow Stochastics
x_test = pd.DataFrame(scaler_x_test.inverse_transform(X_test.iloc[:,:-2]), X_test.index, X_test.iloc[:,:-2].columns)
y_test = pd.DataFrame(scaler_y_test.inverse_transform(Y_test), Y_test.index, Y_test.columns)

In [22]:
# step 4: Illustrating data using plotly for Slow Stochastics
fig = make_subplots(rows=2, cols=1)

fig.append_trace(
    go.Candlestick(
        x=x_test.index,
        open=x_test['Open'], high=x_test['High'],
        low=x_test['Low'], close=x_test['Close'],
        name='Open-High-Low-Close (OHLC)'
    ), row=1, col=1
)

fig.append_trace(
    go.Scatter(
        x=x_test.index,
        y=y_test['label'],
        line=dict(color='brown', width=1),
        name='label',
    ), row=1, col=1  # <------------ upper chart
)

fig.append_trace(
    go.Scatter(
        x=pred_df.index,
        y=scaler_y_test.inverse_transform(pred_df["prediction"]),
        line=dict(color='gray', width=1),
        name='Prediction',
    ), row=1, col=1  # <------------ upper chart
)

fig.append_trace(
    go.Scatter(
        x=X_test.index,
        y=-X_test.william,
        line=dict(color='yellow', width=2),
        name='Low %R',
    ), row=2, col=1
)


fig.append_trace(
    go.Scatter(
        x=X_test.index,
        y=X_test.slow_stochastic,
        line=dict(color='olive', width=2),
        name='Slow %D',
    ), row=2, col=1
)


layout = go.Layout(
    plot_bgcolor='black',
    # Font Families
#     font_family='Tahoma',
    font_color='#000000',
#     font_size=30,
    xaxis=dict(
        rangeslider=dict(
            visible=False
        )
    )
)
fig.update_layout(layout)

fig.show()

# reference: https://www.alpharithms.com/stochastic-oscillator-in-python-483214/

### Grade 3 : Calculate additional feature RSI 

In [15]:
# Step 1: defining Relative Strength Index
def rsi(close):
    dif = close.diff() # Returns a pd.Series with the relative strength index.
    up = dif.clip(lower=0) # Make a series for higher closes 
    down = -1*dif.clip(upper=0) #  Make a series for lower closes
    ema_up = up.ewm(com=14, adjust=False).mean() # Use exponential moving average
    ema_down = down.ewm(com=14, adjust=False).mean() # Use simple moving average
    rs = ema_up/ema_down 
    return 100 - (100/(1+rs))
# I got hints from https://www.roelpeters.be/many-ways-to-calculate-the-rsi-in-python-pandas/

In [16]:
# step 2: setting traning and testing sets according Relative Strength Index roles.
X_test["rsi"] = rsi(X_test.Close)
X_train["rsi"] = rsi(X_train.Close)

In [17]:
# Step 3: Defining elastic net with 0.5 ratio l1_ratio
elastic = ElasticNet(l1_ratio=0.5)
elastic.fit(X_train[17:][["Close", "rsi", "slow_stochastic", "william"]], Y_train[17:])

# I have considered 50 percent contribution of each penalty to the loss function for comparission

ElasticNet()

In [18]:
# Step 4: predicting test data with elastic net
pred_elastic = elastic.predict(X_test[17:][["Close", "rsi", "slow_stochastic", "william"]])

# step 5: calculating Coefficients of regression for Relative Strength Index
print('Coefficients - Elastic net:', elastic.coef_)

# Step 6: calculating Mean Squared Error for testing set
print('Mean Squared Error for testing set: %.4f'% mean_squared_error(Y_test[17:], pred_elastic))

# Step 7: calculating the R² error for training data set
print('R2 for testing set: %.4f'% r2_score(Y_test[17:], pred_elastic))

print("# Elastic Net coefficincy comparing to coefficincy of regression model is showing that applying this model is not providing a better prediction about future market price #")

Coefficients - Elastic net: [ 0.3256702   0.          0.00072447 -0.00041053]
Mean Squared Error for testing set: 0.4376
R2 for testing set: 0.5477
# Elastic Net coefficincy comparing to coefficincy of regression model is showing that applying this model is not providing a better prediction about future market price #


In [21]:
# step 8: Illustrating data using plotly for elastic net
fig = make_subplots(rows=3, cols=1)

fig.append_trace(
    go.Candlestick(
        x=x_test.index,
        open=x_test['Open'], high=x_test['High'],
        low=x_test['Low'], close=x_test['Close'],
        name='Open-High-Low-Close (OHLC)'
    ), row=1, col=1
)

fig.append_trace(
    go.Scatter(
        x=x_test.index,
        y=y_test['label'],
        line=dict(color='purple', width=1),
        name='label',
    ), row=1, col=1  # <------------ upper chart
)

fig.append_trace(
    go.Scatter(
        x=pred_df.index,
        y=scaler_y_test.inverse_transform(pred_df["prediction"]),
        line=dict(color='gray', width=1),
        name='prediction regression',
    ), row=1, col=1  # <------------ upper chart
)

fig.append_trace(
    go.Scatter(
        x=X_test[17:].index,
        y=scaler_y_test.inverse_transform(pred_elastic),
        line=dict(color='yellow', width=1),
        name='prediction elastic',
    ), row=1, col=1  # <------------ upper chart
)

fig.append_trace(
    go.Scatter(
        x=X_test.index,
        y=-X_test.william,
        line=dict(color='red', width=2),
        name='LW%R',
    ), row=2, col=1  # <------------ middle chart
)


fig.append_trace(
    go.Scatter(
        x=X_test.index,
        y=X_test.slow_stochastic,
        line=dict(color='pink', width=2),
        name='SLOW%D',
    ), row=2, col=1  # <------------ middle chart
)

fig.append_trace(
    go.Scatter(
        x=X_test.index,
        y=X_test.rsi,
        line=dict(color='olive', width=2),
        name='RSI',
    ), row=3, col=1  # <------------ lower chart
)

layout = go.Layout(
    plot_bgcolor='black',
    # Font Families
#     font_family='Tahoma',
    font_color='#000000',
#     font_size=30,
    xaxis=dict(
        rangeslider=dict(
            visible=False
        )
    )
)

fig.update_layout(layout)

fig.show()