In [102]:
# Import libs
from yahooquery import Ticker
import pandas as pd
import pandas_ta
from matplotlib import pyplot
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
import plotly.express as px
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [103]:
# Name by which it is represented on the stock exchange.
# example: https://finance.yahoo.com/quote/GGBR4.SA?p=GGBR4.SA&.tsrc=fin-srch
symbol = "GGBR4.SA"

# Query stock in yahoo finance
stock = Ticker(symbol)

# Get 48 months data for enough sample terms
history = stock.history(period="48mo")

# Let only date as index
history.reset_index(level=["symbol"], inplace=True)

# Reindex data using a DatetimeIndex
history.set_index(pd.DatetimeIndex(history.index), inplace=True)

In [104]:
# select features that have interest to us
features = ['adjclose','low','high','volume','close']

# Select our features from dataset
data = history[features].copy()

# use technical analyses using 21 one days and append to our dataset
data.ta.ema(close='adjclose', length=21, append=True)

# Drop empty values
data.dropna(inplace=True)

In [105]:
print(data)

# Plot stock performance data
fig = go.Figure([go.Scatter(x=history.index, y=history['adjclose'])])
fig.show()

             adjclose        low       high    volume      close     EMA_21
date                                                                       
2018-03-19  12.642448  14.920000  15.450000  10213100  15.030000  13.732244
2018-03-20  13.239661  15.020000  15.880000  16580500  15.740000  13.687463
2018-03-21  13.441537  15.670000  15.980000  10501000  15.980000  13.665106
2018-03-22  12.920025  15.190000  15.950000  15905400  15.360000  13.597372
2018-03-23  12.692916  15.000000  15.490000   9854700  15.090000  13.515149
...               ...        ...        ...       ...        ...        ...
2022-02-14  27.780001  27.459999  28.250000   8495200  27.780001  28.348887
2022-02-15  27.670000  27.020000  27.879999   7343800  27.670000  28.287170
2022-02-16  27.450001  27.320000  28.090000   9305700  27.450001  28.211064
2022-02-17  25.990000  25.959999  27.309999  20896900  25.990000  28.009149
2022-02-18  25.809999  25.650000  26.240000   3581400  25.809999  27.809226

[972 rows x

In [106]:
# Features evaluation
X = data[["adjclose"]]
Y = data[['EMA_21','low','high','volume','close']]

# Split data into testing and training sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, train_size = 0.80,)

# Train model
model = LinearRegression().fit(x_train, y_train)

# Try to predict unseen data
y_pred = model.predict(x_test)

In [107]:
print(f"Features importance:")
# get importance
importance = model.coef_

# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

print("--------------------------------------------------------------------------------")
    
# Cross validation
print(f"Cross Validation: {cross_val_score(model, x_train, y_train, scoring='r2', cv=5)}")

print("--------------------------------------------------------------------------------")

print(f"R2 Score: {r2_score(y_test, y_pred)}")

Features importance:
Feature: 0, Score: 0.96083
Feature: 1, Score: 1.00147
Feature: 2, Score: 1.03138
Feature: 3, Score: -19138.79072
Feature: 4, Score: 1.01537
--------------------------------------------------------------------------------
Cross Validation: [0.77958572 0.78220194 0.77781645 0.7799689  0.78312438]
--------------------------------------------------------------------------------
R2 Score: 0.7798602015710631


In [108]:
df_list = [x_train, x_test, y_train, y_test,y_pred]
df_list = [df.squeeze() for df in df_list]

x_train, x_test, y_train, y_test, y_pred = df_list

In [109]:
# We sill select TA to plot on graph
prediction = pd.DataFrame(y_pred.tolist())[0]

# Plot linear regression result
fig = go.Figure([
    go.Scatter(x=x_train, y=y_train["EMA_21"], name='train', mode='markers'),
    go.Scatter(x=x_test,  y=y_test["EMA_21"], name='test', mode='markers'),
    go.Scatter(x=x_test,  y=prediction,  name='prediction')
])

fig.show()

In [110]:
# Plot real value x predicted

fig = px.scatter(
x=x_test,
y=prediction, 
trendline="rolling", 
trendline_options=dict(window=5),
labels=dict(x='Adjusted Closing Price', y='Prediction'),
title=f"Linear Regression Stock Prediction: {symbol}")

fig.show()