In [1]:
# Import libs
from yahooquery import Ticker
import pandas as pd
import pandas_ta
from matplotlib import pyplot
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
symbol = "GGBR4.SA"

# Query stock 
stock = Ticker(symbol)

# Get all history data
history = stock.history(period="12mo")

# Let only date as index
history.reset_index(level=["symbol"], inplace=True)

# Reindex data using a DatetimeIndex
history.set_index(pd.DatetimeIndex(history.index), inplace=True)

In [21]:
# select features that have interest to us
features = ['adjclose','low','high','volume','close']

# Select our features from dataset
data = history[features].copy()

# use technical analyses
data.ta.ema(close='adjclose', length=10, append=True)

# fill empty values
data.dropna(inplace=True)

In [22]:
print(data)

# Plot stock performance data
fig = go.Figure([go.Scatter(x=history.index, y=history['adjclose'])])
fig.show()


             adjclose        low       high    volume      close     EMA_10
date                                                                       
2021-03-03  23.966784  26.360001  27.580000  22429900  26.940001  23.084266
2021-03-04  23.940094  26.580000  27.790001  21293500  26.910000  23.239871
2021-03-05  24.571735  26.480000  27.969999  21959900  27.620001  23.482028
2021-03-08  23.984577  26.809999  28.430000  18089500  26.959999  23.573401
2021-03-09  24.313740  26.120001  28.110001  20635500  27.330000  23.708008
...               ...        ...        ...       ...        ...        ...
2022-02-14  27.780001  27.459999  28.250000   8495200  27.780001  28.451740
2022-02-15  27.670000  27.020000  27.879999   7343800  27.670000  28.309606
2022-02-16  27.450001  27.320000  28.090000   9305700  27.450001  28.153314
2022-02-17  25.990000  25.959999  27.309999  20896900  25.990000  27.759984
2022-02-18  26.110001  25.870001  26.240000    952100  26.110001  27.459987

[243 rows x

In [27]:
# Features evaluation
X = data[["adjclose"]]
Y = data[['EMA_10']]

# Split data into testing and training sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, train_size = 0.80)

# Train model
model = LinearRegression().fit(x_train, y_train)

y_pred = model.predict(x_test)

In [28]:
print(f"Features importance:")
# get importance
importance = model.coef_

# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

print("--------------------------------------------------------------------------------")
    
# Cross validation
print(f"Cross Validation: {cross_val_score(model, x_train, y_train, scoring='r2', cv=5)}")

print("--------------------------------------------------------------------------------")

print(f"R2 Score: {r2_score(y_test, y_pred)}")

Features importance:
Feature: 0, Score: 0.89296
--------------------------------------------------------------------------------
Cross Validation: [0.84935339 0.80920567 0.82874307 0.89947987 0.80707506]
--------------------------------------------------------------------------------
R2 Score: 0.8057160711225254


In [30]:
df_list = [x_train, x_test, y_train, y_test,y_pred]
df_list = [df.squeeze() for df in df_list]

x_train, x_test, y_train, y_test, y_pred = df_list

In [31]:
# Plot linear regression result
fig = go.Figure([
    go.Scatter(x=x_train, y=y_train, name='train', mode='markers'),
    go.Scatter(x=x_test,  y=y_test, name='test', mode='markers'),
    go.Scatter(x=x_test,  y=y_pred,  name='prediction')
])

fig.show()