## Setup

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [3]:
import plotly.express as px

In [2]:
apple_OLS = pd.read_csv(r'C:\Users\12436\Desktop\BrainStation\Capstone project\capstone-Leoyuyuyu\data\OLS_df.csv', index_col=0)
apple_OLS

Unnamed: 0,Adj Close,Volume,Return,Return_tomo
1980-12-13,0.099058,469033600.0,0.000000,0.000000
1980-12-14,0.099058,469033600.0,0.000000,-0.052171
1980-12-15,0.093890,175884800.0,-0.052171,-0.073398
1980-12-16,0.086998,105728000.0,-0.073398,0.024751
1980-12-17,0.089152,86441600.0,0.024751,0.028992
...,...,...,...,...
2024-05-18,189.869995,41282900.0,0.000000,0.000000
2024-05-19,189.869995,41282900.0,0.000000,0.006162
2024-05-20,191.039993,44361300.0,0.006162,0.006857
2024-05-21,192.350006,42309400.0,0.006857,-0.007538


##  Linear regression - without  the train-test split 

In [10]:
# view the return_tomo data

fig = px.line(data_frame=apple_OLS, x=apple_OLS.index, y ='Return_tomo')

fig.update_layout(
    title = 'Tomorrow return of the Apple stock from 1980 to 2024'
)

fig.update_xaxes(rangeslider_visible = True)

fig.show()


In [13]:
# Split the dataset 
X = apple_OLS.drop(columns=['Return_tomo'], axis = 1)
y = apple_OLS['Return_tomo']

In [14]:
# import the linear machine learning package 

from sklearn.linear_model import LinearRegression 

In [18]:
# Instantiate a model 

regression_model1 = LinearRegression()

# fit a model 

regression_fm1 = regression_model1.fit(X, y)



In [19]:
intercept_linear = regression_model1.intercept_
coeff_linear = regression_model1.coef_

print(f'The intercept is: {intercept_linear}')
print(f'The slopes are {coeff_linear}')

The intercept is: 0.0006462404309311076
The slopes are [-1.17064034e-06  4.12706320e-13 -3.39261342e-03]


In [28]:
regression_fm1.score(X,y) # same as the below

5.847335546660393e-05

In [20]:
# Try to visual the prediction here, we need to get the predicted values first:

regression_predicted = regression_fm1.predict(X)
regression_predicted


array([0.0008397 , 0.0008397 , 0.00089571, ..., 0.00042   , 0.00041527,
       0.00046264])

In [22]:
# check the performance of the model: same as the above 
from sklearn.metrics import r2_score

# Evaluate the model
print(f'The R2 score: {r2_score(y, regression_predicted)}')   # This model has extremely bad performance considering the R_2.

The R2 score: 5.847335546660393e-05


In [26]:
# Visualization of the data:


import plotly.graph_objects as go

# Create a figure
fig = go.Figure()

# Add the first line trace for 'Return_tomo'
fig.add_trace(go.Scatter(x=apple_OLS.index, y=apple_OLS['Return_tomo'], mode='lines', name='Return_tomo'))

# Add the second line trace for 'regression_predicted'
fig.add_trace(go.Scatter(x=apple_OLS.index, y=regression_predicted, mode='lines', name='Regression Predicted'))

# Update layout for better visualization
fig.update_layout(title='Apple OLS Return_tomo and Regression Predicted',
                  xaxis_title='Index',
                  yaxis_title='Values')

fig.update_xaxes(rangeslider_visible = True)

# Show the plot
fig.show()

# After viewing this plot we can see that why it is quite rediculous to use a regression model to fit a stock return! Because it is very likely that you will get a
# straight line and it is highly possible to be the mean of the stock return. This is the reason why the R-squared is very small!!! We need to avoid this stupid method. 


## Logit regression - Without train-test split

In [27]:
# Data loading 

apple_Logit = pd.read_csv(r'C:\Users\12436\Desktop\BrainStation\Capstone project\capstone-Leoyuyuyu\data\Logit_df.csv', index_col=0)
apple_Logit

Unnamed: 0,Adj Close,Volume,Target,Target_tomo
1980-12-13,0.099058,469033600.0,0,0
1980-12-14,0.099058,469033600.0,0,0
1980-12-15,0.093890,175884800.0,0,0
1980-12-16,0.086998,105728000.0,0,1
1980-12-17,0.089152,86441600.0,1,1
...,...,...,...,...
2024-05-18,189.869995,41282900.0,0,0
2024-05-19,189.869995,41282900.0,0,1
2024-05-20,191.039993,44361300.0,1,1
2024-05-21,192.350006,42309400.0,1,0


In [29]:
# Split X and y 

X_logit = apple_Logit.drop(columns=['Target_tomo'], axis=1)
y_logit = apple_Logit['Target_tomo']

In [30]:
# machine learning package input
from sklearn.linear_model import LogisticRegression

# Instantiate a model:

Logit_model = LogisticRegression()

# Fit a model 

Logit_fm1 = Logit_model.fit(X_logit,y_logit)

In [41]:
# View the result and performance of the model:

intercept_logit = Logit_fm1.intercept_
coeff_logit = Logit_fm1.coef_
coeff_logit  # What are these numbers? are they the coeffients or the odds ratio?

array([[-3.78857495e-16, -1.00761450e-09, -2.24289034e-18]])

In [48]:
# Qucik way to get the accuracy 
Logit_fm1.score(X_logit, y_logit) # This return the same value as the later code by importing the from lib sklearn.metrics with accuracy_score.

0.6567088926703221

This model need to be saved as we will use it in the finding part 

In [49]:
Logit_predicted = Logit_fm1.predict(X_logit)  # This will return the hard prediction

In [50]:
# Again we cannot visualize the result here as firsty we have used three features and also the index of date is not a feature of the model.
# So we check the accuracy of the model directly  
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_logit, Logit_predicted)
accuracy

0.6567088926703221

In [51]:
# Check the dummy model:
y_logit.value_counts(normalize=True)

# The result indicates that we have made a very inefficient model. We need to come up with something efficient in making a lift in the prediction accuracy. 

Target_tomo
0    0.656709
1    0.343291
Name: proportion, dtype: float64