# Simple Linear Regression for stock using scikit-learn


In [1]:
import pandas as pd
import numpy as np
import math

import os
import sys
import platform

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import yfinance as yf
yf.pdr_override()

In [2]:
print("numpy: ", np.__version__)
print("pandas: ", pd.__version__)
print("sklearn: ", sklearn.__version__)
print("yfinance: ", yf.__version__)
print("os system: ", os.name)
print("Python Version:", sys.version)
print("Platform System: ", platform.system())

numpy:  1.19.5
pandas:  1.1.5
sklearn:  0.24.2
yfinance:  0.1.63
os system:  nt
Python Version: 3.6.13 |Anaconda, Inc.| (default, Mar 16 2021, 11:37:27) [MSC v.1916 64 bit (AMD64)]
Platform System:  Windows


In [3]:
stock = 'AAPL'
start = '2016-01-01' 
end = '2018-01-01'
data = yf.download(stock, start, end)
data.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-04,25.6525,26.342501,25.5,26.3375,24.151493,270597600
2016-01-05,26.4375,26.4625,25.602501,25.6775,23.546276,223164000
2016-01-06,25.139999,25.592501,24.967501,25.174999,23.08548,273829600
2016-01-07,24.67,25.032499,24.1075,24.112499,22.111162,324377600
2016-01-08,24.637501,24.7775,24.190001,24.24,22.22809,283192000


In [4]:
df = data.reset_index()
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2016-01-04,25.6525,26.342501,25.5,26.3375,24.151493,270597600
1,2016-01-05,26.4375,26.4625,25.602501,25.6775,23.546276,223164000
2,2016-01-06,25.139999,25.592501,24.967501,25.174999,23.08548,273829600
3,2016-01-07,24.67,25.032499,24.1075,24.112499,22.111162,324377600
4,2016-01-08,24.637501,24.7775,24.190001,24.24,22.22809,283192000


In [5]:
X = df.drop(['Date','Close'], axis=1)
y = df['Adj Close']

In [6]:

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                   test_size=0.25, random_state=0)

In [7]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression()

In [8]:
intercept = regression_model.intercept_

print("The intercept for our model is {}".format(intercept))

The intercept for our model is -1.637801005927031e-12


In [9]:
regression_model.score(X_test, y_test)

1.0

In [10]:
y_predict = regression_model.predict(X_test)

regression_model_mse = mean_squared_error(y_predict, y_test)

regression_model_mse

2.4917115820171587e-25

In [11]:
math.sqrt(regression_model_mse)

4.991704700818307e-13

In [12]:
# input the latest Open, High, Low, Close, Volume
# predicts the next day price
regression_model.predict([[167.81, 171.75, 165.19, 166.48, 37232900]])

array([166.48])