In [25]:
import pandas as pd
import numpy as np
import FinanceDataReader as fdr

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# To print multiple outputs in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Display data to three deciaml places.
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# To print all columns
pd.set_option('max_columns', None)

## data used 

In [26]:
code_stock_name = pd.Series(np.array(['NETFLIX', 'Amazon', 'Apple', 'Microsoft', 'NVIDIA', 'Tesla', 
                                     'Autodesk', 'NIKE', 'Pfizer', 'Disney']))
use_stock = pd.DataFrame(
    {
        'stock_name': code_stock_name
    }
)
use_stock.index = np.arange(1,len(use_stock) + 1) # start row index from 1 instead of zero. 

use_stock

Unnamed: 0,stock_name
1,NETFLIX
2,Amazon
3,Apple
4,Microsoft
5,NVIDIA
6,Tesla
7,Autodesk
8,NIKE
9,Pfizer
10,Disney


## load datasets
* Getting the NASDAQ Symbols in [here](https://finance.yahoo.com/)
* Date : '2015-01-01' to '2021-11-25'

In [27]:
NETFLIX_df = fdr.DataReader("NFLX", '2015-01-01', '2021-11-25')
NETFLIX_df

# 전체 주석
# AMAZON_df = fdr.DataReader("AMZN", '2015-01-01', '2021-11-25')
# APPLE_DF = fdr.DataReader("AAPL", '2015-01-01', '2021-11-25')
# MICROSOFT_df = fdr.DataReader("MSFT", '2015-01-01', '2021-11-25')
# NVIDIA_df = fdr.DataReader("NVDA", '2015-01-01', '2021-11-25')
# TESLA_df = fdr.DataReader("TSLA", '2015-01-01', '2021-11-25')
# AUTODESK_DF = fdr.DataReader("ADSK", '2015-01-01', '2021-11-25')
# NIKE_DF = fdr.DataReader("NKE", '2015-01-01', '2021-11-25')
# PFIZER_DF = fdr.DataReader("PFE", '2015-01-01', '2021-11-25')
# DISNEY_DF = fdr.DataReader("DIS", '2015-01-01', '2021-11-25')

# Check data's row and column numbers
NETFLIX_df.shape

# 전체 주석
# AMAZON_df.shape
# APPLE_DF.shape
# MICROSOFT_df.shape
# NVIDIA_df.shape
# TESLA_dF.shape
# AUTODESK_DF.shape
# NIKE_DF.shape
# PFIZER_DF.shape
# DISNEY_DF.shape

Unnamed: 0_level_0,Close,Open,High,Low,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-02,49.850,49.150,50.330,48.730,13470000.000,0.021
2015-01-05,47.310,49.260,49.260,47.150,18170000.000,-0.051
2015-01-06,46.500,47.350,47.640,45.660,16040000.000,-0.017
2015-01-07,46.740,47.350,47.420,46.270,9850000.000,0.005
2015-01-08,47.780,47.120,47.840,46.480,9620000.000,0.022
...,...,...,...,...,...,...
2021-11-18,682.020,691.610,691.740,679.730,2010000.000,-0.014
2021-11-19,678.800,692.350,694.160,675.000,2610000.000,-0.005
2021-11-22,659.200,676.020,679.480,656.470,2720000.000,-0.029
2021-11-23,654.060,658.180,666.430,646.050,2310000.000,-0.008


(1739, 6)

In [28]:
# The shape is not the same because of several reasons.
# First, Suspended trading occurs because of serious concerns about a company's assets, 
# operations, or other financial information.
# Second, a Trading halt is implemented to correct an order imbalance,
# Third, If the provider omits a specific day's data
# etc..
# So we must match the shape because of the imbalance.

# In this project, I'll take df1(NFLX) as a criteria.

# 전체 주석
# # using 'reindex' to make the same shape
# APPLE_df = APPLE_DF.reindex(NETFLIX_df.index)
# AUTODESK_df = AUTODESK_DF.reindex(NETFLIX_df.index)
# NIKE_df = NIKE_DF.reindex(NETFLIX_df.index)
# PFIZER_df = PFIZER_DF.reindex(NETFLIX_df.index)
# DISNEY_df = DISNEY_DF.reindex(NETFLIX_df.index)

# # Fill the data 
# # pandas.DataFrame.fillna : Fill NA/NaN values using the specified method.
# # "ffill" : forward fill. Propagate last valid observation forward to next valid backfil
# APPLE_df = APPLE_df.fillna(method="ffill")
# AUTODESK_df = AUTODESK_df.fillna(method="ffill").head()
# NIKE_df = NIKE_df.fillna(method="ffill").head()
# PFIZER_df = PFIZER_df.fillna(method="ffill").head()
# DISNEY_df = DISNEY_df.fillna(method="ffill").head()
# 여기까지

## Check if dataframe has null values

In [29]:
NETFLIX_df.isnull().values.any()
# AMAZON_df.isnull().values.any()
# APPLE_df.isnull().values.any()
# MICROSOFT_df.isnull().values.any()
# NVIDIA_df.isnull().values.any()
# TESLA_df.isnull().values.any()
# AUTODESK_df.isnull().values.any()
# NIKE_df.isnull().values.any()
# PFIZER_df.isnull().values.any()
# DISNEY_df.isnull().values.any()

False

In [30]:
high_prices = NETFLIX_df['High'].values
low_prices = NETFLIX_df['Low'].values
mid_prices = (high_prices + low_prices) / 2

mid_prices.shape

(1739,)

In [33]:
x_train, x_test, y_train, y_test = train_test_split(NETFLIX_df, mid_prices, test_size = 0.2, random_state = 5)

x_train.shape
x_test.shape
y_train.shape
y_test.shape

(1391, 6)

(348, 6)

(1391,)

(348,)

In [32]:
model = LinearRegression()
model.fit(x_train, y_train)

score = model.score(x_test, y_test)
score

LinearRegression()

1.0