In [1]:
# 6 STOCK PORTFOLIO LINEAR REGRESSION
# Comparing correlation to linear regression, by quantifying the direction and strength of the
# relationship between two variables, in this case,stock prices.

# True random and correlated variables
# Visualise linear regression on a silicon chip portolfio of stocks

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas_datareader as pdr
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib notebook
X = np.random.randn(5000)
Y = np.random.randn(5000)

fig, ax = plt.subplots()
ax.scatter(X, Y, alpha=.2)

# This shows two variables that are not correlated.

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x1fbadfcc430>

In [2]:
# Looking at the correlation in stocks in a portfolio that focuses on silicon chips

import pandas_datareader.data as pdr
import yfinance as yfin
yfin.pdr_override()

# To get around a breaking change in the yahoo finance API

tickers = ['NVDA', 'INTC', 'AMD', 'TSM', '^GSPC']
start = dt.datetime(2019, 1, 1)
data = pdr.get_data_yahoo(tickers, start)

[*********************100%***********************]  5 of 5 completed


In [3]:
data.head()

Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Close,Close,Close,Close,Close,...,Open,Open,Open,Open,Open,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,AMD,INTC,NVDA,TSM,^GSPC,AMD,INTC,NVDA,TSM,^GSPC,...,AMD,INTC,NVDA,TSM,^GSPC,AMD,INTC,NVDA,TSM,^GSPC
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-01-02,18.83,41.513222,33.799736,32.560837,2510.030029,18.83,47.080002,34.055,36.52,2510.030029,...,18.01,45.959999,32.66,36.200001,2476.959961,87148700,18774600,50875200,5273100,3733160000
2019-01-03,17.049999,39.229469,31.757652,30.635014,2447.889893,17.049999,44.490002,31.997499,34.360001,2447.889893,...,18.42,46.150002,33.447498,35.34,2491.919922,117277600,32267300,70555200,15998000,3858830000
2019-01-04,19.0,41.636669,33.792282,31.178888,2531.939941,19.0,47.220001,34.047501,34.970001,2531.939941,...,17.549999,45.84,32.735001,34.459999,2474.330078,111878600,35447300,58562000,14178200,4234140000
2019-01-07,20.57,41.830658,35.581272,31.410696,2549.689941,20.57,47.439999,35.849998,35.23,2549.689941,...,19.440001,47.099998,34.625,35.009998,2535.610107,107157000,22736800,70916000,6850800,4133120000
2019-01-08,20.75,42.095196,34.695465,31.152134,2574.409912,20.75,47.740002,34.9575,34.939999,2574.409912,...,21.190001,47.799999,36.672501,35.110001,2568.110107,121271000,22749200,78601600,11462600,4120060000


In [4]:
data = data['Adj Close']

In [5]:
data.head()

Unnamed: 0_level_0,AMD,INTC,NVDA,TSM,^GSPC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-02,18.83,41.513222,33.799736,32.560837,2510.030029
2019-01-03,17.049999,39.229469,31.757652,30.635014,2447.889893
2019-01-04,19.0,41.636669,33.792282,31.178888,2531.939941
2019-01-07,20.57,41.830658,35.581272,31.410696,2549.689941
2019-01-08,20.75,42.095196,34.695465,31.152134,2574.409912


In [6]:
log_returns = np.log(data/data.shift())

In [8]:
log_returns

Unnamed: 0_level_0,AMD,INTC,NVDA,TSM,^GSPC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-02,,,,,
2019-01-03,-0.099301,-0.056584,-0.062319,-0.060967,-0.025068
2019-01-04,0.108289,0.059553,0.062099,0.017598,0.033759
2019-01-07,0.079395,0.004648,0.051587,0.007407,0.006986
2019-01-08,0.008713,0.006304,-0.025210,-0.008266,0.009649
...,...,...,...,...,...
2023-04-26,0.025216,0.005869,0.026883,-0.000486,-0.003849
2023-04-27,0.017303,0.027501,0.009967,0.017117,0.019377
2023-04-28,0.021832,0.039401,0.019027,0.007501,0.008219
2023-05-01,0.003574,-0.024773,0.040988,-0.003803,-0.000386


In [11]:
# Converting the data and modulate it to see if there is a correlation
# AMD vs S&P500

ticker_a = 'AMD'
ticker_b = '^GSPC'

# The LinearRegression model does not work with data frames, so it needs to be converted to a NumPy array
# The first item of the list needs to removed, because it is not a number, and the NumPy array does not
# work with a value that is not a number (integere, i.e. int)
X = log_returns[ticker_a].iloc[1:].to_numpy().reshape(-1, 1)

In [12]:
X

array([[-0.09930118],
       [ 0.10828882],
       [ 0.07939471],
       ...,
       [ 0.02183221],
       [ 0.00357422],
       [ 0.0024499 ]])

In [14]:
Y = log_returns[ticker_b].iloc[1:].to_numpy().reshape(-1, 1)

In [15]:
Y

array([[-0.02506833],
       [ 0.0337594 ],
       [ 0.00698598],
       ...,
       [ 0.00821933],
       [-0.00038618],
       [-0.01165391]])

In [22]:
def linear_regression(ticker_a, ticker_b):
    X = log_returns[ticker_a].iloc[1:].to_numpy().reshape(-1, 1)
    Y = log_returns[ticker_b].iloc[1:].to_numpy().reshape(-1, 1)

    lin_regr = LinearRegression()
    lin_regr.fit(X, Y)

    Y_pred = lin_regr.predict(X)

    # Calculating the coefficient
    alpha = lin_regr.intercept_[0]
    beta = lin_regr.coef_[0, 0]

    fig, ax = plt.subplots()
    ax.set_title("Alpha: " + str(round(alpha, 5)) + ", Beta; " + str(round(beta, 3)))
    ax.scatter(X, Y)
    ax.plot(X, Y_pred, c='r')

# This shows the correlation between AMD and the S&P500

In [23]:
linear_regression("AMD", "^GSPC")

# alpha of 6.8 is the y intercept
# beta of 0.27 shows that when the AMD price goes up 1 unit percetnage, the S&P500 goes up 0.27 units of
# percentage change log return.
# The beta value is the slope ( the gradient)

# If there is a high correlation, the scatter plot is will be close to the line.

<IPython.core.display.Javascript object>

In [24]:
# Comparing correlations in the rest of the portfolio tickers = ['NVDA', 'INTC', 'AMD', 'TSM', '^GSPC']

linear_regression("NVDA", "^GSPC")

<IPython.core.display.Javascript object>

In [25]:
linear_regression("INTC", "^GSPC")

<IPython.core.display.Javascript object>

In [26]:
linear_regression("TSM", "^GSPC")

<IPython.core.display.Javascript object>

In [28]:
# Comparing 2 stocks

linear_regression("AMD", "NVDA")

<IPython.core.display.Javascript object>

In [29]:
linear_regression("AMD", "TSM")

<IPython.core.display.Javascript object>