In [4]:
import yfinance as yf
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
import datetime as dt
from statsmodels.tsa.stattools import adfuller
import pandas_datareader as web
import quantstats as qs
from sklearn.decomposition import PCA
import seaborn as sns

## Section 1: Preparing Data

In [7]:
mixed_stocks = [
    'NEE', 'ED', 'PEG', 'XEL', 'D', 'EXC', 'SRE',  'SO', 'DUK', 'JKS',
    'MSFT', 'SHOP', 'MSTR', 'CRM', 'AMD', 'DIOD', 'INTC', 'MRVL', 'ADI', 'AVGO', 'QCOM', 'NVDA',
    'ATOM', 'AMKR', 'VLO', 'MPC', 'XOM', 'WMB', 'EQT', 'SHEL', 'BP', 'TTE', 'OXY', 'CVX',
    'AMZN', 'IAC', 'GOOGL', 'META', 'IBM', 'NOW', 'DXC',  'WIT', 'TCS.NS', 'CTSH', 'ACN',
    'CVS', 'MRK', 'JNJ', 'UNH', 'NVO', 'LLY', 'KR', 'COST',  'TGT',  'WMT',
    'NFLX', 'DIS', 'TTWO', 'EA', 'ROKU', 'JBL', 'GLW', 'TEL', 'APH', 'CLS', 'ELTK', 'LYTS',
    'PFE', 'ABBV', 'BMY', 'AMGN', 'PYPL', 'ALLY', 'SYF', 'DFS', 'COF', 'AXP', 'MA', 'V',
    'AAPL', 'SONY',  'GPRO', 'LPL',  'IBKR', 'MARA', 'LPLA', 'RJF',  'MS',
    'SCHW', 'GS', 'JPM', 'BAC', 'TSLA',   'GM'
]
print(f'You have {len(mixed_stocks)} mixed _sector stocks in your portfolio:')

def get_stock_data(stocks, start_date, end_date):

  """Fetches historical stock data for a list of stocks."""
  data = yf.download(stocks, start=start_date, end=end_date)
  return data

covid_start_date = dt.datetime(2019, 12, 31) # approximate date- adjust as needed


start_date =  covid_start_date - dt.timedelta(days=2*365)
end_date =  dt.datetime.today()



stock_data = get_stock_data(mixed_stocks, start_date, end_date)
returns = stock_data.pct_change().dropna()

You have 94 mixed _sector stocks in your portfolio:


[*********************100%***********************]  94 of 94 completed


## Section 2: Preparing for PCA

In [8]:
# Calculate daily returns for each portfolio

def calculate_portfolio_returns(weights, returns):
    portfolio_returns = returns.dot(weights)
    cumulative_returns = (1 + portfolio_returns).cumprod() - 1
    return cumulative_returns

# Split the data into pre-COVID and post-COVID periods

returns_pre_covid = returns[returns.index < covid_start_date]
returns_post_covid = returns[returns.index >= covid_start_date]

def create_component_variance_table(explained_variance):
  """Creates a DataFrame with PCA components and cumulative variance."""
  cumulative_variance = np.cumsum(explained_variance)
  component_variance_df = pd.DataFrame({
      'Component': range(1, len(explained_variance) + 1),
      'Explained Variance': explained_variance,
      'Cumulative Explained Variance': cumulative_variance

  })
  # Concatenate the first 17 and the last 2 entries
  result_df = pd.concat([component_variance_df.head(7), component_variance_df.tail(2)])
  return result_df


def kaiser_rule_with_cutoff(explained_variance, cutoff=0.8):
  """Applies Kaiser's rule with a specified variance cutoff."""
  eigenvalues = explained_variance * len(explained_variance)
  kaiser_components = np.sum(eigenvalues > 1)
  # Calculate cumulative variance
  cumulative_variance = np.cumsum(explained_variance)
  cutoff_components = np.argmax(cumulative_variance >= cutoff) + 1
  return min(kaiser_components, cutoff_components)

## Section 3: Proper PCA

In [9]:
pca = PCA()
principalComponents = pca.fit_transform(returns)

explained_variance = pca.explained_variance_ratio_
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(explained_variance) + 1), explained_variance)
plt.xlabel('Number of Components')
plt.ylabel('Explained Variance Ratio')
plt.title('Scree Plot')
plt.show()

ValueError: Input X contains infinity or a value too large for dtype('float64').

In [None]:
num_components = kaiser_rule_with_cutoff(explained_variance, cutoff=0.8)
print(f"Number of components based on Kaiser's rule with 80% cutoff: {num_components}")
component_variance_table = create_component_variance_table(explained_variance)
print("Entire period PCA :\n")
print(component_variance_table)

pre_principalComponents = pca.fit_transform(returns_pre_covid)
pre_explained_variance = pca.explained_variance_ratio_
pre_component_variance_table = create_component_variance_table(pre_explained_variance)

print("Pre_covid PCA:\n")
print(pre_component_variance_table)

post_principalComponents = pca.fit_transform(returns_post_covid)
post_explained_variance = pca.explained_variance_ratio_
post_component_variance_table = create_component_variance_table(post_explained_variance)

print("Post_covid PCA:\n")
print(post_component_variance_table)
