In [None]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller
data = pd.read_csv("./data/stock_data/consolidated_stock_data.csv", parse_dates=["Date"])
data.set_index("Date", inplace=True)
data = data.pivot(columns="Ticker", values="Adj Close")
data.fillna(method="ffill", inplace=True)
data.fillna(method="bfill", inplace=True)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

correlation_matrix = data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=False, cmap="coolwarm", center=0, cbar=True)
plt.title("Stock Correlation Heatmap")
plt.show()



In [None]:

import numpy as np
# avoid these repetitions (take only upper del values)
corr_unstacked = correlation_matrix.where(
    np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
).stack()

top_10_pairs = corr_unstacked.sort_values(ascending=False).head(10)
print("Top 10 Most Correlated Pairs:")
print(top_10_pairs)


In [None]:
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller

def cointegration_test(pair, data):
    ticker1, ticker2 = pair
    stock1 = data[ticker1]
    stock2 = data[ticker2]

    #regress the 2 stocks to find relationship
    model = sm.OLS(stock1, sm.add_constant(stock2)).fit()
    residuals = model.resid
    

    # adf test 
    adf_result = adfuller(residuals)
    adf_statistic, p_value = adf_result[0], adf_result[1]
    print(f"Testing pair {ticker1}-{ticker2}: ADF Statistic = {adf_statistic}, p-value = {p_value}")

    return adf_result[1] < 0.05  # hypothesis threshold

# test cointegration for top 10 correlated pairs
cointegrated_pairs = [
    pair for pair in top_10_pairs.index if cointegration_test(pair, data)
]

print("Cointegrated Pairs:")
print(cointegrated_pairs)


In [None]:
pe_ratios = {
    'ANET': 54.34,  # Arista Networks Inc (ANET)
    'AVGO': 197.19,  # Broadcom Inc (AVGO)
    'MA': 40.26,  # Mastercard (MA)
    'V': 32.75,  # Visa Inc (V)
    'NVDA': 53.92,  # NVIDIA Corp (NVDA)
    'PGR': 17.52,  # Progressive Corp (PGR)
    'COST': 55.18,  # COSTCO Wholesale Corp (COST)
    'BSX': 74.93,  # Boston Scientific Corp (BSX)
    'GE': 29.89,  # GE Aerospace (GE)
    'ETN': 35.60  # Eaton Corp (ETN)
}
cointegrated_pairs = [
    ('ANET', 'AVGO'),
    ('MA', 'V'),
    ('AVGO', 'NVDA'),
    ('ANET', 'PGR'),
    ('ANET', 'COST'),
    ('BSX', 'NVDA'),
    ('BSX', 'GE'),
    ('ANET', 'BSX')
]
all_pairs_with_diff = []

for pair in cointegrated_pairs:
    stock1, stock2 = pair
    pe_diff = abs(pe_ratios[stock1] - pe_ratios[stock2])
    all_pairs_with_diff.append((pair, pe_diff))
all_pairs_with_diff = sorted(all_pairs_with_diff, key=lambda x: x[1])

print("All Cointegrated Pairs Sorted by Ascending P/E Differences:")
for rank, (pair, pe_diff) in enumerate(all_pairs_with_diff, start=1):
    print(f"{rank}. Pair: {pair}, P/E Difference: {pe_diff:.2f}")


In [None]:
import pandas as pd
import pandas_datareader as pdr
from datetime import datetime
import yfinance as yf

def get_historical_Data(tickers):
    data = pd.DataFrame()
    names = list()
    for i in tickers:
        data = pd.concat([data, pd.DataFrame(yf.download(i, start=datetime(2020, 10, 27), end=datetime(2021, 10, 27)).iloc[:,4])], axis = 1)
        names.append(i)
    data.columns = names
    return data

ticks = ["MA", "V","ANET", "AVGO"] 
d = get_historical_Data(ticks)
print(d.shape)
d.tail()

