### Importing Packages & Downloading Data

In [7]:
import os
import yfinance as yf
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import coint

tickers = [
    "AMD", "NVDA", "AVGO", "MU", "INTC", "TSM", "QCOM", "NET",
    "SNOW", "AMZN", "MSFT", "GOOGL", "DDOG", "ESTC", "DOCN", "PLTR", "MDB", 
    "OKTA", "CRWD", "ZS", "PANW", "S", "FTNT", "CYBR", "U",
    "TTD", "ROKU", "PINS", "SNAP", "PUBM", "CFLT", "GTLB", "FROG", "PD",
    "ORCL"
]

#"NBIS"
data = yf.download(tickers, period="2y")[['Close']]


# Convert to log prices
log_prices = np.log(data)
log_prices.to_csv("log_price_data.csv")
n_stocks = len(tickers)


log_prices_flat = log_prices.droplevel(0, axis=1)
log_prices_flat.to_csv("log_prices_flat.csv")
pvalue_matrix = pd.DataFrame(index=tickers, columns=tickers)

  data = yf.download(tickers, period="2y")[['Close']]
[*********************100%***********************]  35 of 35 completed


In [8]:
# Before running cointegration tests, check data quality
print("Data shape:", log_prices_flat.shape)
print("Any NaN values:", log_prices_flat.isnull().sum())
print("Any inf values:", np.isinf(log_prices_flat).sum())



Data shape: (502, 35)
Any NaN values: Ticker
AMD      0
AMZN     0
AVGO     0
CFLT     0
CRWD     0
CYBR     0
DDOG     0
DOCN     0
ESTC     0
FROG     0
FTNT     0
GOOGL    0
GTLB     0
INTC     0
MDB      0
MSFT     0
MU       0
NET      0
NVDA     0
OKTA     0
ORCL     0
PANW     0
PD       0
PINS     0
PLTR     0
PUBM     0
QCOM     0
ROKU     0
S        0
SNAP     0
SNOW     0
TSM      0
TTD      0
U        0
ZS       0
dtype: int64
Any inf values: Ticker
AMD      0
AMZN     0
AVGO     0
CFLT     0
CRWD     0
CYBR     0
DDOG     0
DOCN     0
ESTC     0
FROG     0
FTNT     0
GOOGL    0
GTLB     0
INTC     0
MDB      0
MSFT     0
MU       0
NET      0
NVDA     0
OKTA     0
ORCL     0
PANW     0
PD       0
PINS     0
PLTR     0
PUBM     0
QCOM     0
ROKU     0
S        0
SNAP     0
SNOW     0
TSM      0
TTD      0
U        0
ZS       0
dtype: int64


### Pairwise Cointegration Testing

In [None]:
def test_pairs(log_prices_flat: pd.DataFrame, tickers: list) -> pd.DataFrame:
    for i, stock1 in enumerate(tickers):
        for j, stock2 in enumerate(tickers):
            if i != j:
                prices1 = log_prices_flat[stock1]
                prices2 = log_prices_flat[stock2]
                _, pvalue, _ = coint(prices1, prices2)
                pvalue_matrix.loc[stock1, stock2] = pvalue
            else:
                pvalue_matrix.loc[stock1, stock2] = 0  

    print(pvalue_matrix)
    # Save p-value matrix to CSV
    pvalue_matrix.to_csv("cointegration_pvalues.csv")

test_pairs(log_prices_flat, tickers)

            AMD      NVDA      AVGO        MU      INTC       TSM      QCOM  \
AMD           0  0.591255  0.523291   0.62679  0.370683  0.570069  0.706922   
NVDA   0.784917         0  0.591208  0.674104  0.261189   0.05817  0.857445   
AVGO   0.942236  0.716651         0  0.842199  0.741548  0.263648    0.9649   
MU     0.533998  0.586188  0.670222         0  0.568488  0.454412  0.681501   
INTC   0.680183  0.262894  0.613104  0.716525         0  0.509003   0.73247   
TSM    0.910118  0.074081  0.232486  0.716083  0.584193         0  0.980883   
QCOM   0.479586   0.44419  0.451079  0.299416  0.246774  0.534835         0   
NET    0.958206  0.935599  0.388424  0.895591  0.938964  0.787311  0.977411   
SNOW   0.791337  0.784677  0.714736  0.778301  0.775993  0.769376  0.685227   
AMZN   0.523394  0.097873  0.114386  0.556253   0.17778  0.027522   0.78628   
MSFT    0.72616  0.475428  0.181804  0.172606  0.503189  0.308771  0.813821   
GOOGL  0.972107  0.729672  0.318833  0.446452  0.916

### Low P-Value Cointegration Scores

In [10]:
def find_cointegrated_pairs(csv_file: str, significance_level: float = 0.15):
    """Find and display significantly cointegrated stock pairs."""
    p_value_matrix = pd.read_csv(csv_file, index_col=0)
    stocks = p_value_matrix.columns.tolist()
    
    significant_pairs = []
    for i in range(len(stocks)):
        for j in range(i + 1, len(stocks)):
            p_val = p_value_matrix.iloc[i, j]
            if p_val < significance_level:
                significant_pairs.append((stocks[i], stocks[j], p_val))
    
    # Sort and display
    significant_pairs.sort(key=lambda x: x[2])
    for stock1, stock2, p_val in significant_pairs:
        print(f"{stock1}-{stock2}: {p_val:.4f}")
    
    return significant_pairs

# Usage
pairs = find_cointegrated_pairs("cointegration_pvalues.csv")

AVGO-CYBR: 0.0120
PANW-CYBR: 0.0150
PANW-PD: 0.0154
PLTR-PD: 0.0218
PANW-ORCL: 0.0223
PANW-FTNT: 0.0234
SNAP-PD: 0.0336
CFLT-GTLB: 0.0483
PINS-CFLT: 0.0515
PINS-PD: 0.0548
ESTC-SNAP: 0.0580
NVDA-TSM: 0.0582
MDB-PD: 0.0597
PANW-SNAP: 0.0603
TSM-AMZN: 0.0640
AVGO-PD: 0.0642
PINS-ORCL: 0.0674
ESTC-CFLT: 0.0676
ESTC-PD: 0.0711
CFLT-ORCL: 0.0757
GTLB-PD: 0.0786
PD-ORCL: 0.0802
PANW-PUBM: 0.0803
CRWD-PD: 0.0807
DOCN-S: 0.0859
CYBR-PD: 0.0916
CRWD-FROG: 0.0943
CFLT-PD: 0.0954
S-GTLB: 0.0958
PINS-GTLB: 0.0986
AMZN-PD: 0.1009
SNAP-PUBM: 0.1022
PINS-PUBM: 0.1031
TSM-PANW: 0.1038
ESTC-PUBM: 0.1093
ESTC-GTLB: 0.1137
DOCN-GTLB: 0.1182
AMZN-CYBR: 0.1184
QCOM-ROKU: 0.1210
ESTC-ORCL: 0.1237
GTLB-ORCL: 0.1318
PANW-CFLT: 0.1319
OKTA-TTD: 0.1338
MSFT-ORCL: 0.1338
ESTC-PLTR: 0.1363
TSM-PD: 0.1369
CFLT-FROG: 0.1420
DOCN-U: 0.1432
CRWD-SNAP: 0.1436
ESTC-PINS: 0.1441
AVGO-ORCL: 0.1478
ROKU-PUBM: 0.1497


In [11]:
def find_cointegrated_pairs(csv_file: str, significance_level: float = 0.15):
    """Find and display significantly cointegrated stock pairs."""
    p_value_matrix = pd.read_csv(csv_file, index_col=0)
    stocks = p_value_matrix.columns.tolist()
    
    significant_pairs = []
    for i in range(len(stocks)):
        for j in range(i + 1, len(stocks)):
            p_val = p_value_matrix.iloc[i, j]
            if p_val < significance_level:
                significant_pairs.append((stocks[i], stocks[j], p_val))
    
    # Sort and display
    significant_pairs.sort(key=lambda x: x[2])
    for stock1, stock2, p_val in significant_pairs:
        print(f"{stock1}-{stock2}: {p_val:.4f}")
    
    return significant_pairs

# Usage
pairs = find_cointegrated_pairs("cointegration_pvalues.csv")

AVGO-CYBR: 0.0120
PANW-CYBR: 0.0150
PANW-PD: 0.0154
PLTR-PD: 0.0218
PANW-ORCL: 0.0223
PANW-FTNT: 0.0234
SNAP-PD: 0.0336
CFLT-GTLB: 0.0483
PINS-CFLT: 0.0515
PINS-PD: 0.0548
ESTC-SNAP: 0.0580
NVDA-TSM: 0.0582
MDB-PD: 0.0597
PANW-SNAP: 0.0603
TSM-AMZN: 0.0640
AVGO-PD: 0.0642
PINS-ORCL: 0.0674
ESTC-CFLT: 0.0676
ESTC-PD: 0.0711
CFLT-ORCL: 0.0757
GTLB-PD: 0.0786
PD-ORCL: 0.0802
PANW-PUBM: 0.0803
CRWD-PD: 0.0807
DOCN-S: 0.0859
CYBR-PD: 0.0916
CRWD-FROG: 0.0943
CFLT-PD: 0.0954
S-GTLB: 0.0958
PINS-GTLB: 0.0986
AMZN-PD: 0.1009
SNAP-PUBM: 0.1022
PINS-PUBM: 0.1031
TSM-PANW: 0.1038
ESTC-PUBM: 0.1093
ESTC-GTLB: 0.1137
DOCN-GTLB: 0.1182
AMZN-CYBR: 0.1184
QCOM-ROKU: 0.1210
ESTC-ORCL: 0.1237
GTLB-ORCL: 0.1318
PANW-CFLT: 0.1319
OKTA-TTD: 0.1338
MSFT-ORCL: 0.1338
ESTC-PLTR: 0.1363
TSM-PD: 0.1369
CFLT-FROG: 0.1420
DOCN-U: 0.1432
CRWD-SNAP: 0.1436
ESTC-PINS: 0.1441
AVGO-ORCL: 0.1478
ROKU-PUBM: 0.1497


In [14]:
from statsmodels.tsa.vector_ar.vecm import coint_johansen

cybersecurity_symbols = ['AVGO', 'CYBR', 'PANW', 'PD']
basket_data = log_prices_flat[cybersecurity_symbols].dropna()

result = coint_johansen(basket_data.values, det_order=0, k_ar_diff=1)
rank = sum(result.lr1 > result.cvt[:, 1])  # Compare trace stats to 95% critical values

print(f"Cointegration rank: {rank}")
print("Cointegrated" if rank > 0 else "Not cointegrated")

Cointegration rank: 2
Cointegrated
