### Importing Packages & Downloading Data

In [10]:
import os
import yfinance as yf
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import coint

tickers = [
    "AMD", "NVDA", "AVGO", "MU", "INTC", "TSM", "QCOM", "MRVL", "ADI", "NET",
    "SNOW", "CRM", "AMZN", "MSFT", "GOOGL", "DDOG", "ESTC", "DOCN", "PLTR", "MDB", 
    "OKTA", "CRWD", "ZS", "PANW", "S", "FTNT", "CYBR", "APP", "U",
    "TTD", "ROKU", "PINS", "SN", "ORCL"
]

#"NBIS"
data = yf.download(tickers, period="2y")[['Close']]

# Convert to log prices
log_prices = np.log(data)
log_prices.to_csv("log_price_data.csv")
n_stocks = len(tickers)
pvalue_matrix = pd.DataFrame(index=tickers, columns=tickers)

log_prices_flat = log_prices.droplevel(0, axis=1)
log_prices_flat.to_csv("log_prices_flat.csv")

  data = yf.download(tickers, period="2y")[['Close']]
[*********************100%***********************]  34 of 34 completed


In [11]:
# Before running cointegration tests, check data quality
print("Data shape:", log_prices_flat.shape)
print("Any NaN values:", log_prices_flat.isnull().sum())
print("Any inf values:", np.isinf(log_prices_flat).sum())



Data shape: (501, 34)
Any NaN values: Ticker
ADI      0
AMD      0
AMZN     0
APP      0
AVGO     0
CRM      0
CRWD     0
CYBR     0
DDOG     0
DOCN     0
ESTC     0
FTNT     0
GOOGL    0
INTC     0
MDB      0
MRVL     0
MSFT     0
MU       0
NET      0
NVDA     0
OKTA     0
ORCL     0
PANW     0
PINS     0
PLTR     0
QCOM     0
ROKU     0
S        0
SN       0
SNOW     0
TSM      0
TTD      0
U        0
ZS       0
dtype: int64
Any inf values: Ticker
ADI      0
AMD      0
AMZN     0
APP      0
AVGO     0
CRM      0
CRWD     0
CYBR     0
DDOG     0
DOCN     0
ESTC     0
FTNT     0
GOOGL    0
INTC     0
MDB      0
MRVL     0
MSFT     0
MU       0
NET      0
NVDA     0
OKTA     0
ORCL     0
PANW     0
PINS     0
PLTR     0
QCOM     0
ROKU     0
S        0
SN       0
SNOW     0
TSM      0
TTD      0
U        0
ZS       0
dtype: int64


### Pairwise Cointegration Testing

In [12]:
def test_pairs(log_prices_flat: pd.DataFrame, tickers: list) -> pd.DataFrame:
    for i, stock1 in enumerate(tickers):
        for j, stock2 in enumerate(tickers):
            if i != j:
                prices1 = log_prices_flat[stock1]
                prices2 = log_prices_flat[stock2]
                _, pvalue, _ = coint(prices1, prices2)
                pvalue_matrix.loc[stock1, stock2] = pvalue
            else:
                pvalue_matrix.loc[stock1, stock2] = 0  

    print(pvalue_matrix)
    # Save p-value matrix to CSV
    pvalue_matrix.to_csv("cointegration_pvalues.csv")

test_pairs(log_prices_flat, tickers)

            AMD      NVDA      AVGO        MU      INTC       TSM      QCOM  \
AMD           0  0.648883  0.596311  0.651266  0.478137  0.634944  0.729667   
NVDA   0.827296         0  0.684274  0.660568  0.141075  0.015436  0.890079   
AVGO   0.970264   0.81451         0  0.868234  0.669453   0.47067  0.990683   
MU     0.587465  0.560794  0.514338         0  0.516949  0.469051  0.767247   
INTC   0.675844  0.146613  0.284442  0.656381         0  0.309926  0.674004   
TSM    0.907124  0.018021  0.372721  0.717633  0.381784         0  0.983247   
QCOM   0.474596  0.510027  0.555998  0.474418  0.287609  0.606722         0   
MRVL   0.681945  0.589604  0.777294    0.7577  0.370502  0.686465  0.539584   
ADI    0.726175  0.025109  0.042137  0.129394  0.163103  0.011602   0.70831   
NET    0.973858  0.943667  0.370125  0.920264   0.92403  0.832066  0.985851   
SNOW   0.769902  0.766654  0.679559  0.763219  0.783016  0.751872  0.677274   
CRM    0.573915  0.692807  0.750183  0.695887  0.441

### Low P-Value Cointegration Scores

In [13]:
def find_cointegrated_pairs(csv_file: str, significance_level: float = 0.15):
    """Find and display significantly cointegrated stock pairs."""
    p_value_matrix = pd.read_csv(csv_file, index_col=0)
    stocks = p_value_matrix.columns.tolist()
    
    significant_pairs = []
    for i in range(len(stocks)):
        for j in range(i + 1, len(stocks)):
            p_val = p_value_matrix.iloc[i, j]
            if p_val < significance_level:
                significant_pairs.append((stocks[i], stocks[j], p_val))
    
    # Sort and display
    significant_pairs.sort(key=lambda x: x[2])
    for stock1, stock2, p_val in significant_pairs:
        print(f"{stock1}-{stock2}: {p_val:.4f}")
    
    return significant_pairs

# Usage
pairs = find_cointegrated_pairs("cointegration_pvalues.csv")

TSM-AMZN: 0.0126
AVGO-CYBR: 0.0146
NVDA-TSM: 0.0154
PANW-APP: 0.0207
PANW-CYBR: 0.0259
PANW-SN: 0.0272
ADI-GOOGL: 0.0411
ADI-PANW: 0.0437
ADI-MSFT: 0.0494
TSM-ADI: 0.0510
PANW-ORCL: 0.0532
TSM-SN: 0.0630
PANW-FTNT: 0.0642
NVDA-ADI: 0.0647
ADI-SN: 0.0655
PINS-ORCL: 0.0816
PINS-SN: 0.0844
ADI-ORCL: 0.0849
TSM-PANW: 0.0897
DOCN-S: 0.0927
INTC-SN: 0.1022
MRVL-DDOG: 0.1189
MU-GOOGL: 0.1220
NVDA-SN: 0.1230
NVDA-AMZN: 0.1238
AVGO-ORCL: 0.1261
AMZN-CYBR: 0.1263
ESTC-ORCL: 0.1273
ADI-CYBR: 0.1280
ADI-AMZN: 0.1371
OKTA-CRWD: 0.1377
QCOM-ROKU: 0.1408
NVDA-INTC: 0.1411
ESTC-PLTR: 0.1415
OKTA-TTD: 0.1450
AMZN-APP: 0.1458


In [14]:
def find_cointegrated_pairs(csv_file: str, significance_level: float = 0.15):
    """Find and display significantly cointegrated stock pairs."""
    p_value_matrix = pd.read_csv(csv_file, index_col=0)
    stocks = p_value_matrix.columns.tolist()
    
    significant_pairs = []
    for i in range(len(stocks)):
        for j in range(i + 1, len(stocks)):
            p_val = p_value_matrix.iloc[i, j]
            if p_val < significance_level:
                significant_pairs.append((stocks[i], stocks[j], p_val))
    
    # Sort and display
    significant_pairs.sort(key=lambda x: x[2])
    for stock1, stock2, p_val in significant_pairs:
        print(f"{stock1}-{stock2}: {p_val:.4f}")
    
    return significant_pairs

# Usage
pairs = find_cointegrated_pairs("cointegration_pvalues.csv")

TSM-AMZN: 0.0126
AVGO-CYBR: 0.0146
NVDA-TSM: 0.0154
PANW-APP: 0.0207
PANW-CYBR: 0.0259
PANW-SN: 0.0272
ADI-GOOGL: 0.0411
ADI-PANW: 0.0437
ADI-MSFT: 0.0494
TSM-ADI: 0.0510
PANW-ORCL: 0.0532
TSM-SN: 0.0630
PANW-FTNT: 0.0642
NVDA-ADI: 0.0647
ADI-SN: 0.0655
PINS-ORCL: 0.0816
PINS-SN: 0.0844
ADI-ORCL: 0.0849
TSM-PANW: 0.0897
DOCN-S: 0.0927
INTC-SN: 0.1022
MRVL-DDOG: 0.1189
MU-GOOGL: 0.1220
NVDA-SN: 0.1230
NVDA-AMZN: 0.1238
AVGO-ORCL: 0.1261
AMZN-CYBR: 0.1263
ESTC-ORCL: 0.1273
ADI-CYBR: 0.1280
ADI-AMZN: 0.1371
OKTA-CRWD: 0.1377
QCOM-ROKU: 0.1408
NVDA-INTC: 0.1411
ESTC-PLTR: 0.1415
OKTA-TTD: 0.1450
AMZN-APP: 0.1458


In [17]:
from statsmodels.tsa.vector_ar.vecm import coint_johansen

cybersecurity_symbols = ['AVGO', 'CYBR', 'PANW', 'APP']
basket_data = log_prices_flat[cybersecurity_symbols].dropna()

result = coint_johansen(basket_data.values, det_order=0, k_ar_diff=1)
rank = sum(result.lr1 > result.cvt[:, 1])  # Compare trace stats to 95% critical values

print(f"Cointegration rank: {rank}")
print("Cointegrated" if rank > 0 else "Not cointegrated")

Cointegration rank: 0
Not cointegrated


In [18]:
# Let's examine the data shape and check for missing values in APP, ESTC, and FROG specifically
import pandas as pd
import numpy as np

# Read the flat log prices data
log_prices_flat = pd.read_csv("log_prices_flat.csv", index_col=0)

print("Overall data shape:", log_prices_flat.shape)
print("\nData for APP, ESTC, FROG:")

# Check specific stocks
stocks_to_check = ['APP', 'ESTC', 'FROG']
for stock in stocks_to_check:
    if stock in log_prices_flat.columns:
        stock_data = log_prices_flat[stock]
        print(f"\n{stock}:")
        print(f"  Total observations: {len(stock_data)}")
        print(f"  Non-null observations: {stock_data.notna().sum()}")
        print(f"  Null observations: {stock_data.isna().sum()}")
        print(f"  First few values: {stock_data.head()}")
        print(f"  Last few values: {stock_data.tail()}")
    else:
        print(f"{stock} not found in columns")

print(f"\nAll columns: {list(log_prices_flat.columns)}")

Overall data shape: (501, 34)

Data for APP, ESTC, FROG:

APP:
  Total observations: 501
  Non-null observations: 501
  Null observations: 0
  First few values: Date
2023-09-18    3.784417
2023-09-19    3.682107
2023-09-20    3.671988
2023-09-21    3.649099
2023-09-22    3.631250
Name: APP, dtype: float64
  Last few values: Date
2025-09-10    6.340571
2025-09-11    6.347932
2025-09-12    6.366470
2025-09-15    6.385430
2025-09-16    6.403243
Name: APP, dtype: float64

ESTC:
  Total observations: 501
  Non-null observations: 501
  Null observations: 0
  First few values: Date
2023-09-18    4.357862
2023-09-19    4.344455
2023-09-20    4.335852
2023-09-21    4.353884
2023-09-22    4.376511
Name: ESTC, dtype: float64
  Last few values: Date
2025-09-10    4.499032
2025-09-11    4.489310
2025-09-12    4.472096
2025-09-15    4.480853
2025-09-16    4.460607
Name: ESTC, dtype: float64
FROG not found in columns

All columns: ['ADI', 'AMD', 'AMZN', 'APP', 'AVGO', 'CRM', 'CRWD', 'CYBR', 'DDOG', '