In [29]:
import pandas as pd
import numpy as np

import seaborn as sns
import seaborn.objects as so
import matplotlib.pyplot as plt


from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer

# !pip install pingouin
import pingouin as pg
from statsmodels.tsa.stattools import acf

from scipy.stats import linregress
from itertools import product

### Load Data

In [5]:
## commodity prices ##
commodities = pd.read_csv('../Data/raw/commodity_prices.csv',index_col=['Date'])
commodities.index = pd.to_datetime(commodities.index)

## stock prices ##
stocks = pd.read_csv('../Data/raw/stock_prices.csv',index_col=['Date'])
stocks.index = pd.to_datetime(stocks.index)

## effr ##
effr = pd.read_csv('../Data/raw/effective_federal_funds_rate.csv',index_col=['Date'])
effr.index = pd.to_datetime(effr.index)

## dollar index ##
dxy = pd.read_csv('../Data/raw/us_dollar_index.csv',index_col=['Date'])
dxy.index = pd.to_datetime(dxy.index)
dxy.drop('Unnamed: 0',axis=1,inplace=True)

## world_markets ##
world_markets = pd.read_csv('../Data/raw/world_market_prices.csv',index_col=['Date'])
world_markets.index = pd.to_datetime(world_markets.index)

## Generate Features for model
1. commodities log returns
2. stock log returns
3. lagged log returns for both datasets

In [None]:
## calculate log return from each closing price ##

commodities_log_return = commodities.filter(regex='Close').copy()
commodities_log_return.iloc[:,:] = np.log(commodities_log_return.iloc[:,:].divide(commodities_log_return.iloc[:,:].shift(1)))
commodities_log_return.columns = [x.replace(' Close','') + '_log_return' for x in commodities_log_return.columns.to_list()]
commodities_log_return.dropna(inplace=True)

In [None]:
## calculate log return from each closing price ##

stocks_log_return = stocks.copy()
stocks_log_return.iloc[:,:] = np.log(stocks_log_return.iloc[:,:].divide(stocks_log_return.iloc[:,:].shift(1)))
stocks_log_return.columns = [x+'_log_return' if x != 'Date' else x for x in stocks_log_return.columns.to_list()]
stocks_log_return.dropna(inplace=True)

##### FUNCTION TO CREATE LAGGED DATAFRAMES 

In [23]:
def create_transformed_lagged_df(df, columns, lags, transform_func):
    """
    Create transformed lagged features for a DataFrame.
    Make sure unique date is the index
    
    Parameters:
        df (pd.DataFrame): Input DataFrame with time series data.
        columns (list): List of column names to transform.
        lags (iterable): List or range of lags to apply.
        transform_func (function): Function to apply for lagging (e.g., shift, pct_change).
    
    Returns:
        pd.DataFrame: Transformed lagged features.
    """
    lagged_dfs = []
    for lag in lags:
        transformed_df = transform_func(df[columns], lag)
        transformed_df = transformed_df.add_suffix(f'_lag_{lag}')
        lagged_dfs.append(transformed_df)
    
    return pd.concat(lagged_dfs, axis=1)


In [28]:
## number of lags
lags = range(1,11)

## Create lagged columns for commodities ##
lagged_commodity_df = create_transformed_lagged_df(
    df=commodities_log_return,
    columns=commodities_log_return.columns.to_list(), 
    lags=lags,
    transform_func=lambda x, lag: x.shift(lag))


## Create lagged columns for value stocks ##
lagged_stock_df = create_transformed_lagged_df(
    df=stocks_log_return,
    columns=stocks_log_return.columns.to_list(),
    lags=lags,
    transform_func=lambda x, lag: x.shift(lag))

In [42]:
def generate_pairs(commodities, stocks):
    """
    Create list of all possible (commodity, stock) pairs.
    
    Parameters:
        commodities: Input Dataframe of commodity closing prices with datetime index
        stocks: Input Dataframe of commodity closing prices with datetime index
    
    Returns:
        list: list of tuples of (commodity name, stock name)
    """
    return list(product(commodities.columns, stocks.columns))

def compute_features(commodity, stock, lag=1):
    """
    Operations:
        1. Calculates a lagged log returns of a commodity
        2. Calculates a log return of stock
        3. Calculates a 21-day rolling correlation between the commodity and the stock
        4. Calculates a 3-day gradient of the commodity.
    
    Parameters:
        commodity: Input Dataframe of commodity closing prices with datetime index
        stock: Input Dataframe of commodity closing prices with datetime index
        lag: Integer lag number
    
    Returns:
        pd.DataFrame: dataframe with stock, commodity log returns, rolling_corr, and 3-day gradient
    """
    commodity_name = commodity.name + f'_lag_{str(lag)}'
    stock_name = stock.name
    
    commodity_returns =  np.log(commodity).diff(lag)
    stock_returns = np.log(stock).diff(1)
    df = pd.DataFrame(
        {commodity_name: commodity_returns,
         stock_name: stock_returns
    }).dropna()
    df['rolling_corr'] = df[commodity_name].rolling(window=21).corr(df[stock_name])
    df['gradient'] = df[commodity_name].rolling(window=3).apply(lambda x: linregress(range(3), x).slope, raw=True)

    return df.dropna()

def detect_trade_signals(df, threshold=0.95, min_streak=3, quantile=0.5):
    """
    Operations:
        1. Identifies rolling correlation over a threshold.
        2. Identifies streaks of rolling correlations over threshold.
        3. Filters for top 50% absolute gradient values.
        4. Calucluates trade signals based on sign of the gradient.
    
    Parameters:
        df: Input Dataframe of commodity closing prices with datetime index
        thershold: int - threshold of rolling correlation
        min_streak: int - minimum number of consecutive days of correlations above threshold
    
    Returns:
        pd.DataFrame: dataframe with rolling_corr, gradient, and trade_signal
    """

    df['over_thresh'] = df['rolling_corr'] > threshold
    df['streak'] = df['over_thresh'].astype(int).groupby(df['over_thresh'].ne(df['over_thresh'].shift()).cumsum()).cumsum()

    # Filter for streaks that meet min_streak requirement
    signal_df = df[(df['streak'] >= min_streak)]

    # Select top 50% of absolute gradient values
    cutoff = signal_df['gradient'].abs().quantile(quantile)
    signal_df = signal_df[signal_df['gradient'].abs() >= cutoff]
    signal_df['trade_signal'] = signal_df['gradient'].apply(lambda x: 'long' if x > 0 else 'short')

    return signal_df


def print_signals(signals, commodity_name, stock_name, lag):
    """Prints trade signals in a readable format."""
    if not signals.empty:
        print(f"\n📢 Trade Signals for {commodity_name} (lag={lag}) & {stock_name}")
        print(signals[['rolling_corr', 'gradient', 'trade_signal']])

def main(commodities, stocks, threshold=0.95, min_streak=3, lag=1):
    """
        
    """

    # Generate pairs
    pairs = generate_pairs(commodities, stocks)

    # Process each pair
    for commodity, stock in pairs:
        # print(f"\n🔄 Processing {commodity} & {stock}...")
        df_features = compute_features(commodities[commodity], stocks[stock], lag=lag)
        trade_signals = detect_trade_signals(df_features, threshold=threshold, min_streak=min_streak)
        
        print_signals(trade_signals, commodity + f"_lag_{lag}", stock, lag=lag)


### Let's test our feature and signal logic

In [41]:
##### Set Dataframes & Call Function ######
test_commodities = commodities.filter(like='Close')
test_stocks = stocks

for lag in range(1, 4):
    print(f"\n🚀 Running strategy with LAG = {lag}")
    main(test_commodities, test_stocks, threshold=0.80, min_streak=3, lag=lag)


🚀 Running strategy with LAG = 1

📢 Trade Signals for PA=F Close_lag_1 (lag=1) & SNAP
            rolling_corr  gradient trade_signal
Date                                           
2022-12-02      0.823315 -0.019667        short
2022-12-05      0.821672 -0.031822        short

📢 Trade Signals for SI=F Close_lag_1 (lag=1) & WDC
            rolling_corr  gradient trade_signal
Date                                           
2025-01-15      0.806125  0.035665         long

🚀 Running strategy with LAG = 2

🚀 Running strategy with LAG = 3

📢 Trade Signals for PA=F Close_lag_3 (lag=3) & AAPL
            rolling_corr  gradient trade_signal
Date                                           
2024-08-28      0.836027 -0.014895        short
2024-08-29      0.817821 -0.014134        short

📢 Trade Signals for PL=F Close_lag_3 (lag=3) & RBLX
            rolling_corr  gradient trade_signal
Date                                           
2022-09-12       0.80372  0.004168         long


In [50]:
features = compute_features(test_commodities['SI=F Close'],test_stocks['WDC'],lag=1)
trade_signals = detect_trade_signals(features, threshold=0.7, min_streak=2)

In [51]:
features.head()

Unnamed: 0_level_0,SI=F Close_lag_1,WDC,rolling_corr,gradient,over_thresh,streak
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-02-02,0.005076,0.016414,0.225579,0.000458,False,0
2022-02-03,-0.014905,-0.021754,0.208866,-0.01192,False,0
2022-02-04,0.004459,-0.009222,0.208326,-0.000309,False,0
2022-02-07,0.02639,0.000772,0.154687,0.020647,False,0
2022-02-08,0.005144,0.009979,0.14115,0.000342,False,0


In [52]:
trade_signals.head()

Unnamed: 0_level_0,SI=F Close_lag_1,WDC,rolling_corr,gradient,over_thresh,streak,trade_signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-12-30,-0.018686,-0.017953,0.711573,-0.010559,True,2,short
2025-01-08,0.000164,-0.012565,0.798344,-0.008829,True,5,short
2025-01-13,-0.032692,-0.012817,0.840615,-0.016428,True,7,short
2025-01-15,0.038637,0.02968,0.806125,0.035665,True,9,long
2025-01-17,-0.018312,0.012221,0.745243,-0.028475,True,11,short


#### logic generates long and short signals

##### Next steps
1. backtest logic
2. optimize threshold, gradient, minimum streak
3. visualize