In [53]:
import pandas as pd
import numpy as np

import seaborn as sns
import seaborn.objects as so
import matplotlib.pyplot as plt


from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer

# !pip install pingouin
import pingouin as pg
from statsmodels.tsa.stattools import acf

from scipy.stats import linregress
from itertools import product

### Load Data

In [54]:
## commodity prices ##
commodities = pd.read_csv('../Data/raw/commodity_prices.csv',index_col=['Date'])
commodities.index = pd.to_datetime(commodities.index)

## stock prices ##
stocks = pd.read_csv('../Data/raw/stock_prices.csv',index_col=['Date'])
stocks.index = pd.to_datetime(stocks.index)

## effr ##
effr = pd.read_csv('../Data/raw/effective_federal_funds_rate.csv',index_col=['Date'])
effr.index = pd.to_datetime(effr.index)

## dollar index ##
dxy = pd.read_csv('../Data/raw/us_dollar_index.csv',index_col=['Date'])
dxy.index = pd.to_datetime(dxy.index)
dxy.drop('Unnamed: 0',axis=1,inplace=True)

## world_markets ##
world_markets = pd.read_csv('../Data/raw/world_market_prices.csv',index_col=['Date'])
world_markets.index = pd.to_datetime(world_markets.index)

## Generate Features for model
1. commodities log returns
2. stock log returns
3. lagged log returns for both datasets

In [55]:
## calculate log return from each closing price ##

commodities_log_return = commodities.filter(regex='Close').copy()
commodities_log_return.iloc[:,:] = np.log(commodities_log_return.iloc[:,:].divide(commodities_log_return.iloc[:,:].shift(1)))
commodities_log_return.columns = [x.replace(' Close','') + '_log_return' for x in commodities_log_return.columns.to_list()]
commodities_log_return.dropna(inplace=True)

In [56]:
## calculate log return from each closing price ##

stocks_log_return = stocks.copy()
stocks_log_return.iloc[:,:] = np.log(stocks_log_return.iloc[:,:].divide(stocks_log_return.iloc[:,:].shift(1)))
stocks_log_return.columns = [x+'_log_return' if x != 'Date' else x for x in stocks_log_return.columns.to_list()]
stocks_log_return.dropna(inplace=True)

##### FUNCTION TO CREATE LAGGED DATAFRAMES 

In [57]:
def create_transformed_lagged_df(df, columns, lags, transform_func):
    """
    Create transformed lagged features for a DataFrame.
    Make sure unique date is the index
    
    Parameters:
        df (pd.DataFrame): Input DataFrame with time series data.
        columns (list): List of column names to transform.
        lags (iterable): List or range of lags to apply.
        transform_func (function): Function to apply for lagging (e.g., shift, pct_change).
    
    Returns:
        pd.DataFrame: Transformed lagged features.
    """
    lagged_dfs = []
    for lag in lags:
        transformed_df = transform_func(df[columns], lag)
        transformed_df = transformed_df.add_suffix(f'_lag_{lag}')
        lagged_dfs.append(transformed_df)
    
    return pd.concat(lagged_dfs, axis=1)


In [58]:
## number of lags
lags = range(1,11)

## Create lagged columns for commodities ##
lagged_commodity_df = create_transformed_lagged_df(
    df=commodities_log_return,
    columns=commodities_log_return.columns.to_list(), 
    lags=lags,
    transform_func=lambda x, lag: x.shift(lag))


## Create lagged columns for value stocks ##
lagged_stock_df = create_transformed_lagged_df(
    df=stocks_log_return,
    columns=stocks_log_return.columns.to_list(),
    lags=lags,
    transform_func=lambda x, lag: x.shift(lag))

In [59]:
def generate_pairs(commodities, stocks):
    """
    Create list of all possible (commodity, stock) pairs.
    
    Parameters:
        commodities: Input Dataframe of commodity closing prices with datetime index
        stocks: Input Dataframe of commodity closing prices with datetime index
    
    Returns:
        list: list of tuples of (commodity name, stock name)
    """
    return list(product(commodities.columns, stocks.columns))

def compute_features(commodity, stock, window=21, lag=1):
    """
    Operations:
        1. Calculates a lagged log returns of a commodity
        2. Calculates a log return of stock
        3. Calculates a 21-day rolling correlation between the commodity and the stock
        4. Calculates a 3-day gradient of the commodity.
    
    Parameters:
        commodity: Input Dataframe of commodity closing prices with datetime index
        stock: Input Dataframe of commodity closing prices with datetime index
        lag: Integer lag number
    
    Returns:
        pd.DataFrame: dataframe with stock, commodity log returns, rolling_corr, and 3-day gradient
    """
    commodity_name = commodity.name + f'_lag_{str(lag)}'
    stock_name = stock.name
    
    commodity_returns =  np.log(commodity).diff(lag)
    stock_returns = np.log(stock).diff(1)
    df = pd.DataFrame(
        {commodity_name: commodity_returns,
         stock_name: stock_returns
    }).dropna()
    df['rolling_corr'] = df[commodity_name].rolling(window=window).corr(df[stock_name])
    df['gradient'] = df[commodity_name].rolling(window=3).apply(lambda x: linregress(range(3), x).slope, raw=True)

    return df.dropna()

def detect_trade_signals(df, threshold=0.95, min_streak=3, quantile=0.5):
    """
    Operations:
        1. Identifies rolling correlation over a threshold.
        2. Identifies streaks of rolling correlations over threshold.
        3. Filters for top 50% absolute gradient values.
        4. Calucluates trade signals based on sign of the gradient.
    
    Parameters:
        df: Input Dataframe of commodity closing prices with datetime index
        thershold: int - threshold of rolling correlation
        min_streak: int - minimum number of consecutive days of correlations above threshold
    
    Returns:
        pd.DataFrame: dataframe with rolling_corr, gradient, and trade_signal
    """

    df['over_thresh'] = df['rolling_corr'] > threshold
    df['streak'] = df['over_thresh'].astype(int).groupby(df['over_thresh'].ne(df['over_thresh'].shift()).cumsum()).cumsum()

    # Filter for streaks that meet min_streak requirement
    signal_df = df[(df['streak'] >= min_streak)]

    # Select top 50% of absolute gradient values
    cutoff = signal_df['gradient'].abs().quantile(quantile)
    signal_df = signal_df[signal_df['gradient'].abs() >= cutoff]
    signal_df['trade_signal'] = signal_df['gradient'].apply(lambda x: 'long' if x > 0 else 'short')

    return signal_df


def print_signals(signals, commodity_name, stock_name, lag):
    """Prints trade signals in a readable format."""
    if not signals.empty:
        print(f"\n📢 Trade Signals for {commodity_name} (lag={lag}) & {stock_name}")
        print(signals[['rolling_corr', 'gradient', 'trade_signal']])

def main(commodities, stocks, threshold=0.95, min_streak=3, window=7, lag=1):
    """
        
    """

    # Generate pairs
    pairs = generate_pairs(commodities, stocks)

    # Process each pair
    for commodity, stock in pairs:
        # print(f"\n🔄 Processing {commodity} & {stock}...")
        df_features = compute_features(commodities[commodity], stocks[stock], window=window, lag=lag)
        trade_signals = detect_trade_signals(df_features, threshold=threshold, min_streak=min_streak)
        
        print_signals(trade_signals, commodity + f"_lag_{lag}", stock, lag=lag)


### Let's test our feature and signal logic

In [60]:
##### Set Dataframes & Call Function ######
test_commodities = commodities.filter(like='Close')
test_stocks = stocks

for lag in range(1, 4):
    print(f"\n🚀 Running strategy with LAG = {lag}")
    main(test_commodities, test_stocks, threshold=0.80, min_streak=3, window=14, lag=lag)


🚀 Running strategy with LAG = 1

📢 Trade Signals for GC=F Close_lag_1 (lag=1) & FTNT
            rolling_corr  gradient trade_signal
Date                                           
2022-11-29       0.80242 -0.000022        short

📢 Trade Signals for GC=F Close_lag_1 (lag=1) & MSFT
            rolling_corr  gradient trade_signal
Date                                           
2024-11-05      0.816867  0.000712         long

📢 Trade Signals for GC=F Close_lag_1 (lag=1) & WDC
            rolling_corr  gradient trade_signal
Date                                           
2022-10-21      0.816994  0.012717         long

📢 Trade Signals for HG=F Close_lag_1 (lag=1) & IBM
            rolling_corr  gradient trade_signal
Date                                           
2023-05-31      0.827258 -0.016525        short

📢 Trade Signals for HG=F Close_lag_1 (lag=1) & MDB
            rolling_corr  gradient trade_signal
Date                                           
2022-06-15      0.870638  0.01064

In [61]:
features = compute_features(test_commodities['SI=F Close'],test_stocks['WDC'], window=14, lag=1)
trade_signals = detect_trade_signals(features, threshold=0.7, min_streak=2)

In [62]:
features.head()

Unnamed: 0_level_0,SI=F Close_lag_1,WDC,rolling_corr,gradient,over_thresh,streak
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-01-24,-0.021577,-0.01376,0.135706,-0.020699,False,0
2022-01-25,0.004236,-0.034434,0.084839,0.010216,False,0
2022-01-26,-0.003731,-0.009301,0.075389,0.008923,False,0
2022-01-27,-0.048805,-0.013651,-0.109705,-0.02652,False,0
2022-01-28,-0.016498,-0.075996,0.021203,-0.006383,False,0


In [63]:
trade_signals.head()

Unnamed: 0_level_0,SI=F Close_lag_1,WDC,rolling_corr,gradient,over_thresh,streak,trade_signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-10-17,0.035927,0.005029,0.705881,0.018494,True,2,long
2022-10-18,-0.005531,-0.008296,0.706681,0.020101,True,3,long
2022-10-19,-0.012735,-0.007466,0.717963,-0.024331,True,4,short
2022-10-20,0.017837,0.003292,0.708633,0.011684,True,5,long
2022-12-27,0.011547,0.018783,0.71901,0.01754,True,2,long


#### logic generates long and short signals

##### Next steps
1. backtest logic
2. optimize threshold, gradient, minimum streak
3. visualize