In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import pearsonr
from datetime import datetime
import os
from sklearn.linear_model import HuberRegressor, ElasticNet
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data (keeping your exact data loading)
hf_df = pd.read_excel("Data/hedge_funds_returns_data.xlsx")
factors_df = pd.read_excel("Data/factors_returns_data.xlsx")
mrkt_df = pd.read_excel("Data/market_signals_indexes.xlsx")

short_mapping = {
    'Date': 'Date',
    'HFRI 400 (US) Fund Weighted Composite Index (HFRI4FWC)': 'HFRI4FWC',
    'HFRI 400 (US) EH: Long/Short Index (HFRI4ELS)': 'HFRI4ELS',
    'HFRI 400 (US) EH: Fundamental Value Index (HFRI4EHV)': 'HFRI4EHV',
    'HFRI 400 (US) Event-Driven Index (HFRI4ED)': 'HFRI4ED'
    
}

hf_df = hf_df.rename(columns=short_mapping)

In [13]:
df_temp = pd.merge(hf_df, mrkt_df, on='Date')
df = pd.merge(df_temp, factors_df, on='Date')
df.head()

Unnamed: 0,Date,HFRI4FWC,HFRI4ELS,HFRI4ED,HFRI4EHV,vix_index,USYC2Y10_INDEX,LUACOAS_Index,PMI,Mkt-RF,SMB,HML,Mom,RF
0,2005-01-31,0.0047,0.0067,0.0012,0.0049,12.82,85.558,0.82,56.8,-0.0276,-0.0172,0.0206,0.0305,0.0016
1,2005-02-28,0.0198,0.0279,0.0111,0.0219,12.08,78.0,0.77,55.5,0.0189,-0.0057,0.0153,0.0337,0.0016
2,2005-03-31,-0.0103,-0.0172,0.0002,-0.0111,14.02,71.579,0.93,55.2,-0.0197,-0.014,0.0204,0.0041,0.0021
3,2005-04-29,-0.0124,-0.0165,-0.0105,-0.0137,15.31,55.521,1.04,52.2,-0.0261,-0.0393,0.0006,-0.0069,0.0021
4,2005-05-31,0.0079,0.0122,0.0064,0.0122,13.29,40.832,0.87,50.8,0.0365,0.0289,-0.0064,0.0045,0.0024


In [14]:
print(df.head(1))
print(df.tail(1))

        Date  HFRI4FWC  HFRI4ELS  HFRI4ED  HFRI4EHV  vix_index  \
0 2005-01-31    0.0047    0.0067   0.0012    0.0049      12.82   

   USYC2Y10_INDEX  LUACOAS_Index   PMI  Mkt-RF     SMB     HML  Mom         RF  
0          85.558           0.82  56.8 -0.0276 -0.0172  0.0206  0.0305  0.0016  
          Date  HFRI4FWC  HFRI4ELS  HFRI4ED  HFRI4EHV  vix_index  \
220 2023-05-31   -0.0036    0.0007  -0.0189   -0.0067      17.94   

     USYC2Y10_INDEX  LUACOAS_Index   PMI  Mkt-RF     SMB    HML  Mom     \
220         -76.424           1.38  46.4  0.0035  0.0153 -0.078 -0.0063   

         RF  
220  0.0036  


In [15]:
print(df.columns)

Index(['Date', 'HFRI4FWC', 'HFRI4ELS', 'HFRI4ED', 'HFRI4EHV', 'vix_index',
       'USYC2Y10_INDEX', 'LUACOAS_Index', 'PMI', 'Mkt-RF', 'SMB', 'HML',
       'Mom   ', 'RF'],
      dtype='object')


In [16]:
# Define feature variables (macro indicators)
X = df[['vix_index', 'USYC2Y10_INDEX', 'LUACOAS_Index','Mkt-RF','RF' ]]

In [17]:
# Define target variables (7 Hedge Fund Returns)
HF1 = df['HFRI4FWC']  # Fund Weighted Composite
HF2 = df['HFRI4ELS']  # Long/Short
HF3 = df['HFRI4EHV']  # Fundamental Value
HF4 = df['HFRI4ED']   # Event-Driven

In [18]:
print(df.columns.tolist())  # This will show you all column names

['Date', 'HFRI4FWC', 'HFRI4ELS', 'HFRI4ED', 'HFRI4EHV', 'vix_index', 'USYC2Y10_INDEX', 'LUACOAS_Index', 'PMI', 'Mkt-RF', 'SMB', 'HML', 'Mom   ', 'RF']


In [19]:
def get_quarter_info(date):
    """
    Get quarter string and quarter start/end dates
    """
    year = date.year
    month = date.month
    
    if month <= 3:
        quarter = f"Q1 {year}"
        quarter_start = pd.Timestamp(year, 1, 1)
        quarter_end = pd.Timestamp(year, 3, 31)
    elif month <= 6:
        quarter = f"Q2 {year}"
        quarter_start = pd.Timestamp(year, 4, 1)
        quarter_end = pd.Timestamp(year, 6, 30)
    elif month <= 9:
        quarter = f"Q3 {year}"
        quarter_start = pd.Timestamp(year, 7, 1)
        quarter_end = pd.Timestamp(year, 9, 30)
    else:
        quarter = f"Q4 {year}"
        quarter_start = pd.Timestamp(year, 10, 1)
        quarter_end = pd.Timestamp(year, 12, 31)
    
    return quarter, quarter_start, quarter_end

#### **Huber + AdaBoost + ElasticNet = 0.4 + 0.4 + 0.2**

In [20]:
def train_ensemble_model_simple(X_train, y_train, X_predict):
    """
    Train improved models with better balance between bias and variance
    """
    from sklearn.linear_model import HuberRegressor, ElasticNet
    from sklearn.ensemble import AdaBoostRegressor
    
    # Use Huber regression for outlier robustness (40% weight)
    huber_model = HuberRegressor(
        epsilon=1.35,           # Threshold for outliers (default)
        max_iter=100,           # Maximum iterations
        alpha=0.1,              # Regularization strength
        warm_start=False,
        fit_intercept=True,
        tol=1e-05
    )
    huber_model.fit(X_train, y_train)
    
    # Use AdaBoost for adaptive boosting (40% weight)
    adaboost_model = AdaBoostRegressor(
        n_estimators=50,        # Number of weak learners
        learning_rate=0.1,      # Conservative learning rate
        loss='linear',          # Loss function for regression
        random_state=42
    )
    adaboost_model.fit(X_train, y_train)
    
    # Add ElasticNet for combined L1/L2 regularization (20% weight)
    elasticnet_model = ElasticNet(
        alpha=0.01,      # Light regularization
        l1_ratio=0.5,    # 50% L1, 50% L2 (balanced)
        random_state=42
    )
    elasticnet_model.fit(X_train, y_train)
    
    # Make predictions
    huber_pred = huber_model.predict(X_predict)
    adaboost_pred = adaboost_model.predict(X_predict)
    elasticnet_pred = elasticnet_model.predict(X_predict)
    
    # Weighted ensemble (40% Huber, 40% AdaBoost, 20% ElasticNet)
    ensemble_pred = (0.4 * huber_pred + 
                    0.4 * adaboost_pred + 
                    0.2 * elasticnet_pred)
    
    return ensemble_pred

In [21]:
def calculate_expected_returns(df, feature_cols=['vix_index', 'USYC2Y10_INDEX', 'LUACOAS_Index','Mkt-RF','RF' ], 
                             target_cols=['HFRI4FWC', 'HFRI4ELS', 'HFRI4EHV', 'HFRI4ED'], 
                             date_col='Date', training_years=7):
    """
    Calculate expected returns for each quarter using rolling 7-year training periods
    """
    # Ensure date column is datetime
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    df = df.sort_values(date_col).reset_index(drop=True)
    
    results = []
    
    # Find all unique quarters in the data (starting from training_years + 1)
    start_date = df[date_col].min() + pd.DateOffset(years=training_years)
    end_date = df[date_col].max()
    
    print(f"Calculating expected returns from {start_date.strftime('%Y-%m')} to {end_date.strftime('%Y-%m')}")
    
    # Generate all quarters from start_date to end_date
    current_date = pd.Timestamp(start_date.year, ((start_date.month-1)//3)*3 + 1, 1)  # Start of quarter
    
    while current_date <= end_date:
        quarter, quarter_start, quarter_end = get_quarter_info(current_date)
        
        # Check if we have data for this quarter
        quarter_data = df[(df[date_col] >= quarter_start) & (df[date_col] <= quarter_end)]
        
        if len(quarter_data) == 0:
            # Move to next quarter
            if current_date.month <= 3:
                current_date = pd.Timestamp(current_date.year, 4, 1)
            elif current_date.month <= 6:
                current_date = pd.Timestamp(current_date.year, 7, 1)
            elif current_date.month <= 9:
                current_date = pd.Timestamp(current_date.year, 10, 1)
            else:
                current_date = pd.Timestamp(current_date.year + 1, 1, 1)
            continue
        
        # Define training period (7 years before quarter start)
        train_start = quarter_start - pd.DateOffset(years=training_years)
        train_end = quarter_start - pd.DateOffset(days=1)  # Day before quarter starts
        
        # Get training data
        train_data = df[(df[date_col] >= train_start) & (df[date_col] <= train_end)]
        
        # Skip if insufficient training data
        if len(train_data) < 24:  # Need at least 2 years of monthly data
            print(f"Insufficient training data for {quarter}, skipping...")
            # Move to next quarter
            if current_date.month <= 3:
                current_date = pd.Timestamp(current_date.year, 4, 1)
            elif current_date.month <= 6:
                current_date = pd.Timestamp(current_date.year, 7, 1)
            elif current_date.month <= 9:
                current_date = pd.Timestamp(current_date.year, 10, 1)
            else:
                current_date = pd.Timestamp(current_date.year + 1, 1, 1)
            continue
        
        print(f"Processing {quarter} - Training: {train_start.strftime('%b %Y')} to {train_end.strftime('%b %Y')}")
        
        # Prepare training features
        X_train = train_data[feature_cols]
        X_quarter = quarter_data[feature_cols]
        
        # Store predictions for this quarter
        quarter_predictions = {'Quarter': quarter}
        
        # Train model for each target variable
        for target in target_cols:
            try:
                y_train = train_data[target]
                
                # Train ensemble model and predict
                predictions = train_ensemble_model_simple(X_train, y_train, X_quarter)
                
                # Average prediction for the quarter (mean of monthly predictions)
                quarter_predictions[target] = np.mean(predictions)
                
            except Exception as e:
                print(f"Error predicting {target} for {quarter}: {str(e)}")
                quarter_predictions[target] = np.nan
        
        results.append(quarter_predictions)
        
        # Move to next quarter
        if current_date.month <= 3:
            current_date = pd.Timestamp(current_date.year, 4, 1)
        elif current_date.month <= 6:
            current_date = pd.Timestamp(current_date.year, 7, 1)
        elif current_date.month <= 9:
            current_date = pd.Timestamp(current_date.year, 10, 1)
        else:
            current_date = pd.Timestamp(current_date.year + 1, 1, 1)
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    
    print(f"\nCompleted! Generated expected returns for {len(results_df)} quarters")
    
    return results_df


In [22]:
def display_expected_returns_summary(results_df):
    """
    Display summary statistics of the expected returns
    """
    if results_df.empty:
        print("No results to display")
        return
    
    print("\n" + "="*60)
    print("HEDGE FUND EXPECTED RETURNS SUMMARY")
    print("="*60)
    
    # Numeric columns only
    numeric_cols = results_df.select_dtypes(include=[np.number]).columns
    
    print(f"\nNumber of quarters: {len(results_df)}")
    print(f"Date range: {results_df['Quarter'].iloc[0]} to {results_df['Quarter'].iloc[-1]}")
    
    print("\nMean Expected Returns:")
    for col in numeric_cols:
        mean_return = results_df[col].mean()
        print(f"  {col}: {mean_return:.4f} ({mean_return*100:.2f}%)")
    
    print("\nStandard Deviation of Expected Returns:")
    for col in numeric_cols:
        std_return = results_df[col].std()
        print(f"  {col}: {std_return:.4f} ({std_return*100:.2f}%)")
    
    print("\nFirst 5 quarters:")
    print(results_df.head().round(4))
    
    print("\nLast 5 quarters:")
    print(results_df.tail().round(4))


In [23]:
# Main execution function for expected returns
def run_expected_returns_calculation(df):
    """
    Run the complete expected returns calculation for hedge funds
    """
    # Define features and targets
    feature_cols = ['vix_index', 'USYC2Y10_INDEX', 'LUACOAS_Index','Mkt-RF','RF' ]
    target_cols = ['HFRI4FWC', 'HFRI4ELS', 'HFRI4EHV', 'HFRI4ED']
    
    # Calculate expected returns
    expected_returns_df = calculate_expected_returns(
        df, 
        feature_cols=feature_cols, 
        target_cols=target_cols,
        date_col='Date',
        training_years=7
    )
    
    # Display summary
    display_expected_returns_summary(expected_returns_df)
    
    return expected_returns_df

In [24]:
expected_returns = run_expected_returns_calculation(df)
print(expected_returns.head())

Calculating expected returns from 2012-01 to 2023-05
Processing Q1 2012 - Training: Jan 2005 to Dec 2011
Processing Q2 2012 - Training: Apr 2005 to Mar 2012
Processing Q3 2012 - Training: Jul 2005 to Jun 2012
Processing Q4 2012 - Training: Oct 2005 to Sep 2012
Processing Q1 2013 - Training: Jan 2006 to Dec 2012
Processing Q2 2013 - Training: Apr 2006 to Mar 2013
Processing Q3 2013 - Training: Jul 2006 to Jun 2013
Processing Q4 2013 - Training: Oct 2006 to Sep 2013
Processing Q1 2014 - Training: Jan 2007 to Dec 2013
Processing Q2 2014 - Training: Apr 2007 to Mar 2014
Processing Q3 2014 - Training: Jul 2007 to Jun 2014
Processing Q4 2014 - Training: Oct 2007 to Sep 2014
Processing Q1 2015 - Training: Jan 2008 to Dec 2014
Processing Q2 2015 - Training: Apr 2008 to Mar 2015
Processing Q3 2015 - Training: Jul 2008 to Jun 2015
Processing Q4 2015 - Training: Oct 2008 to Sep 2015
Processing Q1 2016 - Training: Jan 2009 to Dec 2015
Processing Q2 2016 - Training: Apr 2009 to Mar 2016
Processing 

In [25]:
expected_returns.head(5)

Unnamed: 0,Quarter,HFRI4FWC,HFRI4ELS,HFRI4EHV,HFRI4ED
0,Q1 2012,0.016628,0.022772,0.020539,0.01604
1,Q2 2012,0.001403,-0.000219,-0.002237,0.001847
2,Q3 2012,0.009219,0.013701,0.012927,0.011619
3,Q4 2012,0.004305,0.004202,0.004129,0.005091
4,Q1 2013,0.01462,0.019952,0.020217,0.01684


In [26]:
expected_returns.tail(5)

Unnamed: 0,Quarter,HFRI4FWC,HFRI4ELS,HFRI4EHV,HFRI4ED
41,Q2 2022,-0.012201,-0.023422,-0.029179,-0.012557
42,Q3 2022,-0.001204,-0.003537,-0.006279,-0.004387
43,Q4 2022,0.006773,0.007284,0.00558,0.004758
44,Q1 2023,0.006924,0.009231,0.010007,0.005268
45,Q2 2023,0.001152,0.004535,0.005377,0.001956


In [None]:
output_folder = 'all_output_results'
os.makedirs(output_folder, exist_ok=True)
expected_returns.to_excel(os.path.join(output_folder, 'ER_ML_HedgeFunds.xlsx'), index=False)