## **Ensemble = Huber + AdaBoost + ElasticNet = 0.4 + 0.4 + 0.2**

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import pearsonr
from datetime import datetime
import os
from sklearn.linear_model import HuberRegressor, ElasticNet
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load data (keeping your exact data loading)
hf_df = pd.read_excel("/Users/jatindhurve/Desktop/PROJECTS/main_HF_project/Data/hedge_funds_returns_data.xlsx")
factors_df = pd.read_excel("/Users/jatindhurve/Desktop/PROJECTS/main_HF_project/Data/factors_returns_data.xlsx")
mrkt_df = pd.read_excel("/Users/jatindhurve/Desktop/PROJECTS/main_HF_project/Data/market_signals_indexes.xlsx")

short_mapping = {
    'Date': 'Date',
    'HFRI 400 (US) Fund Weighted Composite Index (HFRI4FWC)': 'HFRI4FWC',
    'HFRI 400 (US) EH: Long/Short Index (HFRI4ELS)': 'HFRI4ELS',
    'HFRI 400 (US) EH: Fundamental Value Index (HFRI4EHV)': 'HFRI4EHV',
    'HFRI 400 (US) Event-Driven Index (HFRI4ED)': 'HFRI4ED'
    
}

hf_df = hf_df.rename(columns=short_mapping)

In [3]:
hf_df.head(3)

Unnamed: 0,Date,HFRI4FWC,HFRI4ELS,HFRI4ED,HFRI4EHV
0,2005-01-31,0.0047,0.0067,0.0012,0.0049
1,2005-02-28,0.0198,0.0279,0.0111,0.0219
2,2005-03-31,-0.0103,-0.0172,0.0002,-0.0111


In [4]:
hf_df.head(3)

Unnamed: 0,Date,HFRI4FWC,HFRI4ELS,HFRI4ED,HFRI4EHV
0,2005-01-31,0.0047,0.0067,0.0012,0.0049
1,2005-02-28,0.0198,0.0279,0.0111,0.0219
2,2005-03-31,-0.0103,-0.0172,0.0002,-0.0111


In [5]:
# Merge data (keeping your exact merging)
df_temp = pd.merge(hf_df, mrkt_df, on='Date')
df = pd.merge(df_temp, factors_df, on='Date')

In [6]:
mrkt_df.head()

Unnamed: 0,Date,vix_index,USYC2Y10_INDEX,LUACOAS_Index,PMI
0,2004-12-31,13.29,117.464,0.81,57.2
1,2005-01-31,12.82,85.558,0.82,56.8
2,2005-02-28,12.08,78.0,0.77,55.5
3,2005-03-31,14.02,71.579,0.93,55.2
4,2005-04-29,15.31,55.521,1.04,52.2


In [7]:
def generate_rolling_periods(df, date_col='Date', years=7, quarter_shift=3):
    """
    Generate overlapping 7-year periods starting every quarter
    """
    df[date_col] = pd.to_datetime(df[date_col])
    df = df.sort_values(date_col).reset_index(drop=True)
    
    periods = []
    start_date = df[date_col].min()
    end_date = df[date_col].max()
    
    current_start = start_date
    
    while True:
        current_end = current_start + pd.DateOffset(years=years)
        
        if current_end > end_date:
            break
            
        # Filter data for current period
        period_data = df[(df[date_col] >= current_start) & (df[date_col] <= current_end)].copy()
        
        if len(period_data) >= 24:  # At least 2 years of monthly data
            # 80% train, 20% test split (changed from 75%/25%)
            split_idx = int(len(period_data) * 0.80)
            train_data = period_data.iloc[:split_idx]
            test_data = period_data.iloc[split_idx:]
            
            period_name = f"{current_start.strftime('%b %Y')}-{current_end.strftime('%b %Y')}"
            
            periods.append({
                'period_name': period_name,
                'train_data': train_data,
                'test_data': test_data
            })
        
        # Move to next quarter
        current_start = current_start + pd.DateOffset(months=quarter_shift)
    
    return periods


#### **Huber + AdaBoost + ElasticNet = 0.4 + 0.4 + 0.2**

In [8]:
def train_regularized_model(X_train, y_train, X_test):
    """
    Train improved models with better balance between bias and variance
    """
    from sklearn.linear_model import HuberRegressor, ElasticNet
    from sklearn.ensemble import AdaBoostRegressor
    
    # Use Huber regression for outlier robustness
    huber_model = HuberRegressor(
        epsilon=1.35,           # Threshold for outliers (default)
        max_iter=100,           # Maximum iterations
        alpha=0.1,              # Regularization strength
        warm_start=False,
        fit_intercept=True,
        tol=1e-05
    )
    huber_model.fit(X_train, y_train)
    
    # Use AdaBoost for adaptive boosting
    adaboost_model = AdaBoostRegressor(
        n_estimators=50,        # Number of weak learners
        learning_rate=0.1,      # Conservative learning rate
        loss='linear',          # Loss function for regression
        random_state=42
    )
    adaboost_model.fit(X_train, y_train)
    
    # Add ElasticNet for combined L1/L2 regularization
    elasticnet_model = ElasticNet(
        alpha=0.01,      # Light regularization
        l1_ratio=0.5,    # 50% L1, 50% L2 (balanced)
        random_state=42
    )
    elasticnet_model.fit(X_train, y_train)
    
    # Make predictions
    huber_train_pred = huber_model.predict(X_train)
    huber_test_pred = huber_model.predict(X_test)
    
    adaboost_train_pred = adaboost_model.predict(X_train)
    adaboost_test_pred = adaboost_model.predict(X_test)
    
    elasticnet_train_pred = elasticnet_model.predict(X_train)
    elasticnet_test_pred = elasticnet_model.predict(X_test)
    
    # Weighted ensemble (40% Huber, 40% AdaBoost, 20% ElasticNet)
    ensemble_train_pred = (0.4 * huber_train_pred + 
                          0.4 * adaboost_train_pred + 
                          0.2 * elasticnet_train_pred)
    
    ensemble_test_pred = (0.4 * huber_test_pred + 
                         0.4 * adaboost_test_pred + 
                         0.2 * elasticnet_test_pred)
    
    return ensemble_train_pred, ensemble_test_pred

In [9]:
def calculate_metrics(y_true, y_pred):
    """
    Calculate all 5 metrics: R², RMSE, MAE, Hit Rate, IC
    """
    # R² Score
    r2 = r2_score(y_true, y_pred)
    
    # RMSE
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    
    # MAE
    mae = mean_absolute_error(y_true, y_pred)
    
    # Hit Rate (directional accuracy)
    hit_rate = np.mean((y_true > 0) == (y_pred > 0))
    
    # Information Coefficient (correlation)
    if len(np.unique(y_pred)) > 1:  # Avoid correlation issues
        ic, _ = pearsonr(y_true, y_pred)
    else:
        ic = 0.0
    
    return {
        'R2': r2,
        'RMSE': rmse,
        'MAE': mae,
        'Hit_Rate': hit_rate,
        'IC': ic
    }

In [10]:
def rolling_analysis(df, feature_cols, target_col, date_col='Date'):
    """
    Perform complete rolling analysis for one target variable
    """
    # Generate rolling periods
    periods = generate_rolling_periods(df, date_col)
    
    results = []
    
    print(f"Running rolling analysis for {target_col}...")
    print(f"Total periods: {len(periods)}")
    
    for i, period in enumerate(periods):
        try:
            # Extract features and target
            X_train = period['train_data'][feature_cols]
            y_train = period['train_data'][target_col]
            X_test = period['test_data'][feature_cols]
            y_test = period['test_data'][target_col]
            
            # Skip if insufficient data
            if len(X_train) < 12 or len(X_test) < 6:
                continue
            
            # Check for data quality
            if X_train.isnull().any().any() or y_train.isnull().any():
                continue
                
            # Train improved model
            train_pred, test_pred = train_regularized_model(X_train, y_train, X_test)
            
            # Calculate metrics
            train_metrics = calculate_metrics(y_train, train_pred)
            test_metrics = calculate_metrics(y_test, test_pred)
            
            # Store results
            result = {
                'time_period': period['period_name'],
                'train_R2': train_metrics['R2'],
                'train_rmse': train_metrics['RMSE'],
                'train_mae': train_metrics['MAE'],
                'train_hit_rate': train_metrics['Hit_Rate'],
                'train_ic': train_metrics['IC'],
                'test_R2': test_metrics['R2'],
                'test_rmse': test_metrics['RMSE'],
                'test_mae': test_metrics['MAE'],
                'test_hit_rate': test_metrics['Hit_Rate'],
                'test_ic': test_metrics['IC']
            }
            
            results.append(result)
            
            if (i + 1) % 5 == 0:
                print(f"Completed {i + 1}/{len(periods)} periods")
                
        except Exception as e:
            print(f"Error in period {period['period_name']}: {str(e)}")
            continue
    
    return pd.DataFrame(results)


In [11]:
def run_full_analysis(df):
    """
    Run rolling analysis for all 7 Hedge Fund strategies
    """
    # Use more features to improve model performance
    feature_cols = ['vix_index', 'USYC2Y10_INDEX', 'LUACOAS_Index', 'PMI', 'Mkt-RF','RF']
  
    target_cols = {
    'HF1_HFRI4FWC': 'HFRI4FWC',
    'HF2_HFRI4ELS': 'HFRI4ELS', 
    'HF3_HFRI4EHV': 'HFRI4EHV',
    'HF4_HFRI4ED': 'HFRI4ED'
}

    results = {}
    
    for key, target_col in target_cols.items():
        print(f"\n{'='*50}")
        print(f"Processing {key}: {target_col}")
        print('='*50)
        
        # Run rolling analysis
        df_results = rolling_analysis(df, feature_cols, target_col)
        results[key] = df_results
        
        # Print enhanced summary
        if not df_results.empty:
            print(f"\nImproved Summary for {target_col}:")
            print(f"Average Test R²: {df_results['test_R2'].mean():.4f}")
            print(f"Average Test Hit Rate: {df_results['test_hit_rate'].mean():.4f}")
            print(f"Average Test IC: {df_results['test_ic'].mean():.4f}")
            print(f"Periods with positive R²: {(df_results['test_R2'] > 0).sum()}/{len(df_results)}")
        
    return results

In [12]:
# Run the analysis
print("Starting improved ML analysis with better bias-variance balance...")
results = run_full_analysis(df)

Starting improved ML analysis with better bias-variance balance...

Processing HF1_HFRI4FWC: HFRI4FWC
Running rolling analysis for HFRI4FWC...
Total periods: 46
Completed 5/46 periods
Completed 10/46 periods
Completed 15/46 periods
Completed 20/46 periods
Completed 25/46 periods
Completed 30/46 periods
Completed 35/46 periods
Completed 40/46 periods
Completed 45/46 periods

Improved Summary for HFRI4FWC:
Average Test R²: 0.4942
Average Test Hit Rate: 0.7916
Average Test IC: 0.8159
Periods with positive R²: 44/46

Processing HF2_HFRI4ELS: HFRI4ELS
Running rolling analysis for HFRI4ELS...
Total periods: 46
Completed 5/46 periods
Completed 10/46 periods
Completed 15/46 periods
Completed 20/46 periods
Completed 25/46 periods
Completed 30/46 periods
Completed 35/46 periods
Completed 40/46 periods
Completed 45/46 periods

Improved Summary for HFRI4ELS:
Average Test R²: 0.6294
Average Test Hit Rate: 0.8325
Average Test IC: 0.8670
Periods with positive R²: 45/46

Processing HF3_HFRI4EHV: HFRI4

In [13]:
# Export results (keeping your exact export structure)
output_folder = '/Users/jatindhurve/Desktop/PROJECTS/main_HF_project/all_output_results/ML_validation_outputs'
os.makedirs(output_folder, exist_ok=True)
excel_filename = os.path.join(output_folder, "H_AB_EN_valiedation.xlsx")

with pd.ExcelWriter(excel_filename, engine='openpyxl') as writer:
    results['HF1_HFRI4FWC'].to_excel(writer, sheet_name='HF1_HFRI4FWC', index=False)
    results['HF2_HFRI4ELS'].to_excel(writer, sheet_name='HF2_HFRI4ELS', index=False)
    results['HF3_HFRI4EHV'].to_excel(writer, sheet_name='HF3_HFRI4EHV', index=False)
    results['HF4_HFRI4ED'].to_excel(writer, sheet_name='HF4_HFRI4ED', index=False)
    
print("All hedge fund results exported successfully!")
print(f"File saved at: {excel_filename}")

All hedge fund results exported successfully!
File saved at: /Users/jatindhurve/Desktop/PROJECTS/main_HF_project/all_output_results/ML_validation_outputs/H_AB_EN_valiedation.xlsx


## **Model Validation**

In [14]:
df = pd.read_excel("/Users/jatindhurve/Desktop/PROJECTS/main_HF_project/all_output_results/ML_validation_outputs/H_AB_EN_valiedation.xlsx", sheet_name=None)
df.keys()

dict_keys(['HF1_HFRI4FWC', 'HF2_HFRI4ELS', 'HF3_HFRI4EHV', 'HF4_HFRI4ED'])

In [15]:
HF1 = df['HF1_HFRI4FWC']
HF2 = df['HF2_HFRI4ELS']
HF3 = df['HF3_HFRI4EHV']
HF4 = df['HF4_HFRI4ED']

In [16]:
def model_comparison_analysis(df):
    test_columns = [col for col in df.columns if col.startswith('test_')]
    
    for col in test_columns:
        avg_val = df[col].mean()
        min_val = df[col].min() 
        max_val = df[col].max()
        
        print(f"\n{col.upper()}:")
        print(f"  Average: {avg_val:.4f}")
        print(f"  Range: {min_val:.4f} to {max_val:.4f}")
        
        if 'R2' in col:
            neg_count = (df[col] < 0).sum()
            print(f"  Negative periods: {neg_count}/{len(df)} ({neg_count/len(df)*100:.1f}%)")
        elif 'hit_rate' in col.lower():
            below_50_count = (df[col] < 0.5).sum()
            print(f"  Below 50% periods: {below_50_count}/{len(df)} ({below_50_count/len(df)*100:.1f}%)")

# Function to create ranked hedge fund summary
def create_ranked_hedge_fund_summary():
    hedge_funds = {
        'HF1_HFRI4FWC': HF1,
        'HF2_HFRI4ELS': HF2, 
        'HF3_HFRI4EHV': HF3,
        'HF4_HFRI4ED': HF4
    }
    
    summary_results = []
    
    for hf_name, hf_data in hedge_funds.items():
        test_r2_avg = hf_data['test_R2'].mean()
        test_hit_rate_avg = hf_data['test_hit_rate'].mean()
        test_ic_avg = hf_data['test_ic'].mean()
        
        r2_neg_count = (hf_data['test_R2'] < 0).sum()
        breakdown_rate = (r2_neg_count / len(hf_data)) * 100
        
        # Calculate composite score for ranking
        # Higher R2, higher hit rate, higher IC, lower breakdown = better score
        composite_score = (test_r2_avg * 0.4) + (test_hit_rate_avg * 0.3) + (test_ic_avg * 0.2) - (breakdown_rate/100 * 0.1)
        
        # Determine recommendation
        if test_r2_avg < -0.1 or breakdown_rate > 40:
            recommendation = "REJECT"
        elif test_r2_avg > 0.4 and breakdown_rate < 10:
            recommendation = "CORE HOLDING"
        elif test_r2_avg > 0.3 and breakdown_rate < 20:
            recommendation = "STRONG BUY"
        elif test_r2_avg > 0.2 and breakdown_rate < 20:
            recommendation = "MODERATE BUY"
        elif test_r2_avg > 0 and breakdown_rate < 30:
            recommendation = "CAUTION"
        else:
            recommendation = "AVOID"
        
        summary_results.append({
            'Hedge_Fund': hf_name,
            'R2_Avg': f"{test_r2_avg*100:.1f}%",
            'Hit_Rate_Avg': f"{test_hit_rate_avg*100:.1f}%", 
            'IC_Avg': f"{test_ic_avg*100:.1f}%",
            'Breakdown_Rate': f"{breakdown_rate:.1f}%",
            'Score': round(composite_score, 3),
            'Recommendation': recommendation
        })
    
    # Convert to DataFrame and sort by score (descending)
    summary_df = pd.DataFrame(summary_results)
    summary_df = summary_df.sort_values('Score', ascending=False).reset_index(drop=True)
    summary_df['Rank'] = range(1, len(summary_df) + 1)
    
    # Reorder columns
    summary_df = summary_df[['Rank', 'Hedge_Fund', 'R2_Avg', 'Hit_Rate_Avg', 'IC_Avg', 'Breakdown_Rate', 'Score', 'Recommendation']]
    
    print("\nRANKED HEDGE FUND ANALYSIS:")
    print("=" * 80)
    print(summary_df.to_string(index=False))
    
    return summary_df

# Run the ranked analysis
ranked_summary = create_ranked_hedge_fund_summary()


RANKED HEDGE FUND ANALYSIS:
 Rank   Hedge_Fund R2_Avg Hit_Rate_Avg IC_Avg Breakdown_Rate  Score Recommendation
    1 HF3_HFRI4EHV  68.7%        85.0%  89.0%           0.0%  0.708   CORE HOLDING
    2 HF2_HFRI4ELS  62.9%        83.2%  86.7%           2.2%  0.673   CORE HOLDING
    3 HF1_HFRI4FWC  49.4%        79.2%  81.6%           4.3%  0.594   CORE HOLDING
    4  HF4_HFRI4ED  37.4%        77.2%  75.2%           4.3%  0.527     STRONG BUY
