In [None]:
# Version 1

In [1]:
# Version 4.2
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time

# Section 1: Configuration and Setup
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

RETURN_METRIC = "1 Month Return"
ROLLING_PERIODS = [12, 24, 36, 48, 60]  # in months
DRY_RUN = True
SAMPLE_DRY_RUN = True
SAMPLE_SIZE = 100
CHUNK_SIZE = 5600
BATCH_INSERT_SIZE = 10000
MAX_WORKERS = 15  # Optimized for 16-core i9-185H

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    return mapping.get(category, ("Unknown", "Unknown"))

# Section 3: Data Loading Functions
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.YC_Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    JOIN CWA_Broad_Category_Name b ON f.CWA_Broad_Category_ID = b.ID
    """
    df = pd.read_sql(query, engine)
    df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
    return df.dropna(subset=["Region", "FactorProfile"])

def load_fund_returns(fund_ids):
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
        AND Metric = '{RETURN_METRIC}'
    """
    return pd.read_sql(query, engine, parse_dates=["Date"]).pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

def load_db_factors(factor_list, region="Global", table="factor_returns", portfolio_filter=None):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    portfolio_in_clause = ','.join([f"'{p}'" for p in portfolio_filter]) if portfolio_filter else ''
    query = f"""
        SELECT date AS Date, factor_symbol AS Factor, value AS Value
        FROM {table}
        WHERE region = ?
        AND factor_symbol IN ({factor_in_clause})
    """
    if portfolio_filter:
        query += f" AND portfolio IN ({portfolio_in_clause})"
    print(f"Executing query: {query} with param: {region}")
    df = pd.read_sql(query, engine, params=(region,), parse_dates=['Date'])
    print(f"DB factors loaded: {df.columns.tolist()} | Shape: {df.shape}")
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value").rename(
        columns={
            'MKT': 'mkt', 'SMB': 'smb', 'HML_Devil': 'hml', 'UMD': 'umd', 'QMJ': 'qmj',
            'BAB': 'bab', 'RF': 'rf', 'TSM-Com': 'tsm-com', 'TSM-EQ': 'tsm-eq',
            'TSM-FI': 'tsm-fi', 'TSM-FX': 'tsm-fx', 'TSM-MA': 'tsm-ma'
        }
    )
    print(f"DB factors columns after pivot: {pivoted_df.columns.tolist()}")
    return pivoted_df

def load_fixed_income_factors(factor_list):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
        WHERE Factor_Name IN ({factor_in_clause})
    """
    return pd.read_sql(query, engine, parse_dates=["Date"]).pivot(index="Date", columns="Factor_Name", values="ReturnValue")

def load_century_factors(factor_list, portfolio, asset_class=None, region="Global"):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM aqr_century_factors
        WHERE factor IN ({factor_in_clause})
        AND portfolio = ?
    """
    params = [portfolio]
    if asset_class:
        query += " AND asset_class = ?"
        params.append(asset_class)
    if region != "Global":
        query += " AND region = ?"
        params.append(region)
    print(f"Executing query: {query} with params: {params}")
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    print(f"Century factors loaded: {df.columns.tolist()} | Shape: {df.shape}")
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value")
    print(f"Century factors columns after pivot: {pivoted_df.columns.tolist()}")
    return pivoted_df

def load_commodity_factors(factor_list):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT date AS Date, {factor_in_clause} AS Factor, value AS Value
        FROM aqr_cmdty_factors
    """
    df = pd.read_sql(query, engine, parse_dates=['Date'])
    print(f"Commodity factors loaded: {df.columns.tolist()} | Shape: {df.shape}")
    return df.set_index("Date")

# Section 4: Rolling Regression Functions
def run_rolling_regression(fund, returns, factors, regression_type, factor_set):
    results = []
    returns.index = pd.to_datetime(returns.index)
    factors.index = pd.to_datetime(factors.index)
    viable_periods = [w for w in ROLLING_PERIODS if (returns.index.max() - relativedelta(months=w)) >= returns.index.min()]
    
    for window in viable_periods:
        start = returns.index.min() + relativedelta(months=window)
        for end_date in returns.loc[returns.index >= start].index:
            start_date = end_date - relativedelta(months=window - 1)
            y = returns.loc[start_date:end_date]
            X = factors.loc[start_date:end_date]
            X, y = X.align(y, join="inner", axis=0)
            if len(y) < window or y.isnull().any() or X.isnull().any().any():
                continue
            X_const = add_constant(X)
            model = OLS(y, X_const).fit()
            for factor in X.columns:
                results.append({
                    "SymbolCUSIP": fund,
                    "MonthEndDate": end_date,
                    "RollPeriod": f"{window}m",
                    "Factor_Name": factor,
                    "Coefficient": model.params.get(factor, np.nan),
                    "P_Value": model.pvalues.get(factor, np.nan),
                    "Regression_Type": regression_type,
                    "Factor_Set": factor_set
                })
    return results

# Section 5: Main Processing Pipeline
def process_region(region, fund_subset, regression_func):
    records = []
    
    for _, fund_row in fund_subset.iterrows():
        symbol = fund_row["SymbolCUSIP"]
        category = fund_row["Global_Category_Name"]
        broad_category = fund_row["CWA_Broad_Category_Name"]
        
        # Equity
        if category in [
            "Energy Sector Equity", "Equity Miscellaneous", "Financials Sector Equity",
            "Healthcare Sector Equity", "Consumer Goods & Services Sector Equity",
            "Communications Sector Equity", "Industrials Sector Equity", "Other Sector Equity",
            "Real Estate Sector Equity", "Precious Metals Sector Equity", "Technology Sector Equity",
            "Utilities Sector Equity", "US Equity Large Cap Blend", "US Equity Large Cap Growth",
            "US Equity Large Cap Value", "US Equity Mid Cap", "US Equity Small Cap", "Options Trading"
        ]:
            # Regression 1: USA, MKT-RF, HML-Devil, QMJ, SMB, UMD, TSM-EQ, BAB
            factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB', 'RF'], "USA")
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']], "OLS", "Equity_USA_1"))
            
            # Regression 2: Century Factors, US Stock Selection
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "US Stock Selection", region="USA")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_USA_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_USA_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_USA_4"))
            
            if broad_category in ["Quantitative/Tactical", "Strategic", "Nontraditional"]:
                # Regression 5: Quantitative/Tactical, Strategic, Nontraditional
                factors = load_db_factors(['MKT', 'RF', 'TSM-FX', 'TSM-FI', 'BAB'], "USA")
                fi_factors = load_fixed_income_factors(['TERM', 'CREDIT'])
                cmdty_factors = load_commodity_factors(['excess_return_eqwt'])
                factors = pd.concat([factors, fi_factors, cmdty_factors], axis=1)
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'tsm-fx', 'tsm-fi', 'bab', 'TERM', 'CREDIT', 'excess_return_eqwt']], "OLS", "Equity_USA_5"))
                
                # Regression 6: Quantitative/Tactical, Strategic, Nontraditional
                factors = load_db_factors(['MKT', 'TSM-EQ', 'SMB', 'BAB', 'TSM-Com'], "USA")
                fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT_HY'])
                factors = pd.concat([factors, fi_factors], axis=1)
                if 'mkt' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - load_db_factors(['RF'], 'Global')['rf']
                records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'tsm-eq', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT_HY', 'tsm-com']], "OLS", "Equity_USA_6"))
                
                # Regression 7: Quantitative/Tactical, Strategic, Nontraditional
                factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'UMD', 'TSM-EQ', 'SMB', 'BAB', 'TSM-Com', 'TSM-FI', 'TSM-FX'], "USA")
                fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
                factors = pd.concat([factors, fi_factors], axis=1)
                if 'mkt' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - load_db_factors(['RF'], 'Global')['rf']
                records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'umd', 'tsm-eq', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fi', 'tsm-fx']], "OLS", "Equity_USA_7"))
        
        elif category in [
            "Asia ex-Japan Equity", "Australia & New Zealand Equity", "Canadian Equity Large Cap",
            "Europe Equity Large Cap", "Europe Equity Mid/Small Cap", "Greater China Equity",
            "India Equity", "Mexico Equity", "Japan Equity", "Korea Equity", "Latin America Equity",
            "UK Equity Large Cap", "Thailand Equity"
        ]:
            # Regression 1: International
            factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB', 'RF'], "Intl")
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']], "OLS", "Equity_Intl_1"))
            
            # Regression 2: Century Factors, Intl Stock Selection
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Intl Stock Selection", region="International")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Intl_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Intl_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Intl_4"))
        
        elif category in ["Global Emerging Markets Equity", "Global Equity Large Cap", "Global Equity Mid/Small Cap"]:
            # Regression 1: Global
            factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB', 'RF'], "Global")
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']], "OLS", "Equity_Global_1"))
            
            # Regression 2: Century Factors, All Stock Selection
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Stock Selection")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Global_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Global_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Global_4"))
        
        # Fixed Income
        elif category in [
            "Convertibles", "Emerging Markets Fixed Income", "Fixed Income Miscellaneous",
            "US Fixed Income", "US Municipal Fixed Income"
        ]:
            # Regression 1: TERM_Int, TERM_Long, CREDIT, CREDIT_HY, TSM-FI, TSM-FX
            factors = load_db_factors(['TSM-FI', 'TSM-FX'])
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors, fi_factors], axis=1)
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-fi', 'tsm-fx']], "OLS", "FI_1"))
            
            # Regression 2: Century Factors, Fixed Income
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Fixed Income")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "FI_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "FI_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "FI_4"))
        
        # Allocation
        elif category in [
            "Aggressive Allocation", "Allocation Miscellaneous", "Cautious Allocation",
            "Flexible Allocation", "Moderate Allocation"
        ]:
            # Regression 1: Global, MKT-RF, HML-Devil, QMJ, SMB, UMD, BAB, TSM-EQ, TSM-FI, TERM_Int, TERM_Long, CREDIT, CREDIT_HY
            factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ', 'TSM-FI', 'RF'], "Global")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors, fi_factors], axis=1)
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']], "OLS", "Allocation_1"))
            
            # Regression 2: Century Factors, Fixed Income and All Stock Selection
            fi_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Fixed Income")
            stock_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Stock Selection")
            factors = pd.concat([fi_factors, stock_factors], axis=1)
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Allocation_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Allocation_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Allocation_4"))
        
        # Alternatives
        elif category in [
            "Alternative Miscellaneous", "Long/Short Equity", "Market Neutral", "Multialternative"
        ]:
            # Regression 1: Global, MKT-RF, HML-Devil, QMJ, SMB, UMD, BAB, TSM-EQ, TSM-FI, TERM_Int, TERM_Long, CREDIT, CREDIT_HY, TSM-COM, TSM-FX
            factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ', 'TSM-FI', 'TSM-Com', 'TSM-FX', 'RF'], "Global")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors, fi_factors], axis=1)
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fx']], "OLS", "Alternative_1"))
            
            # Regression 2: Century Factors, Fixed Income, All Stock Selection, Commodities
            fi_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Fixed Income")
            stock_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Stock Selection")
            cmdty_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Commodities")
            factors = pd.concat([fi_factors, stock_factors, cmdty_factors], axis=1)
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Alternative_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Alternative_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Alternative_4"))
        
        # Commodities
        elif category in ["Commodities Broad Basket", "Commodities Specified"]:
            # Regression 1: All factors from aqr_cmdty_factors
            factors = load_commodity_factors(['excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt', 'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short', 'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango'])
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Commodity_1"))
    
    if records:
        if not DRY_RUN:
            insert_batch(records)
        else:
            print(f"ℹ️ Dry run: Would have written {len(records)} records for {region}")

def main():
    logging.basicConfig(level=logging.INFO)
    
    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()
    print(f"🧠 Total mapped funds: {len(fund_meta)}")
    print(f"📍 Regions detected: {regions}\n")
    
    regression_func = run_rolling_regression
    
    for region in regions:
        fund_subset = fund_meta[fund_meta["Region"] == region]
        process_region(region, fund_subset, regression_func)

# Section 6: Database Output
def insert_batch(records):
    df = pd.DataFrame(records)
    if not DRY_RUN:
        for i in range(0, len(df), BATCH_INSERT_SIZE):
            batch = df.iloc[i:i + BATCH_INSERT_SIZE]
            batch.to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)
    else:
        print(f"ℹ️ Dry run: Skipped writing {len(df)} records")

if __name__ == "__main__":
    main()

ProgrammingError: (pyodbc.ProgrammingError) ('42S02', "[42S02] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Invalid object name 'CWA_Broad_Category_Name'. (208) (SQLExecDirectW)")
[SQL: 
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.YC_Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    JOIN CWA_Broad_Category_Name b ON f.CWA_Broad_Category_ID = b.ID
    ]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [3]:
# Version 4.3
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time

# Section 1: Configuration and Setup
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

RETURN_METRIC = "1 Month Return"
ROLLING_PERIODS = [12, 24, 36, 48, 60]  # in months
DRY_RUN = True
SAMPLE_DRY_RUN = True
SAMPLE_SIZE = 100
CHUNK_SIZE = 5600
BATCH_INSERT_SIZE = 10000
MAX_WORKERS = 15  # Optimized for 16-core i9-185H

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    return mapping.get(category, ("Unknown", "Unknown"))

# Section 3: Data Loading Functions
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    print("Executing metadata query...")
    try:
        df = pd.read_sql(query, engine)
        df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
        print(f"Metadata loaded: {df.columns.tolist()} | Shape: {df.shape}")
        if df["CWA_Broad_Category_Name"].isnull().all():
            print("⚠️ Warning: CWA_Broad_Category_Name is missing for all rows; Equity regressions 5-7 will be skipped")
        return df.dropna(subset=["Region", "FactorProfile"])
    except Exception as e:
        print(f"❌ Error loading metadata: {e}")
        raise

def load_fund_returns(fund_ids):
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
        AND Metric = '{RETURN_METRIC}'
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    print(f"Fund returns loaded: {df.columns.tolist()} | Shape: {df.shape}")
    return df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

def load_db_factors(factor_list, region="Global", table="factor_returns", portfolio_filter=None):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    portfolio_in_clause = ','.join([f"'{p}'" for p in portfolio_filter]) if portfolio_filter else ''
    query = f"""
        SELECT date AS Date, factor_symbol AS Factor, value AS Value
        FROM {table}
        WHERE region = ?
        AND factor_symbol IN ({factor_in_clause})
    """
    if portfolio_filter:
        query += f" AND portfolio IN ({portfolio_in_clause})"
    print(f"Executing query: {query} with param: {region}")
    df = pd.read_sql(query, engine, params=(region,), parse_dates=['Date'])
    print(f"DB factors loaded: {df.columns.tolist()} | Shape: {df.shape}")
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value").rename(
        columns={
            'MKT': 'mkt', 'SMB': 'smb', 'HML_Devil': 'hml', 'UMD': 'umd', 'QMJ': 'qmj',
            'BAB': 'bab', 'RF': 'rf', 'TSM-Com': 'tsm-com', 'TSM-EQ': 'tsm-eq',
            'TSM-FI': 'tsm-fi', 'TSM-FX': 'tsm-fx', 'TSM-MA': 'tsm-ma'
        }
    )
    print(f"DB factors columns after pivot: {pivoted_df.columns.tolist()}")
    return pivoted_df

def load_fixed_income_factors(factor_list):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
        WHERE Factor_Name IN ({factor_in_clause})
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    print(f"Fixed income factors loaded: {df.columns.tolist()} | Shape: {df.shape}")
    return df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")

def load_century_factors(factor_list, portfolio, asset_class=None, region="Global"):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM aqr_century_factors
        WHERE factor IN ({factor_in_clause})
        AND portfolio = ?
    """
    params = [portfolio]
    if asset_class:
        query += " AND asset_class = ?"
        params.append(asset_class)
    if region != "Global":
        query += " AND region = ?"
        params.append(region)
    print(f"Executing query: {query} with params: {params}")
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    print(f"Century factors loaded: {df.columns.tolist()} | Shape: {df.shape}")
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value")
    print(f"Century factors columns after pivot: {pivoted_df.columns.tolist()}")
    return pivoted_df

def load_commodity_factors():
    query = """
        SELECT date AS Date, 
               excess_return_eqwt, 
               excess_spot_return_eqwt, 
               ir_adjusted_carry_eqwt, 
               spot_return_eqwt, 
               carry_eqwt, 
               excess_return_long_short, 
               excess_spot_return_long_short, 
               ir_adjusted_carry_long_short, 
               aggregate_backwardation_contango
        FROM aqr_cmdty_factors
    """
    df = pd.read_sql(query, engine, parse_dates=['Date'])
    print(f"Commodity factors loaded: {df.columns.tolist()} | Shape: {df.shape}")
    return df.set_index("Date")

# Section 4: Rolling Regression Functions
def run_rolling_regression(fund, returns, factors, regression_type, factor_set):
    results = []
    returns.index = pd.to_datetime(returns.index)
    factors.index = pd.to_datetime(factors.index)
    viable_periods = [w for w in ROLLING_PERIODS if (returns.index.max() - relativedelta(months=w)) >= returns.index.min()]
    
    for window in viable_periods:
        start = returns.index.min() + relativedelta(months=window)
        for end_date in returns.loc[returns.index >= start].index:
            start_date = end_date - relativedelta(months=window - 1)
            y = returns.loc[start_date:end_date]
            X = factors.loc[start_date:end_date]
            X, y = X.align(y, join="inner", axis=0)
            if len(y) < window or y.isnull().any() or X.isnull().any().any():
                continue
            X_const = add_constant(X)
            model = OLS(y, X_const).fit()
            for factor in X.columns:
                results.append({
                    "SymbolCUSIP": fund,
                    "MonthEndDate": end_date,
                    "RollPeriod": f"{window}m",
                    "Factor_Name": factor,
                    "Coefficient": model.params.get(factor, np.nan),
                    "P_Value": model.pvalues.get(factor, np.nan),
                    "T_Stat": model.tvalues.get(factor, np.nan),
                    "Standard_Error": model.bse.get(factor, np.nan),
                    "CI_Lower": model.conf_int().loc[factor][0] if factor in model.params else np.nan,
                    "CI_Upper": model.conf_int().loc[factor][1] if factor in model.params else np.nan,
                    "Adj_R2": model.rsquared_adj,
                    "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1] if len(y) > 1 else np.nan,
                    "Regression_Type": regression_type,
                    "Factor_Set": factor_set
                })
    return results

# Section 5: Main Processing Pipeline
def process_region(region, fund_subset):
    records = []
    
    for _, fund_row in fund_subset.iterrows():
        symbol = fund_row["SymbolCUSIP"]
        category = fund_row["Global_Category_Name"]
        broad_category = fund_row.get("CWA_Broad_Category_Name", None)
        
        # Equity
        if category in [
            "Energy Sector Equity", "Equity Miscellaneous", "Financials Sector Equity",
            "Healthcare Sector Equity", "Consumer Goods & Services Sector Equity",
            "Communications Sector Equity", "Industrials Sector Equity", "Other Sector Equity",
            "Real Estate Sector Equity", "Precious Metals Sector Equity", "Technology Sector Equity",
            "Utilities Sector Equity", "US Equity Large Cap Blend", "US Equity Large Cap Growth",
            "US Equity Large Cap Value", "US Equity Mid Cap", "US Equity Small Cap", "Options Trading"
        ]:
            # Regression 1: USA, MKT-RF, HML-Devil, QMJ, SMB, UMD, TSM-EQ, BAB
            factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB', 'RF'], "USA", portfolio_filter=["U.S. Large Cap"])
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']], "OLS", "Equity_USA_1"))
            
            # Regression 2: Century Factors, US Stock Selection
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "US Stock Selection", region="USA")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_USA_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_USA_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_USA_4"))
            
            if broad_category in ["Quantitative/Tactical", "Strategic", "Nontraditional"]:
                # Regression 5: Quantitative/Tactical, Strategic, Nontraditional
                factors = load_db_factors(['MKT', 'RF', 'TSM-FX', 'TSM-FI', 'BAB'], "USA", portfolio_filter=["U.S. Large Cap"])
                fi_factors = load_fixed_income_factors(['TERM', 'CREDIT'])
                cmdty_factors = load_commodity_factors()
                factors = pd.concat([factors, fi_factors, cmdty_factors[['excess_return_eqwt']]], axis=1)
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'tsm-fx', 'tsm-fi', 'bab', 'TERM', 'CREDIT', 'excess_return_eqwt']], "OLS", "Equity_USA_5"))
                
                # Regression 6: Quantitative/Tactical, Strategic, Nontraditional
                factors = load_db_factors(['MKT', 'TSM-EQ', 'SMB', 'BAB', 'TSM-Com'], "USA", portfolio_filter=["U.S. Large Cap"])
                fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT_HY'])
                factors = pd.concat([factors, fi_factors], axis=1)
                if 'mkt' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - load_db_factors(['RF'], 'Global')['rf']
                records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'tsm-eq', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT_HY', 'tsm-com']], "OLS", "Equity_USA_6"))
                
                # Regression 7: Quantitative/Tactical, Strategic, Nontraditional
                factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'UMD', 'TSM-EQ', 'SMB', 'BAB', 'TSM-Com', 'TSM-FI', 'TSM-FX'], "USA", portfolio_filter=["U.S. Large Cap"])
                fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
                factors = pd.concat([factors, fi_factors], axis=1)
                if 'mkt' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - load_db_factors(['RF'], 'Global')['rf']
                records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'umd', 'tsm-eq', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fi', 'tsm-fx']], "OLS", "Equity_USA_7"))
            else:
                print(f"Skipping Equity regressions 5-7 for {symbol}: CWA_Broad_Category_Name not available or not Quantitative/Tactical, Strategic, Nontraditional")
        
        # Equity (International)
        elif category in [
            "Asia ex-Japan Equity", "Australia & New Zealand Equity", "Canadian Equity Large Cap",
            "Europe Equity Large Cap", "Europe Equity Mid/Small Cap", "Greater China Equity",
            "India Equity", "Mexico Equity", "Japan Equity", "Korea Equity", "Latin America Equity",
            "UK Equity Large Cap", "Thailand Equity"
        ]:
            # Regression 1: International
            factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB', 'RF'], "Intl", portfolio_filter=["International"])
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']], "OLS", "Equity_Intl_1"))
            
            # Regression 2: Century Factors, Intl Stock Selection
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Intl Stock Selection", region="International")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Intl_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Intl_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Intl_4"))
        
        # Equity (Global)
        elif category in ["Global Emerging Markets Equity", "Global Equity Large Cap", "Global Equity Mid/Small Cap"]:
            # Regression 1: Global
            factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB', 'RF'], "Global", portfolio_filter=["Global"])
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']], "OLS", "Equity_Global_1"))
            
            # Regression 2: Century Factors, All Stock Selection
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Stock Selection")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Global_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Global_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Global_4"))
        
        # Fixed Income
        elif category in [
            "Convertibles", "Emerging Markets Fixed Income", "Fixed Income Miscellaneous",
            "US Fixed Income", "US Municipal Fixed Income"
        ]:
            # Regression 1: TERM_Int, TERM_Long, CREDIT, CREDIT_HY, TSM-FI, TSM-FX
            factors = load_db_factors(['TSM-FI', 'TSM-FX'], "Global")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors, fi_factors], axis=1)
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-fi', 'tsm-fx']], "OLS", "FI_1"))
            
            # Regression 2: Century Factors, Fixed Income
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Fixed Income")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "FI_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "FI_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "FI_4"))
        
        # Allocation
        elif category in [
            "Aggressive Allocation", "Allocation Miscellaneous", "Cautious Allocation",
            "Flexible Allocation", "Moderate Allocation"
        ]:
            # Regression 1: Global, MKT-RF, HML-Devil, QMJ, SMB, UMD, BAB, TSM-EQ, TSM-FI, TERM_Int, TERM_Long, CREDIT, CREDIT_HY
            factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ', 'TSM-FI', 'RF'], "Global", portfolio_filter=["Global"])
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors, fi_factors], axis=1)
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']], "OLS", "Allocation_1"))
            
            # Regression 2: Century Factors, Fixed Income and All Stock Selection
            fi_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Fixed Income")
            stock_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Stock Selection")
            factors = pd.concat([fi_factors, stock_factors], axis=1)
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Allocation_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Allocation_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Allocation_4"))
        
        # Alternatives
        elif category in [
            "Alternative Miscellaneous", "Long/Short Equity", "Market Neutral", "Multialternative"
        ]:
            # Regression 1: Global, MKT-RF, HML-Devil, QMJ, SMB, UMD, BAB, TSM-EQ, TSM-FI, TERM_Int, TERM_Long, CREDIT, CREDIT_HY, TSM-COM, TSM-FX
            factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ', 'TSM-FI', 'TSM-Com', 'TSM-FX', 'RF'], "Global", portfolio_filter=["Global"])
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors, fi_factors], axis=1)
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fx']], "OLS", "Alternative_1"))
            
            # Regression 2: Century Factors, Fixed Income, All Stock Selection, Commodities
            fi_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Fixed Income")
            stock_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Stock Selection")
            cmdty_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Commodities")
            factors = pd.concat([fi_factors, stock_factors, cmdty_factors], axis=1)
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Alternative_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Alternative_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Alternative_4"))
        
        # Commodities
        elif category in ["Commodities Broad Basket", "Commodities Specified"]:
            # Regression 1: All factors from aqr_cmdty_factors
            factors = load_commodity_factors()
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Commodity_1"))
    
    if records:
        if not DRY_RUN:
            insert_batch(records)
        else:
            print(f"ℹ️ Dry run: Would have written {len(records)} records for {region}")

def main():
    logging.basicConfig(level=logging.INFO)
    
    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()
    print(f"🧠 Total mapped funds: {len(fund_meta)}")
    print(f"📍 Regions detected: {regions}\n")
    
    # Add returns to fund_meta for processing
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
    returns = load_fund_returns(fund_ids)
    fund_meta["returns"] = fund_meta["SymbolCUSIP"].map(lambda x: returns[x] if x in returns.columns else pd.Series())
    
    for region in regions:
        fund_subset = fund_meta[fund_meta["Region"] == region]
        process_region(region, fund_subset)

# Section 6: Database Output
def insert_batch(records):
    df = pd.DataFrame(records)
    if not DRY_RUN:
        for i in range(0, len(df), BATCH_INSERT_SIZE):
            batch = df.iloc[i:i + BATCH_INSERT_SIZE]
            batch.to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)
    else:
        print(f"ℹ️ Dry run: Skipped writing {len(df)} records")

if __name__ == "__main__":
    main()

Executing metadata query...
Metadata loaded: ['SymbolCUSIP', 'Region', 'YC_Global_Category_ID', 'Global_Category_Name', 'YC_Category_ID', 'Category_Name', 'CWA_Broad_Category_ID', 'CWA_Broad_Category_Name', 'FactorProfile'] | Shape: (5584, 9)
🧠 Total mapped funds: 5584
📍 Regions detected: ['Global' 'USA' 'Unknown' 'International']

Fund returns loaded: ['SymbolCUSIP', 'Date', 'ReturnValue'] | Shape: (9955, 3)
Executing query: 
        SELECT date AS Date, factor_symbol AS Factor, value AS Value
        FROM factor_returns
        WHERE region = ?
        AND factor_symbol IN ('MKT','HML_Devil','QMJ','SMB','UMD','BAB','TSM-EQ','TSM-FI','RF')
     AND portfolio IN ('Global') with param: Global


ProgrammingError: (pyodbc.ProgrammingError) ('42S22', "[42S22] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Invalid column name 'factor_symbol'. (207) (SQLExecDirectW); [42S22] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Invalid column name 'factor_symbol'. (207); [42S22] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Invalid column name 'factor_symbol'. (207); [42S22] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Invalid column name 'factor_symbol'. (207); [42S22] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Invalid column name 'factor_symbol'. (207); [42S22] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Invalid column name 'factor_symbol'. (207); [42S22] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Invalid column name 'factor_symbol'. (207); [42S22] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Invalid column name 'factor_symbol'. (207); [42S22] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Invalid column name 'factor_symbol'. (207); [42S22] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Invalid column name 'portfolio'. (207); [42S22] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Invalid column name 'factor_symbol'. (207); [42S22] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Statement(s) could not be prepared. (8180)")
[SQL: 
        SELECT date AS Date, factor_symbol AS Factor, value AS Value
        FROM factor_returns
        WHERE region = ?
        AND factor_symbol IN ('MKT','HML_Devil','QMJ','SMB','UMD','BAB','TSM-EQ','TSM-FI','RF')
     AND portfolio IN ('Global')]
[parameters: ('Global',)]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [4]:
# Version 4.4
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time

# Section 1: Configuration and Setup
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

RETURN_METRIC = "1 Month Return"
ROLLING_PERIODS = [12, 24, 36, 48, 60]  # in months
DRY_RUN = True
SAMPLE_DRY_RUN = True
SAMPLE_SIZE = 100
CHUNK_SIZE = 5600
BATCH_INSERT_SIZE = 10000
MAX_WORKERS = 15  # Optimized for 16-core i9-185H

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    return mapping.get(category, ("Unknown", "Unknown"))

# Section 3: Data Loading Functions
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    print("Executing metadata query...")
    try:
        df = pd.read_sql(query, engine)
        df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
        print(f"Metadata loaded: {df.columns.tolist()} | Shape: {df.shape}")
        if df["CWA_Broad_Category_Name"].isnull().all():
            print("⚠️ Warning: CWA_Broad_Category_Name is missing for all rows; Equity regressions 5-7 will be skipped")
        return df.dropna(subset=["Region", "FactorProfile"])
    except Exception as e:
        print(f"❌ Error loading metadata: {e}")
        raise

def load_fund_returns(fund_ids):
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
        AND Metric = '{RETURN_METRIC}'
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    print(f"Fund returns loaded: {df.columns.tolist()} | Shape: {df.shape}")
    return df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

def load_db_factors(factor_list, region="Global", table="factor_returns", asset_class=None):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM {table}
        WHERE region = ?
        AND factor IN ({factor_in_clause})
    """
    params = [region]
    if asset_class:
        query += " AND asset_class = ?"
        params.append(asset_class)
    print(f"Executing query: {query} with params: {params}")
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    print(f"DB factors loaded: {df.columns.tolist()} | Shape: {df.shape}")
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value").rename(
        columns={
            'MKT': 'mkt', 'SMB': 'smb', 'HML_Devil': 'hml', 'UMD': 'umd', 'QMJ': 'qmj',
            'BAB': 'bab', 'RF': 'rf', 'TSM-Com': 'tsm-com', 'TSM-EQ': 'tsm-eq',
            'TSM-FI': 'tsm-fi', 'TSM-FX': 'tsm-fx', 'TSM-MA': 'tsm-ma'
        }
    )
    print(f"DB factors columns after pivot: {pivoted_df.columns.tolist()}")
    return pivoted_df

def load_fixed_income_factors(factor_list):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
        WHERE Factor_Name IN ({factor_in_clause})
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    print(f"Fixed income factors loaded: {df.columns.tolist()} | Shape: {df.shape}")
    return df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")

def load_century_factors(factor_list, portfolio, asset_class=None, region="Global"):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM aqr_century_factors
        WHERE factor IN ({factor_in_clause})
        AND portfolio = ?
    """
    params = [portfolio]
    if asset_class:
        query += " AND asset_class = ?"
        params.append(asset_class)
    if region != "Global":
        query += " AND region = ?"
        params.append(region)
    print(f"Executing query: {query} with params: {params}")
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    print(f"Century factors loaded: {df.columns.tolist()} | Shape: {df.shape}")
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value")
    print(f"Century factors columns after pivot: {pivoted_df.columns.tolist()}")
    return pivoted_df

def load_commodity_factors():
    query = """
        SELECT date AS Date, 
               excess_return_eqwt, 
               excess_spot_return_eqwt, 
               ir_adjusted_carry_eqwt, 
               spot_return_eqwt, 
               carry_eqwt, 
               excess_return_long_short, 
               excess_spot_return_long_short, 
               ir_adjusted_carry_long_short, 
               aggregate_backwardation_contango
        FROM aqr_cmdty_factors
    """
    df = pd.read_sql(query, engine, parse_dates=['Date'])
    print(f"Commodity factors loaded: {df.columns.tolist()} | Shape: {df.shape}")
    return df.set_index("Date")

# Section 4: Rolling Regression Functions
def run_rolling_regression(fund, returns, factors, regression_type, factor_set):
    results = []
    returns.index = pd.to_datetime(returns.index)
    factors.index = pd.to_datetime(factors.index)
    viable_periods = [w for w in ROLLING_PERIODS if (returns.index.max() - relativedelta(months=w)) >= returns.index.min()]
    
    for window in viable_periods:
        start = returns.index.min() + relativedelta(months=window)
        for end_date in returns.loc[returns.index >= start].index:
            start_date = end_date - relativedelta(months=window - 1)
            y = returns.loc[start_date:end_date]
            X = factors.loc[start_date:end_date]
            X, y = X.align(y, join="inner", axis=0)
            if len(y) < window or y.isnull().any() or X.isnull().any().any():
                continue
            X_const = add_constant(X)
            model = OLS(y, X_const).fit()
            for factor in X.columns:
                results.append({
                    "SymbolCUSIP": fund,
                    "MonthEndDate": end_date,
                    "RollPeriod": f"{window}m",
                    "Factor_Name": factor,
                    "Coefficient": model.params.get(factor, np.nan),
                    "P_Value": model.pvalues.get(factor, np.nan),
                    "T_Stat": model.tvalues.get(factor, np.nan),
                    "Standard_Error": model.bse.get(factor, np.nan),
                    "CI_Lower": model.conf_int().loc[factor][0] if factor in model.params else np.nan,
                    "CI_Upper": model.conf_int().loc[factor][1] if factor in model.params else np.nan,
                    "Adj_R2": model.rsquared_adj,
                    "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1] if len(y) > 1 else np.nan,
                    "Regression_Type": regression_type,
                    "Factor_Set": factor_set
                })
    return results

# Section 5: Main Processing Pipeline
def process_region(region, fund_subset):
    records = []
    
    for _, fund_row in fund_subset.iterrows():
        symbol = fund_row["SymbolCUSIP"]
        category = fund_row["Global_Category_Name"]
        broad_category = fund_row.get("CWA_Broad_Category_Name", None)
        
        # Equity
        if category in [
            "Energy Sector Equity", "Equity Miscellaneous", "Financials Sector Equity",
            "Healthcare Sector Equity", "Consumer Goods & Services Sector Equity",
            "Communications Sector Equity", "Industrials Sector Equity", "Other Sector Equity",
            "Real Estate Sector Equity", "Precious Metals Sector Equity", "Technology Sector Equity",
            "Utilities Sector Equity", "US Equity Large Cap Blend", "US Equity Large Cap Growth",
            "US Equity Large Cap Value", "US Equity Mid Cap", "US Equity Small Cap", "Options Trading"
        ]:
            # Regression 1: USA, MKT-RF, HML-Devil, QMJ, SMB, UMD, TSM-EQ, BAB
            factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB', 'RF'], "USA", asset_class="Equity")
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']], "OLS", "Equity_USA_1"))
            
            # Regression 2: Century Factors, US Stock Selection
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "US Stock Selection", region="USA")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_USA_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_USA_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_USA_4"))
            
            if broad_category in ["Quantitative/Tactical", "Strategic", "Nontraditional"]:
                # Regression 5: Quantitative/Tactical, Strategic, Nontraditional
                factors = load_db_factors(['MKT', 'RF', 'TSM-FX', 'TSM-FI', 'BAB'], "USA", asset_class="Equity")
                fi_factors = load_fixed_income_factors(['TERM', 'CREDIT'])
                cmdty_factors = load_commodity_factors()
                factors = pd.concat([factors, fi_factors, cmdty_factors[['excess_return_eqwt']]], axis=1)
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'tsm-fx', 'tsm-fi', 'bab', 'TERM', 'CREDIT', 'excess_return_eqwt']], "OLS", "Equity_USA_5"))
                
                # Regression 6: Quantitative/Tactical, Strategic, Nontraditional
                factors = load_db_factors(['MKT', 'TSM-EQ', 'SMB', 'BAB', 'TSM-Com'], "USA", asset_class="Equity")
                fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT_HY'])
                factors = pd.concat([factors, fi_factors], axis=1)
                if 'mkt' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - load_db_factors(['RF'], 'Global', asset_class="Equity")['rf']
                records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'tsm-eq', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT_HY', 'tsm-com']], "OLS", "Equity_USA_6"))
                
                # Regression 7: Quantitative/Tactical, Strategic, Nontraditional
                factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'UMD', 'TSM-EQ', 'SMB', 'BAB', 'TSM-Com', 'TSM-FI', 'TSM-FX'], "USA", asset_class="Equity")
                fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
                factors = pd.concat([factors, fi_factors], axis=1)
                if 'mkt' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - load_db_factors(['RF'], 'Global', asset_class="Equity")['rf']
                records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'umd', 'tsm-eq', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fi', 'tsm-fx']], "OLS", "Equity_USA_7"))
            else:
                print(f"Skipping Equity regressions 5-7 for {symbol}: CWA_Broad_Category_Name not Quantitative/Tactical, Strategic, Nontraditional")
        
        # Equity (International)
        elif category in [
            "Asia ex-Japan Equity", "Australia & New Zealand Equity", "Canadian Equity Large Cap",
            "Europe Equity Large Cap", "Europe Equity Mid/Small Cap", "Greater China Equity",
            "India Equity", "Mexico Equity", "Japan Equity", "Korea Equity", "Latin America Equity",
            "UK Equity Large Cap", "Thailand Equity"
        ]:
            # Regression 1: International
            factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB', 'RF'], "Intl", asset_class="Equity")
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']], "OLS", "Equity_Intl_1"))
            
            # Regression 2: Century Factors, Intl Stock Selection
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Intl Stock Selection", region="International")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Intl_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Intl_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Intl_4"))
        
        # Equity (Global)
        elif category in ["Global Emerging Markets Equity", "Global Equity Large Cap", "Global Equity Mid/Small Cap"]:
            # Regression 1: Global
            factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB', 'RF'], "Global", asset_class="Equity")
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']], "OLS", "Equity_Global_1"))
            
            # Regression 2: Century Factors, All Stock Selection
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Stock Selection")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Global_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Global_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Global_4"))
        
        # Fixed Income
        elif category in [
            "Convertibles", "Emerging Markets Fixed Income", "Fixed Income Miscellaneous",
            "US Fixed Income", "US Municipal Fixed Income"
        ]:
            # Regression 1: TERM_Int, TERM_Long, CREDIT, CREDIT_HY, TSM-FI, TSM-FX
            factors = load_db_factors(['TSM-FI', 'TSM-FX'], "Global", asset_class="Fixed Income")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors, fi_factors], axis=1)
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-fi', 'tsm-fx']], "OLS", "FI_1"))
            
            # Regression 2: Century Factors, Fixed Income
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Fixed Income")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "FI_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "FI_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "FI_4"))
        
        # Allocation
        elif category in [
            "Aggressive Allocation", "Allocation Miscellaneous", "Cautious Allocation",
            "Flexible Allocation", "Moderate Allocation"
        ]:
            # Regression 1: Global, MKT-RF, HML-Devil, QMJ, SMB, UMD, BAB, TSM-EQ, TSM-FI, TERM_Int, TERM_Long, CREDIT, CREDIT_HY
            factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ', 'TSM-FI', 'RF'], "Global", asset_class="Equity")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors, fi_factors], axis=1)
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']], "OLS", "Allocation_1"))
            
            # Regression 2: Century Factors, Fixed Income and All Stock Selection
            fi_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Fixed Income")
            stock_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Stock Selection")
            factors = pd.concat([fi_factors, stock_factors], axis=1)
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Allocation_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Allocation_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Allocation_4"))
        
        # Alternatives
        elif category in [
            "Alternative Miscellaneous", "Long/Short Equity", "Market Neutral", "Multialternative"
        ]:
            # Regression 1: Global, MKT-RF, HML-Devil, QMJ, SMB, UMD, BAB, TSM-EQ, TSM-FI, TERM_Int, TERM_Long, CREDIT, CREDIT_HY, TSM-COM, TSM-FX
            factors = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ', 'TSM-FI', 'TSM-Com', 'TSM-FX', 'RF'], "Global", asset_class="Equity")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors, fi_factors], axis=1)
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fx']], "OLS", "Alternative_1"))
            
            # Regression 2: Century Factors, Fixed Income, All Stock Selection, Commodities
            fi_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Fixed Income")
            stock_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Stock Selection")
            cmdty_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Commodities")
            factors = pd.concat([fi_factors, stock_factors, cmdty_factors], axis=1)
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Alternative_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Alternative_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Alternative_4"))
        
        # Commodities
        elif category in ["Commodities Broad Basket", "Commodities Specified"]:
            # Regression 1: All factors from aqr_cmdty_factors
            factors = load_commodity_factors()
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Commodity_1"))
    
    if records:
        if not DRY_RUN:
            insert_batch(records)
        else:
            print(f"ℹ️ Dry run: Would have written {len(records)} records for {region}")

def main():
    logging.basicConfig(level=logging.INFO)
    
    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()
    print(f"🧠 Total mapped funds: {len(fund_meta)}")
    print(f"📍 Regions detected: {regions}\n")
    
    # Add returns to fund_meta for processing
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
    returns = load_fund_returns(fund_ids)
    fund_meta["returns"] = fund_meta["SymbolCUSIP"].map(lambda x: returns[x] if x in returns.columns else pd.Series())
    
    for region in regions:
        fund_subset = fund_meta[fund_meta["Region"] == region]
        process_region(region, fund_subset)

# Section 6: Database Output
def insert_batch(records):
    df = pd.DataFrame(records)
    if not DRY_RUN:
        for i in range(0, len(df), BATCH_INSERT_SIZE):
            batch = df.iloc[i:i + BATCH_INSERT_SIZE]
            batch.to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)
    else:
        print(f"ℹ️ Dry run: Skipped writing {len(df)} records")

if __name__ == "__main__":
    main()

Executing metadata query...
Metadata loaded: ['SymbolCUSIP', 'Region', 'YC_Global_Category_ID', 'Global_Category_Name', 'YC_Category_ID', 'Category_Name', 'CWA_Broad_Category_ID', 'CWA_Broad_Category_Name', 'FactorProfile'] | Shape: (5584, 9)
🧠 Total mapped funds: 5584
📍 Regions detected: ['Global' 'USA' 'Unknown' 'International']

Fund returns loaded: ['SymbolCUSIP', 'Date', 'ReturnValue'] | Shape: (9582, 3)
Executing query: 
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM factor_returns
        WHERE region = ?
        AND factor IN ('MKT','HML_Devil','QMJ','SMB','UMD','BAB','TSM-EQ','TSM-FI','RF')
     AND asset_class = ? with params: ['Global', 'Equity']


ArgumentError: List argument must consist only of tuples or dictionaries

In [9]:
# Version 4.5
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time

# Section 1: Configuration and Setup
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

RETURN_METRIC = "1 Month Return"
ROLLING_PERIODS = [12, 24, 36, 48, 60]  # in months
DRY_RUN = True
SAMPLE_DRY_RUN = True
SAMPLE_SIZE = 100
CHUNK_SIZE = 5600
BATCH_INSERT_SIZE = 10000
MAX_WORKERS = 15  # Optimized for 16-core i9-185H

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    return mapping.get(category, ("Unknown", "Unknown"))

# Section 3: Data Loading Functions
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    print("Executing metadata query...")
    try:
        df = pd.read_sql(query, engine)
        df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
        print(f"Metadata loaded: {df.columns.tolist()} | Shape: {df.shape}")
        if df["CWA_Broad_Category_Name"].isnull().all():
            print("⚠️ Warning: CWA_Broad_Category_Name is missing for all rows; Equity regressions 5-7 will be skipped")
        return df.dropna(subset=["Region", "FactorProfile"])
    except Exception as e:
        print(f"❌ Error loading metadata: {e}")
        raise

def load_fund_returns(fund_ids):
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
        AND Metric = '{RETURN_METRIC}'
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    print(f"Fund returns loaded: {df.columns.tolist()} | Shape: {df.shape}")
    return df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

def load_db_factors(factor_list, region="Global", table="factor_returns", asset_class=None):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM {table}
        WHERE factor IN ({factor_in_clause})
    """
    params = ()
    # Universal factors (RF, TSM-*) don't filter by region unless specified
    if region and not any(f.startswith('TSM-') or f == 'RF' for f in factor_list):
        query += " AND region = ?"
        params = (region,)
    elif 'RF' in factor_list:
        query += " AND region = 'USA'"  # RF is under USA per Query1
    if asset_class:
        query += " AND asset_class = ?"
        params += (asset_class,)
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    if df.empty:
        logging.warning(f"No data for factors {factor_list} in {table} (region: {region}, asset_class: {asset_class})")
        return pd.DataFrame()
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value").rename(
        columns={
            'MKT': 'mkt', 'SMB': 'smb', 'HML_Devil': 'hml', 'UMD': 'umd', 'QMJ': 'qmj',
            'BAB': 'bab', 'RF': 'rf', 'TSM-Com': 'tsm-com', 'TSM-EQ': 'tsm-eq',
            'TSM-FI': 'tsm-fi', 'TSM-FX': 'tsm-fx', 'TSM-MA': 'tsm-ma'
        }
    )
    missing_factors = [f for f in factor_list if f.lower() not in pivoted_df.columns]
    if missing_factors:
        logging.warning(f"Missing factors in {table} (region: {region}): {missing_factors}")
    return pivoted_df

def load_fixed_income_factors(factor_list):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
        WHERE Factor_Name IN ({factor_in_clause})
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    print(f"Fixed income factors loaded: {df.columns.tolist()} | Shape: {df.shape}")
    return df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")

def load_century_factors(factor_list, portfolio, asset_class=None, region="Global"):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM aqr_century_factors
        WHERE factor IN ({factor_in_clause})
        AND portfolio = ?
    """
    params = (portfolio,)
    if asset_class:
        query += " AND asset_class = ?"
        params += (asset_class,)
    if region != "Global":
        query += " AND region = ?"
        params += (region,)
    print(f"Executing query: {query} with params: {params}")
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    print(f"Century factors loaded: {df.columns.tolist()} | Shape: {df.shape}")
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value")
    print(f"Century factors columns after pivot: {pivoted_df.columns.tolist()}")
    return pivoted_df

def load_commodity_factors():
    query = """
        SELECT date AS Date, 
               excess_return_eqwt, 
               excess_spot_return_eqwt, 
               ir_adjusted_carry_eqwt, 
               spot_return_eqwt, 
               carry_eqwt, 
               excess_return_long_short, 
               excess_spot_return_long_short, 
               ir_adjusted_carry_long_short, 
               aggregate_backwardation_contango
        FROM aqr_cmdty_factors
    """
    df = pd.read_sql(query, engine, parse_dates=['Date'])
    print(f"Commodity factors loaded: {df.columns.tolist()} | Shape: {df.shape}")
    return df.set_index("Date")

# Section 4: Rolling Regression Functions
def run_rolling_regression(fund, returns, factors, regression_type, factor_set):
    results = []
    returns = returns.dropna()  # Drop NaN values
    factors = factors.dropna()  # Drop NaN values
    returns.index = pd.to_datetime(returns.index, errors='coerce')
    factors.index = pd.to_datetime(factors.index, errors='coerce')
    returns = returns[returns.index.notnull()]  # Remove NaT
    factors = factors[factors.index.notnull()]  # Remove NaT
    if returns.empty or factors.empty:
        print(f"⚠️ Warning: Empty returns or factors for {fund}; skipping regression")
        return results
    viable_periods = [w for w in ROLLING_PERIODS if (returns.index.max() - relativedelta(months=w)) >= returns.index.min()]
    
    for window in viable_periods:
        start = returns.index.min() + relativedelta(months=window)
        for end_date in returns.loc[returns.index >= start].index:
            start_date = end_date - relativedelta(months=window - 1)
            y = returns.loc[start_date:end_date]
            X = factors.loc[start_date:end_date]
            X, y = X.align(y, join="inner", axis=0)
            if len(y) < window or y.isnull().any() or X.isnull().any().any():
                continue
            X_const = add_constant(X)
            model = OLS(y, X_const).fit()
            for factor in X.columns:
                results.append({
                    "SymbolCUSIP": fund,
                    "MonthEndDate": end_date,
                    "RollPeriod": f"{window}m",
                    "Factor_Name": factor,
                    "Coefficient": model.params.get(factor, np.nan),
                    "P_Value": model.pvalues.get(factor, np.nan),
                    "T_Stat": model.tvalues.get(factor, np.nan),
                    "Standard_Error": model.bse.get(factor, np.nan),
                    "CI_Lower": model.conf_int().loc[factor][0] if factor in model.params else np.nan,
                    "CI_Upper": model.conf_int().loc[factor][1] if factor in model.params else np.nan,
                    "Adj_R2": model.rsquared_adj,
                    "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1] if len(y) > 1 else np.nan,
                    "Regression_Type": regression_type,
                    "Factor_Set": factor_set
                })
    return results

def process_fund(fund_row):
    """Process regressions for a single fund, returning a list of records."""
    records = []
    symbol = fund_row["SymbolCUSIP"]
    category = fund_row["Global_Category_Name"]
    broad_category = fund_row.get("CWA_Broad_Category_Name", None)
    returns = fund_row["returns"].dropna()
    
    if returns.empty:
        logging.warning(f"No valid returns for {symbol}; skipping")
        return records

    # Equity (USA)
    if category in [
        "Energy Sector Equity", "Equity Miscellaneous", "Financials Sector Equity",
        "Healthcare Sector Equity", "Consumer Goods & Services Sector Equity",
        "Communications Sector Equity", "Industrials Sector Equity", "Other Sector Equity",
        "Real Estate Sector Equity", "Precious Metals Sector Equity", "Technology Sector Equity",
        "Utilities Sector Equity", "US Equity Large Cap Blend", "US Equity Large Cap Growth",
        "US Equity Large Cap Value", "US Equity Mid Cap", "US Equity Small Cap", "Options Trading"
    ]:
        # Regression 1: USA, MKT-RF, HML-Devil, QMJ, SMB, UMD, BAB
        factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB'], "USA")
        rf_factors = load_db_factors(['RF'], "USA")
        factors = pd.concat([factors_eq, rf_factors], axis=1).dropna()
        if factors.empty:
            logging.warning(f"No valid factors for {symbol} (Equity_USA_1)")
            return records
        if 'mkt' in factors.columns and 'rf' in factors.columns:
            factors['mkt-rf'] = factors['mkt'] - factors['rf']
        elif 'mkt' not in factors.columns:
            logging.warning(f"No market factor for {symbol} (Equity_USA_1); skipping")
            return records
        factors, returns_aligned = factors.align(returns, join="inner", axis=0)
        if returns_aligned.empty:
            logging.warning(f"No overlapping dates for {symbol} (Equity_USA_1)")
            return records
        desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab']
        available_factors = [f for f in desired_factors if f in factors.columns]
        if available_factors:
            records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_1"))
        else:
            logging.warning(f"No valid factors for {symbol} (Equity_USA_1)")

        # Regression 2-4: Century Factors
        for portfolio, factor_set in [
            ("US Stock Selection", "Equity_USA_2"),
            ("All Macro", "Equity_USA_3"),
            ("Equity Indices", "Equity_USA_4")
        ]:
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], portfolio, region="USA")
            if factors.empty:
                logging.warning(f"No century factors for {symbol} ({factor_set})")
                continue
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} ({factor_set})")
                continue
            records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        if broad_category in ["Quantitative/Tactical", "Strategic", "Nontraditional"]:
            # Regression 5
            factors_eq = load_db_factors(['MKT', 'BAB'], "USA")
            factors_fi = load_db_factors(['TSM-FI'])  # Universal
            factors_fx = load_db_factors(['TSM-FX'])  # Universal
            rf_factors = load_db_factors(['RF'], "USA")
            fi_factors = load_fixed_income_factors(['TERM', 'CREDIT'])
            cmdty_factors = load_commodity_factors()
            factors = pd.concat([factors_eq, factors_fi, factors_fx, rf_factors, fi_factors, cmdty_factors[['excess_return_eqwt']]], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Equity_USA_5)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Equity_USA_5)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'tsm-fx', 'tsm-fi', 'bab', 'TERM', 'CREDIT', 'excess_return_eqwt']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_5"))

            # Regression 6
            factors_eq = load_db_factors(['MKT', 'SMB', 'BAB'], "USA")
            factors_com = load_db_factors(['TSM-Com'])  # Universal
            rf_factors = load_db_factors(['RF'], "USA")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT_HY'])
            factors = pd.concat([factors_eq, factors_com, rf_factors, fi_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Equity_USA_6)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Equity_USA_6)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT_HY', 'tsm-com']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_6"))

            # Regression 7
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'UMD', 'SMB', 'BAB'], "USA")
            factors_com = load_db_factors(['TSM-Com'])  # Universal
            factors_fi = load_db_factors(['TSM-FI'])  # Universal
            factors_fx = load_db_factors(['TSM-FX'])  # Universal
            rf_factors = load_db_factors(['RF'], "USA")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors_eq, factors_com, factors_fi, factors_fx, rf_factors, fi_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Equity_USA_7)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Equity_USA_7)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'umd', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fi', 'tsm-fx']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_7"))
        else:
            logging.warning(f"Skipping Equity regressions 5-7 for {symbol}: Not Quantitative/Tactical, Strategic, or Nontraditional")

    # Equity (International)
    elif category in [
        "Asia ex-Japan Equity", "Australia & New Zealand Equity", "Canadian Equity Large Cap",
        "Europe Equity Large Cap", "Europe Equity Mid/Small Cap", "Greater China Equity",
        "India Equity", "Mexico Equity", "Japan Equity", "Korea Equity", "Latin America Equity",
        "UK Equity Large Cap", "Thailand Equity"
    ]:
        # Regression 1
        factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Intl")
        rf_factors = load_db_factors(['RF'], "USA")
        factors = pd.concat([factors_eq, rf_factors], axis=1).dropna()
        if factors.empty:
            logging.warning(f"No valid factors for {symbol} (Equity_Intl_1)")
        else:
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} (Equity_Intl_1)")
            else:
                desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']
                available_factors = [f for f in desired_factors if f in factors.columns]
                if available_factors:
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_Intl_1"))

        # Regression 2-4
        for portfolio, factor_set in [
            ("Intl Stock Selection", "Equity_Intl_2"),
            ("All Macro", "Equity_Intl_3"),
            ("Equity Indices", "Equity_Intl_4")
        ]:
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], portfolio, region="International")
            if factors.empty:
                logging.warning(f"No century factors for {symbol} ({factor_set})")
                continue
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} ({factor_set})")
            else:
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

    # Equity (Global)
    elif category in ["Global Emerging Markets Equity", "Global Equity Large Cap", "Global Equity Mid/Small Cap"]:
        # Regression 1
        factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Global")
        rf_factors = load_db_factors(['RF'], "USA")
        factors = pd.concat([factors_eq, rf_factors], axis=1).dropna()
        if factors.empty:
            logging.warning(f"No valid factors for {symbol} (Equity_Global_1)")
        else:
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} (Equity_Global_1)")
            else:
                desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']
                available_factors = [f for f in desired_factors if f in factors.columns]
                if available_factors:
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_Global_1"))

        # Regression 2-4
        for portfolio, factor_set in [
            ("All Stock Selection", "Equity_Global_2"),
            ("All Macro", "Equity_Global_3"),
            ("Equity Indices", "Equity_Global_4")
        ]:
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], portfolio)
            if factors.empty:
                logging.warning(f"No century factors for {symbol} ({factor_set})")
                continue
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} ({factor_set})")
            else:
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

    # Fixed Income
    elif category in [
        "Convertibles", "Emerging Markets Fixed Income", "Fixed Income Miscellaneous",
        "US Fixed Income", "US Municipal Fixed Income"
    ]:
        # Regression 1
        factors_fi = load_db_factors(['TSM-FI'])  # Universal
        factors_fx = load_db_factors(['TSM-FX'])  # Universal
        fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
        factors = pd.concat([factors_fi, factors_fx, fi_factors], axis=1).dropna()
        if factors.empty:
            logging.warning(f"No valid factors for {symbol} (FI_1)")
        else:
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} (FI_1)")
            else:
                desired_factors = ['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-fi', 'tsm-fx']
                available_factors = [f for f in desired_factors if f in factors.columns]
                if available_factors:
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "FI_1"))

        # Regression 2-4
        for portfolio, factor_set in [
            ("Fixed Income", "FI_2"),
            ("All Macro", "FI_3"),
            ("Equity Indices", "FI_4")
        ]:
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], portfolio)
            if factors.empty:
                logging.warning(f"No century factors for {symbol} ({factor_set})")
                continue
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} ({factor_set})")
            else:
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

    # Allocation
    elif category in [
        "Aggressive Allocation", "Allocation Miscellaneous", "Cautious Allocation",
        "Flexible Allocation", "Moderate Allocation"
    ]:
        # Regression 1
        factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ'], "Global")
        factors_fi = load_db_factors(['TSM-FI'])  # Universal
        rf_factors = load_db_factors(['RF'], "USA")
        fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
        factors = pd.concat([factors_eq, factors_fi, rf_factors, fi_factors], axis=1).dropna()
        if factors.empty:
            logging.warning(f"No valid factors for {symbol} (Allocation_1)")
        else:
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} (Allocation_1)")
            else:
                desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']
                available_factors = [f for f in desired_factors if f in factors.columns]
                if available_factors:
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Allocation_1"))

        # Regression 2
        fi_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Fixed Income")
        stock_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Stock Selection")
        factors = pd.concat([fi_factors, stock_factors], axis=1).dropna()
        if factors.empty:
            logging.warning(f"No valid factors for {symbol} (Allocation_2)")
        else:
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} (Allocation_2)")
            else:
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", "Allocation_2"))

        # Regression 3-4
        for portfolio, factor_set in [
            ("All Macro", "Allocation_3"),
            ("Equity Indices", "Allocation_4")
        ]:
            factors =bea
            logging.warning(f"No century factors for {symbol} ({factor_set})")
            continue
        factors, returns_aligned = factors.align(returns, join="inner", axis=0)
        if returns_aligned.empty:
            logging.warning(f"No overlapping dates for {symbol} ({factor_set})")
        else:
            records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

    # Alternatives
    elif category in [
        "Alternative Miscellaneous", "Long/Short Equity", "Market Neutral", "Multialternative"
    ]:
        # Regression 1
        factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ'], "Global")
        factors_com = load_db_factors(['TSM-Com'])  # Universal
        factors_fi = load_db_factors(['TSM-FI'])  # Universal
        factors_fx = load_db_factors(['TSM-FX'])  # Universal
        rf_factors = load_db_factors(['RF'], "USA")
        fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
        factors = pd.concat([factors_eq, factors_com, factors_fi, factors_fx, rf_factors, fi_factors], axis=1).dropna()
        if factors.empty:
            logging.warning(f"No valid factors for {symbol} (Alternative_1)")
        else:
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} (Alternative_1)")
            else:
                desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fx']
                available_factors = [f for f in desired_factors if f in factors.columns]
                if available_factors:
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Alternative_1"))

        # Regression 2
        fi_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Fixed Income")
        stock_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Stock Selection")
        cmdty_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Commodities")
        factors = pd.concat([fi_factors, stock_factors, cmdty_factors], axis=1).dropna()
        if factors.empty:
            logging.warning(f"No valid factors for {symbol} (Alternative_2)")
        else:
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} (Alternative_2)")
            else:
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", "Alternative_2"))

        # Regression 3-4
        for portfolio, factor_set in [
            ("All Macro", "Alternative_3"),
            ("Equity Indices", "Alternative_4")
        ]:
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], portfolio)
            if factors.empty:
                logging.warning(f"No century factors for {symbol} ({factor_set})")
                continue
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} ({factor_set})")
            else:
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

    # Commodities
    elif category in ["Commodities Broad Basket", "Commodities Specified"]:
        # Regression 1
        factors = load_commodity_factors()
        if factors.empty:
            logging.warning(f"No commodity factors for {symbol} (Commodity_1)")
        else:
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} (Commodity_1)")
            else:
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", "Commodity_1"))

    return records

# Section 5: Main Processing Pipeline
def process_region(region, fund_subset):
    records = []
    
    for _, fund_row in fund_subset.iterrows():
        symbol = fund_row["SymbolCUSIP"]
        category = fund_row["Global_Category_Name"]
        broad_category = fund_row.get("CWA_Broad_Category_Name", None)
        
        # Equity
        if category in [
            "Energy Sector Equity", "Equity Miscellaneous", "Financials Sector Equity",
            "Healthcare Sector Equity", "Consumer Goods & Services Sector Equity",
            "Communications Sector Equity", "Industrials Sector Equity", "Other Sector Equity",
            "Real Estate Sector Equity", "Precious Metals Sector Equity", "Technology Sector Equity",
            "Utilities Sector Equity", "US Equity Large Cap Blend", "US Equity Large Cap Growth",
            "US Equity Large Cap Value", "US Equity Mid Cap", "US Equity Small Cap", "Options Trading"
        ]:
            # Regression 1: USA, MKT-RF, HML-Devil, QMJ, SMB, UMD, TSM-EQ, BAB
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "USA")
            rf_factors = load_db_factors(['RF'], None)  # No region/asset_class for RF
            factors = pd.concat([factors_eq, rf_factors], axis=1)
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            else:
                print(f"⚠️ Warning: Cannot calculate mkt-rf for {symbol} (Equity_USA_1); using mkt")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']], "OLS", "Equity_USA_1"))
            
            # Regression 2: Century Factors, US Stock Selection
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "US Stock Selection", region="USA")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_USA_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_USA_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_USA_4"))
            
            if broad_category in ["Quantitative/Tactical", "Strategic", "Nontraditional"]:
                # Regression 5: Quantitative/Tactical, Strategic, Nontraditional
                factors_eq = load_db_factors(['MKT', 'BAB'], "USA")
                factors_fi = load_db_factors(['TSM-FI'], "USA", asset_class="Fixed Income")
                factors_fx = load_db_factors(['TSM-FX'], "USA", asset_class="Currencies")
                rf_factors = load_db_factors(['RF'], None)
                fi_factors = load_fixed_income_factors(['TERM', 'CREDIT'])
                cmdty_factors = load_commodity_factors()
                factors = pd.concat([factors_eq, factors_fi, factors_fx, rf_factors, fi_factors, cmdty_factors[['excess_return_eqwt']]], axis=1)
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                else:
                    print(f"⚠️ Warning: Cannot calculate mkt-rf for {symbol} (Equity_USA_5); using mkt")
                records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'tsm-fx', 'tsm-fi', 'bab', 'TERM', 'CREDIT', 'excess_return_eqwt']], "OLS", "Equity_USA_5"))
                
                # Regression 6: Quantitative/Tactical, Strategic, Nontraditional
                factors_eq = load_db_factors(['MKT', 'TSM-EQ', 'SMB', 'BAB'], "USA")
                factors_com = load_db_factors(['TSM-Com'], "USA", asset_class="Commodities")
                rf_factors = load_db_factors(['RF'], None)
                fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT_HY'])
                factors = pd.concat([factors_eq, factors_com, rf_factors, fi_factors], axis=1)
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                else:
                    print(f"⚠️ Warning: Cannot calculate mkt-rf for {symbol} (Equity_USA_6); using mkt")
                records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'tsm-eq', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT_HY', 'tsm-com']], "OLS", "Equity_USA_6"))
                
                # Regression 7: Quantitative/Tactical, Strategic, Nontraditional
                factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'UMD', 'TSM-EQ', 'SMB', 'BAB'], "USA")
                factors_com = load_db_factors(['TSM-Com'], "USA", asset_class="Commodities")
                factors_fi = load_db_factors(['TSM-FI'], "USA", asset_class="Fixed Income")
                factors_fx = load_db_factors(['TSM-FX'], "USA", asset_class="Currencies")
                rf_factors = load_db_factors(['RF'], None)
                fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
                factors = pd.concat([factors_eq, factors_com, factors_fi, factors_fx, rf_factors, fi_factors], axis=1)
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                else:
                    print(f"⚠️ Warning: Cannot calculate mkt-rf for {symbol} (Equity_USA_7); using mkt")
                records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'umd', 'tsm-eq', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fi', 'tsm-fx']], "OLS", "Equity_USA_7"))
            else:
                print(f"Skipping Equity regressions 5-7 for {symbol}: CWA_Broad_Category_Name not Quantitative/Tactical, Strategic, Nontraditional")
        
        # Equity (International)
        elif category in [
            "Asia ex-Japan Equity", "Australia & New Zealand Equity", "Canadian Equity Large Cap",
            "Europe Equity Large Cap", "Europe Equity Mid/Small Cap", "Greater China Equity",
            "India Equity", "Mexico Equity", "Japan Equity", "Korea Equity", "Latin America Equity",
            "UK Equity Large Cap", "Thailand Equity"
        ]:
            # Regression 1: International
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Intl")
            rf_factors = load_db_factors(['RF'], None)
            factors = pd.concat([factors_eq, rf_factors], axis=1)
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            else:
                print(f"⚠️ Warning: Cannot calculate mkt-rf for {symbol} (Equity_Intl_1); using mkt")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']], "OLS", "Equity_Intl_1"))
            
            # Regression 2: Century Factors, Intl Stock Selection
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Intl Stock Selection", region="International")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Intl_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Intl_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Intl_4"))
        
        # Equity (Global)
        elif category in ["Global Emerging Markets Equity", "Global Equity Large Cap", "Global Equity Mid/Small Cap"]:
            # Regression 1: Global
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Global")
            rf_factors = load_db_factors(['RF'], None)
            factors = pd.concat([factors_eq, rf_factors], axis=1)
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            else:
                print(f"⚠️ Warning: Cannot calculate mkt-rf for {symbol} (Equity_Global_1); using mkt")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']], "OLS", "Equity_Global_1"))
            
            # Regression 2: Century Factors, All Stock Selection
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Stock Selection")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Global_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Global_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Equity_Global_4"))
        
        # Fixed Income
        elif category in [
            "Convertibles", "Emerging Markets Fixed Income", "Fixed Income Miscellaneous",
            "US Fixed Income", "US Municipal Fixed Income"
        ]:
            # Regression 1: TERM_Int, TERM_Long, CREDIT, CREDIT_HY, TSM-FI, TSM-FX
            factors_fi = load_db_factors(['TSM-FI'], "Global", asset_class="Fixed Income")
            factors_fx = load_db_factors(['TSM-FX'], "Global", asset_class="Currencies")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors_fi, factors_fx, fi_factors], axis=1)
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-fi', 'tsm-fx']], "OLS", "FI_1"))
            
            # Regression 2: Century Factors, Fixed Income
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Fixed Income")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "FI_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "FI_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "FI_4"))
        
        # Allocation
        elif category in [
            "Aggressive Allocation", "Allocation Miscellaneous", "Cautious Allocation",
            "Flexible Allocation", "Moderate Allocation"
        ]:
            # Regression 1: Global, MKT-RF, HML-Devil, QMJ, SMB, UMD, BAB, TSM-EQ, TSM-FI, TERM_Int, TERM_Long, CREDIT, CREDIT_HY
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ'], "Global")
            factors_fi = load_db_factors(['TSM-FI'], "Global", asset_class="Fixed Income")
            rf_factors = load_db_factors(['RF'], None)
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors_eq, factors_fi, rf_factors, fi_factors], axis=1)
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            else:
                print(f"⚠️ Warning: Cannot calculate mkt-rf for {symbol} (Allocation_1); using mkt")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']], "OLS", "Allocation_1"))
            
            # Regression 2: Century Factors, Fixed Income and All Stock Selection
            fi_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Fixed Income")
            stock_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Stock Selection")
            factors = pd.concat([fi_factors, stock_factors], axis=1)
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Allocation_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Allocation_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Allocation_4"))
        
        # Alternatives
        elif category in [
            "Alternative Miscellaneous", "Long/Short Equity", "Market Neutral", "Multialternative"
        ]:
            # Regression 1: Global, MKT-RF, HML-Devil, QMJ, SMB, UMD, BAB, TSM-EQ, TSM-FI, TERM_Int, TERM_Long, CREDIT, CREDIT_HY, TSM-COM, TSM-FX
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ'], "Global")
            factors_com = load_db_factors(['TSM-Com'], "Global", asset_class="Commodities")
            factors_fi = load_db_factors(['TSM-FI'], "Global", asset_class="Fixed Income")
            factors_fx = load_db_factors(['TSM-FX'], "Global", asset_class="Currencies")
            rf_factors = load_db_factors(['RF'], None)
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors_eq, factors_com, factors_fi, factors_fx, rf_factors, fi_factors], axis=1)
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            else:
                print(f"⚠️ Warning: Cannot calculate mkt-rf for {symbol} (Alternative_1); using mkt")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors[['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fx']], "OLS", "Alternative_1"))
            
            # Regression 2: Century Factors, Fixed Income, All Stock Selection, Commodities
            fi_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Fixed Income")
            stock_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Stock Selection")
            cmdty_factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Commodities")
            factors = pd.concat([fi_factors, stock_factors, cmdty_factors], axis=1)
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Alternative_2"))
            
            # Regression 3: Century Factors, All Macro
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "All Macro")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Alternative_3"))
            
            # Regression 4: Century Factors, Equity Indices
            factors = load_century_factors(['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Equity Indices")
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Alternative_4"))
        
        # Commodities
        elif category in ["Commodities Broad Basket", "Commodities Specified"]:
            # Regression 1: All factors from aqr_cmdty_factors
            factors = load_commodity_factors()
            records.extend(run_rolling_regression(symbol, fund_row["returns"], factors, "OLS", "Commodity_1"))
    
    if records:
        if not DRY_RUN:
            insert_batch(records)
        else:
            print(f"ℹ️ Dry run: Would have written {len(records)} records for {region}")

def main():
    logging.basicConfig(level=logging.INFO)
    
    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()
    print(f"🧠 Total mapped funds: {len(fund_meta)}")
    print(f"📍 Regions detected: {regions}\n")
    
    # Add returns to fund_meta for processing
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
    returns = load_fund_returns(fund_ids)
    fund_meta["returns"] = fund_meta["SymbolCUSIP"].map(lambda x: returns[x] if x in returns.columns else pd.Series())
    
    for region in regions:
        fund_subset = fund_meta[fund_meta["Region"] == region]
        process_region(region, fund_subset)

# Section 6: Database Output
def insert_batch(records):
    df = pd.DataFrame(records)
    if not DRY_RUN:
        for i in range(0, len(df), BATCH_INSERT_SIZE):
            batch = df.iloc[i:i + BATCH_INSERT_SIZE]
            batch.to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)
    else:
        print(f"ℹ️ Dry run: Skipped writing {len(df)} records")

if __name__ == "__main__":
    main()

Executing metadata query...
Metadata loaded: ['SymbolCUSIP', 'Region', 'YC_Global_Category_ID', 'Global_Category_Name', 'YC_Category_ID', 'Category_Name', 'CWA_Broad_Category_ID', 'CWA_Broad_Category_Name', 'FactorProfile'] | Shape: (5584, 9)
🧠 Total mapped funds: 5584
📍 Regions detected: ['Global' 'USA' 'Unknown' 'International']

Fund returns loaded: ['SymbolCUSIP', 'Date', 'ReturnValue'] | Shape: (9449, 3)


ValueError: Index contains duplicate entries, cannot reshape

In [1]:
# Version 4.6
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time

# Section 1: Configuration and Setup
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

RETURN_METRIC = "1 Month Return"
ROLLING_PERIODS = [12, 24, 36, 48, 60]  # in months
DRY_RUN = True
SAMPLE_DRY_RUN = True
SAMPLE_SIZE = 50
CHUNK_SIZE = 5600
BATCH_INSERT_SIZE = 10000
MAX_WORKERS = 15  # Optimized for 16-core i9-185H

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials SectorEquity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    return mapping.get(category, ("Unknown", "Unknown"))

# Section 3: Data Loading Functions
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    try:
        df = pd.read_sql(query, engine)
        df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
        logging.warning(f"Loaded metadata for {len(df)} funds")
        if df["CWA_Broad_Category_Name"].isnull().all():
            logging.warning("CWA_Broad_Category_Name missing for all rows; Equity regressions 5-7 will be skipped")
        return df.dropna(subset=["Region", "FactorProfile"])
    except Exception as e:
        logging.error(f"Error loading metadata: {e}")
        raise

def load_fund_returns(fund_ids):
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
        AND Metric = '{RETURN_METRIC}'
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    logging.warning(f"Loaded returns for {len(df['SymbolCUSIP'].unique())} funds")
    return df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

def load_db_factors(factor_list, region="Global", table="factor_returns", asset_class=None):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM {table}
        WHERE factor IN ({factor_in_clause})
    """
    params = ()
    if region and not any(f.startswith('TSM-') or f == 'RF' for f in factor_list):
        query += " AND region = ?"
        params = (region,)
    elif 'RF' in factor_list:
        query += " AND region = 'USA'"
    if asset_class:
        query += " AND asset_class = ?"
        params += (asset_class,)
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    if df.empty:
        logging.warning(f"No data for factors {factor_list} in {table} (region: {region}, asset_class: {asset_class})")
        return pd.DataFrame()
    
    # Handle duplicates
    duplicates = df.duplicated(subset=['Date', 'Factor']).sum()
    if duplicates > 0:
        logging.warning(f"Found {duplicates} duplicate Date-Factor pairs in {table} for {factor_list}; aggregating by mean")
        df = df.groupby(['Date', 'Factor'])['Value'].mean().reset_index()
    
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value").rename(
        columns={
            'MKT': 'mkt', 'SMB': 'smb', 'HML_Devil': 'hml', 'UMD': 'umd', 'QMJ': 'qmj',
            'BAB': 'bab', 'RF': 'rf', 'TSM-Com': 'tsm-com', 'TSM-EQ': 'tsm-eq',
            'TSM-FI': 'tsm-fi', 'TSM-FX': 'tsm-fx', 'TSM-MA': 'tsm-ma'
        }
    )
    missing_factors = [f for f in factor_list if f.lower() not in pivoted_df.columns]
    if missing_factors:
        logging.warning(f"Missing factors in {table} (region: {region}): {missing_factors}")
    return pivoted_df

def load_fixed_income_factors(factor_list):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
        WHERE Factor_Name IN ({factor_in_clause})
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    if df.empty:
        logging.warning(f"No fixed income factors for {factor_list}")
        return pd.DataFrame()
    return df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")

def load_century_factors(factor_list, portfolio_base, factor, asset_class=None, region="Global"):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    portfolio = f"{portfolio_base} {factor}"  # e.g., "US Stock Selection Value"
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM aqr_century_factors
        WHERE factor IN ({factor_in_clause})
        AND portfolio = ?
    """
    params = (portfolio,)
    if asset_class:
        query += " AND asset_class = ?"
        params += (asset_class,)
    if region != "Global":
        query += " AND region = ?"
        params += (region,)
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    if df.empty:
        logging.warning(f"No data for factors {factor_list} in aqr_century_factors (portfolio: {portfolio}, region: {region})")
        return pd.DataFrame()
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value")
    return pivoted_df

def load_commodity_factors():
    query = """
        SELECT date AS Date, 
               excess_return_eqwt, 
               excess_spot_return_eqwt, 
               ir_adjusted_carry_eqwt, 
               spot_return_eqwt, 
               carry_eqwt, 
               excess_return_long_short, 
               excess_spot_return_long_short, 
               ir_adjusted_carry_long_short, 
               aggregate_backwardation_contango
        FROM aqr_cmdty_factors
    """
    df = pd.read_sql(query, engine, parse_dates=['Date'])
    if df.empty:
        logging.warning("No commodity factors loaded")
        return pd.DataFrame()
    return df.set_index("Date")

# Section 4: Regression Functions
def run_rolling_regression(fund, returns, factors, regression_type, factor_set):
    results = []
    returns = returns.dropna()
    factors = factors.dropna()
    returns.index = pd.to_datetime(returns.index, errors='coerce')
    factors.index = pd.to_datetime(factors.index, errors='coerce')
    returns = returns[returns.index.notnull()]
    factors = factors[factors.index.notnull()]
    if returns.empty or factors.empty:
        logging.warning(f"Empty returns or factors for {fund} ({factor_set}); skipping")
        return results
    viable_periods = [w for w in ROLLING_PERIODS if (returns.index.max() - relativedelta(months=w)) >= returns.index.min()]
    
    for window in viable_periods:
        start = returns.index.min() + relativedelta(months=window)
        for end_date in returns.loc[returns.index >= start].index:
            start_date = end_date - relativedelta(months=window - 1)
            y = returns.loc[start_date:end_date]
            X = factors.loc[start_date:end_date]
            X, y = X.align(y, join="inner", axis=0)
            if len(y) < window or y.isnull().any() or X.isnull().any().any():
                continue
            X_const = add_constant(X)
            model = OLS(y, X_const).fit()
            for factor in X.columns:
                results.append({
                    "SymbolCUSIP": fund,
                    "MonthEndDate": end_date,
                    "RollPeriod": f"{window}m",
                    "Factor_Name": factor,
                    "Coefficient": model.params.get(factor, np.nan),
                    "P_Value": model.pvalues.get(factor, np.nan),
                    "T_Stat": model.tvalues.get(factor, np.nan),
                    "Standard_Error": model.bse.get(factor, np.nan),
                    "CI_Lower": model.conf_int().loc[factor][0] if factor in model.params else np.nan,
                    "CI_Upper": model.conf_int().loc[factor][1] if factor in model.params else np.nan,
                    "Adj_R2": model.rsquared_adj,
                    "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1] if len(y) > 1 else np.nan,
                    "Regression_Type": regression_type,
                    "Factor_Set": factor_set
                })
    return results

# Section 5: Processing Functions
def process_fund(fund_row):
    """Process regressions for a single fund, returning a list of records."""
    records = []
    symbol = fund_row["SymbolCUSIP"]
    category = fund_row["Global_Category_Name"]
    broad_category = fund_row.get("CWA_Broad_Category_Name", None)
    returns = fund_row["returns"].dropna()
    
    if returns.empty:
        logging.warning(f"No valid returns for {symbol}; skipping")
        return records

    # Equity (USA)
    if category in [
        "Energy Sector Equity", "Equity Miscellaneous", "Financials Sector Equity",
        "Healthcare Sector Equity", "Consumer Goods & Services Sector Equity",
        "Communications Sector Equity", "Industrials Sector Equity", "Other Sector Equity",
        "Real Estate Sector Equity", "Precious Metals Sector Equity", "Technology Sector Equity",
        "Utilities Sector Equity", "US Equity Large Cap Blend", "US Equity Large Cap Growth",
        "US Equity Large Cap Value", "US Equity Mid Cap", "US Equity Small Cap", "Options Trading"
    ]:
        # Regression 1: USA, MKT-RF, HML-Devil, QMJ, SMB, UMD, BAB
        factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB'], "USA")
        rf_factors = load_db_factors(['RF'], "USA")
        factors = pd.concat([factors_eq, rf_factors], axis=1).dropna()
        if factors.empty:
            logging.warning(f"No valid factors for {symbol} (Equity_USA_1)")
            return records
        if 'mkt' in factors.columns and 'rf' in factors.columns:
            factors['mkt-rf'] = factors['mkt'] - factors['rf']
        elif 'mkt' not in factors.columns:
            logging.warning(f"No market factor for {symbol} (Equity_USA_1); skipping")
            return records
        factors, returns_aligned = factors.align(returns, join="inner", axis=0)
        if returns_aligned.empty:
            logging.warning(f"No overlapping dates for {symbol} (Equity_USA_1)")
            return records
        desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab']
        available_factors = [f for f in desired_factors if f in factors.columns]
        if available_factors:
            records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_1"))
        else:
            logging.warning(f"No valid factors for {symbol} (Equity_USA_1)")

        # Regression 2-4: Century Factors
        for portfolio_base, factor_set in [
            ("US Stock Selection", "Equity_USA_2"),
            ("All Macro", "Equity_USA_3"),
            ("Equity indices", "Equity_USA_4")
        ]:
            for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                factors = load_century_factors([factor], portfolio_base, factor, region="USA")
                if factors.empty:
                    logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                    continue
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                    continue
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        if broad_category in ["Quantitative/Tactical", "Strategic", "Nontraditional"]:
            # Regression 5
            factors_eq = load_db_factors(['MKT', 'BAB'], "USA")
            factors_fi = load_db_factors(['TSM-FI'])  # Universal
            factors_fx = load_db_factors(['TSM-FX'])  # Universal
            rf_factors = load_db_factors(['RF'], "USA")
            fi_factors = load_fixed_income_factors(['TERM', 'CREDIT'])
            cmdty_factors = load_commodity_factors()
            factors = pd.concat([factors_eq, factors_fi, factors_fx, rf_factors, fi_factors, cmdty_factors[['excess_return_eqwt']]], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Equity_USA_5)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Equity_USA_5)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'tsm-fx', 'tsm-fi', 'bab', 'TERM', 'CREDIT', 'excess_return_eqwt']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_5"))

            # Regression 6
            factors_eq = load_db_factors(['MKT', 'SMB', 'BAB'], "USA")
            factors_com = load_db_factors(['TSM-Com'])  # Universal
            rf_factors = load_db_factors(['RF'], "USA")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT_HY'])
            factors = pd.concat([factors_eq, factors_com, rf_factors, fi_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Equity_USA_6)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Equity_USA_6)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT_HY', 'tsm-com']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_6"))

            # Regression 7
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'UMD', 'SMB', 'BAB'], "USA")
            factors_com = load_db_factors(['TSM-Com'])  # Universal
            factors_fi = load_db_factors(['TSM-FI'])  # Universal
            factors_fx = load_db_factors(['TSM-FX'])  # Universal
            rf_factors = load_db_factors(['RF'], "USA")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors_eq, factors_com, factors_fi, factors_fx, rf_factors, fi_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Equity_USA_7)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Equity_USA_7)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'umd', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fi', 'tsm-fx']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_7"))
        else:
            logging.warning(f"Skipping Equity regressions 5-7 for {symbol}: Not Quantitative/Tactical, Strategic, or Nontraditional")

    # Equity (International)
    elif category in [
        "Asia ex-Japan Equity", "Australia & New Zealand Equity", "Canadian Equity Large Cap",
        "Europe Equity Large Cap", "Europe Equity Mid/Small Cap", "Greater China Equity",
        "India Equity", "Mexico Equity", "Japan Equity", "Korea Equity", "Latin America Equity",
        "UK Equity Large Cap", "Thailand Equity"
    ]:
        # Regression 1
        factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Intl")
        rf_factors = load_db_factors(['RF'], "USA")
        factors = pd.concat([factors_eq, rf_factors], axis=1).dropna()
        if factors.empty:
            logging.warning(f"No valid factors for {symbol} (Equity_Intl_1)")
        else:
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} (Equity_Intl_1)")
            else:
                desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']
                available_factors = [f for f in desired_factors if f in factors.columns]
                if available_factors:
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_Intl_1"))

        # Regression 2-4
        for portfolio_base, factor_set in [
            ("Intl Stock Selection", "Equity_Intl_2"),
            ("All Macro", "Equity_Intl_3"),
            ("Equity indices", "Equity_Intl_4")
        ]:
            for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                factors = load_century_factors([factor], portfolio_base, factor, region="International")
                if factors.empty:
                    logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                    continue
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                    continue
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

    # Equity (Global)
    elif category in ["Global Emerging Markets Equity", "Global Equity Large Cap", "Global Equity Mid/Small Cap"]:
        # Regression 1
        factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Global")
        rf_factors = load_db_factors(['RF'], "USA")
        factors = pd.concat([factors_eq, rf_factors], axis=1).dropna()
        if factors.empty:
            logging.warning(f"No valid factors for {symbol} (Equity_Global_1)")
        else:
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} (Equity_Global_1)")
            else:
                desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']
                available_factors = [f for f in desired_factors if f in factors.columns]
                if available_factors:
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_Global_1"))

        # Regression 2-4
        for portfolio_base, factor_set in [
            ("All Stock Selection", "Equity_Global_2"),
            ("All Macro", "Equity_Global_3"),
            ("Equity indices", "Equity_Global_4")
        ]:
            for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                factors = load_century_factors([factor], portfolio_base, factor)
                if factors.empty:
                    logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                    continue
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                    continue
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

    # Fixed Income
    elif category in [
        "Convertibles", "Emerging Markets Fixed Income", "Fixed Income Miscellaneous",
        "US Fixed Income", "US Municipal Fixed Income"
    ]:
        # Regression 1
        factors_fi = load_db_factors(['TSM-FI'])  # Universal
        factors_fx = load_db_factors(['TSM-FX'])  # Universal
        fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
        factors = pd.concat([factors_fi, factors_fx, fi_factors], axis=1).dropna()
        if factors.empty:
            logging.warning(f"No valid factors for {symbol} (FI_1)")
        else:
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} (FI_1)")
            else:
                desired_factors = ['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-fi', 'tsm-fx']
                available_factors = [f for f in desired_factors if f in factors.columns]
                if available_factors:
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "FI_1"))

        # Regression 2-4
        for portfolio_base, factor_set in [
            ("Fixed income", "FI_2"),
            ("All Macro", "FI_3"),
            ("Equity indices", "FI_4")
        ]:
            for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                factors = load_century_factors([factor], portfolio_base, factor)
                if factors.empty:
                    logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                    continue
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                    continue
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

    # Allocation
    elif category in [
        "Aggressive Allocation", "Allocation Miscellaneous", "Cautious Allocation",
        "Flexible Allocation", "Moderate Allocation"
    ]:
        # Regression 1
        factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ'], "Global")
        factors_fi = load_db_factors(['TSM-FI'])  # Universal
        rf_factors = load_db_factors(['RF'], "USA")
        fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
        factors = pd.concat([factors_eq, factors_fi, rf_factors, fi_factors], axis=1).dropna()
        if factors.empty:
            logging.warning(f"No valid factors for {symbol} (Allocation_1)")
        else:
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} (Allocation_1)")
            else:
                desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']
                available_factors = [f for f in desired_factors if f in factors.columns]
                if available_factors:
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Allocation_1"))

        # Regression 2
        for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
            fi_factors = load_century_factors([factor], "Fixed income", factor)
            stock_factors = load_century_factors([factor], "All Stock Selection", factor)
            factors = pd.concat([fi_factors, stock_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Allocation_2, factor: {factor})")
                continue
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} (Allocation_2, factor: {factor})")
                continue
            records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", "Allocation_2"))

        # Regression 3-4
        for portfolio_base, factor_set in [
            ("All Macro", "Allocation_3"),
            ("Equity indices", "Allocation_4")
        ]:
            for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                factors = load_century_factors([factor], portfolio_base, factor)
                if factors.empty:
                    logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                    continue
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                    continue
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

    # Alternatives
    elif category in [
        "Alternative Miscellaneous", "Long/Short Equity", "Market Neutral", "Multialternative"
    ]:
        # Regression 1
        factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ'], "Global")
        factors_com = load_db_factors(['TSM-Com'])  # Universal
        factors_fi = load_db_factors(['TSM-FI'])  # Universal
        factors_fx = load_db_factors(['TSM-FX'])  # Universal
        rf_factors = load_db_factors(['RF'], "USA")
        fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
        factors = pd.concat([factors_eq, factors_com, factors_fi, factors_fx, rf_factors, fi_factors], axis=1).dropna()
        if factors.empty:
            logging.warning(f"No valid factors for {symbol} (Alternative_1)")
        else:
            if 'mkt' in factors.columns and 'rf' in factors.columns:
                factors['mkt-rf'] = factors['mkt'] - factors['rf']
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} (Alternative_1)")
            else:
                desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fx']
                available_factors = [f for f in desired_factors if f in factors.columns]
                if available_factors:
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Alternative_1"))

        # Regression 2
        for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
            fi_factors = load_century_factors([factor], "Fixed income", factor)
            stock_factors = load_century_factors([factor], "All Stock Selection", factor)
            cmdty_factors = load_century_factors([factor], "Commodities", factor)
            factors = pd.concat([fi_factors, stock_factors, cmdty_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Alternative_2, factor: {factor})")
                continue
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} (Alternative_2, factor: {factor})")
                continue
            records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", "Alternative_2"))

        # Regression 3-4
        for portfolio_base, factor_set in [
            ("All Macro", "Alternative_3"),
            ("Equity indices", "Alternative_4")
        ]:
            for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                factors = load_century_factors([factor], portfolio_base, factor)
                if factors.empty:
                    logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                    continue
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                    continue
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

    # Commodities
    elif category in ["Commodities Broad Basket", "Commodities Specified"]:
        # Regression 1
        factors = load_commodity_factors()
        if factors.empty:
            logging.warning(f"No commodity factors for {symbol} (Commodity_1)")
        else:
            factors, returns_aligned = factors.align(returns, join="inner", axis=0)
            if returns_aligned.empty:
                logging.warning(f"No overlapping dates for {symbol} (Commodity_1)")
            else:
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", "Commodity_1"))

    return records

def process_region(region, fund_subset):
    records = []
    with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_fund = {executor.submit(process_fund, row): row["SymbolCUSIP"] for _, row in fund_subset.iterrows()}
        for future in tqdm(future_to_fund, total=len(fund_subset), desc=f"Processing {region}"):
            try:
                records.extend(future.result())
            except Exception as e:
                logging.warning(f"Error processing {future_to_fund[future]}: {e}")

    if records:
        if not DRY_RUN:
            insert_batch(records)
        else:
            logging.warning(f"Dry run: Would have written {len(records)} records for {region}")

# Section 6: Main Pipeline
def main():
    logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
    
    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()
    logging.warning(f"Total mapped funds: {len(fund_meta)}")
    logging.warning(f"Regions detected: {regions}")
    
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
    returns = load_fund_returns(fund_ids)
    fund_meta["returns"] = fund_meta["SymbolCUSIP"].map(lambda x: returns[x] if x in returns.columns else pd.Series())
    
    for region in regions:
        fund_subset = fund_meta[fund_meta["Region"] == region]
        logging.warning(f"Processing {len(fund_subset)} funds in {region}")
        process_region(region, fund_subset)

# Section 7: Database Output
def insert_batch(records):
    df = pd.DataFrame(records)
    if not DRY_RUN:
        for i in range(0, len(df), BATCH_INSERT_SIZE):
            batch = df.iloc[i:i + BATCH_INSERT_SIZE]
            batch.to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)
    else:
        logging.warning(f"Dry run: Skipped writing {len(df)} records")

if __name__ == "__main__":
    main()



BrokenProcessPool: A child process terminated abruptly, the process pool is not usable anymore

In [2]:
# Version 4.7
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time

# Section 1: Configuration and Setup
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

RETURN_METRIC = "1 Month Return"
ROLLING_PERIODS = [12, 24, 36, 48, 60]  # in months
DRY_RUN = True
SAMPLE_DRY_RUN = True
SAMPLE_SIZE = 50
CHUNK_SIZE = 5600
BATCH_INSERT_SIZE = 10000
MAX_WORKERS = 8  # Reduced to avoid resource overload

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    return mapping.get(category, ("Unknown", "Unknown"))

# Section 3: Data Loading Functions
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    try:
        df = pd.read_sql(query, engine)
        df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
        logging.warning(f"Loaded metadata for {len(df)} funds")
        if df["CWA_Broad_Category_Name"].isnull().all():
            logging.warning("CWA_Broad_Category_Name missing for all rows; Equity regressions 5-7 will be skipped")
        return df.dropna(subset=["Region", "FactorProfile"])
    except Exception as e:
        logging.error(f"Error loading metadata: {e}")
        raise

def load_fund_returns(fund_ids):
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
        AND Metric = '{RETURN_METRIC}'
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    logging.warning(f"Loaded returns for {len(df['SymbolCUSIP'].unique())} funds")
    return df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

def load_db_factors(factor_list, region="Global", table="factor_returns", asset_class=None):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM {table}
        WHERE factor IN ({factor_in_clause})
    """
    params = ()
    if region and not any(f.startswith('TSM-') or f == 'RF' for f in factor_list):
        query += " AND region = ?"
        params = (region,)
    elif 'RF' in factor_list:
        query += " AND region = 'USA'"
    if asset_class:
        query += " AND asset_class = ?"
        params += (asset_class,)
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    if df.empty:
        logging.warning(f"No data for factors {factor_list} in {table} (region: {region}, asset_class: {asset_class})")
        return pd.DataFrame()
    
    # Handle duplicates
    duplicates = df.duplicated(subset=['Date', 'Factor']).sum()
    if duplicates > 0:
        logging.warning(f"Found {duplicates} duplicate Date-Factor pairs in {table} for {factor_list}; aggregating by mean")
        df = df.groupby(['Date', 'Factor'])['Value'].mean().reset_index()
    
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value").rename(
        columns={
            'MKT': 'mkt', 'SMB': 'smb', 'HML_Devil': 'hml', 'UMD': 'umd', 'QMJ': 'qmj',
            'BAB': 'bab', 'RF': 'rf', 'TSM-Com': 'tsm-com', 'TSM-EQ': 'tsm-eq',
            'TSM-FI': 'tsm-fi', 'TSM-FX': 'tsm-fx', 'TSM-MA': 'tsm-ma'
        }
    )
    missing_factors = [f for f in factor_list if f.lower() not in pivoted_df.columns]
    if missing_factors:
        logging.warning(f"Missing factors in {table} (region: {region}): {missing_factors}")
    return pivoted_df

def load_fixed_income_factors(factor_list):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
        WHERE Factor_Name IN ({factor_in_clause})
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    if df.empty:
        logging.warning(f"No fixed income factors for {factor_list}")
        return pd.DataFrame()
    return df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")

def load_century_factors(factor_list, portfolio_base, factor, asset_class=None, region="Global"):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    portfolio = f"{portfolio_base} {factor}"  # e.g., "US Stock Selection Value"
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM aqr_century_factors
        WHERE factor IN ({factor_in_clause})
        AND portfolio = ?
    """
    params = (portfolio,)
    if asset_class:
        query += " AND asset_class = ?"
        params += (asset_class,)
    if region != "Global":
        query += " AND region = ?"
        params += (region,)
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    if df.empty:
        logging.warning(f"No data for factors {factor_list} in aqr_century_factors (portfolio: {portfolio}, region: {region})")
        return pd.DataFrame()
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value")
    return pivoted_df

def load_commodity_factors():
    query = """
        SELECT date AS Date, 
               excess_return_eqwt, 
               excess_spot_return_eqwt, 
               ir_adjusted_carry_eqwt, 
               spot_return_eqwt, 
               carry_eqwt, 
               excess_return_long_short, 
               excess_spot_return_long_short, 
               ir_adjusted_carry_long_short, 
               aggregate_backwardation_contango
        FROM aqr_cmdty_factors
    """
    df = pd.read_sql(query, engine, parse_dates=['Date'])
    if df.empty:
        logging.warning("No commodity factors loaded")
        return pd.DataFrame()
    return df.set_index("Date")

# Section 4: Regression Functions
def run_rolling_regression(fund, returns, factors, regression_type, factor_set):
    results = []
    try:
        returns = returns.dropna()
        factors = factors.dropna()
        returns.index = pd.to_datetime(returns.index, errors='coerce')
        factors.index = pd.to_datetime(factors.index, errors='coerce')
        returns = returns[returns.index.notnull()]
        factors = factors[factors.index.notnull()]
        if returns.empty or factors.empty:
            logging.warning(f"Empty returns or factors for {fund} ({factor_set})")
            return results
        viable_periods = [w for w in ROLLING_PERIODS if (returns.index.max() - relativedelta(months=w)) >= returns.index.min()]
        
        for window in viable_periods:
            start = returns.index.min() + relativedelta(months=window)
            for end_date in returns.loc[returns.index >= start].index:
                start_date = end_date - relativedelta(months=window - 1)
                y = returns.loc[start_date:end_date]
                X = factors.loc[start_date:end_date]
                X, y = X.align(y, join="inner", axis=0)
                if len(y) < window or y.isnull().any() or X.isnull().any().any():
                    continue
                X_const = add_constant(X)
                model = OLS(y, X_const).fit()
                for factor in X.columns:
                    results.append({
                        "SymbolCUSIP": fund,
                        "MonthEndDate": end_date,
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": model.params.get(factor, np.nan),
                        "P_Value": model.pvalues.get(factor, np.nan),
                        "T_Stat": model.tvalues.get(factor, np.nan),
                        "Standard_Error": model.bse.get(factor, np.nan),
                        "CI_Lower": model.conf_int().loc[factor][0] if factor in model.params else np.nan,
                        "CI_Upper": model.conf_int().loc[factor][1] if factor in model.params else np.nan,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1] if len(y) > 1 else np.nan,
                        "Regression_Type": regression_type,
                        "Factor_Set": factor_set
                    })
    except Exception as e:
        logging.warning(f"Error in run_rolling_regression for {fund} ({factor_set}): {e}")
        return results
    return results

# Section 5: Processing Functions
def process_fund(fund_row):
    """Process regressions for a single fund, returning a list of records."""
    records = []
    symbol = fund_row["SymbolCUSIP"]
    category = fund_row["Global_Category_Name"]
    broad_category = fund_row.get("CWA_Broad_Category_Name", None)
    
    try:
        returns = fund_row["returns"].dropna()
        if returns.empty:
            logging.warning(f"No valid returns for {symbol}")
            return records

        # Equity (USA)
        if category in [
            "Energy Sector Equity", "Equity Miscellaneous", "Financials Sector Equity",
            "Healthcare Sector Equity", "Consumer Goods & Services Sector Equity",
            "Communications Sector Equity", "Industrials Sector Equity", "Other Sector Equity",
            "Real Estate Sector Equity", "Precious Metals Sector Equity", "Technology Sector Equity",
            "Utilities Sector Equity", "US Equity Large Cap Blend", "US Equity Large Cap Growth",
            "US Equity Large Cap Value", "US Equity Mid Cap", "US Equity Small Cap", "Options Trading"
        ]:
            # Regression 1: USA, MKT-RF, HML-Devil, QMJ, SMB, UMD, BAB
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB'], "USA")
            rf_factors = load_db_factors(['RF'], "USA")
            factors = pd.concat([factors_eq, rf_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Equity_USA_1)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                elif 'mkt' not in factors.columns:
                    logging.warning(f"No market factor for {symbol} (Equity_USA_1)")
                else:
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} (Equity_USA_1)")
                    else:
                        desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab']
                        available_factors = [f for f in desired_factors if f in factors.columns]
                        if available_factors:
                            records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_1"))
                        else:
                            logging.warning(f"No valid factors for {symbol} (Equity_USA_1)")

            # Regression 2-4: Century Factors
            for portfolio_base, factor_set in [
                ("US Stock Selection", "Equity_USA_2"),
                ("All Macro", "Equity_USA_3"),
                ("Equity indices", "Equity_USA_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor, region="USA")
                    if factors.empty:
                        logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

            if broad_category in ["Quantitative/Tactical", "Strategic", "Nontraditional"]:
                # Regression 5
                factors_eq = load_db_factors(['MKT', 'BAB'], "USA")
                factors_fi = load_db_factors(['TSM-FI'])
                factors_fx = load_db_factors(['TSM-FX'])
                rf_factors = load_db_factors(['RF'], "USA")
                fi_factors = load_fixed_income_factors(['TERM', 'CREDIT'])
                cmdty_factors = load_commodity_factors()
                factors = pd.concat([factors_eq, factors_fi, factors_fx, rf_factors, fi_factors, cmdty_factors[['excess_return_eqwt']]], axis=1).dropna()
                if factors.empty:
                    logging.warning(f"No valid factors for {symbol} (Equity_USA_5)")
                else:
                    if 'mkt' in factors.columns and 'rf' in factors.columns:
                        factors['mkt-rf'] = factors['mkt'] - factors['rf']
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} (Equity_USA_5)")
                    else:
                        desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'tsm-fx', 'tsm-fi', 'bab', 'TERM', 'CREDIT', 'excess_return_eqwt']
                        available_factors = [f for f in desired_factors if f in factors.columns]
                        if available_factors:
                            records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_5"))

                # Regression 6
                factors_eq = load_db_factors(['MKT', 'SMB', 'BAB'], "USA")
                factors_com = load_db_factors(['TSM-Com'])
                rf_factors = load_db_factors(['RF'], "USA")
                fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT_HY'])
                factors = pd.concat([factors_eq, factors_com, rf_factors, fi_factors], axis=1).dropna()
                if factors.empty:
                    logging.warning(f"No valid factors for {symbol} (Equity_USA_6)")
                else:
                    if 'mkt' in factors.columns and 'rf' in factors.columns:
                        factors['mkt-rf'] = factors['mkt'] - factors['rf']
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} (Equity_USA_6)")
                    else:
                        desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT_HY', 'tsm-com']
                        available_factors = [f for f in desired_factors if f in factors.columns]
                        if available_factors:
                            records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_6"))

                # Regression 7
                factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'UMD', 'SMB', 'BAB'], "USA")
                factors_com = load_db_factors(['TSM-Com'])
                factors_fi = load_db_factors(['TSM-FI'])
                factors_fx = load_db_factors(['TSM-FX'])
                rf_factors = load_db_factors(['RF'], "USA")
                fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
                factors = pd.concat([factors_eq, factors_com, factors_fi, factors_fx, rf_factors, fi_factors], axis=1).dropna()
                if factors.empty:
                    logging.warning(f"No valid factors for {symbol} (Equity_USA_7)")
                else:
                    if 'mkt' in factors.columns and 'rf' in factors.columns:
                        factors['mkt-rf'] = factors['mkt'] - factors['rf']
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} (Equity_USA_7)")
                    else:
                        desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'umd', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fi', 'tsm-fx']
                        available_factors = [f for f in desired_factors if f in factors.columns]
                        if available_factors:
                            records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_7"))
            else:
                logging.warning(f"Skipping Equity regressions 5-7 for {symbol}: Not Quantitative/Tactical, Strategic, or Nontraditional")

        # Equity (International)
        elif category in [
            "Asia ex-Japan Equity", "Australia & New Zealand Equity", "Canadian Equity Large Cap",
            "Europe Equity Large Cap", "Europe Equity Mid/Small Cap", "Greater China Equity",
            "India Equity", "Mexico Equity", "Japan Equity", "Korea Equity", "Latin America Equity",
            "UK Equity Large Cap", "Thailand Equity"
        ]:
            # Regression 1
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Intl")
            rf_factors = load_db_factors(['RF'], "USA")
            factors = pd.concat([factors_eq, rf_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Equity_Intl_1)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Equity_Intl_1)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_Intl_1"))

            # Regression 2-4
            for portfolio_base, factor_set in [
                ("Intl Stock Selection", "Equity_Intl_2"),
                ("All Macro", "Equity_Intl_3"),
                ("Equity indices", "Equity_Intl_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor, region="International")
                    if factors.empty:
                        logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        # Equity (Global)
        elif category in ["Global Emerging Markets Equity", "Global Equity Large Cap", "Global Equity Mid/Small Cap"]:
            # Regression 1
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Global")
            rf_factors = load_db_factors(['RF'], "USA")
            factors = pd.concat([factors_eq, rf_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Equity_Global_1)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Equity_Global_1)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_Global_1"))

            # Regression 2-4
            for portfolio_base, factor_set in [
                ("All Stock Selection", "Equity_Global_2"),
                ("All Macro", "Equity_Global_3"),
                ("Equity indices", "Equity_Global_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor)
                    if factors.empty:
                        logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        # Fixed Income
        elif category in [
            "Convertibles", "Emerging Markets Fixed Income", "Fixed Income Miscellaneous",
            "US Fixed Income", "US Municipal Fixed Income"
        ]:
            # Regression 1
            factors_fi = load_db_factors(['TSM-FI'])
            factors_fx = load_db_factors(['TSM-FX'])
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors_fi, factors_fx, fi_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (FI_1)")
            else:
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (FI_1)")
                else:
                    desired_factors = ['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-fi', 'tsm-fx']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "FI_1"))

            # Regression 2-4
            for portfolio_base, factor_set in [
                ("Fixed income", "FI_2"),
                ("All Macro", "FI_3"),
                ("Equity indices", "FI_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor)
                    if factors.empty:
                        logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        # Allocation
        elif category in [
            "Aggressive Allocation", "Allocation Miscellaneous", "Cautious Allocation",
            "Flexible Allocation", "Moderate Allocation"
        ]:
            # Regression 1
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ'], "Global")
            factors_fi = load_db_factors(['TSM-FI'])
            rf_factors = load_db_factors(['RF'], "USA")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors_eq, factors_fi, rf_factors, fi_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Allocation_1)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Allocation_1)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Allocation_1"))

            # Regression 2
            for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                fi_factors = load_century_factors([factor], "Fixed income", factor)
                stock_factors = load_century_factors([factor], "All Stock Selection", factor)
                factors = pd.concat([fi_factors, stock_factors], axis=1).dropna()
                if factors.empty:
                    logging.warning(f"No valid factors for {symbol} (Allocation_2, factor: {factor})")
                    continue
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Allocation_2, factor: {factor})")
                    continue
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", "Allocation_2"))

            # Regression 3-4
            for portfolio_base, factor_set in [
                ("All Macro", "Allocation_3"),
                ("Equity indices", "Allocation_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor)
                    if factors.empty:
                        logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        # Alternatives
        elif category in [
            "Alternative Miscellaneous", "Long/Short Equity", "Market Neutral", "Multialternative"
        ]:
            # Regression 1
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ'], "Global")
            factors_com = load_db_factors(['TSM-Com'])
            factors_fi = load_db_factors(['TSM-FI'])
            factors_fx = load_db_factors(['TSM-FX'])
            rf_factors = load_db_factors(['RF'], "USA")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors_eq, factors_com, factors_fi, factors_fx, rf_factors, fi_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Alternative_1)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Alternative_1)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fx']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Alternative_1"))

            # Regression 2
            for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                fi_factors = load_century_factors([factor], "Fixed income", factor)
                stock_factors = load_century_factors([factor], "All Stock Selection", factor)
                cmdty_factors = load_century_factors([factor], "Commodities", factor)
                factors = pd.concat([fi_factors, stock_factors, cmdty_factors], axis=1).dropna()
                if factors.empty:
                    logging.warning(f"No valid factors for {symbol} (Alternative_2, factor: {factor})")
                    continue
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Alternative_2, factor: {factor})")
                    continue
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", "Alternative_2"))

            # Regression 3-4
            for portfolio_base, factor_set in [
                ("All Macro", "Alternative_3"),
                ("Equity indices", "Alternative_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor)
                    if factors.empty:
                        logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        # Commodities
        elif category in ["Commodities Broad Basket", "Commodities Specified"]:
            # Regression 1
            factors = load_commodity_factors()
            if factors.empty:
                logging.warning(f"No commodity factors for {symbol} (Commodity_1)")
            else:
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Commodity_1)")
                else:
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", "Commodity_1"))

    except Exception as e:
        logging.warning(f"Error processing fund {symbol}: {e}")
        return records
    
    return records

def process_region(region, fund_subset):
    records = []
    # Apply sample size to fund_subset
    if SAMPLE_DRY_RUN:
        fund_subset = fund_subset.sample(n=min(SAMPLE_SIZE, len(fund_subset)), random_state=42)
        logging.warning(f"Sampled {len(fund_subset)} funds for {region}")
    
    with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_fund = {executor.submit(process_fund, row): row["SymbolCUSIP"] for _, row in fund_subset.iterrows()}
        for future in tqdm(future_to_fund, total=len(fund_subset), desc=f"Processing {region}"):
            try:
                records.extend(future.result())
            except Exception as e:
                logging.warning(f"Error processing {future_to_fund[future]}: {e}")

    if records:
        if not DRY_RUN:
            insert_batch(records)
        else:
            logging.warning(f"Dry run: Would have written {len(records)} records for {region}")

# Section 6: Main Pipeline
def main():
    logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
    
    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()
    logging.warning(f"Total mapped funds: {len(fund_meta)}")
    logging.warning(f"Regions detected: {regions}")
    
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
    returns = load_fund_returns(fund_ids)
    fund_meta["returns"] = fund_meta["SymbolCUSIP"].map(lambda x: returns[x] if x in returns.columns else pd.Series())
    
    for region in regions:
        fund_subset = fund_meta[fund_meta["Region"] == region]
        process_region(region, fund_subset)

# Section 7: Database Output
def insert_batch(records):
    try:
        df = pd.DataFrame(records)
        if not DRY_RUN:
            for i in range(0, len(df), BATCH_INSERT_SIZE):
                batch = df.iloc[i:i + BATCH_INSERT_SIZE]
                batch.to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)
        else:
            logging.warning(f"Dry run: Skipped writing {len(df)} records")
    except Exception as e:
        logging.error(f"Error inserting batch: {e}")

if __name__ == "__main__":
    main()

Processing Global: 100%|██████████| 50/50 [00:00<00:00, 283.69it/s]
Processing USA: 100%|██████████| 50/50 [00:00<00:00, 333.08it/s]
Processing Unknown: 100%|██████████| 50/50 [00:00<00:00, 377.68it/s]
Processing International: 100%|██████████| 50/50 [00:00<00:00, 295.14it/s]


In [None]:
# Version 4.8
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time
import pickle

# Section 1: Configuration and Setup
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

RETURN_METRIC = "1 Month Return"
ROLLING_PERIODS = [12, 24, 36, 48, 60]  # in months
DRY_RUN = True
SAMPLE_DRY_RUN = True
SAMPLE_SIZE = 50
CHUNK_SIZE = 5600
BATCH_INSERT_SIZE = 10000
MAX_WORKERS = 20  # Further reduced to avoid crashes
USE_THREADS = False  # Set to True to use ThreadPoolExecutor instead of ProcessPoolExecutor

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    return mapping.get(category, ("Unknown", "Unknown"))

# Section 3: Data Loading Functions
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    try:
        df = pd.read_sql(query, engine)
        df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
        logging.warning(f"Loaded metadata for {len(df)} funds")
        if df["CWA_Broad_Category_Name"].isnull().all():
            logging.warning("CWA_Broad_Category_Name missing for all rows; Equity regressions 5-7 will be skipped")
        return df.dropna(subset=["Region", "FactorProfile"])
    except Exception as e:
        logging.error(f"Error loading metadata: {e}")
        raise

def load_fund_returns(fund_ids):
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
        AND Metric = '{RETURN_METRIC}'
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    logging.warning(f"Loaded returns for {len(df['SymbolCUSIP'].unique())} funds")
    return df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

def load_db_factors(factor_list, region="Global", table="factor_returns", asset_class=None):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM {table}
        WHERE factor IN ({factor_in_clause})
    """
    params = ()
    if region and not any(f.startswith('TSM-') or f == 'RF' for f in factor_list):
        query += " AND region = ?"
        params = (region,)
    elif 'RF' in factor_list:
        query += " AND region = 'USA'"
    if asset_class:
        query += " AND asset_class = ?"
        params += (asset_class,)
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    if df.empty:
        logging.warning(f"No data for factors {factor_list} in {table} (region: {region}, asset_class: {asset_class})")
        return pd.DataFrame()
    
    # Handle duplicates
    duplicates = df.duplicated(subset=['Date', 'Factor']).sum()
    if duplicates > 0:
        logging.warning(f"Found {duplicates} duplicate Date-Factor pairs in {table} for {factor_list}; aggregating by mean")
        df = df.groupby(['Date', 'Factor'])['Value'].mean().reset_index()
    
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value").rename(
        columns={
            'MKT': 'mkt', 'SMB': 'smb', 'HML_Devil': 'hml', 'UMD': 'umd', 'QMJ': 'qmj',
            'BAB': 'bab', 'RF': 'rf', 'TSM-Com': 'tsm-com', 'TSM-EQ': 'tsm-eq',
            'TSM-FI': 'tsm-fi', 'TSM-FX': 'tsm-fx', 'TSM-MA': 'tsm-ma'
        }
    )
    missing_factors = [f for f in factor_list if f.lower() not in pivoted_df.columns]
    if missing_factors:
        logging.warning(f"Missing factors in {table} (region: {region}): {missing_factors}")
    return pivoted_df

def load_fixed_income_factors(factor_list):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
        WHERE Factor_Name IN ({factor_in_clause})
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    if df.empty:
        logging.warning(f"No fixed income factors for {factor_list}")
        return pd.DataFrame()
    return df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")

def load_century_factors(factor_list, portfolio_base, factor, asset_class=None, region="Global"):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    portfolio = f"{portfolio_base} {factor}"
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM aqr_century_factors
        WHERE factor IN ({factor_in_clause})
        AND portfolio = ?
    """
    params = (portfolio,)
    if asset_class:
        query += " AND asset_class = ?"
        params += (asset_class,)
    if region != "Global":
        query += " AND region = ?"
        params += (region,)
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    if df.empty:
        logging.warning(f"No data for factors {factor_list} in aqr_century_factors (portfolio: {portfolio}, region: {region})")
        return pd.DataFrame()
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value")
    return pivoted_df

def load_commodity_factors():
    query = """
        SELECT date AS Date, 
               excess_return_eqwt, 
               excess_spot_return_eqwt, 
               ir_adjusted_carry_eqwt, 
               spot_return_eqwt, 
               carry_eqwt, 
               excess_return_long_short, 
               excess_spot_return_long_short, 
               ir_adjusted_carry_long_short, 
               aggregate_backwardation_contango
        FROM aqr_cmdty_factors
    """
    df = pd.read_sql(query, engine, parse_dates=['Date'])
    if df.empty:
        logging.warning("No commodity factors loaded")
        return pd.DataFrame()
    return df.set_index("Date")

# Section 4: Regression Functions
def run_rolling_regression(fund, returns, factors, regression_type, factor_set):
    results = []
    try:
        returns = pd.Series(returns).dropna()
        factors = pd.DataFrame(factors).dropna()
        returns.index = pd.to_datetime(returns.index, errors='coerce')
        factors.index = pd.to_datetime(factors.index, errors='coerce')
        returns = returns[returns.index.notnull()]
        factors = factors[factors.index.notnull()]
        if returns.empty or factors.empty:
            logging.warning(f"Empty returns or factors for {fund} ({factor_set})")
            return results
        viable_periods = [w for w in ROLLING_PERIODS if (returns.index.max() - relativedelta(months=w)) >= returns.index.min()]
        
        for window in viable_periods:
            start = returns.index.min() + relativedelta(months=window)
            for end_date in returns.loc[returns.index >= start].index:
                start_date = end_date - relativedelta(months=window - 1)
                y = returns.loc[start_date:end_date]
                X = factors.loc[start_date:end_date]
                X, y = X.align(y, join="inner", axis=0)
                if len(y) < window or y.isnull().any() or X.isnull().any().any():
                    continue
                X_const = add_constant(X)
                model = OLS(y, X_const).fit()
                for factor in X.columns:
                    results.append({
                        "SymbolCUSIP": fund,
                        "MonthEndDate": end_date,
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": model.params.get(factor, np.nan),
                        "P_Value": model.pvalues.get(factor, np.nan),
                        "T_Stat": model.tvalues.get(factor, np.nan),
                        "Standard_Error": model.bse.get(factor, np.nan),
                        "CI_Lower": model.conf_int().loc[factor][0] if factor in model.params else np.nan,
                        "CI_Upper": model.conf_int().loc[factor][1] if factor in model.params else np.nan,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1] if len(y) > 1 else np.nan,
                        "Regression_Type": regression_type,
                        "Factor_Set": factor_set
                    })
    except Exception as e:
        logging.warning(f"Error in run_rolling_regression for {fund} ({factor_set}): {e}")
        return results
    return results

# Section 5: Processing Functions
def process_fund(fund_data):
    """Process regressions for a single fund, returning a list of records."""
    records = []
    symbol = fund_data["SymbolCUSIP"]
    category = fund_data["Global_Category_Name"]
    broad_category = fund_data["CWA_Broad_Category_Name"]
    returns = pd.Series(fund_data["returns"]).dropna()
    
    logging.debug(f"Starting process_fund for {symbol}")
    
    try:
        if returns.empty:
            logging.warning(f"No valid returns for {symbol}")
            return records

        # Equity (USA)
        if category in [
            "Energy Sector Equity", "Equity Miscellaneous", "Financials Sector Equity",
            "Healthcare Sector Equity", "Consumer Goods & Services Sector Equity",
            "Communications Sector Equity", "Industrials Sector Equity", "Other Sector Equity",
            "Real Estate Sector Equity", "Precious Metals Sector Equity", "Technology Sector Equity",
            "Utilities Sector Equity", "US Equity Large Cap Blend", "US Equity Large Cap Growth",
            "US Equity Large Cap Value", "US Equity Mid Cap", "US Equity Small Cap", "Options Trading"
        ]:
            # Regression 1: USA, MKT-RF, HML-Devil, QMJ, SMB, UMD, BAB
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB'], "USA")
            rf_factors = load_db_factors(['RF'], "USA")
            factors = pd.concat([factors_eq, rf_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Equity_USA_1)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                elif 'mkt' not in factors.columns:
                    logging.warning(f"No market factor for {symbol} (Equity_USA_1)")
                else:
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} (Equity_USA_1)")
                    else:
                        desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab']
                        available_factors = [f for f in desired_factors if f in factors.columns]
                        if available_factors:
                            records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_1"))
                        else:
                            logging.warning(f"No valid factors for {symbol} (Equity_USA_1)")

            # Regression 2-4: Century Factors
            for portfolio_base, factor_set in [
                ("US Stock Selection", "Equity_USA_2"),
                ("All Macro", "Equity_USA_3"),
                ("Equity indices", "Equity_USA_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor, region="USA")
                    if factors.empty:
                        logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

            if broad_category in ["Quantitative/Tactical", "Strategic", "Nontraditional"]:
                # Regression 5
                factors_eq = load_db_factors(['MKT', 'BAB'], "USA")
                factors_fi = load_db_factors(['TSM-FI'])
                factors_fx = load_db_factors(['TSM-FX'])
                rf_factors = load_db_factors(['RF'], "USA")
                fi_factors = load_fixed_income_factors(['TERM', 'CREDIT'])
                cmdty_factors = load_commodity_factors()
                factors = pd.concat([factors_eq, factors_fi, factors_fx, rf_factors, fi_factors, cmdty_factors[['excess_return_eqwt']]], axis=1).dropna()
                if factors.empty:
                    logging.warning(f"No valid factors for {symbol} (Equity_USA_5)")
                else:
                    if 'mkt' in factors.columns and 'rf' in factors.columns:
                        factors['mkt-rf'] = factors['mkt'] - factors['rf']
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} (Equity_USA_5)")
                    else:
                        desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'tsm-fx', 'tsm-fi', 'bab', 'TERM', 'CREDIT', 'excess_return_eqwt']
                        available_factors = [f for f in desired_factors if f in factors.columns]
                        if available_factors:
                            records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_5"))

                # Regression 6
                factors_eq = load_db_factors(['MKT', 'SMB', 'BAB'], "USA")
                factors_com = load_db_factors(['TSM-Com'])
                rf_factors = load_db_factors(['RF'], "USA")
                fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT_HY'])
                factors = pd.concat([factors_eq, factors_com, rf_factors, fi_factors], axis=1).dropna()
                if factors.empty:
                    logging.warning(f"No valid factors for {symbol} (Equity_USA_6)")
                else:
                    if 'mkt' in factors.columns and 'rf' in factors.columns:
                        factors['mkt-rf'] = factors['mkt'] - factors['rf']
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} (Equity_USA_6)")
                    else:
                        desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT_HY', 'tsm-com']
                        available_factors = [f for f in desired_factors if f in factors.columns]
                        if available_factors:
                            records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_6"))

                # Regression 7
                factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'UMD', 'SMB', 'BAB'], "USA")
                factors_com = load_db_factors(['TSM-Com'])
                factors_fi = load_db_factors(['TSM-FI'])
                factors_fx = load_db_factors(['TSM-FX'])
                rf_factors = load_db_factors(['RF'], "USA")
                fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
                factors = pd.concat([factors_eq, factors_com, factors_fi, factors_fx, rf_factors, fi_factors], axis=1).dropna()
                if factors.empty:
                    logging.warning(f"No valid factors for {symbol} (Equity_USA_7)")
                else:
                    if 'mkt' in factors.columns and 'rf' in factors.columns:
                        factors['mkt-rf'] = factors['mkt'] - factors['rf']
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} (Equity_USA_7)")
                    else:
                        desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'umd', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fi', 'tsm-fx']
                        available_factors = [f for f in desired_factors if f in factors.columns]
                        if available_factors:
                            records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_7"))
            else:
                logging.warning(f"Skipping Equity regressions 5-7 for {symbol}: Not Quantitative/Tactical, Strategic, or Nontraditional")

        # Equity (International)
        elif category in [
            "Asia ex-Japan Equity", "Australia & New Zealand Equity", "Canadian Equity Large Cap",
            "Europe Equity Large Cap", "Europe Equity Mid/Small Cap", "Greater China Equity",
            "India Equity", "Mexico Equity", "Japan Equity", "Korea Equity", "Latin America Equity",
            "UK Equity Large Cap", "Thailand Equity"
        ]:
            # Regression 1
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Intl")
            rf_factors = load_db_factors(['RF'], "USA")
            factors = pd.concat([factors_eq, rf_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Equity_Intl_1)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Equity_Intl_1)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_Intl_1"))

            # Regression 2-4
            for portfolio_base, factor_set in [
                ("Intl Stock Selection", "Equity_Intl_2"),
                ("All Macro", "Equity_Intl_3"),
                ("Equity indices", "Equity_Intl_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor, region="International")
                    if factors.empty:
                        logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        # Equity (Global)
        elif category in ["Global Emerging Markets Equity", "Global Equity Large Cap", "Global Equity Mid/Small Cap"]:
            # Regression 1
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Global")
            rf_factors = load_db_factors(['RF'], "USA")
            factors = pd.concat([factors_eq, rf_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Equity_Global_1)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Equity_Global_1)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_Global_1"))

            # Regression 2-4
            for portfolio_base, factor_set in [
                ("All Stock Selection", "Equity_Global_2"),
                ("All Macro", "Equity_Global_3"),
                ("Equity indices", "Equity_Global_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor)
                    if factors.empty:
                        logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        # Fixed Income
        elif category in [
            "Convertibles", "Emerging Markets Fixed Income", "Fixed Income Miscellaneous",
            "US Fixed Income", "US Municipal Fixed Income"
        ]:
            # Regression 1
            factors_fi = load_db_factors(['TSM-FI'])
            factors_fx = load_db_factors(['TSM-FX'])
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors_fi, factors_fx, fi_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (FI_1)")
            else:
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (FI_1)")
                else:
                    desired_factors = ['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-fi', 'tsm-fx']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "FI_1"))

            # Regression 2-4
            for portfolio_base, factor_set in [
                ("Fixed income", "FI_2"),
                ("All Macro", "FI_3"),
                ("Equity indices", "FI_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor)
                    if factors.empty:
                        logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        # Allocation
        elif category in [
            "Aggressive Allocation", "Allocation Miscellaneous", "Cautious Allocation",
            "Flexible Allocation", "Moderate Allocation"
        ]:
            # Regression 1
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ'], "Global")
            factors_fi = load_db_factors(['TSM-FI'])
            rf_factors = load_db_factors(['RF'], "USA")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors_eq, factors_fi, rf_factors, fi_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Allocation_1)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Allocation_1)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Allocation_1"))

            # Regression 2
            for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                fi_factors = load_century_factors([factor], "Fixed income", factor)
                stock_factors = load_century_factors([factor], "All Stock Selection", factor)
                factors = pd.concat([fi_factors, stock_factors], axis=1).dropna()
                if factors.empty:
                    logging.warning(f"No valid factors for {symbol} (Allocation_2, factor: {factor})")
                    continue
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Allocation_2, factor: {factor})")
                    continue
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", "Allocation_2"))

            # Regression 3-4
            for portfolio_base, factor_set in [
                ("All Macro", "Allocation_3"),
                ("Equity indices", "Allocation_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor)
                    if factors.empty:
                        logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        # Alternatives
        elif category in [
            "Alternative Miscellaneous", "Long/Short Equity", "Market Neutral", "Multialternative"
        ]:
            # Regression 1
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ'], "Global")
            factors_com = load_db_factors(['TSM-Com'])
            factors_fi = load_db_factors(['TSM-FI'])
            factors_fx = load_db_factors(['TSM-FX'])
            rf_factors = load_db_factors(['RF'], "USA")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors_eq, factors_com, factors_fi, factors_fx, rf_factors, fi_factors], axis=1).dropna()
            if factors.empty:
                logging.warning(f"No valid factors for {symbol} (Alternative_1)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Alternative_1)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fx']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Alternative_1"))

            # Regression 2
            for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                fi_factors = load_century_factors([factor], "Fixed income", factor)
                stock_factors = load_century_factors([factor], "All Stock Selection", factor)
                cmdty_factors = load_century_factors([factor], "Commodities", factor)
                factors = pd.concat([fi_factors, stock_factors, cmdty_factors], axis=1).dropna()
                if factors.empty:
                    logging.warning(f"No valid factors for {symbol} (Alternative_2, factor: {factor})")
                    continue
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Alternative_2, factor: {factor})")
                    continue
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", "Alternative_2"))

            # Regression 3-4
            for portfolio_base, factor_set in [
                ("All Macro", "Alternative_3"),
                ("Equity indices", "Alternative_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor)
                    if factors.empty:
                        logging.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logging.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        # Commodities
        elif category in ["Commodities Broad Basket", "Commodities Specified"]:
            # Regression 1
            factors = load_commodity_factors()
            if factors.empty:
                logging.warning(f"No commodity factors for {symbol} (Commodity_1)")
            else:
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logging.warning(f"No overlapping dates for {symbol} (Commodity_1)")
                else:
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", "Commodity_1"))

    except Exception as e:
        logging.warning(f"Error processing fund {symbol}: {e}")
        return records
    
    logging.debug(f"Completed process_fund for {symbol} with {len(records)} records")
    return records

def process_region(region, fund_data_list):
    records = []
    logging.warning(f"Processing {len(fund_data_list)} funds in {region}")
    
    executor_class = ThreadPoolExecutor if USE_THREADS else ProcessPoolExecutor
    with executor_class(max_workers=MAX_WORKERS) as executor:
        future_to_fund = {executor.submit(process_fund, fund_data): fund_data["SymbolCUSIP"] for fund_data in fund_data_list}
        for future in tqdm(future_to_fund, total=len(fund_data_list), desc=f"Processing {region}"):
            try:
                records.extend(future.result())
            except Exception as e:
                logging.warning(f"Error processing {future_to_fund[future]}: {e}")

    if records:
        if not DRY_RUN:
            insert_batch(records)
        else:
            logging.warning(f"Dry run: Would have written {len(records)} records for {region}")

# Section 6: Main Pipeline
def main():
    logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
    
    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()
    logging.warning(f"Total mapped funds: {len(fund_meta)}")
    logging.warning(f"Regions detected: {regions}")
    
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
    returns = load_fund_returns(fund_ids)
    
    # Validate returns
    for col in returns.columns:
        if not pd.api.types.is_numeric_dtype(returns[col]):
            logging.warning(f"Non-numeric returns for {col}; converting to numeric")
            returns[col] = pd.to_numeric(returns[col], errors='coerce')
    
    # Prepare fund data
    fund_data_list = []
    for _, row in fund_meta.iterrows():
        symbol = row["SymbolCUSIP"]
        if symbol in returns.columns:
            fund_data_list.append({
                "SymbolCUSIP": symbol,
                "Global_Category_Name": row["Global_Category_Name"],
                "CWA_Broad_Category_Name": row.get("CWA_Broad_Category_Name", None),
                "returns": returns[symbol].to_dict()
            })
    
    for region in regions:
        region_funds = [fd for fd in fund_data_list if fund_meta[fund_meta["SymbolCUSIP"] == fd["SymbolCUSIP"]]["Region"].iloc[0] == region]
        if SAMPLE_DRY_RUN and len(region_funds) > SAMPLE_SIZE:
            region_funds = random.sample(region_funds, SAMPLE_SIZE)
            logging.warning(f"Sampled {len(region_funds)} funds for {region}")
        process_region(region, region_funds)

# Section 7: Database Output
def insert_batch(records):
    try:
        df = pd.DataFrame(records)
        if not DRY_RUN:
            for i in range(0, len(df), BATCH_INSERT_SIZE):
                batch = df.iloc[i:i + BATCH_INSERT_SIZE]
                batch.to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)
        else:
            logging.warning(f"Dry run: Skipped writing {len(df)} records")
    except Exception as e:
        logging.error(f"Error inserting batch: {e}")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logging.error(f"Main execution failed: {e}")
        raise

Processing Global: 100%|██████████| 12/12 [00:00<00:00, 81.74it/s]


In [None]:
#supergrok try

In [None]:
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from sqlalchemy import create_engine, text
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time
import pickle
import json
import uuid
import psutil
import functools
import warnings
from contextlib import contextmanager

# Section 1: Configuration and Setup
with open("config.json", "r") as f:
    CONFIG = json.load(f)

DATABASE_CONFIG = CONFIG.get("database", {})
CONNECTION_STRING = (
    f"mssql+pyodbc://{DATABASE_CONFIG.get('server')}/{DATABASE_CONFIG.get('database')}"
    f"?driver={DATABASE_CONFIG.get('driver')}&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(CONNECTION_STRING)

RETURN_METRIC = CONFIG.get("return_metric", "1 Month Return")
ROLLING_PERIODS = CONFIG.get("rolling_periods", [12, 24, 36, 48, 60])  # in months
DRY_RUN = CONFIG.get("dry_run", True)
SAMPLE_DRY_RUN = CONFIG.get("sample_dry_run", True)
SAMPLE_SIZE = CONFIG.get("sample_size", 50)
CHUNK_SIZE = CONFIG.get("chunk_size", 5600)
BATCH_INSERT_SIZE = CONFIG.get("batch_insert_size", 10000)
MAX_WORKERS = min(CONFIG.get("max_workers", 16), psutil.cpu_count(logical=False))
USE_THREADS = CONFIG.get("use_threads", False)
MIN_OBSERVATIONS = CONFIG.get("min_observations", 0.8)  # Minimum fraction of expected data points

# Load category mappings
with open("category_mapping.json", "r") as f:
    CATEGORY_MAPPING = json.load(f)

# Factor configurations for different regressions
FACTOR_CONFIGS = {
    "Equity_USA_1": [
        {"source": "db", "factors": ["MKT", "HML_Devil", "QMJ", "SMB", "UMD", "BAB"], "region": "USA"},
        {"source": "db", "factors": ["RF"], "region": "USA"}
    ],
    "Equity_USA_2": [
        {"source": "century", "factors": ["Carry", "Defensive", "Market", "Momentum", "Multi-Style", "Value"],
         "portfolio_base": "US Stock Selection", "region": "USA"}
    ],
    # Add other configurations similarly
}

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("factor_attribution.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger()

# Section 2: Helper Functions
def category_to_region(category):
    return CATEGORY_MAPPING.get(category, ("Unknown", "Unknown"))

@contextmanager
def database_transaction():
    """Context manager for database transactions."""
    connection = engine.connect()
    transaction = connection.begin()
    try:
        yield connection
        transaction.commit()
    except Exception as e:
        transaction.rollback()
        logger.error(f"Transaction failed: {e}")
        raise
    finally:
        connection.close()

def timer(func):
    """Decorator to log execution time of functions."""
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        logger.info(f"{func.__name__} took {time.time() - start_time:.2f} seconds")
        return result
    return wrapper

# Section 3: Data Loading Functions
@timer
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    try:
        df = pd.read_sql(query, engine)
        df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
        logger.info(f"Loaded metadata for {len(df)} funds")
        if df["CWA_Broad_Category_Name"].isnull().all():
            logger.warning("CWA_Broad_Category_Name missing for all rows; Equity regressions 5-7 will be skipped")
        return df.dropna(subset=["Region", "FactorProfile"])
    except Exception as e:
        logger.error(f"Error loading metadata: {e}")
        raise

@timer
def load_fund_returns(fund_ids=None):
    query = """
    SELECT SymbolCUSIP, Date, ReturnValue
    FROM Fund_Returns_Timeseries
    WHERE Metric = :metric
    """
    params = {"metric": RETURN_METRIC}
    if fund_ids:
        query += " AND SymbolCUSIP IN :fund_ids"
        params["fund_ids"] = tuple(fund_ids)
    try:
        df = pd.read_sql(text(query), engine, params=params, parse_dates=["Date"])
        logger.info(f"Loaded returns for {len(df['SymbolCUSIP'].unique())} funds")
        pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
        # Validate numeric data
        for col in pivoted.columns:
            if not pd.api.types.is_numeric_dtype(pivoted[col]):
                logger.warning(f"Non-numeric returns for {col}; converting to numeric")
                pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
        return pivoted
    except Exception as e:
        logger.error(f"Error loading returns: {e}")
        raise

@timer
def load_factors(factor_config, region="Global", asset_class=None):
    """
    Load factors from various sources.
    """
    source = factor_config.get("source")
    try:
        if source == "db":
            factor_in_clause = ','.join([f"'{f}'" for f in factor_config["factors"]])
            query = f"""
            SELECT date AS Date, factor AS Factor, value AS Value
            FROM {factor_config.get('table', 'factor_returns')}
            WHERE factor IN ({factor_in_clause})
            """
            params = []
            if region and not any(f.startswith('TSM-') or f == 'RF' for f in factor_config["factors"]):
                query += " AND region = ?"
                params.append(region)
            elif 'RF' in factor_config["factors"]:
                query += " AND region = 'USA'"
            if asset_class:
                query += " AND asset_class = ?"
                params.append(asset_class)
            df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
            if df.empty:
                logger.warning(f"No data for factors {factor_config['factors']} (region: {region})")
                return pd.DataFrame()
            
            if df.duplicated(subset=['Date', 'Factor']).sum() > 0:
                logger.error(f"Duplicate Date-Factor pairs in {factor_config['factors']}; please clean data")
                raise ValueError("Duplicate factor data detected")
            
            pivoted_df = df.pivot(index="Date", columns="Factor", values="Value").rename(
                columns={
                    'MKT': 'mkt', 'SMB': 'smb', 'HML_Devil': 'hml', 'UMD': 'umd', 'QMJ': 'qmj',
                    'BAB': 'bab', 'RF': 'rf', 'TSM-Com': 'tsm-com', 'TSM-EQ': 'tsm-eq',
                    'TSM-FI': 'tsm-fi', 'TSM-FX': 'tsm-fx', 'TSM-MA': 'tsm-ma'
                }
            )
            return pivoted_df

        elif source == "century":
            factor_in_clause = ','.join([f"'{f}'" for f in factor_config["factors"]])
            portfolio = f"{factor_config['portfolio_base']} {factor_config['factor']}"
            query = f"""
            SELECT date AS Date, factor AS Factor, value AS Value
            FROM aqr_century_factors
            WHERE factor IN ({factor_in_clause})
            AND portfolio = ?
            """
            params = [portfolio]
            if asset_class:
                query += " AND asset_class = ?"
                params.append(asset_class)
            if region != "Global":
                query += " AND region = ?"
                params.append(region)
            df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
            if df.empty:
                logger.warning(f"No data for century factors {factor_config['factors']} (portfolio: {portfolio})")
                return pd.DataFrame()
            return df.pivot(index="Date", columns="Factor", values="Value")

        elif source == "commodity":
            query = """
            SELECT date AS Date, 
                   excess_return_eqwt, 
                   excess_spot_return_eqwt, 
                   ir_adjusted_carry_eqwt, 
                   spot_return_eqwt, 
                   carry_eqwt, 
                   excess_return_long_short, 
                   excess_spot_return_long_short, 
                   ir_adjusted_carry_long_short, 
                   aggregate_backwardation_contango
            FROM aqr_cmdty_factors
            """
            df = pd.read_sql(query, engine, parse_dates=['Date'])
            if df.empty:
                logger.warning("No commodity factors loaded")
                return pd.DataFrame()
            return df.set_index("Date")

        elif source == "fixed_income":
            factor_in_clause = ','.join([f"'{f}'" for f in factor_config["factors"]])
            query = f"""
            SELECT Date, Factor_Name, ReturnValue
            FROM Fixed_Income_Factor_Returns
            WHERE Factor_Name IN ({factor_in_clause})
            """
            df = pd.read_sql(query, engine, parse_dates=["Date"])
            if df.empty:
                logger.warning(f"No fixed income factors for {factor_config['factors']}")
                return pd.DataFrame()
            return df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")

        else:
            logger.error(f"Unknown factor source: {source}")
            return pd.DataFrame()
    except Exception as e:
        logger.error(f"Error loading factors {factor_config.get('factors', [])}: {e}")
        raise

# Section 4: Regression Functions
def run_rolling_regression(fund, returns, factors, regression_type, factor_set, min_observations=MIN_OBSERVATIONS):
    results = []
    try:
        returns = pd.Series(returns).dropna()
        factors = pd.DataFrame(factors).dropna()
        returns.index = pd.to_datetime(returns.index, errors='coerce')
        factors.index = pd.to_datetime(factors.index, errors='coerce')
        returns = returns[returns.index.notnull()]
        factors = factors[factors.index.notnull()]
        
        if returns.empty or factors.empty:
            logger.warning(f"Empty returns or factors for {fund} ({factor_set})")
            return results
        
        viable_periods = [w for w in ROLLING_PERIODS if (returns.index.max() - relativedelta(months=w)) >= returns.index.min()]
        
        for window in viable_periods:
            min_obs = int(window * min_observations)
            start = returns.index.min() + relativedelta(months=window)
            rolling_windows = returns.loc[returns.index >= start].rolling(window=f"{window}M", min_periods=min_obs)
            
            for end_date, y in rolling_windows:
                if y.index[0] < returns.index.min():
                    continue
                X = factors.loc[y.index]
                X, y = X.align(y, join="inner", axis=0)
                if len(y) < min_obs or y.isnull().any() or X.isnull().any().any():
                    continue
                X_const = add_constant(X)
                model = OLS(y, X_const).fit()
                for factor in X.columns:
                    results.append({
                        "SymbolCUSIP": fund,
                        "MonthEndDate": end_date,
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": model.params.get(factor, np.nan),
                        "P_Value": model.pvalues.get(factor, np.nan),
                        "T_Stat": model.tvalues.get(factor, np.nan),
                        "Standard_Error": model.bse.get(factor, np.nan),
                        "CI_Lower": model.conf_int().loc[factor][0] if factor in model.params else np.nan,
                        "CI_Upper": model.conf_int().loc[factor][1] if factor in model.params else np.nan,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1] if len(y) > 1 else np.nan,
                        "Regression_Type": regression_type,
                        "Factor_Set": factor_set
                    })
    except Exception as e:
        logger.error(f"Error in regression for {fund} ({factor_set}): {e}")
        return results
    return results

# Section 5: Processing Functions
def process_regression(symbol, returns, category, broad_category, factor_set_key):
    records = []
    factor_configs = FACTOR_CONFIGS.get(factor_set_key, [])
    
    if not factor_configs:
        logger.warning(f"No factor configuration for {factor_set_key}")
        return records
    
    try:
        factors_list = []
        for config in factor_configs:
            factors = load_factors(config, region=config.get("region", "Global"), asset_class=config.get("asset_class"))
            if factors.empty:
                logger.warning(f"No factors loaded for {symbol} ({factor_set_key})")
                continue
            factors_list.append(factors)
        
        if not factors_list:
            return records
        
        factors = pd.concat(factors_list, axis=1).dropna()
        if 'mkt' in factors.columns and 'rf' in factors.columns:
            factors['mkt-rf'] = factors['mkt'] - factors['rf']
        
        factors, returns_aligned = factors.align(returns, join="inner", axis=0)
        if returns_aligned.empty:
            logger.warning(f"No overlapping dates for {symbol} ({factor_set_key})")
            return records
        
        desired_factors = factor_configs[0].get("desired_factors", factors.columns.tolist())
        available_factors = [f for f in desired_factors if f in factors.columns]
        if not available_factors:
            logger.warning(f"No valid factors for {symbol} ({factor_set_key})")
            return records
        
        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", factor_set_key))
    except Exception as e:
        logger.error(f"Error processing {symbol} ({factor_set_key}): {e}")
    
    return records

def process_fund(fund_data):
    records = []
    symbol = fund_data["SymbolCUSIP"]
    category = fund_data["Global_Category_Name"]
    broad_category = fund_data.get("CWA_Broad_Category_Name")
    returns = pd.Series(fund_data["returns"]).dropna()
    
    logger.debug(f"Processing fund {symbol}")
    
    if returns.empty:
        logger.warning(f"No valid returns for {symbol}")
        return records
    
    # Define regression sets based on category
    regression_sets = []
    if category in [
        "Energy Sector Equity", "Equity Miscellaneous", "Financials Sector Equity",
        "Healthcare Sector Equity", "Consumer Goods & Services Sector Equity",
        "Communications Sector Equity", "Industrials Sector Equity", "Other Sector Equity",
        "Real Estate Sector Equity", "Precious Metals Sector Equity", "Technology Sector Equity",
        "Utilities Sector Equity", "US Equity Large Cap Blend", "US Equity Large Cap Growth",
        "US Equity Large Cap Value", "US Equity Mid Cap", "US Equity Small Cap", "Options Trading"
    ]:
        regression_sets.extend(["Equity_USA_1", "Equity_USA_2", "Equity_USA_3", "Equity_USA_4"])
        if broad_category in ["Quantitative/Tactical", "Strategic", "Nontraditional"]:
            regression_sets.extend(["Equity_USA_5", "Equity_USA_6", "Equity_USA_7"])
    
    # Add similar logic for other categories (International, Global, Fixed Income, etc.)
    
    for factor_set in regression_sets:
        records.extend(process_regression(symbol, returns, category, broad_category, factor_set))
    
    logger.debug(f"Completed {symbol} with {len(records)} records")
    return records

def process_region(region, fund_data_list):
    records = []
    logger.info(f"Processing {len(fund_data_list)} funds in {region}")
    
    executor_class = ThreadPoolExecutor if USE_THREADS else ProcessPoolExecutor
    with executor_class(max_workers=MAX_WORKERS) as executor:
        future_to_fund = {executor.submit(process_fund, fund_data): fund_data["SymbolCUSIP"] for fund_data in fund_data_list}
        for future in tqdm(future_to_fund, total=len(fund_data_list), desc=f"Processing {region}"):
            try:
                records.extend(future.result())
            except Exception as e:
                logger.error(f"Error processing {future_to_fund[future]}: {e}")
    
    if records:
        if not DRY_RUN:
            insert_batch(records)
        else:
            pd.DataFrame(records).to_csv(f"dry_run_{region}_{uuid.uuid4()}.csv", index=False)
            logger.info(f"Dry run: Saved {len(records)} records for {region} to CSV")
    
    return records

# Section 6: Main Pipeline
@timer
def main():
    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()
    logger.info(f"Total mapped funds: {len(fund_meta)}")
    logger.info(f"Regions detected: {regions}")
    
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
        logger.info(f"Sampled {len(fund_ids)} funds for dry run")
    
    returns = load_fund_returns(fund_ids)
    
    fund_data_list = [
        {
            "SymbolCUSIP": row["SymbolCUSIP"],
            "Global_Category_Name": row["Global_Category_Name"],
            "CWA_Broad_Category_Name": row.get("CWA_Broad_Category_Name"),
            "returns": returns[row["SymbolCUSIP"]].to_dict()
        }
        for _, row in fund_meta.iterrows() if row["SymbolCUSIP"] in returns.columns
    ]
    
    summary = {"total_funds": len(fund_data_list), "regions": {}, "errors": 0}
    for region in regions:
        region_funds = [fd for fd in fund_data_list if fund_meta[fund_meta["SymbolCUSIP"] == fd["SymbolCUSIP"]]["Region"].iloc[0] == region]
        if SAMPLE_DRY_RUN and len(region_funds) > SAMPLE_SIZE:
            region_funds = random.sample(region_funds, SAMPLE_SIZE)
            logger.info(f"Sampled {len(region_funds)} funds for {region}")
        records = process_region(region, region_funds)
        summary["regions"][region] = {"funds_processed": len(region_funds), "records": len(records)}
    
    logger.info(f"Pipeline summary: {summary}")
    return summary

# Section 7: Database Output
def insert_batch(records):
    try:
        df = pd.DataFrame(records)
        with database_transaction() as connection:
            df.to_sql("AQRR_Factor_Attribution", connection, if_exists="append", index=False, method="multi")
        logger.info(f"Inserted {len(df)} records")
    except Exception as e:
        logger.error(f"Error inserting batch: {e}")
        raise

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logger.error(f"Main execution failed: {e}")
        raise

In [2]:
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ThreadPoolExecutor
from sqlalchemy import create_engine, text
from sqlalchemy.exc import OperationalError
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time
import json
import uuid
import psutil
import functools
from contextlib import contextmanager
import sys

# Section 1: Configuration and Setup
CONFIG = {
    "database": {
        "server": "JULIANS_LAPTOP\\SQLEXPRESS",
        "database": "CWA_Fund_Database",
        "driver": "ODBC Driver 18 for SQL Server"
    },
    "return_metric": "1 Month Return",
    "rolling_periods": [12],
    "dry_run": True,
    "sample_dry_run": True,
    "sample_size": 10,
    "chunk_size": 1000,
    "batch_insert_size": 5000,
    "max_workers": 8,
    "use_threads": True,
    "min_observations": 0.8,
    "query_timeout": 30
}

CONNECTION_STRING = (
    f"mssql+pyodbc://{CONFIG['database']['server']}/{CONFIG['database']['database']}"
    f"?driver={CONFIG['database']['driver']}&trusted_connection=yes&TrustServerCertificate=yes"
)
try:
    engine = create_engine(CONNECTION_STRING, connect_args={"timeout": CONFIG["query_timeout"]})
except Exception as e:
    print(f"Database connection failed: {e}")
    raise

RETURN_METRIC = CONFIG["return_metric"]
ROLLING_PERIODS = CONFIG["rolling_periods"]
DRY_RUN = CONFIG["dry_run"]
SAMPLE_DRY_RUN = CONFIG["sample_dry_run"]
SAMPLE_SIZE = CONFIG["sample_size"]
CHUNK_SIZE = CONFIG["chunk_size"]
BATCH_INSERT_SIZE = CONFIG["batch_insert_size"]
MAX_WORKERS = min(CONFIG["max_workers"], psutil.cpu_count(logical=False))
USE_THREADS = CONFIG["use_threads"]
MIN_OBSERVATIONS = CONFIG["min_observations"]

# Simplified factor configuration for testing
FACTOR_CONFIGS = {
    "Equity_USA_1": [
        {"source": "db", "factors": ["MKT", "RF"], "region": "USA", "desired_factors": ["mkt-rf"]}
    ]
}

# Logging setup with immediate flushing
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("factor_attribution.log"),
        logging.StreamHandler(sys.stdout)
    ],
    force=True
)
logger = logging.getLogger()
logger.handlers[1].stream = sys.stdout  # Ensure console output
logger.handlers[1].setLevel(logging.INFO)  # Less verbose console

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    return mapping.get(category, ("Unknown", "Unknown"))

@contextmanager
def database_transaction():
    connection = engine.connect()
    transaction = connection.begin()
    try:
        yield connection
        transaction.commit()
    except Exception as e:
        transaction.rollback()
        logger.error(f"Transaction failed: {e}")
        raise
    finally:
        connection.close()

def timer(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        logger.debug(f"Starting {func.__name__}")
        result = func(*args, **kwargs)
        logger.info(f"{func.__name__} took {time.time() - start_time:.2f} seconds")
        return result
    return wrapper

# Section 3: Data Loading Functions
@timer
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    try:
        df = pd.read_sql(text(query), engine)
        logger.info(f"Loaded metadata for {len(df)} fundus")
        df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
        if df["CWA_Broad_Category_Name"].isnull().all():
            logger.warning("CWA_Broad_Category_Name missing; some regressions may be skipped")
        return df.dropna(subset=["Region", "FactorProfile"])
    except OperationalError as e:
        logger.error(f"Database error loading metadata: {e}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error loading metadata: {e}")
        raise

@timer
def load_fund_returns(fund_ids=None):
    query = """
    SELECT SymbolCUSIP, Date, ReturnValue
    FROM Fund_Returns_Timeseries
    WHERE Metric = :metric
    """
    params = {"metric": RETURN_METRIC}
    if fund_ids:
        query += " AND SymbolCUSIP IN :fund_ids"
        params["fund_ids"] = tuple(fund_ids)
    try:
        chunks = []
        for chunk in pd.read_sql(text(query), engine, params=params, parse_dates=["Date"], chunksize=CHUNK_SIZE):
            logger.debug(f"Loaded chunk of {len(chunk)} rows")
            chunks.append(chunk)
        df = pd.concat(chunks) if chunks else pd.DataFrame()
        if df.empty:
            logger.warning("No returns data loaded")
            return pd.DataFrame()
        logger.info(f"Loaded returns for {len(df['SymbolCUSIP'].unique())} funds")
        pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
        for col in pivoted.columns:
            if not pd.api.types.is_numeric_dtype(pivoted[col]):
                logger.warning(f"Non-numeric returns for {col}; converting")
                pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
        return pivoted
    except OperationalError as e:
        logger.error(f"Database error loading returns: {e}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error loading returns: {e}")
        raise

@timer
def load_factors(factor_config, region="Global", asset_class=None):
    source = factor_config.get("source")
    try:
        if source == "db":
            factor_in_clause = ','.join([f"'{f}'" for f in factor_config["factors"]])
            query = f"""
            SELECT date AS Date, factor AS Factor, value AS Value
            FROM {factor_config.get('table', 'factor_returns')}
            WHERE factor IN ({factor_in_clause})
            """
            params = []
            if region and not any(f.startswith('TSM-') or f == 'RF' for f in factor_config["factors"]):
                query += " AND region = ?"
                params.append(region)
            elif 'RF' in factor_config["factors"]:
                query += " AND region = 'USA'"
            if asset_class:
                query += " AND asset_class = ?"
                params.append(asset_class)
            df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
            if df.empty:
                logger.warning(f"No data for factors {factor_config['factors']}")
                return pd.DataFrame()
            if df.duplicated(subset=['Date', 'Factor']).sum() > 0:
                logger.error(f"Duplicate factors {factor_config['factors']}")
                raise ValueError("Duplicate factor data")
            pivoted_df = df.pivot(index="Date", columns="Factor", values="Value").rename(
                columns={'MKT': 'mkt', 'RF': 'rf'}
            )
            return pivoted_df
        else:
            logger.warning(f"Factor source {source} not implemented")
            return pd.DataFrame()
    except Exception as e:
        logger.error(f"Error loading factors {factor_config.get('factors', [])}: {e}")
        return pd.DataFrame()

# Section 4: Regression Functions
def run_rolling_regression(fund, returns, factors, regression_type, factor_set):
    results = []
    try:
        returns = pd.Series(returns).dropna()
        factors = pd.DataFrame(factors).dropna()
        returns.index = pd.to_datetime(returns.index, errors='coerce')
        factors.index = pd.to_datetime(factors.index, errors='coerce')
        returns = returns[returns.index.notnull()]
        factors = factors[factors.index.notnull()]
        
        if returns.empty or factors.empty:
            logger.warning(f"Empty data for {fund} ({factor_set})")
            return results
        
        viable_periods = [w for w in ROLLING_PERIODS if (returns.index.max() - relativedelta(months=w)) >= returns.index.min()]
        for window in viable_periods:
            min_obs = int(window * MIN_OBSERVATIONS)
            rolling_windows = returns.loc[returns.index >= (returns.index.min() + relativedelta(months=window))].rolling(window=f"{window}M", min_periods=min_obs)
            for end_date, y in rolling_windows:
                if y.index[0] < returns.index.min():
                    continue
                X = factors.loc[y.index]
                X, y = X.align(y, join="inner", axis=0)
                if len(y) < min_obs or y.isnull().any() or X.isnull().any().any():
                    continue
                X_const = add_constant(X)
                model = OLS(y, X_const).fit()
                for factor in X.columns:
                    results.append({
                        "SymbolCUSIP": fund,
                        "MonthEndDate": end_date,
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": model.params.get(factor, np.nan),
                        "P_Value": model.pvalues.get(factor, np.nan),
                        "T_Stat": model.tvalues.get(factor, np.nan),
                        "Standard_Error": model.bse.get(factor, np.nan),
                        "CI_Lower": model.conf_int().loc[factor][0] if factor in model.params else np.nan,
                        "CI_Upper": model.conf_int().loc[factor][1] if factor in model.params else np.nan,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1] if len(y) > 1 else np.nan,
                        "Regression_Type": regression_type,
                        "Factor_Set": factor_set
                    })
    except Exception as e:
        logger.error(f"Regression error for {fund} ({factor_set}): {e}")
    return results

# Section 5: Processing Functions
def process_regression(symbol, returns, factor_set_key):
    records = []
    factor_configs = FACTOR_CONFIGS.get(factor_set_key, [])
    if not factor_configs:
        logger.warning(f"No config for {factor_set_key}")
        return records
    
    try:
        factors_list = [load_factors(config, region=config.get("region", "Global")) for config in factor_configs]
        factors = pd.concat([f for f in factors_list if not f.empty], axis=1).dropna()
        if factors.empty:
            logger.warning(f"No factors for {symbol} ({factor_set_key})")
            return records
        
        if 'mkt' in factors.columns and 'rf' in factors.columns:
            factors['mkt-rf'] = factors['mkt'] - factors['rf']
        
        factors, returns_aligned = factors.align(returns, join="inner", axis=0)
        if returns_aligned.empty:
            logger.warning(f"No overlapping data for {symbol} ({factor_set_key})")
            return records
        
        desired_factors = factor_configs[0].get("desired_factors", factors.columns.tolist())
        available_factors = [f for f in desired_factors if f in factors.columns]
        if not available_factors:
            logger.warning(f"No valid factors for {symbol} ({factor_set_key})")
            return records
        
        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", factor_set_key))
    except Exception as e:
        logger.error(f"Error processing {symbol} ({factor_set_key}): {e}")
    
    return records

def process_fund(fund_data):
    records = []
    symbol = fund_data["SymbolCUSIP"]
    category = fund_data["Global_Category_Name"]
    returns = pd.Series(fund_data["returns"]).dropna()
    
    logger.debug(f"Processing fund {symbol}")
    
    if returns.empty:
        logger.warning(f"No returns for {symbol}")
        return records
    
    regression_sets = ["Equity_USA_1"] if category in [k for k, v in category_to_region().items() if v[0] == "USA"] else []
    for factor_set in regression_sets:
        records.extend(process_regression(symbol, returns, factor_set))
    
    if records and DRY_RUN:
        pd.DataFrame(records).to_csv(f"dry_run_{symbol}_{uuid.uuid4()}.csv", index=False)
        logger.info(f"Saved {len(records)} records for {symbol}")
    
    return records

def process_region(region, fund_data_list):
    records = []
    logger.info(f"Processing {len(fund_data_list)} funds in {region}")
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_fund = {executor.submit(process_fund, fund_data): fund_data["SymbolCUSIP"] for fund_data in fund_data_list}
        for future in tqdm(future_to_fund, total=len(fund_data_list), desc=f"Processing {region}", file=sys.stdout):
            try:
                records.extend(future.result())
            except Exception as e:
                logger.error(f"Error processing {future_to_fund[future]}: {e}")
    
    if records and DRY_RUN:
        pd.DataFrame(records).to_csv(f"dry_run_{region}_{uuid.uuid4()}.csv", index=False)
        logger.info(f"Dry run: Saved {len(records)} records for {region}")
    
    return records

# Section 6: Main Pipeline
@timer
def main():
    logger.info("Starting main pipeline")
    try:
        fund_meta = load_fund_metadata()
    except Exception as e:
        logger.error(f"Failed to load metadata: {e}")
        return {"error": str(e)}
    
    regions = fund_meta["Region"].unique()
    logger.info(f"Total funds: {len(fund_meta)}, Regions: {regions}")
    
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
        logger.info(f"Sampled {len(fund_ids)} funds")
    
    try:
        returns = load_fund_returns(fund_ids)
    except Exception as e:
        logger.error(f"Failed to load returns: {e}")
        return {"error": str(e)}
    
    fund_data_list = [
        {
            "SymbolCUSIP": row["SymbolCUSIP"],
            "Global_Category_Name": row["Global_Category_Name"],
            "returns": returns[row["SymbolCUSIP"]].to_dict()
        }
        for _, row in fund_meta.iterrows() if row["SymbolCUSIP"] in returns.columns
    ]
    
    summary = {"total_funds": len(fund_data_list), "regions": {}, "errors": 0}
    for region in regions:
        region_funds = [fd for fd in fund_data_list if fund_meta[fund_meta["SymbolCUSIP"] == fd["SymbolCUSIP"]]["Region"].iloc[0] == region]
        if SAMPLE_DRY_RUN and len(region_funds) > SAMPLE_SIZE:
            region_funds = random.sample(region_funds, SAMPLE_SIZE)
            logger.info(f"Sampled {len(region_funds)} funds for {region}")
        records = process_region(region, region_funds)
        summary["regions"][region] = {"funds_processed": len(region_funds), "records": len(records)}
    
    logger.info(f"Pipeline summary: {summary}")
    return summary

# Section 7: Database Output
def insert_batch(records):
    try:
        df = pd.DataFrame(records)
        with database_transaction() as connection:
            df.to_sql("AQRR_Factor_Attribution", connection, if_exists="append", index=False, method="multi")
        logger.info(f"Inserted {len(df)} records")
    except Exception as e:
        logger.error(f"Error inserting batch: {e}")
        raise

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logger.error(f"Main execution failed: {e}")
        raise

2025-04-13 19:56:08,848 - INFO - Starting main pipeline
2025-04-13 19:56:09,045 - INFO - Loaded metadata for 5584 fundus
2025-04-13 19:56:09,337 - INFO - load_fund_metadata took 0.49 seconds
2025-04-13 19:56:09,339 - INFO - Total funds: 5584, Regions: ['Global' 'USA' 'Unknown' 'International']
2025-04-13 19:56:09,339 - INFO - Sampled 10 funds
2025-04-13 19:56:09,341 - ERROR - Unexpected error loading returns: (pyodbc.ProgrammingError) ("A TVP's rows must all be the same size.", 'HY000')
[SQL: 
    SELECT SymbolCUSIP, Date, ReturnValue
    FROM Fund_Returns_Timeseries
    WHERE Metric = ?
     AND SymbolCUSIP IN ?]
[parameters: ('1 Month Return', ('PLDR', 'VEGI', 'HYXF', 'ACAZX', 'DPTRX', 'LHMIX', 'CBLS', 'TRHBX', 'AIVI', 'GISTX'))]
(Background on this error at: https://sqlalche.me/e/20/f405)
2025-04-13 19:56:09,343 - ERROR - Failed to load returns: (pyodbc.ProgrammingError) ("A TVP's rows must all be the same size.", 'HY000')
[SQL: 
    SELECT SymbolCUSIP, Date, ReturnValue
    FROM Fu

In [3]:
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ThreadPoolExecutor
from sqlalchemy import create_engine, text
from sqlalchemy.exc import OperationalError
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time
import uuid
import psutil
import functools
from contextlib import contextmanager
import sys

# Section 1: Configuration and Setup
CONFIG = {
    "database": {
        "server": "JULIANS_LAPTOP\\SQLEXPRESS",
        "database": "CWA_Fund_Database",
        "driver": "ODBC Driver 18 for SQL Server"
    },
    "return_metric": "1 Month Return",
    "rolling_periods": [12],
    "dry_run": True,
    "sample_dry_run": True,
    "sample_size": 10,
    "chunk_size": 1000,
    "batch_insert_size": 5000,
    "max_workers": 8,
    "use_threads": True,
    "min_observations": 0.8,
    "query_timeout": 30
}

CONNECTION_STRING = (
    f"mssql+pyodbc://{CONFIG['database']['server']}/{CONFIG['database']['database']}"
    f"?driver={CONFIG['database']['driver']}&trusted_connection=yes&TrustServerCertificate=yes"
)
try:
    engine = create_engine(CONNECTION_STRING, connect_args={"timeout": CONFIG["query_timeout"]})
except Exception as e:
    print(f"Database connection failed: {e}")
    raise

RETURN_METRIC = CONFIG["return_metric"]
ROLLING_PERIODS = CONFIG["rolling_periods"]
DRY_RUN = CONFIG["dry_run"]
SAMPLE_DRY_RUN = CONFIG["sample_dry_run"]
SAMPLE_SIZE = CONFIG["sample_size"]
CHUNK_SIZE = CONFIG["chunk_size"]
BATCH_INSERT_SIZE = CONFIG["batch_insert_size"]
MAX_WORKERS = min(CONFIG["max_workers"], psutil.cpu_count(logical=False))
USE_THREADS = CONFIG["use_threads"]
MIN_OBSERVATIONS = CONFIG["min_observations"]

# Simplified factor configuration
FACTOR_CONFIGS = {
    "Equity_USA_1": [
        {"source": "db", "factors": ["MKT", "RF"], "region": "USA", "desired_factors": ["mkt-rf"]}
    ]
}

# Logging setup with immediate flushing
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("factor_attribution.log"),
        logging.StreamHandler(sys.stdout)
    ],
    force=True
)
logger = logging.getLogger()
logger.handlers[1].stream = sys.stdout
logger.handlers[1].setLevel(logging.INFO)

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    return mapping.get(category, ("Unknown", "Unknown"))

@contextmanager
def database_transaction():
    connection = engine.connect()
    transaction = connection.begin()
    try:
        yield connection
        transaction.commit()
    except Exception as e:
        transaction.rollback()
        logger.error(f"Transaction failed: {e}")
        raise
    finally:
        connection.close()

def timer(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        logger.debug(f"Starting {func.__name__}")
        result = func(*args, **kwargs)
        logger.info(f"{func.__name__} took {time.time() - start_time:.2f} seconds")
        return result
    return wrapper

# Section 3: Data Loading Functions
@timer
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    try:
        df = pd.read_sql(text(query), engine)
        logger.info(f"Loaded metadata for {len(df)} funds")
        df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
        if df["CWA_Broad_Category_Name"].isnull().all():
            logger.warning("CWA_Broad_Category_Name missing; some regressions may be skipped")
        return df.dropna(subset=["Region", "FactorProfile"])
    except OperationalError as e:
        logger.error(f"Database error loading metadata: {e}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error loading metadata: {e}")
        raise

@timer
def load_fund_returns(fund_ids=None):
    try:
        if not fund_ids:
            query = """
            SELECT SymbolCUSIP, Date, ReturnValue
            FROM Fund_Returns_Timeseries
            WHERE Metric = ?
            """
            params = [RETURN_METRIC]
        else:
            # Validate fund_ids
            fund_ids = [fid for fid in fund_ids if isinstance(fid, str) and fid.strip()]
            if not fund_ids:
                logger.warning("No valid fund IDs provided")
                return pd.DataFrame()
            # Manual construction to avoid TVP issues
            placeholders = ",".join([f"'{fid.replace('\'', '')}'" for fid in fund_ids])
            query = f"""
            SELECT SymbolCUSIP, Date, ReturnValue
            FROM Fund_Returns_Timeseries
            WHERE Metric = ?
            AND SymbolCUSIP IN ({placeholders})
            """
            params = [RETURN_METRIC]
            logger.debug(f"Querying returns for funds: {fund_ids}")
        
        chunks = []
        for chunk in pd.read_sql(query, engine, params=params, parse_dates=["Date"], chunksize=CHUNK_SIZE):
            logger.debug(f"Loaded chunk of {len(chunk)} rows")
            chunks.append(chunk)
        df = pd.concat(chunks) if chunks else pd.DataFrame()
        if df.empty:
            logger.warning("No returns data loaded")
            return pd.DataFrame()
        
        logger.info(f"Loaded returns for {len(df['SymbolCUSIP'].unique())} funds")
        pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
        for col in pivoted.columns:
            if not pd.api.types.is_numeric_dtype(pivoted[col]):
                logger.warning(f"Non-numeric returns for {col}; converting")
                pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
        return pivoted
    except OperationalError as e:
        logger.error(f"Database error loading returns: {e}")
        # Fallback: Try individual fund queries
        if fund_ids:
            logger.info("Attempting individual fund queries")
            chunks = []
            for fid in fund_ids:
                try:
                    query = """
                    SELECT SymbolCUSIP, Date, ReturnValue
                    FROM Fund_Returns_Timeseries
                    WHERE Metric = ? AND SymbolCUSIP = ?
                    """
                    chunk = pd.read_sql(query, engine, params=[RETURN_METRIC, fid], parse_dates=["Date"])
                    chunks.append(chunk)
                    logger.debug(f"Loaded returns for {fid}")
                except Exception as e2:
                    logger.warning(f"Failed to load returns for {fid}: {e2}")
            df = pd.concat(chunks) if chunks else pd.DataFrame()
            if df.empty:
                logger.warning("No returns data loaded in fallback")
                return pd.DataFrame()
            pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
            for col in pivoted.columns:
                pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
            return pivoted
        raise
    except Exception as e:
        logger.error(f"Unexpected error loading returns: {e}")
        raise

@timer
def load_factors(factor_config, region="Global", asset_class=None):
    source = factor_config.get("source")
    try:
        if source == "db":
            factor_in_clause = ','.join([f"'{f}'" for f in factor_config["factors"]])
            query = f"""
            SELECT date AS Date, factor AS Factor, value AS Value
            FROM {factor_config.get('table', 'factor_returns')}
            WHERE factor IN ({factor_in_clause})
            """
            params = []
            if region and not any(f.startswith('TSM-') or f == 'RF' for f in factor_config["factors"]):
                query += " AND region = ?"
                params.append(region)
            elif 'RF' in factor_config["factors"]:
                query += " AND region = 'USA'"
            if asset_class:
                query += " AND asset_class = ?"
                params.append(asset_class)
            df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
            if df.empty:
                logger.warning(f"No data for factors {factor_config['factors']}")
                return pd.DataFrame()
            if df.duplicated(subset=['Date', 'Factor']).sum() > 0:
                logger.error(f"Duplicate factors {factor_config['factors']}")
                raise ValueError("Duplicate factor data")
            pivoted_df = df.pivot(index="Date", columns="Factor", values="Value").rename(
                columns={'MKT': 'mkt', 'RF': 'rf'}
            )
            return pivoted_df
        else:
            logger.warning(f"Factor source {source} not implemented")
            return pd.DataFrame()
    except Exception as e:
        logger.error(f"Error loading factors {factor_config.get('factors', [])}: {e}")
        return pd.DataFrame()

# Section 4: Regression Functions
def run_rolling_regression(fund, returns, factors, regression_type, factor_set):
    results = []
    try:
        returns = pd.Series(returns).dropna()
        factors = pd.DataFrame(factors).dropna()
        returns.index = pd.to_datetime(returns.index, errors='coerce')
        factors.index = pd.to_datetime(factors.index, errors='coerce')
        returns = returns[returns.index.notnull()]
        factors = factors[factors.index.notnull()]
        
        if returns.empty or factors.empty:
            logger.warning(f"Empty data for {fund} ({factor_set})")
            return results
        
        viable_periods = [w for w in ROLLING_PERIODS if (returns.index.max() - relativedelta(months=w)) >= returns.index.min()]
        for window in viable_periods:
            min_obs = int(window * MIN_OBSERVATIONS)
            rolling_windows = returns.loc[returns.index >= (returns.index.min() + relativedelta(months=window))].rolling(window=f"{window}M", min_periods=min_obs)
            for end_date, y in rolling_windows:
                if y.index[0] < returns.index.min():
                    continue
                X = factors.loc[y.index]
                X, y = X.align(y, join="inner", axis=0)
                if len(y) < min_obs or y.isnull().any() or X.isnull().any().any():
                    continue
                X_const = add_constant(X)
                model = OLS(y, X_const).fit()
                for factor in X.columns:
                    results.append({
                        "SymbolCUSIP": fund,
                        "MonthEndDate": end_date,
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": model.params.get(factor, np.nan),
                        "P_Value": model.pvalues.get(factor, np.nan),
                        "T_Stat": model.tvalues.get(factor, np.nan),
                        "Standard_Error": model.bse.get(factor, np.nan),
                        "CI_Lower": model.conf_int().loc[factor][0] if factor in model.params else np.nan,
                        "CI_Upper": model.conf_int().loc[factor][1] if factor in model.params else np.nan,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1] if len(y) > 1 else np.nan,
                        "Regression_Type": regression_type,
                        "Factor_Set": factor_set
                    })
    except Exception as e:
        logger.error(f"Regression error for {fund} ({factor_set}): {e}")
    return results

# Section 5: Processing Functions
def process_regression(symbol, returns, factor_set_key):
    records = []
    factor_configs = FACTOR_CONFIGS.get(factor_set_key, [])
    if not factor_configs:
        logger.warning(f"No config for {factor_set_key}")
        return records
    
    try:
        factors_list = [load_factors(config, region=config.get("region", "Global")) for config in factor_configs]
        factors = pd.concat([f for f in factors_list if not f.empty], axis=1).dropna()
        if factors.empty:
            logger.warning(f"No factors for {symbol} ({factor_set_key})")
            return records
        
        if 'mkt' in factors.columns and 'rf' in factors.columns:
            factors['mkt-rf'] = factors['mkt'] - factors['rf']
        
        factors, returns_aligned = factors.align(returns, join="inner", axis=0)
        if returns_aligned.empty:
            logger.warning(f"No overlapping data for {symbol} ({factor_set_key})")
            return records
        
        desired_factors = factor_configs[0].get("desired_factors", factors.columns.tolist())
        available_factors = [f for f in desired_factors if f in factors.columns]
        if not available_factors:
            logger.warning(f"No valid factors for {symbol} ({factor_set_key})")
            return records
        
        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", factor_set_key))
    except Exception as e:
        logger.error(f"Error processing {symbol} ({factor_set_key}): {e}")
    
    return records

def process_fund(fund_data):
    records = []
    symbol = fund_data["SymbolCUSIP"]
    category = fund_data["Global_Category_Name"]
    returns = pd.Series(fund_data["returns"]).dropna()
    
    logger.debug(f"Processing fund {symbol}")
    
    if returns.empty:
        logger.warning(f"No returns for {symbol}")
        return records
    
    regression_sets = ["Equity_USA_1"] if category in [k for k, v in category_to_region().items() if v[0] == "USA"] else []
    for factor_set in regression_sets:
        records.extend(process_regression(symbol, returns, factor_set))
    
    if records and DRY_RUN:
        pd.DataFrame(records).to_csv(f"dry_run_{symbol}_{uuid.uuid4()}.csv", index=False)
        logger.info(f"Saved {len(records)} records for {symbol}")
    
    return records

def process_region(region, fund_data_list):
    records = []
    logger.info(f"Processing {len(fund_data_list)} funds in {region}")
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_fund = {executor.submit(process_fund, fund_data): fund_data["SymbolCUSIP"] for fund_data in fund_data_list}
        for future in tqdm(future_to_fund, total=len(fund_data_list), desc=f"Processing {region}", file=sys.stdout):
            try:
                records.extend(future.result())
            except Exception as e:
                logger.error(f"Error processing {future_to_fund[future]}: {e}")
    
    if records and DRY_RUN:
        pd.DataFrame(records).to_csv(f"dry_run_{region}_{uuid.uuid4()}.csv", index=False)
        logger.info(f"Dry run: Saved {len(records)} records for {region}")
    
    return records

# Section 6: Main Pipeline
@timer
def main():
    logger.info("Starting main pipeline")
    try:
        fund_meta = load_fund_metadata()
    except Exception as e:
        logger.error(f"Failed to load metadata: {e}")
        return {"error": str(e)}
    
    regions = fund_meta["Region"].unique()
    logger.info(f"Total funds: {len(fund_meta)}, Regions: {regions}")
    
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
        logger.info(f"Sampled {len(fund_ids)} funds")
    
    try:
        returns = load_fund_returns(fund_ids)
    except Exception as e:
        logger.error(f"Failed to load returns: {e}")
        return {"error": str(e)}
    
    fund_data_list = [
        {
            "SymbolCUSIP": row["SymbolCUSIP"],
            "Global_Category_Name": row["Global_Category_Name"],
            "returns": returns[row["SymbolCUSIP"]].to_dict()
        }
        for _, row in fund_meta.iterrows() if row["SymbolCUSIP"] in returns.columns
    ]
    
    summary = {"total_funds": len(fund_data_list), "regions": {}, "errors": 0}
    for region in regions:
        region_funds = [fd for fd in fund_data_list if fund_meta[fund_meta["SymbolCUSIP"] == fd["SymbolCUSIP"]]["Region"].iloc[0] == region]
        if SAMPLE_DRY_RUN and len(region_funds) > SAMPLE_SIZE:
            region_funds = random.sample(region_funds, SAMPLE_SIZE)
            logger.info(f"Sampled {len(region_funds)} funds for {region}")
        records = process_region(region, region_funds)
        summary["regions"][region] = {"funds_processed": len(region_funds), "records": len(records)}
    
    logger.info(f"Pipeline summary: {summary}")
    return summary

# Section 7: Database Output
def insert_batch(records):
    try:
        df = pd.DataFrame(records)
        with database_transaction() as connection:
            df.to_sql("AQRR_Factor_Attribution", connection, if_exists="append", index=False, method="multi")
        logger.info(f"Inserted {len(df)} records")
    except Exception as e:
        logger.error(f"Error inserting batch: {e}")
        raise

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logger.error(f"Main execution failed: {e}")
        raise

2025-04-13 19:59:06,725 - INFO - Starting main pipeline
2025-04-13 19:59:06,849 - INFO - Loaded metadata for 5584 funds
2025-04-13 19:59:07,131 - INFO - load_fund_metadata took 0.41 seconds
2025-04-13 19:59:07,133 - INFO - Total funds: 5584, Regions: ['Global' 'USA' 'Unknown' 'International']
2025-04-13 19:59:07,133 - INFO - Sampled 10 funds
2025-04-13 19:59:07,148 - ERROR - Unexpected error loading returns: List argument must consist only of tuples or dictionaries
2025-04-13 19:59:07,148 - ERROR - Failed to load returns: List argument must consist only of tuples or dictionaries
2025-04-13 19:59:07,150 - INFO - main took 0.42 seconds


In [6]:
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ThreadPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time
import functools
from contextlib import contextmanager
import sys

# Section 1: Configuration and Setup
CONFIG = {
    "database": {
        "server": "JULIANS_LAPTOP\\SQLEXPRESS",
        "database": "CWA_Fund_Database",
        "driver": "ODBC Driver 18 for SQL Server"
    },
    "return_metric": "1 Month Return",
    "rolling_periods": [12],
    "dry_run": True,
    "sample_dry_run": True,
    "sample_size": 10,
    "chunk_size": 1000,
    "batch_insert_size": 5000,
    "max_workers": 8,
    "use_threads": True,
    "min_observations": 0.8,
    "query_timeout": 30
}

CONNECTION_STRING = (
    f"mssql+pyodbc://{CONFIG['database']['server']}/{CONFIG['database']['database']}"
    f"?driver={CONFIG['database']['driver']}&trusted_connection=yes&TrustServerCertificate=yes"
)
try:
    engine = create_engine(CONNECTION_STRING, connect_args={"timeout": CONFIG["query_timeout"]})
except Exception as e:
    print(f"Database connection failed: {e}")
    raise

RETURN_METRIC = CONFIG["return_metric"]
ROLLING_PERIODS = CONFIG["rolling_periods"]
DRY_RUN = CONFIG["dry_run"]
SAMPLE_DRY_RUN = CONFIG["sample_dry_run"]
SAMPLE_SIZE = CONFIG["sample_size"]
CHUNK_SIZE = CONFIG["chunk_size"]
BATCH_INSERT_SIZE = CONFIG["batch_insert_size"]
MAX_WORKERS = min(CONFIG["max_workers"], psutil.cpu_count(logical=False))
USE_THREADS = CONFIG["use_threads"]
MIN_OBSERVATIONS = CONFIG["min_observations"]

# Factor configuration for all regions
FACTOR_CONFIGS = {
    "Equity_USA_1": [
        {"source": "db", "factors": ["MKT", "RF"], "region": "USA", "desired_factors": ["mkt-rf"]}
    ],
    "Equity_Global_1": [
        {"source": "db", "factors": ["MKT", "RF"], "region": "Global", "desired_factors": ["mkt-rf"]}
    ],
    "Equity_International_1": [
        {"source": "db", "factors": ["MKT", "RF"], "region": "International", "desired_factors": ["mkt-rf"]}
    ]
}

# Logging setup
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("factor_attribution.log"),
        logging.StreamHandler(sys.stdout)
    ],
    force=True
)
logger = logging.getLogger()
logger.handlers[1].stream = sys.stdout
logger.handlers[1].setLevel(logging.INFO)

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    result = mapping.get(category, ("USA", "Unknown"))
    if result[0] == "Unknown":
        logger.warning(f"Unmapped category '{category}' for fund; defaulting to USA")
    return result

@contextmanager
def database_transaction():
    connection = engine.connect()
    transaction = connection.begin()
    try:
        yield connection
        transaction.commit()
    except Exception as e:
        transaction.rollback()
        logger.error(f"Transaction failed: {e}")
        raise
    finally:
        connection.close()

def timer(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        logger.debug(f"Starting {func.__name__}")
        result = func(*args, **kwargs)
        logger.info(f"{func.__name__} took {time.time() - start_time:.2f} seconds")
        return result
    return wrapper

# Section 3: Data Loading Functions
@timer
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    try:
        df = pd.read_sql(query, engine)
        logger.info(f"Loaded metadata for {len(df)} funds")
        df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
        if df["CWA_Broad_Category_Name"].isnull().all():
            logger.warning("CWA_Broad_Category_Name missing; some regressions may be skipped")
        return df.dropna(subset=["Region", "FactorProfile"])
    except Exception as e:
        logger.error(f"Error loading metadata: {e}")
        raise

@timer
def load_fund_returns(fund_ids=None):
    try:
        if not fund_ids:
            query = f"""
            SELECT SymbolCUSIP, Date, ReturnValue
            FROM Fund_Returns_Timeseries
            WHERE Metric = '{RETURN_METRIC.replace("'", "")}'
            """
        else:
            fund_ids = [fid for fid in fund_ids if isinstance(fid, str) and fid.strip()]
            if not fund_ids:
                logger.warning("No valid fund IDs provided")
                return pd.DataFrame()
            placeholders = ",".join([f"'{fid.replace('\'', '')}'" for fid in fund_ids])
            query = f"""
            SELECT SymbolCUSIP, Date, ReturnValue
            FROM Fund_Returns_Timeseries
            WHERE Metric = '{RETURN_METRIC.replace("'", "")}'
            AND SymbolCUSIP IN ({placeholders})
            """
            logger.debug(f"Querying returns for funds: {fund_ids}")
        
        chunks = []
        for chunk in pd.read_sql(query, engine, parse_dates=["Date"], chunksize=CHUNK_SIZE):
            logger.debug(f"Loaded chunk of {len(chunk)} rows")
            chunks.append(chunk)
        df = pd.concat(chunks) if chunks else pd.DataFrame()
        if df.empty:
            logger.warning("No returns data loaded")
            return pd.DataFrame()
        
        logger.info(f"Loaded returns for {len(df['SymbolCUSIP'].unique())} funds")
        pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
        # Enforce month-end dates and frequency
        pivoted.index = pd.to_datetime(pivoted.index) + pd.offsets.MonthEnd(0)
        pivoted = pivoted.asfreq('ME')
        for col in pivoted.columns:
            if not pd.api.types.is_numeric_dtype(pivoted[col]):
                logger.warning(f"Non-numeric returns for {col}; converting")
                pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
        return pivoted
    except Exception as e:
        logger.error(f"Error loading returns: {e}")
        if fund_ids:
            logger.info("Attempting individual fund queries")
            chunks = []
            for fid in fund_ids:
                try:
                    query = f"""
                    SELECT SymbolCUSIP, Date, ReturnValue
                    FROM Fund_Returns_Timeseries
                    WHERE Metric = '{RETURN_METRIC.replace("'", "")}'
                    AND SymbolCUSIP = '{fid.replace("'", "")}'
                    """
                    chunk = pd.read_sql(query, engine, parse_dates=["Date"])
                    chunks.append(chunk)
                    logger.debug(f"Loaded returns for {fid}")
                except Exception as e2:
                    logger.warning(f"Failed to load returns for {fid}: {e2}")
            df = pd.concat(chunks) if chunks else pd.DataFrame()
            if df.empty:
                logger.warning("No returns data loaded in fallback")
                return pd.DataFrame()
            pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
            pivoted.index = pd.to_datetime(pivoted.index) + pd.offsets.MonthEnd(0)
            pivoted = pivoted.asfreq('ME')
            for col in pivoted.columns:
                pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
            return pivoted
        raise

@timer
def load_factors(factor_config, region="Global", asset_class=None):
    source = factor_config.get("source")
    try:
        if source == "db":
            factor_in_clause = ','.join([f"'{f}'" for f in factor_config["factors"]])
            query = f"""
            SELECT date AS Date, factor AS Factor, value AS Value
            FROM {factor_config.get('table', 'factor_returns')}
            WHERE factor IN ({factor_in_clause})
            """
            params = []
            if region and not any(f.startswith('TSM-') or f == 'RF' for f in factor_config["factors"]):
                query += " AND region = ?"
                params.append(region)
            elif 'RF' in factor_config["factors"]:
                query += " AND region = 'USA'"
            if asset_class:
                query += " AND asset_class = ?"
                params.append(asset_class)
            df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
            if df.empty:
                logger.warning(f"No data for factors {factor_config['factors']} in region {region}")
                return pd.DataFrame()
            if df.duplicated(subset=['Date', 'Factor']).sum() > 0:
                logger.error(f"Duplicate factors {factor_config['factors']}")
                raise ValueError("Duplicate factor data")
            pivoted_df = df.pivot(index="Date", columns="Factor", values="Value").rename(
                columns={'MKT': 'mkt', 'RF': 'rf'}
            )
            # Enforce month-end dates and frequency
            pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
            pivoted_df = pivoted_df.asfreq('ME')
            logger.debug(f"Factors loaded: {pivoted_df.columns.tolist()}, dates: {pivoted_df.index.min()} to {pivoted_df.index.max()}")
            return pivoted_df
        else:
            logger.warning(f"Factor source {source} not implemented")
            return pd.DataFrame()
    except Exception as e:
        logger.error(f"Error loading factors {factor_config.get('factors', [])}: {e}")
        return pd.DataFrame()

# Section 4: Regression Functions
def run_rolling_regression(fund, returns, factors, regression_type, factor_set):
    results = []
    try:
        returns = pd.Series(returns).dropna()
        factors = pd.DataFrame(factors).dropna()
        returns.index = pd.to_datetime(returns.index, errors='coerce')
        factors.index = pd.to_datetime(factors.index, errors='coerce')
        returns = returns[returns.index.notnull()]
        factors = factors[factors.index.notnull()]
        
        if returns.empty or factors.empty:
            logger.warning(f"Empty data for {fund} ({factor_set})")
            return results
        
        # Data integrity check
        min_obs = int(min(ROLLING_PERIODS) * MIN_OBSERVATIONS)
        overlap_dates = returns.index.intersection(factors.index)
        if len(overlap_dates) < min_obs:
            logger.warning(f"Insufficient overlapping data for {fund} ({factor_set}): {len(overlap_dates)} < {min_obs}")
            return results
        
        logger.debug(f"Fund {fund}: {len(returns)} returns, {len(factors)} factors, {len(overlap_dates)} overlapping month-end dates")
        
        # Align indices
        factors = factors.loc[overlap_dates]
        returns = returns.loc[overlap_dates]
        
        for window in ROLLING_PERIODS:
            if len(returns) < min_obs:
                logger.warning(f"Insufficient data for {fund} ({factor_set}, window={window}): {len(returns)} < {min_obs}")
                continue
            
            # Roll over fixed periods
            for start_idx in range(len(returns) - window + 1):
                end_idx = start_idx + window
                y = returns.iloc[start_idx:end_idx]
                X = factors.iloc[start_idx:end_idx]
                if len(y) < min_obs or y.isnull().any() or X.isnull().any().any():
                    continue
                X_const = add_constant(X)
                model = OLS(y, X_const).fit()
                for factor in X.columns:
                    result = {
                        "SymbolCUSIP": fund,
                        "MonthEndDate": y.index[-1],
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": model.params.get(factor, np.nan),
                        "P_Value": model.pvalues.get(factor, np.nan),
                        "T_Stat": model.tvalues.get(factor, np.nan),
                        "Standard_Error": model.bse.get(factor, np.nan),
                        "CI_Lower": model.conf_int().loc[factor][0] if factor in model.params else np.nan,
                        "CI_Upper": model.conf_int().loc[factor][1] if factor in model.params else np.nan,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1] if len(y) > 1 else np.nan,
                        "Regression_Type": regression_type,
                        "Factor_Set": factor_set
                    }
                    results.append(result)
                # Log sample result
                if results:
                    logger.debug(f"Sample regression for {fund}: Factor={results[-1]['Factor_Name']}, "
                                 f"Coefficient={results[-1]['Coefficient']:.4f}, P_Value={results[-1]['P_Value']:.4f}")
        
    except Exception as e:
        logger.error(f"Regression error for {fund} ({factor_set}): {e}")
    return results

# Section 5: Processing Functions
def process_regression(symbol, returns, factor_set_key):
    records = []
    factor_configs = FACTOR_CONFIGS.get(factor_set_key, [])
    if not factor_configs:
        logger.warning(f"No config for {factor_set_key}")
        return records
    
    try:
        factors_list = [load_factors(config, region=config.get("region", "Global")) for config in factor_configs]
        factors = pd.concat([f for f in factors_list if not f.empty], axis=1).dropna()
        if factors.empty:
            logger.warning(f"No factors for {symbol} ({factor_set_key})")
            return records
        
        if 'mkt' in factors.columns and 'rf' in factors.columns:
            factors['mkt-rf'] = factors['mkt'] - factors['rf']
        
        records.extend(run_rolling_regression(symbol, returns, factors, "OLS", factor_set_key))
    except Exception as e:
        logger.error(f"Error processing {symbol} ({factor_set_key}): {e}")
    
    return records

def process_fund(fund_data):
    records = []
    symbol = fund_data["SymbolCUSIP"]
    category = fund_data["Global_Category_Name"]
    returns = pd.Series(fund_data["returns"]).dropna()
    
    logger.debug(f"Processing fund {symbol}, category: {category}")
    
    if returns.empty:
        logger.info(f"No returns for {symbol}")
        return records
    
    region, profile = category_to_region(category)
    logger.debug(f"Fund {symbol} mapped to region: {region}, profile: {profile}")
    
    # Assign regression set based on region
    regression_sets = {
        "USA": ["Equity_USA_1"],
        "Global": ["Equity_Global_1"],
        "International": ["Equity_International_1"]
    }.get(region, ["Equity_USA_1"])  # Default Unknown to USA
    
    for factor_set in regression_sets:
        records.extend(process_regression(symbol, returns, factor_set))
    
    logger.info(f"Generated {len(records)} regression records for {symbol}")
    return records

def process_region(region, fund_data_list):
    records = []
    logger.info(f"Processing {len(fund_data_list)} funds in {region}")
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_fund = {executor.submit(process_fund, fund_data): fund_data["SymbolCUSIP"] for fund_data in fund_data_list}
        for future in tqdm(future_to_fund, total=len(fund_data_list), desc=f"Processing {region}", file=sys.stdout):
            try:
                records.extend(future.result())
            except Exception as e:
                logger.error(f"Error processing {future_to_fund[future]}: {e}")
    
    logger.info(f"Region {region} generated {len(records)} total records")
    if not DRY_RUN:
        insert_batch(records)
    
    return records

# Section 6: Main Pipeline
@timer
def main():
    logger.info("Starting main pipeline")
    try:
        fund_meta = load_fund_metadata()
    except Exception as e:
        logger.error(f"Failed to load metadata: {e}")
        return {"error": str(e)}
    
    regions = fund_meta["Region"].unique()
    logger.info(f"Total funds: {len(fund_meta)}, Regions: {regions}")
    
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
        logger.info(f"Sampled {len(fund_ids)} funds: {fund_ids}")
    
    try:
        returns = load_fund_returns(fund_ids)
    except Exception as e:
        logger.error(f"Failed to load returns: {e}")
        return {"error": str(e)}
    
    fund_data_list = [
        {
            "SymbolCUSIP": row["SymbolCUSIP"],
            "Global_Category_Name": row["Global_Category_Name"],
            "returns": returns[row["SymbolCUSIP"]].to_dict()
        }
        for _, row in fund_meta.iterrows() if row["SymbolCUSIP"] in returns.columns
    ]
    
    summary = {"total_funds": len(fund_data_list), "regions": {}, "errors": 0}
    for region in sorted(set(regions) - {'Unknown'} | {'USA'}):  # Exclude Unknown, ensure USA
        region_funds = [fd for fd in fund_data_list if fund_meta[fund_meta["SymbolCUSIP"] == fd["SymbolCUSIP"]]["Region"].iloc[0] == region]
        if SAMPLE_DRY_RUN and len(region_funds) > SAMPLE_SIZE:
            region_funds = random.sample(region_funds, SAMPLE_SIZE)
            logger.info(f"Sampled {len(region_funds)} funds for {region}")
        records = process_region(region, region_funds)
        summary["regions"][region] = {"funds_processed": len(region_funds), "records": len(records)}
    
    logger.info(f"Pipeline summary: {summary}")
    return summary

# Section 7: Database Output
def insert_batch(records):
    try:
        df = pd.DataFrame(records)
        with database_transaction() as connection:
            df.to_sql("AQRR_Factor_Attribution", connection, if_exists="append", index=False, method="multi")
        logger.info(f"Inserted {len(df)} records to database")
    except Exception as e:
        logger.error(f"Error inserting batch: {e}")
        raise

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logger.error(f"Main execution failed: {e}")
        raise

2025-04-13 20:56:37,116 - INFO - Starting main pipeline
2025-04-13 20:56:37,297 - INFO - Loaded metadata for 5584 funds
2025-04-13 20:56:37,573 - INFO - load_fund_metadata took 0.46 seconds
2025-04-13 20:56:37,574 - INFO - Total funds: 5584, Regions: ['Global' 'USA' 'International']
2025-04-13 20:56:37,574 - INFO - Sampled 10 funds: ['FPRO', 'SPEU', 'BFGIX', 'PCGYX', 'CGIIX', 'FIBR', 'FUSAX', 'BFIX', 'MBNE', 'RVER']
2025-04-13 20:56:37,603 - INFO - Loaded returns for 10 funds
2025-04-13 20:56:37,608 - INFO - load_fund_returns took 0.03 seconds
2025-04-13 20:56:37,733 - INFO - Processing 1 funds in Global
Processing Global:   0%|          | 0/1 [00:00<?, ?it/s]2025-04-13 20:56:37,763 - INFO - load_factors took 0.03 seconds
2025-04-13 20:56:38,227 - INFO - Generated 327 regression records for CGIIX
Processing Global: 100%|██████████| 1/1 [00:00<00:00,  2.03it/s]
2025-04-13 20:56:38,231 - INFO - Region Global generated 327 total records
2025-04-13 20:56:38,235 - INFO - Processing 1 funds 

In [10]:
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ThreadPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time
import functools
from contextlib import contextmanager
import sys

# Section 1: Configuration and Setup
CONFIG = {
    "database": {
        "server": "JULIANS_LAPTOP\\SQLEXPRESS",
        "database": "CWA_Fund_Database",
        "driver": "ODBC Driver 18 for SQL Server"
    },
    "return_metric": "1 Month Return",
    "rolling_periods": [12, 24, 36, 48, 60],
    "dry_run": True,
    "sample_dry_run": True,
    "sample_size": 10,
    "chunk_size": 1000,
    "batch_insert_size": 5000,
    "max_workers": 8,
    "use_threads": True,
    "query_timeout": 30
}

CONNECTION_STRING = (
    f"mssql+pyodbc://{CONFIG['database']['server']}/{CONFIG['database']['database']}"
    f"?driver={CONFIG['database']['driver']}&trusted_connection=yes&TrustServerCertificate=yes"
)
try:
    engine = create_engine(CONNECTION_STRING, connect_args={"timeout": CONFIG["query_timeout"]})
except Exception as e:
    print(f"Database connection failed: {e}")
    raise

RETURN_METRIC = CONFIG["return_metric"]
ROLLING_PERIODS = CONFIG["rolling_periods"]
DRY_RUN = CONFIG["dry_run"]
SAMPLE_DRY_RUN = CONFIG["sample_dry_run"]
SAMPLE_SIZE = CONFIG["sample_size"]
CHUNK_SIZE = CONFIG["chunk_size"]
BATCH_INSERT_SIZE = CONFIG["batch_insert_size"]
MAX_WORKERS = min(CONFIG["max_workers"], psutil.cpu_count(logical=False))
USE_THREADS = CONFIG["use_threads"]

# Factor configuration for all regions
FACTOR_CONFIGS = {
    "Equity_USA_1": [
        {"source": "db", "factors": ["MKT", "RF"], "region": "USA", "desired_factors": ["mkt-rf"]}
    ],
    "Equity_Global_1": [
        {"source": "db", "factors": ["MKT", "RF"], "region": "Global", "desired_factors": ["mkt-rf"]}
    ],
    "Equity_International_1": [
        {"source": "db", "factors": ["MKT", "RF"], "region": "International", "desired_factors": ["mkt-rf"]}
    ]
}

# Logging setup
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("factor_attribution.log"),
        logging.StreamHandler(sys.stdout)
    ],
    force=True
)
logger = logging.getLogger()
logger.handlers[1].stream = sys.stdout
logger.handlers[1].setLevel(logging.INFO)

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Natural Resources Sector Equity": ("USA", "Natural Resources Sector Equity"),
        "Infrastructure Sector Equity": ("USA", "Infrastructure Sector Equity"),
        "Trading Tools": ("USA", "Trading Tools"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    result = mapping.get(category, ("USA", "Unknown"))
    if result[1] == "Unknown":
        logger.warning(f"UnMapped category '{category}' for fund; defaulting to USA")
    return result

@contextmanager
def database_transaction():
    connection = engine.connect()
    transaction = connection.begin()
    try:
        yield connection
        transaction.commit()
    except Exception as e:
        transaction.rollback()
        logger.error(f"Transaction failed: {e}")
        raise
    finally:
        connection.close()

def timer(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        logger.debug(f"Starting {func.__name__}")
        result = func(*args, **kwargs)
        logger.info(f"{func.__name__} took {time.time() - start_time:.2f} seconds")
        return result
    return wrapper

# Section 3: Data Loading Functions
@timer
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    try:
        df = pd.read_sql(query, engine)
        logger.info(f"Loaded metadata for {len(df)} funds")
        df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
        if df["CWA_Broad_Category_Name"].isnull().all():
            logger.warning("CWA_Broad_Category_Name missing; some regressions may be skipped")
        return df.dropna(subset=["Region", "FactorProfile"])
    except Exception as e:
        logger.error(f"Error loading metadata: {e}")
        raise

@timer
def load_fund_returns(fund_ids=None):
    try:
        if not fund_ids:
            query = f"""
            SELECT SymbolCUSIP, Date, ReturnValue
            FROM Fund_Returns_Timeseries
            WHERE Metric = '{RETURN_METRIC.replace("'", "")}'
            """
        else:
            fund_ids = [fid for fid in fund_ids if isinstance(fid, str) and fid.strip()]
            if not fund_ids:
                logger.warning("No valid fund IDs provided")
                return pd.DataFrame()
            placeholders = ",".join([f"'{fid.replace('\'', '')}'" for fid in fund_ids])
            query = f"""
            SELECT SymbolCUSIP, Date, ReturnValue
            FROM Fund_Returns_Timeseries
            WHERE Metric = '{RETURN_METRIC.replace("'", "")}'
            AND SymbolCUSIP IN ({placeholders})
            """
            logger.debug(f"Querying returns for funds: {fund_ids}")
        
        chunks = []
        for chunk in pd.read_sql(query, engine, parse_dates=["Date"], chunksize=CHUNK_SIZE):
            logger.debug(f"Loaded chunk of {len(chunk)} rows")
            chunks.append(chunk)
        df = pd.concat(chunks) if chunks else pd.DataFrame()
        if df.empty:
            logger.warning("No returns data loaded")
            return pd.DataFrame()
        
        logger.info(f"Loaded returns for {len(df['SymbolCUSIP'].unique())} funds")
        pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
        # Enforce month-end dates and frequency
        pivoted.index = pd.to_datetime(pivoted.index) + pd.offsets.MonthEnd(0)
        pivoted.index = pivoted.index.drop_duplicates()
        pivoted = pivoted.asfreq('ME')
        for col in pivoted.columns:
            if not pd.api.types.is_numeric_dtype(pivoted[col]):
                logger.warning(f"Non-numeric returns for {col}; converting")
                pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
        return pivoted
    except Exception as e:
        logger.error(f"Error loading returns: {e}")
        if fund_ids:
            logger.info("Attempting individual fund queries")
            chunks = []
            for fid in fund_ids:
                try:
                    query = f"""
                    SELECT SymbolCUSIP, Date, ReturnValue
                    FROM Fund_Returns_Timeseries
                    WHERE Metric = '{RETURN_METRIC.replace("'", "")}'
                    AND SymbolCUSIP = '{fid.replace("'", "")}'
                    """
                    chunk = pd.read_sql(query, engine, parse_dates=["Date"])
                    chunks.append(chunk)
                    logger.debug(f"Loaded returns for {fid}")
                except Exception as e2:
                    logger.warning(f"Failed to load returns for {fid}: {e2}")
            df = pd.concat(chunks) if chunks else pd.DataFrame()
            if df.empty:
                logger.warning("No returns data loaded in fallback")
                return pd.DataFrame()
            pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
            pivoted.index = pd.to_datetime(pivoted.index) + pd.offsets.MonthEnd(0)
            pivoted.index = pivoted.index.drop_duplicates()
            pivoted = pivoted.asfreq('ME')
            for col in pivoted.columns:
                pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
            return pivoted
        raise

@timer
def load_factors(factor_config, region="Global", asset_class=None):
    source = factor_config.get("source")
    try:
        if source == "db":
            factor_in_clause = ','.join([f"'{f}'" for f in factor_config["factors"]])
            query = f"""
            SELECT date AS Date, factor AS Factor, value AS Value
            FROM {factor_config.get('table', 'factor_returns')}
            WHERE factor IN ({factor_in_clause})
            """
            params = []
            if region and not any(f.startswith('TSM-') or f == 'RF' for f in factor_config["factors"]):
                query += " AND region = ?"
                params.append(region)
            elif 'RF' in factor_config["factors"]:
                query += " AND region = 'USA'"
            if asset_class:
                query += " AND asset_class = ?"
                params.append(asset_class)
            df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
            if df.empty:
                logger.warning(f"No data for factors {factor_config['factors']} in region {region}")
                return pd.DataFrame()
            if df.duplicated(subset=['Date', 'Factor']).sum() > 0:
                logger.error(f"Duplicate factors {factor_config['factors']}")
                raise ValueError("Duplicate factor data")
            pivoted_df = df.pivot(index="Date", columns="Factor", values="Value").rename(
                columns={'MKT': 'mkt', 'RF': 'rf'}
            )
            # Enforce month-end dates and frequency
            pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
            pivoted_df.index = pivoted_df.index.drop_duplicates()
            pivoted_df = pivoted_df.asfreq('ME')
            logger.debug(f"Factors loaded: {pivoted_df.columns.tolist()}, dates: {pivoted_df.index.min()} to {pivoted_df.index.max()}")
            return pivoted_df
        else:
            logger.warning(f"Factor source {source} not implemented")
            return pd.DataFrame()
    except Exception as e:
        logger.error(f"Error loading factors {factor_config.get('factors', [])}: {e}")
        return pd.DataFrame()

# Section 4: Regression Functions
def run_rolling_regression(fund, returns, factors, regression_type, factor_set):
    results = []
    try:
        returns = pd.Series(returns).dropna()
        factors = pd.DataFrame(factors).dropna()
        returns.index = pd.to_datetime(returns.index, errors='coerce')
        factors.index = pd.to_datetime(factors.index, errors='coerce')
        returns = returns[returns.index.notnull()]
        factors = factors[factors.index.notnull()]
        
        if returns.empty or factors.empty:
            logger.warning(f"Empty data for {fund} ({factor_set})")
            return results
        
        # Data integrity check
        overlap_dates = returns.index.intersection(factors.index)
        months_available = len(overlap_dates)
        date_start = overlap_dates.min().strftime('%Y-%m-%d') if months_available > 0 else "none"
        date_end = overlap_dates.max().strftime('%Y-%m-%d') if months_available > 0 else "none"
        
        # Check for smallest window
        min_window = min(ROLLING_PERIODS)
        if months_available < min_window:
            logger.warning(f"Fund {fund} ({factor_set}) skipped: {months_available} months available "
                           f"from {date_start} to {date_end}, need {min_window} for smallest window")
            return results
        
        # Log index details for debugging
        logger.debug(f"Fund {fund}: {len(returns)} returns, {len(factors)} factors, {len(overlap_dates)} overlapping dates")
        
        # Align indices with reindex
        factors = factors.reindex(overlap_dates)
        returns = returns.reindex(overlap_dates)
        
        for window in ROLLING_PERIODS:
            if months_available < window:
                logger.warning(f"Fund {fund} ({factor_set}, window={window}m) skipped: {months_available} months available "
                               f"from {date_start} to {date_end}, need {window}")
                continue
            
            # Roll over fixed periods
            for start_idx in range(len(returns) - window + 1):
                end_idx = start_idx + window
                y = returns.iloc[start_idx:end_idx]
                X = factors.iloc[start_idx:end_idx]
                if len(y) < window or y.isnull().any() or X.isnull().any().any():
                    continue
                X_const = add_constant(X)
                model = OLS(y, X_const).fit()
                for factor in X.columns:
                    result = {
                        "SymbolCUSIP": fund,
                        "MonthEndDate": y.index[-1],
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": model.params.get(factor, np.nan),
                        "P_Value": model.pvalues.get(factor, np.nan),
                        "T_Stat": model.tvalues.get(factor, np.nan),
                        "Standard_Error": model.bse.get(factor, np.nan),
                        "CI_Lower": model.conf_int().loc[factor][0] if factor in model.params else np.nan,
                        "CI_Upper": model.conf_int().loc[factor][1] if factor in model.params else np.nan,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1] if len(y) > 1 else np.nan,
                        "Regression_Type": regression_type,
                        "Factor_Set": factor_set
                    }
                    results.append(result)
                # Log sample result
                if results:
                    logger.debug(f"Sample regression for {fund}: Factor={results[-1]['Factor_Name']}, "
                                 f"Coefficient={results[-1]['Coefficient']:.4f}, P_Value={results[-1]['P_Value']:.4f}")
        
    except Exception as e:
        logger.error(f"Regression error for {fund} ({factor_set}): {e}")
    return results

# Section 5: Processing Functions
def process_regression(symbol, returns, factor_set_key):
    records = []
    factor_configs = FACTOR_CONFIGS.get(factor_set_key, [])
    if not factor_configs:
        logger.warning(f"No config for {factor_set_key}")
        return records
    
    try:
        factors_list = [load_factors(config, region=config.get("region", "Global")) for config in factor_configs]
        factors = pd.concat([f for f in factors_list if not f.empty], axis=1).dropna()
        if factors.empty:
            logger.warning(f"No factors for {symbol} ({factor_set_key})")
            return records
        
        if 'mkt' in factors.columns and 'rf' in factors.columns:
            factors['mkt-rf'] = factors['mkt'] - factors['rf']
        
        records.extend(run_rolling_regression(symbol, returns, factors, "OLS", factor_set_key))
    except Exception as e:
        logger.error(f"Error processing {symbol} ({factor_set_key}): {e}")
    
    return records

def process_fund(fund_data):
    records = []
    symbol = fund_data["SymbolCUSIP"]
    category = fund_data["Global_Category_Name"]
    returns = pd.Series(fund_data["returns"]).dropna()
    
    logger.debug(f"Processing fund {symbol}, category: {category}")
    
    if returns.empty:
        logger.info(f"No returns for {symbol}")
        return records
    
    region, profile = category_to_region(category)
    logger.debug(f"Fund {symbol} mapped to region: {region}, profile: {profile}")
    
    # Assign regression set based on region
    regression_sets = {
        "USA": ["Equity_USA_1"],
        "Global": ["Equity_Global_1"],
        "International": ["Equity_International_1"]
    }.get(region, ["Equity_USA_1"])  # Default Unknown to USA
    
    for factor_set in regression_sets:
        records.extend(process_regression(symbol, returns, factor_set))
    
    logger.info(f"Generated {len(records)} regression records for {symbol}")
    return records

def process_region(region, fund_data_list):
    records = []
    logger.info(f"Processing {len(fund_data_list)} funds in {region}")
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_fund = {executor.submit(process_fund, fund_data): fund_data["SymbolCUSIP"] for fund_data in fund_data_list}
        for future in tqdm(future_to_fund, total=len(fund_data_list), desc=f"Processing {region}", file=sys.stdout):
            try:
                records.extend(future.result())
            except Exception as e:
                logger.error(f"Error processing {future_to_fund[future]}: {e}")
    
    logger.info(f"Region {region} generated {len(records)} total records")
    if not DRY_RUN:
        insert_batch(records)
    
    return records

# Section 6: Main Pipeline
@timer
def main():
    logger.info("Starting main pipeline")
    try:
        fund_meta = load_fund_metadata()
    except Exception as e:
        logger.error(f"Failed to load metadata: {e}")
        return {"error": str(e)}
    
    regions = fund_meta["Region"].unique()
    logger.info(f"Total funds: {len(fund_meta)}, Regions: {regions}")
    
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
        logger.info(f"Sampled {len(fund_ids)} funds: {fund_ids}")
    
    try:
        returns = load_fund_returns(fund_ids)
    except Exception as e:
        logger.error(f"Failed to load returns: {e}")
        return {"error": str(e)}
    
    fund_data_list = [
        {
            "SymbolCUSIP": row["SymbolCUSIP"],
            "Global_Category_Name": row["Global_Category_Name"],
            "returns": returns[row["SymbolCUSIP"]].to_dict()
        }
        for _, row in fund_meta.iterrows() if row["SymbolCUSIP"] in returns.columns
    ]
    
    summary = {"total_funds": len(fund_data_list), "regions": {}, "errors": 0}
    for region in ['Global', 'USA', 'International']:  # Explicitly process only these regions
        region_funds = [fd for fd in fund_data_list if fund_meta[fund_meta["SymbolCUSIP"] == fd["SymbolCUSIP"]]["Region"].iloc[0] == region]
        if SAMPLE_DRY_RUN and len(region_funds) > SAMPLE_SIZE:
            region_funds = random.sample(region_funds, SAMPLE_SIZE)
            logger.info(f"Sampled {len(region_funds)} funds for {region}")
        records = process_region(region, region_funds)
        summary["regions"][region] = {"funds_processed": len(region_funds), "records": len(records)}
    
    logger.info(f"Pipeline summary: {summary}")
    return summary

# Section 7: Database Output
def insert_batch(records):
    try:
        df = pd.DataFrame(records)
        with database_transaction() as connection:
            df.to_sql("AQRR_Factor_Attribution", connection, if_exists="append", index=False, method="multi")
        logger.info(f"Inserted {len(df)} records to database")
    except Exception as e:
        logger.error(f"Error inserting batch: {e}")
        raise

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logger.error(f"Main execution failed: {e}")
        raise

2025-04-13 21:59:46,933 - INFO - Starting main pipeline
2025-04-13 21:59:47,067 - INFO - Loaded metadata for 5584 funds
2025-04-13 21:59:47,382 - INFO - load_fund_metadata took 0.45 seconds
2025-04-13 21:59:47,383 - INFO - Total funds: 5584, Regions: ['Global' 'USA' 'International']
2025-04-13 21:59:47,384 - INFO - Sampled 10 funds: ['RHRX', 'HYDB', 'LPXIX', 'UFO', 'MNOIX', 'KNOW', 'VEVIX', 'QUAYX', 'RSHO', 'JIESX']
2025-04-13 21:59:47,412 - INFO - Loaded returns for 10 funds
2025-04-13 21:59:47,416 - INFO - load_fund_returns took 0.03 seconds
2025-04-13 21:59:47,561 - INFO - Processing 3 funds in Global
Processing Global:   0%|          | 0/3 [00:00<?, ?it/s]2025-04-13 21:59:47,687 - INFO - load_factors took 0.12 seconds
2025-04-13 21:59:47,725 - INFO - Generated 0 regression records for KNOW
2025-04-13 21:59:47,725 - INFO - load_factors took 0.16 seconds
2025-04-13 21:59:47,741 - INFO - load_factors took 0.17 seconds
2025-04-13 21:59:48,165 - INFO - Generated 126 regression records f

In [12]:
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ThreadPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time
import functools
from contextlib import contextmanager
import sys

# Section 1: Configuration and Setup
CONFIG = {
    "database": {
        "server": "JULIANS_LAPTOP\\SQLEXPRESS",
        "database": "CWA_Fund_Database",
        "driver": "ODBC Driver 18 for SQL Server"
    },
    "return_metric": "1 Month Return",
    "rolling_periods": [12, 24, 36, 48, 60],
    "dry_run": True,
    "sample_dry_run": True,
    "sample_size": 10,
    "chunk_size": 1000,
    "batch_insert_size": 5000,
    "max_workers": 8,
    "use_threads": True,
    "query_timeout": 30
}

CONNECTION_STRING = (
    f"mssql+pyodbc://{CONFIG['database']['server']}/{CONFIG['database']['database']}"
    f"?driver={CONFIG['database']['driver']}&trusted_connection=yes&TrustServerCertificate=yes"
)
try:
    engine = create_engine(CONNECTION_STRING, connect_args={"timeout": CONFIG["query_timeout"]})
except Exception as e:
    print(f"Database connection failed: {e}")
    raise

RETURN_METRIC = CONFIG["return_metric"]
ROLLING_PERIODS = CONFIG["rolling_periods"]
DRY_RUN = CONFIG["dry_run"]
SAMPLE_DRY_RUN = CONFIG["sample_dry_run"]
SAMPLE_SIZE = CONFIG["sample_size"]
CHUNK_SIZE = CONFIG["chunk_size"]
BATCH_INSERT_SIZE = CONFIG["batch_insert_size"]
MAX_WORKERS = min(CONFIG["max_workers"], psutil.cpu_count(logical=False))
USE_THREADS = CONFIG["use_threads"]

# Factor configuration for all regions
FACTOR_CONFIGS = {
    "Equity_USA_1": [
        {"source": "db", "factors": ["MKT", "RF"], "region": "USA", "desired_factors": ["mkt-rf"]}
    ],
    "Equity_Global_1": [
        {"source": "db", "factors": ["MKT", "RF"], "region": "Global", "desired_factors": ["mkt-rf"]}
    ],
    "Equity_International_1": [
        {"source": "db", "factors": ["MKT", "RF"], "region": "International", "desired_factors": ["mkt-rf"]}
    ]
}

# Logging setup
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("factor_attribution.log"),
        logging.StreamHandler(sys.stdout)
    ],
    force=True
)
logger = logging.getLogger()
logger.handlers[1].stream = sys.stdout
logger.handlers[1].setLevel(logging.INFO)

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Natural Resources Sector Equity": ("USA", "Natural Resources Sector Equity"),
        "Infrastructure Sector Equity": ("USA", "Infrastructure Sector Equity"),
        "Trading Tools": ("USA", "Trading Tools"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    result = mapping.get(category, ("USA", "Unknown"))
    if result[1] == "Unknown":
        logger.warning(f"Unmapped category '{category}' for fund; defaulting to USA")
    return result

@contextmanager
def database_transaction():
    connection = engine.connect()
    transaction = connection.begin()
    try:
        yield connection
        transaction.commit()
    except Exception as e:
        transaction.rollback()
        logger.error(f"Transaction failed: {e}")
        raise
    finally:
        connection.close()

def timer(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        logger.debug(f"Starting {func.__name__}")
        result = func(*args, **kwargs)
        logger.info(f"{func.__name__} took {time.time() - start_time:.2f} seconds")
        return result
    return wrapper

# Section 3: Data Loading Functions
@timer
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    try:
        df = pd.read_sql(query, engine)
        logger.info(f"Loaded metadata for {len(df)} funds")
        df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
        if df["CWA_Broad_Category_Name"].isnull().all():
            logger.warning("CWA_Broad_Category_Name missing; some regressions may be skipped")
        return df.dropna(subset=["Region", "FactorProfile"])
    except Exception as e:
        logger.error(f"Error loading metadata: {e}")
        raise

@timer
def load_fund_returns(fund_ids=None):
    try:
        if not fund_ids:
            query = f"""
            SELECT SymbolCUSIP, Date, ReturnValue
            FROM Fund_Returns_Timeseries
            WHERE Metric = '{RETURN_METRIC.replace("'", "")}'
            """
        else:
            fund_ids = [fid for fid in fund_ids if isinstance(fid, str) and fid.strip()]
            if not fund_ids:
                logger.warning("No valid fund IDs provided")
                return pd.DataFrame()
            placeholders = ",".join([f"'{fid.replace('\'', '')}'" for fid in fund_ids])
            query = f"""
            SELECT SymbolCUSIP, Date, ReturnValue
            FROM Fund_Returns_Timeseries
            WHERE Metric = '{RETURN_METRIC.replace("'", "")}'
            AND SymbolCUSIP IN ({placeholders})
            """
            logger.debug(f"Querying returns for funds: {fund_ids}")
        
        chunks = []
        for chunk in pd.read_sql(query, engine, parse_dates=["Date"], chunksize=CHUNK_SIZE):
            logger.debug(f"Loaded chunk of {len(chunk)} rows")
            chunks.append(chunk)
        df = pd.concat(chunks) if chunks else pd.DataFrame()
        if df.empty:
            logger.warning("No returns data loaded")
            return pd.DataFrame()
        
        logger.info(f"Loaded returns for {len(df['SymbolCUSIP'].unique())} funds")
        pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
        # Enforce month-end dates and frequency
        pivoted.index = pd.to_datetime(pivoted.index) + pd.offsets.MonthEnd(0)
        pivoted.index = pivoted.index.drop_duplicates()
        pivoted = pivoted.asfreq('ME')
        for col in pivoted.columns:
            if not pd.api.types.is_numeric_dtype(pivoted[col]):
                logger.warning(f"Non-numeric returns for {col}; converting")
                pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
        return pivoted
    except Exception as e:
        logger.error(f"Error loading returns: {e}")
        if fund_ids:
            logger.info("Attempting individual fund queries")
            chunks = []
            for fid in fund_ids:
                try:
                    query = f"""
                    SELECT SymbolCUSIP, Date, ReturnValue
                    FROM Fund_Returns_Timeseries
                    WHERE Metric = '{RETURN_METRIC.replace("'", "")}'
                    AND SymbolCUSIP = '{fid.replace("'", "")}'
                    """
                    chunk = pd.read_sql(query, engine, parse_dates=["Date"])
                    chunks.append(chunk)
                    logger.debug(f"Loaded returns for {fid}")
                except Exception as e2:
                    logger.warning(f"Failed to load returns for {fid}: {e2}")
            df = pd.concat(chunks) if chunks else pd.DataFrame()
            if df.empty:
                logger.warning("No returns data loaded in fallback")
                return pd.DataFrame()
            pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
            pivoted.index = pd.to_datetime(pivoted.index) + pd.offsets.MonthEnd(0)
            pivoted.index = pivoted.index.drop_duplicates()
            pivoted = pivoted.asfreq('ME')
            for col in pivoted.columns:
                pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
            return pivoted
        raise

@timer
def load_factors(factor_config, region="Global", asset_class=None):
    source = factor_config.get("source")
    try:
        if source == "db":
            factor_in_clause = ','.join([f"'{f}'" for f in factor_config["factors"]])
            query = f"""
            SELECT date AS Date, factor AS Factor, value AS Value
            FROM {factor_config.get('table', 'factor_returns')}
            WHERE factor IN ({factor_in_clause})
            """
            params = []
            if region and not any(f.startswith('TSM-') or f == 'RF' for f in factor_config["factors"]):
                query += " AND region = ?"
                params.append(region)
            elif 'RF' in factor_config["factors"]:
                query += " AND region = 'USA'"
            if asset_class:
                query += " AND asset_class = ?"
                params.append(asset_class)
            df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
            if df.empty:
                logger.warning(f"No data for factors {factor_config['factors']} in region {region}")
                return pd.DataFrame()
            if df.duplicated(subset=['Date', 'Factor']).sum() > 0:
                logger.error(f"Duplicate factors {factor_config['factors']}")
                raise ValueError("Duplicate factor data")
            pivoted_df = df.pivot(index="Date", columns="Factor", values="Value").rename(
                columns={'MKT': 'mkt', 'RF': 'rf'}
            )
            # Enforce month-end dates and frequency
            pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
            pivoted_df.index = pivoted_df.index.drop_duplicates()
            pivoted_df = pivoted_df.asfreq('ME')
            logger.debug(f"Factors loaded: {pivoted_df.columns.tolist()}, dates: {pivoted_df.index.min()} to {pivoted_df.index.max()}")
            return pivoted_df
        else:
            logger.warning(f"Factor source {source} not implemented")
            return pd.DataFrame()
    except Exception as e:
        logger.error(f"Error loading factors {factor_config.get('factors', [])}: {e}")
        return pd.DataFrame()

# Section 4: Regression Functions
# Section 4: Regression Functions
def run_rolling_regression(fund, returns, factors, regression_type, factor_set):
    results = []
    try:
        returns = pd.Series(returns).dropna()
        factors = pd.DataFrame(factors).dropna()
        returns.index = pd.to_datetime(returns.index, errors='coerce')
        factors.index = pd.to_datetime(factors.index, errors='coerce')
        returns = returns[returns.index.notnull()]
        factors = factors[factors.index.notnull()]
        
        if returns.empty or factors.empty:
            logger.warning(f"Empty data for {fund} ({factor_set})")
            return results
        
        # Data integrity check
        overlap_dates = returns.index.intersection(factors.index)
        months_available = len(overlap_dates)
        date_start = overlap_dates.min().strftime('%Y-%m-%d') if months_available > 0 else "none"
        date_end = overlap_dates.max().strftime('%Y-%m-%d') if months_available > 0 else "none"
        
        # Check for smallest window
        min_window = min(ROLLING_PERIODS)
        if months_available < min_window:
            skipped_windows = [f"{w}m" for w in ROLLING_PERIODS]
            logger.warning(f"Fund {fund} ({factor_set}) skipped for windows {skipped_windows}: "
                           f"{months_available} months available from {date_start} to {date_end}, need {min_window}")
            return results
        
        logger.debug(f"Fund {fund}: {len(returns)} returns, {len(factors)} factors, {len(overlap_dates)} overlapping dates")
        
        # Align indices with reindex
        factors = factors.reindex(overlap_dates)
        returns = returns.reindex(overlap_dates)
        
        for window in ROLLING_PERIODS:
            window_results = []
            if months_available < window:
                logger.warning(f"Fund {fund} ({factor_set}, window={window}m) skipped: {months_available} months available "
                               f"from {date_start} to {date_end}, need {window}")
                continue
            
            # Roll over fixed periods
            expected_rolls = max(0, len(returns) - window + 1)
            for start_idx in range(expected_rolls):
                end_idx = start_idx + window
                y = returns.iloc[start_idx:end_idx]
                X = factors.iloc[start_idx:end_idx]
                if len(y) != window or y.isnull().any() or X.isnull().any().any():
                    continue
                X_const = add_constant(X)
                model = OLS(y, X_const).fit()
                for factor in X.columns:
                    result = {
                        "SymbolCUSIP": fund,
                        "MonthEndDate": y.index[-1],
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": model.params.get(factor, np.nan),
                        "P_Value": model.pvalues.get(factor, np.nan),
                        "T_Stat": model.tvalues.get(factor, np.nan),
                        "Standard_Error": model.bse.get(factor, np.nan),
                        "CI_Lower": model.conf_int().loc[factor][0] if factor in model.params else np.nan,
                        "CI_Upper": model.conf_int().loc[factor][1] if factor in model.params else np.nan,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1] if len(y) > 1 else np.nan,
                        "Regression_Type": regression_type,
                        "Factor_Set": factor_set
                    }
                    window_results.append(result)
                # Log sample result
                if window_results:
                    logger.debug(f"Sample regression for {fund}: Factor={window_results[-1]['Factor_Name']}, "
                                 f"Coefficient={window_results[-1]['Coefficient']:.4f}, P_Value={window_results[-1]['P_Value']:.4f}")
            
            if window_results:
                logger.info(f"Fund {fund} succeeded for window {window}m with factor {X.columns.tolist()}, generated {len(window_results)} records")
                results.extend(window_results)
        
    except Exception as e:
        logger.error(f"Regression error for {fund} ({factor_set}): {e}")
    return results

# Section 5: Processing Functions
def process_regression(symbol, returns, factor_set_key):
    records = []
    factor_configs = FACTOR_CONFIGS.get(factor_set_key, [])
    if not factor_configs:
        logger.warning(f"No config for {factor_set_key}")
        return records
    
    try:
        factors_list = [load_factors(config, region=config.get("region", "Global")) for config in factor_configs]
        factors = pd.concat([f for f in factors_list if not f.empty], axis=1).dropna()
        if factors.empty:
            logger.warning(f"No factors for {symbol} ({factor_set_key})")
            return records
        
        if 'mkt' in factors.columns and 'rf' in factors.columns:
            factors['mkt-rf'] = factors['mkt'] - factors['rf']
        
        records.extend(run_rolling_regression(symbol, returns, factors, "OLS", factor_set_key))
    except Exception as e:
        logger.error(f"Error processing {symbol} ({factor_set_key}): {e}")
    
    return records

def process_fund(fund_data):
    records = []
    symbol = fund_data["SymbolCUSIP"]
    category = fund_data["Global_Category_Name"]
    returns = pd.Series(fund_data["returns"]).dropna()
    
    logger.debug(f"Processing fund {symbol}, category: {category}")
    
    if returns.empty:
        logger.info(f"No returns for {symbol}")
        return records
    
    region, profile = category_to_region(category)
    logger.debug(f"Fund {symbol} mapped to region: {region}, profile: {profile}")
    
    # Assign regression set based on region
    regression_sets = {
        "USA": ["Equity_USA_1"],
        "Global": ["Equity_Global_1"],
        "International": ["Equity_International_1"]
    }.get(region, ["Equity_USA_1"])  # Default Unknown to USA
    
    for factor_set in regression_sets:
        records.extend(process_regression(symbol, returns, factor_set))
    
    logger.info(f"Generated {len(records)} regression records for {symbol}")
    return records

def process_region(region, fund_data_list):
    records = []
    logger.info(f"Processing {len(fund_data_list)} funds in {region}")
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_fund = {executor.submit(process_fund, fund_data): fund_data["SymbolCUSIP"] for fund_data in fund_data_list}
        for future in tqdm(future_to_fund, total=len(fund_data_list), desc=f"Processing {region}", file=sys.stdout):
            try:
                records.extend(future.result())
            except Exception as e:
                logger.error(f"Error processing {future_to_fund[future]}: {e}")
    
    logger.info(f"Region {region} generated {len(records)} total records")
    if not DRY_RUN:
        insert_batch(records)
    
    return records

# Section 6: Main Pipeline
@timer
def main():
    logger.info("Starting main pipeline")
    try:
        fund_meta = load_fund_metadata()
    except Exception as e:
        logger.error(f"Failed to load metadata: {e}")
        return {"error": str(e)}
    
    regions = fund_meta["Region"].unique()
    logger.info(f"Total funds: {len(fund_meta)}, Regions: {regions}")
    
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
        logger.info(f"Sampled {len(fund_ids)} funds: {fund_ids}")
    
    try:
        returns = load_fund_returns(fund_ids)
    except Exception as e:
        logger.error(f"Failed to load returns: {e}")
        return {"error": str(e)}
    
    fund_data_list = [
        {
            "SymbolCUSIP": row["SymbolCUSIP"],
            "Global_Category_Name": row["Global_Category_Name"],
            "returns": returns[row["SymbolCUSIP"]].to_dict()
        }
        for _, row in fund_meta.iterrows() if row["SymbolCUSIP"] in returns.columns
    ]
    
    summary = {"total_funds": len(fund_data_list), "regions": {}, "errors": 0}
    for region in ['Global', 'USA', 'International']:  # Explicitly process only these regions
        region_funds = [fd for fd in fund_data_list if fund_meta[fund_meta["SymbolCUSIP"] == fd["SymbolCUSIP"]]["Region"].iloc[0] == region]
        if SAMPLE_DRY_RUN and len(region_funds) > SAMPLE_SIZE:
            region_funds = random.sample(region_funds, SAMPLE_SIZE)
            logger.info(f"Sampled {len(region_funds)} funds for {region}")
        records = process_region(region, region_funds)
        summary["regions"][region] = {"funds_processed": len(region_funds), "records": len(records)}
    
    logger.info(f"Pipeline summary: {summary}")
    return summary

# Section 7: Database Output
def insert_batch(records):
    try:
        df = pd.DataFrame(records)
        with database_transaction() as connection:
            df.to_sql("AQRR_Factor_Attribution", connection, if_exists="append", index=False, method="multi")
        logger.info(f"Inserted {len(df)} records to database")
    except Exception as e:
        logger.error(f"Error inserting batch: {e}")
        raise

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logger.error(f"Main execution failed: {e}")
        raise

2025-04-13 22:14:20,079 - INFO - Starting main pipeline
2025-04-13 22:14:20,213 - INFO - Loaded metadata for 5584 funds
2025-04-13 22:14:20,520 - INFO - load_fund_metadata took 0.44 seconds
2025-04-13 22:14:20,521 - INFO - Total funds: 5584, Regions: ['Global' 'USA' 'International']
2025-04-13 22:14:20,522 - INFO - Sampled 10 funds: ['JDESX', 'GTEK', 'RWK', 'ONGFX', 'XRLV', 'METV', 'CPII', 'ETEC', 'DTAN', 'PLDR']
2025-04-13 22:14:20,548 - INFO - Loaded returns for 10 funds
2025-04-13 22:14:20,552 - INFO - load_fund_returns took 0.03 seconds
2025-04-13 22:14:20,719 - INFO - Processing 2 funds in Global
Processing Global:   0%|          | 0/2 [00:00<?, ?it/s]2025-04-13 22:14:20,784 - INFO - load_factors took 0.06 seconds
2025-04-13 22:14:20,797 - INFO - load_factors took 0.08 seconds
2025-04-13 22:14:20,814 - INFO - Generated 0 regression records for DTAN
2025-04-13 22:14:21,337 - INFO - Fund ONGFX succeeded for window 12m with factor ['mkt', 'rf', 'mkt-rf'], generated 327 records
2025-0

In [13]:
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time
import pickle

# Section 1: Configuration and Setup
CONFIG = {
    "database": {
        "server": "JULIANS_LAPTOP\\SQLEXPRESS",
        "database": "CWA_Fund_Database",
        "driver": "ODBC Driver 18 for SQL Server"
    },
    "return_metric": "1 Month Return",
    "rolling_periods": [12, 24, 36, 48, 60],
    "dry_run": True,
    "sample_dry_run": True,
    "sample_size": 50,
    "chunk_size": 5600,
    "batch_insert_size": 10000,
    "max_workers": 20,
    "use_threads": False
}

CONNECTION_STRING = (
    f"mssql+pyodbc://{CONFIG['database']['server']}/{CONFIG['database']['database']}"
    f"?driver={CONFIG['database']['driver']}&trusted_connection=yes&TrustServerCertificate=yes"
)
try:
    engine = create_engine(CONNECTION_STRING)
except Exception as e:
    print(f"Database connection failed: {e}")
    raise

RETURN_METRIC = CONFIG["return_metric"]
ROLLING_PERIODS = CONFIG["rolling_periods"]
DRY_RUN = CONFIG["dry_run"]
SAMPLE_DRY_RUN = CONFIG["sample_dry_run"]
SAMPLE_SIZE = CONFIG["sample_size"]
CHUNK_SIZE = CONFIG["chunk_size"]
BATCH_INSERT_SIZE = CONFIG["batch_insert_size"]
MAX_WORKERS = CONFIG["max_workers"]
USE_THREADS = CONFIG["use_threads"]

# Logging setup
logging.basicConfig(
    level=logging.WARNING,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("factor_attribution.log"),
        logging.StreamHandler(sys.stdout)
    ],
    force=True
)
logger = logging.getLogger()

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Natural Resources Sector Equity": ("USA", "Natural Resources Sector Equity"),
        "Infrastructure Sector Equity": ("USA", "Infrastructure Sector Equity"),
        "Trading Tools": ("USA", "Trading Tools"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    return mapping.get(category, ("USA", "Unknown"))

@contextmanager
def database_transaction():
    connection = engine.connect()
    transaction = connection.begin()
    try:
        yield connection
        transaction.commit()
    except Exception as e:
        transaction.rollback()
        logger.error(f"Transaction failed: {e}")
        raise
    finally:
        connection.close()

def timer(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        logger.debug(f"Starting {func.__name__}")
        result = func(*args, **kwargs)
        logger.info(f"{func.__name__} took {time.time() - start_time:.2f} seconds")
        return result
    return wrapper

# Section 3: Data Loading Functions
@timer
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    try:
        df = pd.read_sql(query, engine)
        logger.info(f"Loaded metadata for {len(df)} funds")
        df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
        if df["CWA_Broad_Category_Name"].isnull().all():
            logger.warning("CWA_Broad_Category_Name missing; Equity regressions 5-7 will be skipped")
        return df.dropna(subset=["Region", "FactorProfile"])
    except Exception as e:
        logger.error(f"Error loading metadata: {e}")
        raise

@timer
def load_fund_returns(fund_ids=None):
    try:
        if not fund_ids:
            query = f"""
            SELECT SymbolCUSIP, Date, ReturnValue
            FROM Fund_Returns_Timeseries
            WHERE Metric = '{RETURN_METRIC.replace("'", "")}'
            """
        else:
            fund_ids = [fid for fid in fund_ids if isinstance(fid, str) and fid.strip()]
            if not fund_ids:
                logger.warning("No valid fund IDs provided")
                return pd.DataFrame()
            placeholders = ",".join([f"'{fid.replace('\'', '')}'" for fid in fund_ids])
            query = f"""
            SELECT SymbolCUSIP, Date, ReturnValue
            FROM Fund_Returns_Timeseries
            WHERE Metric = '{RETURN_METRIC.replace("'", "")}'
            AND SymbolCUSIP IN ({placeholders})
            """
            logger.debug(f"Querying returns for funds: {fund_ids}")
        
        chunks = []
        for chunk in pd.read_sql(query, engine, parse_dates=["Date"], chunksize=CHUNK_SIZE):
            logger.debug(f"Loaded chunk of {len(chunk)} rows")
            chunks.append(chunk)
        df = pd.concat(chunks) if chunks else pd.DataFrame()
        if df.empty:
            logger.warning("No returns data loaded")
            return pd.DataFrame()
        
        logger.info(f"Loaded returns for {len(df['SymbolCUSIP'].unique())} funds")
        pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
        pivoted.index = pd.to_datetime(pivoted.index) + pd.offsets.MonthEnd(0)
        pivoted.index = pivoted.index.drop_duplicates()
        pivoted = pivoted.asfreq('ME')
        for col in pivoted.columns:
            if not pd.api.types.is_numeric_dtype(pivoted[col]):
                logger.warning(f"Non-numeric returns for {col}; converting")
                pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
        return pivoted
    except Exception as e:
        logger.error(f"Error loading returns: {e}")
        if fund_ids:
            logger.info("Attempting individual fund queries")
            chunks = []
            for fid in fund_ids:
                try:
                    query = f"""
                    SELECT SymbolCUSIP, Date, ReturnValue
                    FROM Fund_Returns_Timeseries
                    WHY Metric = '{RETURN_METRIC.replace("'", "")}'
                    AND SymbolCUSIP = '{fid.replace("'", "")}'
                    """
                    chunk = pd.read_sql(query, engine, parse_dates=["Date"])
                    chunks.append(chunk)
                    logger.debug(f"Loaded returns for {fid}")
                except Exception as e2:
                    logger.warning(f"Failed to load returns for {fid}: {e2}")
            df = pd.concat(chunks) if chunks else pd.DataFrame()
            if df.empty:
                logger.warning("No returns data loaded in fallback")
                return pd.DataFrame()
            pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
            pivoted.index = pd.to_datetime(pivoted.index) + pd.offsets.MonthEnd(0)
            pivoted.index = pivoted.index.drop_duplicates()
            pivoted = pivoted.asfreq('ME')
            for col in pivoted.columns:
                pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
            return pivoted
        raise

@timer
def load_db_factors(factor_list, region="Global", table="factor_returns", asset_class=None):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM {table}
        WHERE factor IN ({factor_in_clause})
    """
    params = []
    if region and not any(f.startswith('TSM-') or f == 'RF' for f in factor_list):
        query += " AND region = ?"
        params.append(region)
    elif 'RF' in factor_list:
        query += " AND region = 'USA'"
    if asset_class:
        query += " AND asset_class = ?"
        params.append(asset_class)
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    if df.empty:
        logger.warning(f"No data for factors {factor_list} in {table} (region: {region}, asset_class: {asset_class})")
        return pd.DataFrame()
    
    duplicates = df.duplicated(subset=['Date', 'Factor']).sum()
    if duplicates > 0:
        logger.warning(f"Found {duplicates} duplicate Date-Factor pairs in {table} for {factor_list}; aggregating by mean")
        df = df.groupby(['Date', 'Factor'])['Value'].mean().reset_index()
    
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value").rename(
        columns={
            'MKT': 'mkt', 'SMB': 'smb', 'HML_Devil': 'hml', 'UMD': 'umd', 'QMJ': 'qmj',
            'BAB': 'bab', 'RF': 'rf', 'TSM-Com': 'tsm-com', 'TSM-EQ': 'tsm-eq',
            'TSM-FI': 'tsm-fi', 'TSM-FX': 'tsm-fx', 'TSM-MA': 'tsm-ma'
        }
    )
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df.index = pivoted_df.index.drop_duplicates()
    pivoted_df = pivoted_df.asfreq('ME')
    missing_factors = [f for f in factor_list if f.lower() not in pivoted_df.columns]
    if missing_factors:
        logger.warning(f"Missing factors in {table} (region: {region}): {missing_factors}")
    return pivoted_df

@timer
def load_fixed_income_factors(factor_list):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
        WHERE Factor_Name IN ({factor_in_clause})
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    if df.empty:
        logger.warning(f"No fixed income factors for {factor_list}")
        return pd.DataFrame()
    pivoted_df = df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df.index = pivoted_df.index.drop_duplicates()
    pivoted_df = pivoted_df.asfreq('ME')
    return pivoted_df

@timer
def load_century_factors(factor_list, portfolio_base, factor, asset_class=None, region="Global"):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    portfolio = f"{portfolio_base} {factor}"
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM aqr_century_factors
        WHERE factor IN ({factor_in_clause})
        AND portfolio = ?
    """
    params = [portfolio]
    if asset_class:
        query += " AND asset_class = ?"
        params.append(asset_class)
    if region != "Global":
        query += " AND region = ?"
        params.append(region)
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    if df.empty:
        logger.warning(f"No data for factors {factor_list} in aqr_century_factors (portfolio: {portfolio}, region: {region})")
        return pd.DataFrame()
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df.index = pivoted_df.index.drop_duplicates()
    pivoted_df = pivoted_df.asfreq('ME')
    return pivoted_df

@timer
def load_commodity_factors():
    query = """
        SELECT date AS Date, 
               excess_return_eqwt, 
               excess_spot_return_eqwt, 
               ir_adjusted_carry_eqwt, 
               spot_return_eqwt, 
               carry_eqwt, 
               excess_return_long_short, 
               excess_spot_return_long_short, 
               ir_adjusted_carry_long_short, 
               aggregate_backwardation_contango
        FROM aqr_cmdty_factors
    """
    df = pd.read_sql(query, engine, parse_dates=['Date'])
    if df.empty:
        logger.warning("No commodity factors loaded")
        return pd.DataFrame()
    pivoted_df = df.set_index("Date")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df.index = pivoted_df.index.drop_duplicates()
    pivoted_df = pivoted_df.asfreq('ME')
    return pivoted_df

# Section 4: Regression Functions
def run_rolling_regression(fund, returns, factors, regression_type, factor_set):
    results = []
    try:
        returns = pd.Series(returns).dropna()
        factors = pd.DataFrame(factors).dropna()
        returns.index = pd.to_datetime(returns.index, errors='coerce')
        factors.index = pd.to_datetime(factors.index, errors='coerce')
        returns = returns[returns.index.notnull()]
        factors = factors[factors.index.notnull()]
        
        if returns.empty or factors.empty:
            logger.warning(f"Empty data for {fund} ({factor_set})")
            return results
        
        overlap_dates = returns.index.intersection(factors.index)
        months_available = len(overlap_dates)
        date_start = overlap_dates.min().strftime('%Y-%m-%d') if months_available > 0 else "none"
        date_end = overlap_dates.max().strftime('%Y-%m-%d') if months_available > 0 else "none"
        
        min_window = min(ROLLING_PERIODS)
        if months_available < min_window:
            skipped_windows = [f"{w}m" for w in ROLLING_PERIODS]
            logger.warning(f"Fund {fund} ({factor_set}) skipped for windows {skipped_windows}: "
                           f"{months_available} months available from {date_start} to {date_end}, need {min_window}")
            return results
        
        logger.debug(f"Fund {fund}: {len(returns)} returns, {len(factors)} factors, {len(overlap_dates)} overlapping dates")
        
        factors = factors.reindex(overlap_dates)
        returns = returns.reindex(overlap_dates)
        
        for window in ROLLING_PERIODS:
            window_results = []
            if months_available < window:
                logger.warning(f"Fund {fund} ({factor_set}, window={window}m) skipped: {months_available} months available "
                               f"from {date_start} to {date_end}, need {window}")
                continue
            
            expected_rolls = max(0, len(returns) - window + 1)
            logger.debug(f"Fund {fund}: Expected {expected_rolls} rolls for window {window}m with factors {factors.columns.tolist()}")
            for start_idx in range(expected_rolls):
                end_idx = start_idx + window
                y = returns.iloc[start_idx:end_idx]
                X = factors.iloc[start_idx:end_idx]
                if len(y) != window or y.isnull().any() or X.isnull().any().any():
                    continue
                X_const = add_constant(X)
                model = OLS(y, X_const).fit()
                for factor in X.columns:
                    result = {
                        "SymbolCUSIP": fund,
                        "MonthEndDate": y.index[-1],
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": model.params.get(factor, np.nan),
                        "P_Value": model.pvalues.get(factor, np.nan),
                        "T_Stat": model.tvalues.get(factor, np.nan),
                        "Standard_Error": model.bse.get(factor, np.nan),
                        "CI_Lower": model.conf_int().loc[factor][0] if factor in model.params else np.nan,
                        "CI_Upper": model.conf_int().loc[factor][1] if factor in model.params else np.nan,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1] if len(y) > 1 else np.nan,
                        "Regression_Type": regression_type,
                        "Factor_Set": factor_set
                    }
                    window_results.append(result)
            
            if window_results:
                factor_counts = {factor: len([r for r in window_results if r["Factor_Name"] == factor]) for factor in factors.columns}
                logger.info(f"Fund {fund} succeeded for window {window}m with factors {factor_counts}, total {len(window_results)} records")
                results.extend(window_results)
        
    except Exception as e:
        logger.error(f"Regression error for {fund} ({factor_set}): {e}")
    return results

# Section 5: Processing Functions
def process_fund(fund_data):
    records = []
    symbol = fund_data["SymbolCUSIP"]
    category = fund_data["Global_Category_Name"]
    broad_category = fund_data["CWA_Broad_Category_Name"]
    returns = pd.Series(fund_data["returns"]).dropna()
    
    logger.debug(f"Starting process_fund for {symbol}")
    
    try:
        if returns.empty:
            logger.warning(f"No valid returns for {symbol}")
            return records

        # Equity (USA)
        if category in [
            "Energy Sector Equity", "Equity Miscellaneous", "Financials Sector Equity",
            "Healthcare Sector Equity", "Consumer Goods & Services Sector Equity",
            "Communications Sector Equity", "Industrials Sector Equity", "Other Sector Equity",
            "Real Estate Sector Equity", "Precious Metals Sector Equity", "Technology Sector Equity",
            "Utilities Sector Equity", "US Equity Large Cap Blend", "US Equity Large Cap Growth",
            "US Equity Large Cap Value", "US Equity Mid Cap", "US Equity Small Cap", "Options Trading",
            "Natural Resources Sector Equity", "Infrastructure Sector Equity"
        ]:
            # Regression 1: USA, MKT-RF, HML-Devil, QMJ, SMB, UMD, BAB
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB'], "USA")
            rf_factors = load_db_factors(['RF'], "USA")
            factors = pd.concat([factors_eq, rf_factors], axis=1).dropna()
            if factors.empty:
                logger.warning(f"No valid factors for {symbol} (Equity_USA_1)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logger.warning(f"No overlapping dates for {symbol} (Equity_USA_1)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_1"))
                    else:
                        logger.warning(f"No valid factors for {symbol} (Equity_USA_1)")

            # Regression 2-4: Century Factors
            for portfolio_base, factor_set in [
                ("US Stock Selection", "Equity_USA_2"),
                ("All Macro", "Equity_USA_3"),
                ("Equity indices", "Equity_USA_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor, region="USA")
                    if factors.empty:
                        logger.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logger.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

            if broad_category in ["Quantitative/Tactical", "Strategic", "Nontraditional"]:
                # Regression 5
                factors_eq = load_db_factors(['MKT', 'BAB'], "USA")
                factors_fi = load_db_factors(['TSM-FI'])
                factors_fx = load_db_factors(['TSM-FX'])
                rf_factors = load_db_factors(['RF'], "USA")
                fi_factors = load_fixed_income_factors(['TERM', 'CREDIT'])
                cmdty_factors = load_commodity_factors()
                factors = pd.concat([factors_eq, factors_fi, factors_fx, rf_factors, fi_factors, cmdty_factors[['excess_return_eqwt']]], axis=1).dropna()
                if factors.empty:
                    logger.warning(f"No valid factors for {symbol} (Equity_USA_5)")
                else:
                    if 'mkt' in factors.columns and 'rf' in factors.columns:
                        factors['mkt-rf'] = factors['mkt'] - factors['rf']
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logger.warning(f"No overlapping dates for {symbol} (Equity_USA_5)")
                    else:
                        desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'tsm-fx', 'tsm-fi', 'bab', 'TERM', 'CREDIT', 'excess_return_eqwt']
                        available_factors = [f for f in desired_factors if f in factors.columns]
                        if available_factors:
                            records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_5"))

                # Regression 6
                factors_eq = load_db_factors(['MKT', 'SMB', 'BAB'], "USA")
                factors_com = load_db_factors(['TSM-Com'])
                rf_factors = load_db_factors(['RF'], "USA")
                fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT_HY'])
                factors = pd.concat([factors_eq, factors_com, rf_factors, fi_factors], axis=1).dropna()
                if factors.empty:
                    logger.warning(f"No valid factors for {symbol} (Equity_USA_6)")
                else:
                    if 'mkt' in factors.columns and 'rf' in factors.columns:
                        factors['mkt-rf'] = factors['mkt'] - factors['rf']
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logger.warning(f"No overlapping dates for {symbol} (Equity_USA_6)")
                    else:
                        desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT_HY', 'tsm-com']
                        available_factors = [f for f in desired_factors if f in factors.columns]
                        if available_factors:
                            records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_6"))

                # Regression 7
                factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'UMD', 'SMB', 'BAB'], "USA")
                factors_com = load_db_factors(['TSM-Com'])
                factors_fi = load_db_factors(['TSM-FI'])
                factors_fx = load_db_factors(['TSM-FX'])
                rf_factors = load_db_factors(['RF'], "USA")
                fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
                factors = pd.concat([factors_eq, factors_com, factors_fi, factors_fx, rf_factors, fi_factors], axis=1).dropna()
                if factors.empty:
                    logger.warning(f"No valid factors for {symbol} (Equity_USA_7)")
                else:
                    if 'mkt' in factors.columns and 'rf' in factors.columns:
                        factors['mkt-rf'] = factors['mkt'] - factors['rf']
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logger.warning(f"No overlapping dates for {symbol} (Equity_USA_7)")
                    else:
                        desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'umd', 'smb', 'bab', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fi', 'tsm-fx']
                        available_factors = [f for f in desired_factors if f in factors.columns]
                        if available_factors:
                            records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_USA_7"))

        # Equity (International)
        elif category in [
            "Asia ex-Japan Equity", "Australia & New Zealand Equity", "Canadian Equity Large Cap",
            "Europe Equity Large Cap", "Europe Equity Mid/Small Cap", "Greater China Equity",
            "India Equity", "Mexico Equity", "Japan Equity", "Korea Equity", "Latin America Equity",
            "UK Equity Large Cap", "Thailand Equity"
        ]:
            # Regression 1
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Intl")
            rf_factors = load_db_factors(['RF'], "USA")
            factors = pd.concat([factors_eq, rf_factors], axis=1).dropna()
            if factors.empty:
                logger.warning(f"No valid factors for {symbol} (Equity_Intl_1)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logger.warning(f"No overlapping dates for {symbol} (Equity_Intl_1)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_Intl_1"))

            # Regression 2-4
            for portfolio_base, factor_set in [
                ("Intl Stock Selection", "Equity_Intl_2"),
                ("All Macro", "Equity_Intl_3"),
                ("Equity indices", "Equity_Intl_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor, region="International")
                    if factors.empty:
                        logger.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logger.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        # Equity (Global)
        elif category in ["Global Emerging Markets Equity", "Global Equity Large Cap", "Global Equity Mid/Small Cap"]:
            # Regression 1
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Global")
            rf_factors = load_db_factors(['RF'], "USA")
            factors = pd.concat([factors_eq, rf_factors], axis=1).dropna()
            if factors.empty:
                logger.warning(f"No valid factors for {symbol} (Equity_Global_1)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logger.warning(f"No overlapping dates for {symbol} (Equity_Global_1)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'tsm-eq', 'bab']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Equity_Global_1"))

            # Regression 2-4
            for portfolio_base, factor_set in [
                ("All Stock Selection", "Equity_Global_2"),
                ("All Macro", "Equity_Global_3"),
                ("Equity indices", "Equity_Global_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor)
                    if factors.empty:
                        logger.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logger.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        # Fixed Income
        elif category in [
            "Convertibles", "Emerging Markets Fixed Income", "Fixed Income Miscellaneous",
            "US Fixed Income", "US Municipal Fixed Income"
        ]:
            # Regression 1
            factors_fi = load_db_factors(['TSM-FI'])
            factors_fx = load_db_factors(['TSM-FX'])
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors_fi, factors_fx, fi_factors], axis=1).dropna()
            if factors.empty:
                logger.warning(f"No valid factors for {symbol} (FI_1)")
            else:
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logger.warning(f"No overlapping dates for {symbol} (FI_1)")
                else:
                    desired_factors = ['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-fi', 'tsm-fx']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "FI_1"))

            # Regression 2-4
            for portfolio_base, factor_set in [
                ("Fixed income", "FI_2"),
                ("All Macro", "FI_3"),
                ("Equity indices", "FI_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor)
                    if factors.empty:
                        logger.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logger.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        # Allocation
        elif category in [
            "Aggressive Allocation", "Allocation Miscellaneous", "Cautious Allocation",
            "Flexible Allocation", "Moderate Allocation"
        ]:
            # Regression 1
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ'], "Global")
            factors_fi = load_db_factors(['TSM-FI'])
            rf_factors = load_db_factors(['RF'], "USA")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors_eq, factors_fi, rf_factors, fi_factors], axis=1).dropna()
            if factors.empty:
                logger.warning(f"No valid factors for {symbol} (Allocation_1)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logger.warning(f"No overlapping dates for {symbol} (Allocation_1)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Allocation_1"))

            # Regression 2
            for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                fi_factors = load_century_factors([factor], "Fixed income", factor)
                stock_factors = load_century_factors([factor], "All Stock Selection", factor)
                factors = pd.concat([fi_factors, stock_factors], axis=1).dropna()
                if factors.empty:
                    logger.warning(f"No valid factors for {symbol} (Allocation_2, factor: {factor})")
                    continue
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logger.warning(f"No overlapping dates for {symbol} (Allocation_2, factor: {factor})")
                    continue
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", "Allocation_2"))

            # Regression 3-4
            for portfolio_base, factor_set in [
                ("All Macro", "Allocation_3"),
                ("Equity indices", "Allocation_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor)
                    if factors.empty:
                        logger.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logger.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        # Alternatives
        elif category in [
            "Alternative Miscellaneous", "Long/Short Equity", "Market Neutral", "Multialternative"
        ]:
            # Regression 1
            factors_eq = load_db_factors(['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ'], "Global")
            factors_com = load_db_factors(['TSM-Com'])
            factors_fi = load_db_factors(['TSM-FI'])
            factors_fx = load_db_factors(['TSM-FX'])
            rf_factors = load_db_factors(['RF'], "USA")
            fi_factors = load_fixed_income_factors(['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY'])
            factors = pd.concat([factors_eq, factors_com, factors_fi, factors_fx, rf_factors, fi_factors], axis=1).dropna()
            if factors.empty:
                logger.warning(f"No valid factors for {symbol} (Alternative_1)")
            else:
                if 'mkt' in factors.columns and 'rf' in factors.columns:
                    factors['mkt-rf'] = factors['mkt'] - factors['rf']
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logger.warning(f"No overlapping dates for {symbol} (Alternative_1)")
                else:
                    desired_factors = ['mkt-rf' if 'mkt-rf' in factors.columns else 'mkt', 'hml', 'qmj', 'smb', 'umd', 'bab', 'tsm-eq', 'tsm-fi', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'tsm-com', 'tsm-fx']
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Alternative_1"))

            # Regression 2
            for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                fi_factors = load_century_factors([factor], "Fixed income", factor)
                stock_factors = load_century_factors([factor], "All Stock Selection", factor)
                cmdty_factors = load_century_factors([factor], "Commodities", factor)
                factors = pd.concat([fi_factors, stock_factors, cmdty_factors], axis=1).dropna()
                if factors.empty:
                    logger.warning(f"No valid factors for {symbol} (Alternative_2, factor: {factor})")
                    continue
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logger.warning(f"No overlapping dates for {symbol} (Alternative_2, factor: {factor})")
                    continue
                records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", "Alternative_2"))

            # Regression 3-4
            for portfolio_base, factor_set in [
                ("All Macro", "Alternative_3"),
                ("Equity indices", "Alternative_4")
            ]:
                for factor in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']:
                    factors = load_century_factors([factor], portfolio_base, factor)
                    if factors.empty:
                        logger.warning(f"No century factors for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                    if returns_aligned.empty:
                        logger.warning(f"No overlapping dates for {symbol} ({factor_set}, factor: {factor})")
                        continue
                    records.extend(run_rolling_regression(symbol, returns_aligned, factors, "OLS", factor_set))

        # Commodities
        elif category in ["Commodities Broad Basket", "Commodities Specified"]:
            # Regression 1
            factors = load_commodity_factors()
            if factors.empty:
                logger.warning(f"No commodity factors for {symbol} (Commodity_1)")
            else:
                factors, returns_aligned = factors.align(returns, join="inner", axis=0)
                if returns_aligned.empty:
                    logger.warning(f"No overlapping dates for {symbol} (Commodity_1)")
                else:
                    desired_factors = [
                        'excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt',
                        'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short',
                        'excess_spot_return_long_short', 'ir_adjusted_carry_long_short',
                        'aggregate_backwardation_contango'
                    ]
                    available_factors = [f for f in desired_factors if f in factors.columns]
                    if available_factors:
                        records.extend(run_rolling_regression(symbol, returns_aligned, factors[available_factors], "OLS", "Commodity_1"))

        else:
            logger.warning(f"Skipping regressions for {symbol}: Category {category} not mapped")

    except Exception as e:
        logger.warning(f"Error processing fund {symbol}: {e}")
        return records
    
    logger.info(f"Generated {len(records)} regression records for {symbol}")
    return records

def process_region(region, fund_data_list):
    records = []
    logger.info(f"Processing {len(fund_data_list)} funds in {region}")
    
    executor_class = ThreadPoolExecutor if USE_THREADS else ProcessPoolExecutor
    with executor_class(max_workers=MAX_WORKERS) as executor:
        future_to_fund = {executor.submit(process_fund, fund_data): fund_data["SymbolCUSIP"] for fund_data in fund_data_list}
        for future in tqdm(future_to_fund, total=len(fund_data_list), desc=f"Processing {region}", file=sys.stdout):
            try:
                records.extend(future.result())
            except Exception as e:
                logger.error(f"Error processing {future_to_fund[future]}: {e}")
    
    logger.info(f"Region {region} generated {len(records)} total records")
    if not DRY_RUN:
        insert_batch(records)
    
    return records

# Section 6: Main Pipeline
@timer
def main():
    logger.info("Starting main pipeline")
    try:
        fund_meta = load_fund_metadata()
    except Exception as e:
        logger.error(f"Failed to load metadata: {e}")
        return {"error": str(e)}
    
    regions = fund_meta["Region"].unique()
    logger.info(f"Total funds: {len(fund_meta)}, Regions: {regions}")
    
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
        logger.info(f"Sampled {len(fund_ids)} funds: {fund_ids}")
    
    try:
        returns = load_fund_returns(fund_ids)
    except Exception as e:
        logger.error(f"Failed to load returns: {e}")
        return {"error": str(e)}
    
    fund_data_list = [
        {
            "SymbolCUSIP": row["SymbolCUSIP"],
            "Global_Category_Name": row["Global_Category_Name"],
            "CWA_Broad_Category_Name": row.get("CWA_Broad_Category_Name", None),
            "returns": returns[row["SymbolCUSIP"]].to_dict()
        }
        for _, row in fund_meta.iterrows() if row["SymbolCUSIP"] in returns.columns
    ]
    
    summary = {"total_funds": len(fund_data_list), "regions": {}, "errors": 0}
    for region in sorted(set(regions) - {'Unknown'} | {'USA'}):
        region_funds = [fd for fd in fund_data_list if fund_meta[fund_meta["SymbolCUSIP"] == fd["SymbolCUSIP"]]["Region"].iloc[0] == region]
        if SAMPLE_DRY_RUN and len(region_funds) > SAMPLE_SIZE:
            region_funds = random.sample(region_funds, SAMPLE_SIZE)
            logger.info(f"Sampled {len(region_funds)} funds for {region}")
        records = process_region(region, region_funds)
        summary["regions"][region] = {"funds_processed": len(region_funds), "records": len(records)}
    
    logger.info(f"Pipeline summary: {summary}")
    return summary

# Section 7: Database Output
def insert_batch(records):
    try:
        df = pd.DataFrame(records)
        with database_transaction() as connection:
            df.to_sql("AQRR_Factor_Attribution", connection, if_exists="append", index=False, method="multi")
        logger.info(f"Inserted {len(df)} records to database")
    except Exception as e:
        logger.error(f"Error inserting batch: {e}")
        raise

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logger.error(f"Main execution failed: {e}")
        raise

Processing Global:   0%|          | 0/10 [00:00<?, ?it/s]2025-04-13 22:30:52,563 - ERROR - Error processing HERIX: A process in the process pool was terminated abruptly while the future was running or pending.
Processing Global:  10%|█         | 1/10 [00:00<00:03,  2.95it/s]2025-04-13 22:30:52,565 - ERROR - Error processing PCCIX: A process in the process pool was terminated abruptly while the future was running or pending.
2025-04-13 22:30:52,566 - ERROR - Error processing ROAM: A process in the process pool was terminated abruptly while the future was running or pending.
2025-04-13 22:30:52,567 - ERROR - Error processing CGXU: A process in the process pool was terminated abruptly while the future was running or pending.
2025-04-13 22:30:52,568 - ERROR - Error processing FYLG: A process in the process pool was terminated abruptly while the future was running or pending.
2025-04-13 22:30:52,569 - ERROR - Error processing CGCIX: A process in the process pool was terminated abruptly whil

In [None]:
#new grok review with some fixes

In [9]:
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time
import pickle
import sys
from contextlib import contextmanager
from functools import wraps


# Section 1: Configuration and Setup
CONFIG = {
    "database": {
        "server": "JULIANS_LAPTOP\\SQLEXPRESS",
        "database": "CWA_Fund_Database",
        "driver": "ODBC Driver 18 for SQL Server"
    },
    "return_metric": "1 Month Return",
    "rolling_periods": [12, 24, 36, 48, 60],
    "dry_run": True,
    "sample_dry_run": True,
    "sample_size": 50,
    "chunk_size": 5600,
    "batch_insert_size": 10000,
    "max_workers_cpu": min(13, os.cpu_count()),  # 14 workers for CPU-bound tasks
    "max_workers_io": 30,  # 30 workers for I/O-bound tasks
    "use_threads": False,
    "unmapped_category_behavior": "allocation"  # Options: "allocation", "all"
}

CONNECTION_STRING = (
    f"mssql+pyodbc://{CONFIG['database']['server']}/{CONFIG['database']['database']}"
    f"?driver={CONFIG['database']['driver']}&trusted_connection=yes&TrustServerCertificate=yes"
)
try:
    engine = create_engine(CONNECTION_STRING)
except Exception as e:
    print(f"Database connection failed: {e}")
    raise

RETURN_METRIC = CONFIG["return_metric"]
ROLLING_PERIODS = CONFIG["rolling_periods"]
DRY_RUN = CONFIG["dry_run"]
SAMPLE_DRY_RUN = CONFIG["sample_dry_run"]
SAMPLE_SIZE = CONFIG["sample_size"]
CHUNK_SIZE = CONFIG["chunk_size"]
BATCH_INSERT_SIZE = CONFIG["batch_insert_size"]
MAX_WORKERS_CPU = CONFIG["max_workers_cpu"]
MAX_WORKERS_IO = CONFIG["max_workers_io"]
USE_THREADS = CONFIG["use_threads"]
UNMAPPED_CATEGORY_BEHAVIOR = CONFIG["unmapped_category_behavior"]

# Logging setup
SUMMARY_LOG = "factor_attribution_summary.log"
logging.basicConfig(
    level=logging.INFO,  # Use INFO to reduce verbosity
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("factor_attribution.log"),
        logging.StreamHandler(sys.stdout)
    ],
    force=True
)
logger = logging.getLogger()

def log_summary(message):
    with open(SUMMARY_LOG, 'a') as f:
        f.write(f"{datetime.now()}: {message}\n")

# Section 2: Helper Functions

def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Natural Resources Sector Equity": ("USA", "Natural Resources Sector Equity"),
        "Infrastructure Sector Equity": ("USA", "Infrastructure Sector Equity"),
        "Trading Tools": ("USA", "Trading Tools"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    return mapping.get(category, ("USA", "Unknown"))

@contextmanager
def database_transaction():
    connection = engine.connect()
    transaction = connection.begin()
    try:
        yield connection
        transaction.commit()
    except Exception as e:
        transaction.rollback()
        logger.error(f"Transaction failed: {e}")
        raise
    finally:
        connection.close()

def timer(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        logger.debug(f"Starting {func.__name__}")
        result = func(*args, **kwargs)
        logger.info(f"{func.__name__} took {time.time() - start_time:.2f} seconds")
        return result
    return wrapper

# Section 3: Data Loading Functions
@timer
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    try:
        df = pd.read_sql(query, engine)
        logger.info(f"Loaded metadata for {len(df)} funds")
        df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
        if df["CWA_Broad_Category_Name"].isnull().all():
            logger.warning("CWA_Broad_Category_Name missing; Equity regressions 5-7 will be skipped")
        return df.dropna(subset=["Region", "FactorProfile"])
    except Exception as e:
        logger.error(f"Error loading metadata: {e}")
        raise

# Global cache for factor data
FACTOR_CACHE = {}

@timer
def load_db_factors(factor_list, region="Global", table="factor_returns", asset_class=None):
    cache_key = (tuple(factor_list), region, table, asset_class)
    if cache_key in FACTOR_CACHE:
        logger.debug(f"Using cached factors for {cache_key}")
        return FACTOR_CACHE[cache_key]
    
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM {table}
        WHERE factor IN ({factor_in_clause})
    """
    if region and not any(f.startswith('TSM-') or f == 'RF' for f in factor_list):
        query += f" AND region = '{region}'"
    elif 'RF' in factor_list:
        query += " AND region = 'USA'"
    if asset_class:
        query += f" AND asset_class = '{asset_class}'"
    
    logger.debug(f"Executing query: {query}")
    df = pd.read_sql_query(query, engine, parse_dates=['Date'])
    if df.empty:
        logger.warning(f"No data for factors {factor_list} in {table} (region: {region}, asset_class: {asset_class})")
        return pd.DataFrame()
    
    duplicates = df.duplicated(subset=['Date', 'Factor']).sum()
    if duplicates > 0:
        logger.warning(f"Found {duplicates} duplicate Date-Factor pairs in {table} for {factor_list}; dropping duplicates")
        df = df.drop_duplicates(subset=['Date', 'Factor'], keep='first')
    
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value").rename(
        columns={
            'MKT': 'mkt', 'SMB': 'smb', 'HML_Devil': 'hml', 'UMD': 'umd', 'QMJ': 'qmj',
            'BAB': 'bab', 'RF': 'rf', 'TSM-Com': 'tsm-com', 'TSM-EQ': 'tsm-eq',
            'TSM-FI': 'tsm-fi', 'TSM-FX': 'tsm-fx', 'TSM-MA': 'tsm-ma'
        }
    )
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df.index = pivoted_df.index.drop_duplicates()
    pivoted_df = pivoted_df.asfreq('ME')
    missing_factors = [f for f in factor_list if f.lower() not in pivoted_df.columns]
    if missing_factors:
        logger.warning(f"Missing factors in {table} (region: {region}): {missing_factors}")
    
    FACTOR_CACHE[cache_key] = pivoted_df
    return pivoted_df

@timer
def load_fund_returns(fund_ids=None):
    try:
        query = """
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE Metric = '1 Month Return' AND ReturnValue IS NOT NULL AND Date IS NOT NULL
        """
        if fund_ids:
            fund_ids = [fid for fid in fund_ids if isinstance(fid, str) and fid.strip()]
            if not fund_ids:
                logger.warning("No valid SymbolCUSIP provided")
                return pd.DataFrame()
            fund_ids_str = ",".join([f"'{fid}'" for fid in fund_ids])
            query += f" AND SymbolCUSIP IN ({fund_ids_str})"
        
        logger.debug(f"Executing query: {query}")
        chunks = []
        for chunk in pd.read_sql_query(query, engine, parse_dates=["Date"], chunksize=CHUNK_SIZE):
            logger.debug(f"Loaded chunk of {len(chunk)} rows")
            chunks.append(chunk)
        df = pd.concat(chunks) if chunks else pd.DataFrame()
        if df.empty:
            logger.warning(f"No returns data loaded for SymbolCUSIP: {fund_ids}")
            return pd.DataFrame()
        
        logger.info(f"Loaded returns for {len(df['SymbolCUSIP'].unique())} funds")
        pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
        pivoted.index = pd.to_datetime(pivoted.index) + pd.offsets.MonthEnd(0)
        pivoted.index = pivoted.index.drop_duplicates()
        pivoted = pivoted.asfreq('ME')
        for col in pivoted.columns:
            if not pd.api.types.is_numeric_dtype(pivoted[col]):
                logger.warning(f"Non-numeric returns for {col}; converting")
                pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
        return pivoted
    except Exception as e:
        logger.error(f"Error loading returns: {e}")
        raise

@timer
def load_century_factors(factor_list, portfolio_base, factor, asset_class=None, region="Global"):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    portfolio = f"{portfolio_base} {factor}"
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM aqr_century_factors
        WHERE factor IN ({factor_in_clause})
        AND portfolio = ?
    """
    params = [portfolio]
    if asset_class:
        query += " AND asset_class = ?"
        params.append(asset_class)
    if region != "Global":
        query += " AND region = ?"
        params.append(region)
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    if df.empty:
        logger.warning(f"No data for factors {factor_list} in aqr_century_factors (portfolio: {portfolio}, region: {region})")
        return pd.DataFrame()
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df.index = pivoted_df.index.drop_duplicates()
    pivoted_df = pivoted_df.asfreq('ME')
    return pivoted_df

@timer
def load_commodity_factors():
    query = """
        SELECT date AS Date, 
               excess_return_eqwt, 
               excess_spot_return_eqwt, 
               ir_adjusted_carry_eqwt, 
               spot_return_eqwt, 
               carry_eqwt, 
               excess_return_long_short, 
               excess_spot_return_long_short, 
               ir_adjusted_carry_long_short, 
               aggregate_backwardation_contango
        FROM aqr_cmdty_factors
    """
    df = pd.read_sql(query, engine, parse_dates=['Date'])
    if df.empty:
        logger.warning("No commodity factors loaded")
        return pd.DataFrame()
    pivoted_df = df.set_index("Date")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df.index = pivoted_df.index.drop_duplicates()
    pivoted_df = pivoted_df.asfreq('ME')
    return pivoted_df

@timer
def load_fixed_income_factors(factor_list):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
        WHERE Factor_Name IN ({factor_in_clause})
    """
    logger.debug(f"Executing query: {query}")
    df = pd.read_sql_query(query, engine, parse_dates=["Date"])
    if df.empty:
        logger.warning(f"No fixed income factors for {factor_list}")
        return pd.DataFrame()
    pivoted_df = df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df.index = pivoted_df.index.drop_duplicates()
    pivoted_df = pivoted_df.asfreq('ME')
    return pivoted_df

# Section 4: Regression Functions
def run_rolling_regression(fund, returns, factors, regression_type, factor_set):
    results = []
    try:
        returns = pd.Series(returns).dropna()
        factors = pd.DataFrame(factors).dropna()
        returns.index = pd.to_datetime(returns.index, errors='coerce')
        factors.index = pd.to_datetime(factors.index, errors='coerce')
        returns = returns[returns.index.notnull()]
        factors = factors[factors.index.notnull()]
        
        if returns.empty or factors.empty:
            logger.warning(f"Empty data for {fund} ({factor_set})")
            return results
        
        overlap_dates = returns.index.intersection(factors.index)
        months_available = len(overlap_dates)
        date_start = overlap_dates.min().strftime('%Y-%m-%d') if months_available > 0 else "none"
        date_end = overlap_dates.max().strftime('%Y-%m-%d') if months_available > 0 else "none"
        
        min_window = min(ROLLING_PERIODS)
        if months_available < min_window:
            skipped_windows = [f"{w}m" for w in ROLLING_PERIODS]
            logger.warning(f"Fund {fund} ({factor_set}) skipped for windows {skipped_windows}: "
                           f"{months_available} months available from {date_start} to {date_end}, need {min_window}")
            return results
        
        logger.debug(f"Fund {fund}: {len(returns)} returns, {len(factors)} factors, {len(overlap_dates)} overlapping dates")
        
        factors = factors.reindex(overlap_dates)
        returns = returns.reindex(overlap_dates)
        
        for window in ROLLING_PERIODS:
            window_results = []
            if months_available < window:
                logger.warning(f"Fund {fund} ({factor_set}, window={window}m) skipped: {months_available} months available "
                               f"from {date_start} to {date_end}, need {window}")
                continue
            
            expected_rolls = max(0, len(returns) - window + 1)
            logger.debug(f"Fund {fund}: Expected {expected_rolls} rolls for window {window}m with factors {factors.columns.tolist()}")
            for start_idx in range(expected_rolls):
                end_idx = start_idx + window
                y = returns.iloc[start_idx:end_idx]
                X = factors.iloc[start_idx:end_idx]
                if len(y) != window or y.isnull().any() or X.isnull().any().any():
                    continue
                X_const = add_constant(X)
                model = OLS(y, X_const).fit()
                for factor in X.columns:
                    result = {
                        "SymbolCUSIP": fund,
                        "MonthEndDate": y.index[-1],
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": model.params.get(factor, np.nan),
                        "P_Value": model.pvalues.get(factor, np.nan),
                        "T_Stat": model.tvalues.get(factor, np.nan),
                        "Standard_Error": model.bse.get(factor, np.nan),
                        "CI_Lower": model.conf_int().loc[factor][0] if factor in model.params else np.nan,
                        "CI_Upper": model.conf_int().loc[factor][1] if factor in model.params else np.nan,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1] if len(y) > 1 else np.nan,
                        "Regression_Type": regression_type,
                        "Factor_Set": factor_set
                    }
                    window_results.append(result)
            
            if window_results:
                factor_counts = {factor: len([r for r in window_results if r["Factor_Name"] == factor]) for factor in factors.columns}
                logger.info(f"Fund {fund} succeeded for window {window}m with factors {factor_counts}, total {len(window_results)} records")
                results.extend(window_results)
        
    except Exception as e:
        logger.error(f"Regression error for {fund} ({factor_set}): {e}")
    return results

# Section 5: Processing Functions
def process_fund(fund_data):
    records = []
    symbol = fund_data["SymbolCUSIP"]
    category = fund_data["Global_Category_Name"]
    broad_category = fund_data["CWA_Broad_Category_Name"]
    returns = pd.Series(fund_data["returns"]).dropna()
    
    logger.debug(f"Starting process_fund for {symbol}")
    
    try:
        if returns.empty:
            logger.warning(f"No valid returns for {symbol}")
            return records

        # Define regression sets
        regression_sets = {
            "Equity_USA": [
                ("Equity_USA_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB'], "USA", None),
                ("Equity_USA_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "USA", "US Stock Selection"),
                ("Equity_USA_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "USA", "All Macro"),
                ("Equity_USA_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "USA", "Equity indices"),
                ("Equity_USA_5", ['MKT', 'BAB', 'TSM-FI', 'TSM-FX', 'excess_return_eqwt'], "USA", None),
                ("Equity_USA_6", ['MKT', 'SMB', 'BAB', 'TSM-Com'], "USA", None),
                ("Equity_USA_7", ['MKT', 'HML_Devil', 'QMJ', 'UMD', 'SMB', 'BAB', 'TSM-Com', 'TSM-FI', 'TSM-FX'], "USA", None)
            ],
            "Equity_Intl": [
                ("Equity_Intl_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Intl", None),
                ("Equity_Intl_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "International", "Intl Stock Selection"),
                ("Equity_Intl_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "International", "All Macro"),
                ("Equity_Intl_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "International", "Equity indices")
            ],
            "Equity_Global": [
                ("Equity_Global_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Global", None),
                ("Equity_Global_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Stock Selection"),
                ("Equity_Global_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
                ("Equity_Global_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
            ],
            "Fixed_Income": [
                ("FI_1", ['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'TSM-FI', 'TSM-FX'], "Global", None),
                ("FI_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Fixed income"),
                ("FI_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
                ("FI_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
            ],
            "Allocation": [
                ("Allocation_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ', 'TSM-FI'], "Global", None),
                ("Allocation_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", None),
                ("Allocation_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
                ("Allocation_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
            ],
            "Alternative": [
                ("Alternative_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ', 'TSM-FI', 'TSM-Com', 'TSM-FX'], "Global", None),
                ("Alternative_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", None),
                ("Alternative_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
                ("Alternative_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
            ],
            "Commodity": [
                ("Commodity_1", ['excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt', 'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short', 'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango'], "Global", None)
            ]
        }

        # Map categories to regression sets
        category_to_regressions = {
            "Energy Sector Equity": "Equity_USA",
            "Equity Miscellaneous": "Equity_USA",
            "Financials Sector Equity": "Equity_USA",
            "Healthcare Sector Equity": "Equity_USA",
            "Consumer Goods & Services Sector Equity": "Equity_USA",
            "Communications Sector Equity": "Equity_USA",
            "Industrials Sector Equity": "Equity_USA",
            "Other Sector Equity": "Equity_USA",
            "Real Estate Sector Equity": "Equity_USA",
            "Precious Metals Sector Equity": "Equity_USA",
            "Technology Sector Equity": "Equity_USA",
            "Utilities Sector Equity": "Equity_USA",
            "US Equity Large Cap Blend": "Equity_USA",
            "US Equity Large Cap Growth": "Equity_USA",
            "US Equity Large Cap Value": "Equity_USA",
            "US Equity Mid Cap": "Equity_USA",
            "US Equity Small Cap": "Equity_USA",
            "Options Trading": "Equity_USA",
            "Natural Resources Sector Equity": "Equity_USA",
            "Infrastructure Sector Equity": "Equity_USA",
            "Asia ex-Japan Equity": "Equity_Intl",
            "Australia & New Zealand Equity": "Equity_Intl",
            "Canadian Equity Large Cap": "Equity_Intl",
            "Europe Equity Large Cap": "Equity_Intl",
            "Europe Equity Mid/Small Cap": "Equity_Intl",
            "Greater China Equity": "Equity_Intl",
            "India Equity": "Equity_Intl",
            "Mexico Equity": "Equity_Intl",
            "Japan Equity": "Equity_Intl",
            "Korea Equity": "Equity_Intl",
            "Latin America Equity": "Equity_Intl",
            "UK Equity Large Cap": "Equity_Intl",
            "Thailand Equity": "Equity_Intl",
            "Global Emerging Markets Equity": "Equity_Global",
            "Global Equity Large Cap": "Equity_Global",
            "Global Equity Mid/Small Cap": "Equity_Global",
            "Global Fixed Income": "Fixed_Income",
            "Convertibles": "Fixed_Income",
            "Emerging Markets Fixed Income": "Fixed_Income",
            "Fixed Income Miscellaneous": "Fixed_Income",
            "US Fixed Income": "Fixed_Income",
            "US Municipal Fixed Income": "Fixed_Income",
            "Aggressive Allocation": "Allocation",
            "Allocation Miscellaneous": "Allocation",
            "Cautious Allocation": "Allocation",
            "Flexible Allocation": "Allocation",
            "Moderate Allocation": "Allocation",
            "Alternative Miscellaneous": "Alternative",
            "Long/Short Equity": "Alternative",
            "Market Neutral": "Alternative",
            "Multialternative": "Alternative",
            "Commodities Broad Basket": "Commodity",
            "Commodities Specified": "Commodity"
        }

        # Determine regression sets to run
        regression_category = category_to_regressions.get(category)
        if not regression_category:
            if UNMAPPED_CATEGORY_BEHAVIOR == "allocation":
                regression_category = "Allocation"
                logger.warning(f"Unmapped category {category} for {symbol}; defaulting to Allocation regressions")
            else:  # "all"
                regression_category = "all"
                logger.warning(f"Unmapped category {category} for {symbol}; running all regressions")

        # Run regressions
        if regression_category == "all":
            for reg_set in regression_sets.values():
                for reg_name, factors, region, portfolio_base in reg_set:
                    try:
                        if portfolio_base:
                            for factor in factors:
                                factor_data = load_century_factors([factor], portfolio_base, factor, region=region)
                                if factor_data.empty:
                                    logger.warning(f"No factors for {symbol} ({reg_name}, factor: {factor})")
                                    continue
                                factor_data, returns_aligned = factor_data.align(returns, join="inner", axis=0)
                                if returns_aligned.empty:
                                    logger.warning(f"No overlapping dates for {symbol} ({reg_name}, factor: {factor})")
                                    continue
                                records.extend(run_rolling_regression(symbol, returns_aligned, factor_data, "OLS", reg_name))
                        else:
                            if reg_name.startswith("Commodity"):
                                factor_data = load_commodity_factors()
                            elif reg_name.startswith("FI"):
                                factor_data = pd.concat([
                                    load_db_factors([f for f in factors if f not in ['TERM', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']], region),
                                    load_fixed_income_factors([f for f in factors if f in ['TERM', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']])
                                ], axis=1).dropna()
                            else:
                                factor_data = load_db_factors(factors, region)
                            if factor_data.empty:
                                logger.warning(f"No factors for {symbol} ({reg_name})")
                                continue
                            if 'mkt' in factor_data.columns and 'rf' in factor_data.columns:
                                factor_data['mkt-rf'] = factor_data['mkt'] - factor_data['rf']
                            factor_data, returns_aligned = factor_data.align(returns, join="inner", axis=0)
                            if returns_aligned.empty:
                                logger.warning(f"No overlapping dates for {symbol} ({reg_name})")
                                continue
                            desired_factors = [f if f != 'MKT' else ('mkt-rf' if 'mkt-rf' in factor_data.columns else 'mkt') for f in factors]
                            available_factors = [f for f in desired_factors if f in factor_data.columns]
                            if available_factors:
                                records.extend(run_rolling_regression(symbol, returns_aligned, factor_data[available_factors], "OLS", reg_name))
                    except Exception as e:
                        logger.warning(f"Error in regression for {symbol} ({reg_name}): {str(e)}")
                        continue
        else:
            for reg_name, factors, region, portfolio_base in regression_sets[regression_category]:
                try:
                    if portfolio_base:
                        for factor in factors:
                            factor_data = load_century_factors([factor], portfolio_base, factor, region=region)
                            if factor_data.empty:
                                logger.warning(f"No factors for {symbol} ({reg_name}, factor: {factor})")
                                continue
                            factor_data, returns_aligned = factor_data.align(returns, join="inner", axis=0)
                            if returns_aligned.empty:
                                logger.warning(f"No overlapping dates for {symbol} ({reg_name}, factor: {factor})")
                                continue
                            records.extend(run_rolling_regression(symbol, returns_aligned, factor_data, "OLS", reg_name))
                    else:
                        if reg_name.startswith("Commodity"):
                            factor_data = load_commodity_factors()
                        elif reg_name.startswith("FI"):
                            factor_data = pd.concat([
                                load_db_factors([f for f in factors if f not in ['TERM', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']], region),
                                load_fixed_income_factors([f for f in factors if f in ['TERM', 'TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']])
                            ], axis=1).dropna()
                        else:
                            factor_data = load_db_factors(factors, region)
                        if factor_data.empty:
                            logger.warning(f"No factors for {symbol} ({reg_name})")
                            continue
                        if 'mkt' in factor_data.columns and 'rf' in factor_data.columns:
                            factor_data['mkt-rf'] = factor_data['mkt'] - factor_data['rf']
                        factor_data, returns_aligned = factor_data.align(returns, join="inner", axis=0)
                        if returns_aligned.empty:
                            logger.warning(f"No overlapping dates for {symbol} ({reg_name})")
                            continue
                        desired_factors = [f if f != 'MKT' else ('mkt-rf' if 'mkt-rf' in factor_data.columns else 'mkt') for f in factors]
                        available_factors = [f for f in desired_factors if f in factor_data.columns]
                        if available_factors:
                            records.extend(run_rolling_regression(symbol, returns_aligned, factor_data[available_factors], "OLS", reg_name))
                except Exception as e:
                    logger.warning(f"Error in regression for {symbol} ({reg_name}): {str(e)}")
                    continue

    except Exception as e:
        logger.warning(f"Error processing fund {symbol}: {str(e)}")
        return records
    
    logger.info(f"Generated {len(records)} regression records for {symbol}")
    return records

def process_region(region, fund_data_list):
    from concurrent.futures import ThreadPoolExecutor
    records = []
    errors = 0
    logger.info(f"Processing {len(fund_data_list)} funds in {region}")
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        future_to_fund = {
            executor.submit(process_fund, fund_data): fund_data["SymbolCUSIP"]
            for fund_data in fund_data_list
        }
        for future in tqdm(future_to_fund, total=len(fund_data_list), desc=f"Processing {region}", file=sys.stdout):
            try:
                records.extend(future.result())
            except Exception as e:
                logger.error(f"Error processing {future_to_fund[future]}: {str(e)}")
                errors += 1
    
    logger.info(f"Region {region} generated {len(records)} total records with {errors} errors")
    log_summary(f"Region {region}: {len(fund_data_list)} funds, {len(records)} records, {errors} errors")
    if not DRY_RUN:
        insert_batch(records)
    
    return records, errors

# Section 6: Main Pipeline
@timer
def main():
    logger.info("Starting main pipeline")
    log_summary("Pipeline started")
    try:
        fund_meta = load_fund_metadata()
    except Exception as e:
        logger.error(f"Failed to load metadata: {e}")
        log_summary(f"Error: Failed to load metadata: {e}")
        return {"error": str(e)}
    
    regions = fund_meta["Region"].unique()
    logger.info(f"Total funds: {len(fund_meta)}, Regions: {regions}")
    log_summary(f"Total funds: {len(fund_meta)}, Regions: {regions}")
    
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
        logger.info(f"Sampled {len(fund_ids)} funds")
        log_summary(f"Sampled {len(fund_ids)} funds")
    
    summary = {"total_funds": len(fund_ids), "regions": {}, "errors": 0}
    for region in sorted(set(regions) - {'Unknown'} | {'USA'}):
        region_fund_ids = fund_meta[fund_meta["Region"] == region]["SymbolCUSIP"].tolist()
        if not region_fund_ids:
            logger.warning(f"No SymbolCUSIP found for region {region}")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 0}
            continue
        if SAMPLE_DRY_RUN:
            region_fund_ids = random.sample(region_fund_ids, min(SAMPLE_SIZE, len(region_fund_ids)))
            logger.info(f"Sampled {len(region_fund_ids)} SymbolCUSIP for {region}")
        
        try:
            returns = load_fund_returns(region_fund_ids)
        except Exception as e:
            logger.error(f"Failed to load returns for {region} with SymbolCUSIP {region_fund_ids[:5]}...: {e}")
            log_summary(f"Error: Failed to load returns for {region}: {e}")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 1}
            summary["errors"] += 1
            continue
        
        region_funds = [
            {
                "SymbolCUSIP": row["SymbolCUSIP"],
                "Global_Category_Name": row["Global_Category_Name"],
                "CWA_Broad_Category_Name": row.get("CWA_Broad_Category_Name", None),
                "returns": returns[row["SymbolCUSIP"]].to_dict()
            }
            for _, row in fund_meta.iterrows() if row["SymbolCUSIP"] in returns.columns
        ]
        if not region_funds:
            logger.warning(f"No valid returns data for {region} with SymbolCUSIP {region_fund_ids[:5]}...")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 0}
            continue
        
        records, errors = process_region(region, region_funds)
        summary["regions"][region] = {"funds_processed": len(region_funds), "records": len(records), "errors": errors}
        summary["errors"] += errors
    
    logger.info(f"Pipeline summary: {summary}")
    log_summary(f"Pipeline completed: {summary}")
    return summary

# Section 7: Database Output
def insert_batch(records):
    try:
        df = pd.DataFrame(records)
        with database_transaction() as connection:
            df.to_sql("AQRR_Factor_Attribution", connection, if_exists="append", index=False, method="multi")
        logger.info(f"Inserted {len(df)} records to database")
    except Exception as e:
        logger.error(f"Error inserting batch: {e}")
        raise

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logger.error(f"Main execution failed: {e}")
        raise

2025-04-16 16:44:16,557 - INFO - Starting main pipeline
2025-04-16 16:44:16,696 - INFO - Loaded metadata for 5584 funds
2025-04-16 16:44:17,031 - INFO - load_fund_metadata took 0.47 seconds
2025-04-16 16:44:17,032 - INFO - Total funds: 5584, Regions: ['Global' 'USA' 'International']
2025-04-16 16:44:17,033 - INFO - Sampled 50 funds
2025-04-16 16:44:17,035 - INFO - Sampled 50 SymbolCUSIP for Global
2025-04-16 16:44:17,083 - INFO - Loaded returns for 50 funds
2025-04-16 16:44:17,089 - INFO - load_fund_returns took 0.05 seconds
2025-04-16 16:44:17,252 - INFO - Processing 50 funds in Global
2025-04-16 16:44:18,971 - INFO - Generated 0 regression records for DFAI
2025-04-16 16:44:18,984 - INFO - Generated 0 regression records for APDJX
2025-04-16 16:44:19,002 - INFO - Generated 0 regression records for GXUS
2025-04-16 16:44:19,005 - INFO - Generated 0 regression records for DFIC
2025-04-16 16:44:19,142 - INFO - load_db_factors took 0.01 seconds
2025-04-16 16:44:19,173 - INFO - Generated 0 r

In [14]:
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time
import pickle
import sys
from contextlib import contextmanager
from functools import wraps


# Section 1: Configuration and Setup
CONFIG = {
    "database": {
        "server": "JULIANS_LAPTOP\\SQLEXPRESS",
        "database": "CWA_Fund_Database",
        "driver": "ODBC Driver 18 for SQL Server"
    },
    "return_metric": "1 Month Return",
    "rolling_periods": [12, 24, 36, 48, 60],
    "dry_run": True,
    "sample_dry_run": True,
    "sample_size": 10,
    "chunk_size": 5600,
    "batch_insert_size": 10000,
    "max_workers_cpu": min(13, os.cpu_count()),  # 14 workers for CPU-bound tasks
    "max_workers_io": 30,  # 30 workers for I/O-bound tasks
    "use_threads": False,
    "unmapped_category_behavior": "allocation"  # Options: "allocation", "all"
}

CONNECTION_STRING = (
    f"mssql+pyodbc://{CONFIG['database']['server']}/{CONFIG['database']['database']}"
    f"?driver={CONFIG['database']['driver']}&trusted_connection=yes&TrustServerCertificate=yes"
)
try:
    engine = create_engine(CONNECTION_STRING)
except Exception as e:
    print(f"Database connection failed: {e}")
    raise

RETURN_METRIC = CONFIG["return_metric"]
ROLLING_PERIODS = CONFIG["rolling_periods"]
DRY_RUN = CONFIG["dry_run"]
SAMPLE_DRY_RUN = CONFIG["sample_dry_run"]
SAMPLE_SIZE = CONFIG["sample_size"]
CHUNK_SIZE = CONFIG["chunk_size"]
BATCH_INSERT_SIZE = CONFIG["batch_insert_size"]
MAX_WORKERS_CPU = CONFIG["max_workers_cpu"]
MAX_WORKERS_IO = CONFIG["max_workers_io"]
USE_THREADS = CONFIG["use_threads"]
UNMAPPED_CATEGORY_BEHAVIOR = CONFIG["unmapped_category_behavior"]

# Logging setup
SUMMARY_LOG = "factor_attribution_summary.log"
logging.basicConfig(
    level=logging.INFO,  # Use INFO to reduce verbosity
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("factor_attribution.log"),
        logging.StreamHandler(sys.stdout)
    ],
    force=True
)
logger = logging.getLogger()

def log_summary(message):
    with open(SUMMARY_LOG, 'a') as f:
        f.write(f"{datetime.now()}: {message}\n")

# Section 2: Helper Functions

def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Natural Resources Sector Equity": ("USA", "Natural Resources Sector Equity"),
        "Infrastructure Sector Equity": ("USA", "Infrastructure Sector Equity"),
        "Trading Tools": ("USA", "Trading Tools"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    return mapping.get(category, ("USA", "Unknown"))

@contextmanager
def database_transaction():
    connection = engine.connect()
    transaction = connection.begin()
    try:
        yield connection
        transaction.commit()
    except Exception as e:
        transaction.rollback()
        logger.error(f"Transaction failed: {e}")
        raise
    finally:
        connection.close()

def timer(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        logger.debug(f"Starting {func.__name__}")
        result = func(*args, **kwargs)
        logger.info(f"{func.__name__} took {time.time() - start_time:.2f} seconds")
        return result
    return wrapper

# Section 3: Data Loading Functions
@timer
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    try:
        df = pd.read_sql(query, engine)
        logger.info(f"Loaded metadata for {len(df)} funds")
        df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
        if df["CWA_Broad_Category_Name"].isnull().all():
            logger.warning("CWA_Broad_Category_Name missing; Equity regressions 5-7 will be skipped")
        return df.dropna(subset=["Region", "FactorProfile"])
    except Exception as e:
        logger.error(f"Error loading metadata: {e}")
        raise

# Global cache for factor data
FACTOR_CACHE = {}

@timer
def load_db_factors(factor_list, region="Global", table="factor_returns", asset_class=None):
    cache_key = (tuple(factor_list), region, table, asset_class)
    if cache_key in FACTOR_CACHE:
        logger.debug(f"Using cached factors for {cache_key}")
        return FACTOR_CACHE[cache_key]
    
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value, region
        FROM {table}
        WHERE factor IN ({factor_in_clause})
        AND date >= '2015-01-01'
    """
    if region and not any(f.startswith('TSM-') or f == 'RF' for f in factor_list):
        query += f" AND region = '{region}'"
    elif 'RF' in factor_list:
        query += " AND region = 'USA'"
    if asset_class:
        query += f" AND asset_class = '{asset_class}'"
    
    logger.debug(f"Executing query: {query}")
    df = pd.read_sql_query(query, engine, parse_dates=['Date'])
    if df.empty:
        logger.warning(f"No data for factors {factor_list} in {table} (region: {region})")
        return pd.DataFrame()
    
    df['Factor'] = df['Factor'].str.strip()
    df['region'] = df['region'].str.strip()
    duplicates = df.duplicated(subset=['Date', 'Factor', 'region']).sum()
    if duplicates > 0:
        logger.warning(f"Found {duplicates} duplicates in {table} for {factor_list}; aggregating mean")
        df = df.groupby(['Date', 'Factor', 'region'])['Value'].mean().reset_index()
    
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df = pivoted_df[~pivoted_df.index.duplicated(keep='first')]
    pivoted_df = pivoted_df.asfreq('ME')
    if pivoted_df.empty or not any(f in pivoted_df.columns for f in factor_list):
        logger.warning(f"No valid factors in {table} (region: {region}): {factor_list}")
        return pd.DataFrame()
    
    FACTOR_CACHE[cache_key] = pivoted_df
    return pivoted_df

@timer
def load_fund_returns(fund_ids=None):
    try:
        query = """
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE Metric = '1 Month Return' AND ReturnValue IS NOT NULL AND Date IS NOT NULL
        """
        if fund_ids:
            fund_ids = [fid for fid in fund_ids if isinstance(fid, str) and fid.strip()]
            if not fund_ids:
                logger.warning("No valid SymbolCUSIP provided")
                return pd.DataFrame()
            fund_ids_str = ",".join([f"'{fid}'" for fid in fund_ids])
            query += f" AND SymbolCUSIP IN ({fund_ids_str})"
        
        logger.debug(f"Executing query: {query}")
        chunks = []
        for chunk in pd.read_sql_query(query, engine, parse_dates=["Date"], chunksize=CHUNK_SIZE):
            logger.debug(f"Loaded chunk of {len(chunk)} rows")
            chunks.append(chunk)
        df = pd.concat(chunks) if chunks else pd.DataFrame()
        if df.empty:
            logger.warning(f"No returns data loaded for SymbolCUSIP: {fund_ids}")
            return pd.DataFrame()
        
        logger.info(f"Loaded returns for {len(df['SymbolCUSIP'].unique())} funds")
        pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
        pivoted.index = pd.to_datetime(pivoted.index) + pd.offsets.MonthEnd(0)
        pivoted.index = pivoted.index.drop_duplicates()
        pivoted = pivoted.asfreq('ME')
        for col in pivoted.columns:
            if not pd.api.types.is_numeric_dtype(pivoted[col]):
                logger.warning(f"Non-numeric returns for {col}; converting")
                pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
        return pivoted
    except Exception as e:
        logger.error(f"Error loading returns: {e}")
        raise

@timer
def load_century_factors(factor_list, portfolio_base, factor, asset_class=None, region="Global"):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    portfolio = f"{portfolio_base} {factor}"
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM aqr_century_factors
        WHERE factor IN ({factor_in_clause})
        AND portfolio = ?
    """
    params = [portfolio]
    if asset_class:
        query += " AND asset_class = ?"
        params.append(asset_class)
    if region != "Global":
        query += " AND region = ?"
        params.append(region)
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    if df.empty:
        logger.warning(f"No data for factors {factor_list} in aqr_century_factors (portfolio: {portfolio}, region: {region})")
        return pd.DataFrame()
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df.index = pivoted_df.index.drop_duplicates()
    pivoted_df = pivoted_df.asfreq('ME')
    return pivoted_df

@timer
def load_commodity_factors():
    query = """
        SELECT date AS Date, 
               excess_return_eqwt, 
               excess_spot_return_eqwt, 
               ir_adjusted_carry_eqwt, 
               spot_return_eqwt, 
               carry_eqwt, 
               excess_return_long_short, 
               excess_spot_return_long_short, 
               ir_adjusted_carry_long_short, 
               aggregate_backwardation_contango
        FROM aqr_cmdty_factors
    """
    df = pd.read_sql(query, engine, parse_dates=['Date'])
    if df.empty:
        logger.warning("No commodity factors loaded")
        return pd.DataFrame()
    pivoted_df = df.set_index("Date")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df.index = pivoted_df.index.drop_duplicates()
    pivoted_df = pivoted_df.asfreq('ME')
    return pivoted_df

@timer
def load_fixed_income_factors(factor_list):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
        WHERE Factor_Name IN ({factor_in_clause})
    """
    logger.debug(f"Executing query: {query}")
    df = pd.read_sql_query(query, engine, parse_dates=["Date"])
    if df.empty:
        logger.warning(f"No fixed income factors for {factor_list}")
        return pd.DataFrame()
    pivoted_df = df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df.index = pivoted_df.index.drop_duplicates()
    pivoted_df = pivoted_df.asfreq('ME')
    return pivoted_df

# Section 4: Regression Functions
def run_rolling_regression(symbol, returns, factor_data, regression_type, regression_name, window_months):
    records = []
    months_needed = window_months
    
    # Ensure indices are datetime and sorted
    returns = returns.sort_index()
    factor_data = factor_data.sort_index()
    
    # Find overlapping dates
    available_dates = returns.index.intersection(factor_data.index)
    if len(available_dates) < months_needed:
        logger.warning(f"Fund {symbol} ({regression_name}, window={window_months}m) skipped: {len(available_dates)} months available, need {months_needed}")
        return records
    
    # Iterate over possible window end dates, stepping one month
    for end_date in available_dates[months_needed-1::1]:
        start_date = end_date - pd.offsets.MonthEnd(months_needed)
        if start_date < available_dates[0]:
            continue
        
        # Slice data for the window
        window_returns = returns.loc[start_date:end_date]
        window_factors = factor_data.loc[start_date:end_date]
        
        # Reindex to ensure alignment
        window_returns = window_returns.reindex(window_factors.index)
        
        if len(window_returns.dropna()) < months_needed or len(window_factors.dropna()) < months_needed:
            logger.debug(f"Skipping regression for {symbol} ({regression_name}, window={window_months}m, end_date={end_date}): insufficient data")
            continue
        
        try:
            if regression_type == "OLS":
                X = sm.add_constant(window_factors)
                model = sm.OLS(window_returns, X, missing='drop').fit()
                coefficients = model.params.to_dict()
                tvalues = model.tvalues.to_dict()
                pvalues = model.pvalues.to_dict()
                r_squared = model.rsquared
                record = {
                    "SymbolCUSIP": symbol,
                    "RegressionName": regression_name,
                    "Window": f"{window_months}m",
                    "EndDate": end_date,
                    "R_Squared": r_squared
                }
                for factor in coefficients:
                    record[f"{factor}_beta"] = coefficients[factor]
                    record[f"{factor}_tvalue"] = tvalues.get(factor, None)
                    record[f"{factor}_pvalue"] = pvalues.get(factor, None)
                records.append(record)
        except Exception as e:
            logger.warning(f"Regression failed for {symbol} ({regression_name}, window={window_months}m, end_date={end_date}): {str(e)}")
            continue
    
    if not records:
        logger.info(f"No regressions completed for {symbol} ({regression_name}, window={window_months}m)")
    return records

# Section 5: Processing Functions
def process_fund(fund_data):
    records = []
    symbol = fund_data["SymbolCUSIP"]
    category = fund_data["Global_Category_Name"]
    returns = pd.Series(fund_data["returns"]).dropna()
    
    if returns.empty:
        logger.warning(f"No valid returns for {symbol}")
        return records

    regression_sets = {
        "Equity_USA": [
            ("Equity_USA_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB'], "USA", None),
            ("Equity_USA_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "USA", "US Stock Selection"),
            ("Equity_USA_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "USA", "All Macro"),
            ("Equity_USA_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "USA", "Equity indices"),
            ("Equity_USA_5", ['MKT', 'BAB', 'TSM-FI', 'TSM-FX', 'excess_return_eqwt'], "USA", None),
            ("Equity_USA_6", ['MKT', 'SMB', 'BAB', 'TSM-Com'], "USA", None),
            ("Equity_USA_7", ['MKT', 'HML_Devil', 'QMJ', 'UMD', 'SMB', 'BAB', 'TSM-Com', 'TSM-FI', 'TSM-FX'], "USA", None)
        ],
        "Equity_Intl": [
            ("Equity_Intl_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Intl", None),
            ("Equity_Intl_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "International", "Intl Stock Selection"),
            ("Equity_Intl_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "International", "All Macro"),
            ("Equity_Intl_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "International", "Equity indices")
        ],
        "Equity_Global": [
            ("Equity_Global_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Global", None),
            ("Equity_Global_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Stock Selection"),
            ("Equity_Global_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
            ("Equity_Global_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
        ],
        "Fixed_Income": [
            ("FI_1", ['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'TSM-FI', 'TSM-FX'], "Global", None),
            ("FI_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Fixed income"),
            ("FI_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
            ("FI_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
        ],
        "Allocation": [
            ("Allocation_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ', 'TSM-FI'], "Global", None),
            ("Allocation_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", None),
            ("Allocation_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
            ("Allocation_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
        ],
        "Alternative": [
            ("Alternative_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ', 'TSM-FI', 'TSM-Com', 'TSM-FX'], "Global", None),
            ("Allocation_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", None),
            ("Allocation_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
            ("Allocation_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
        ],
        "Commodity": [
            ("Commodity_1", ['excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt', 'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short', 'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango'], "Global", None)
        ]
    }

    category_to_regressions = {
        "Energy Sector Equity": "Equity_USA",
        "Equity Miscellaneous": "Equity_USA",
        "Financials Sector Equity": "Equity_USA",
        "Healthcare Sector Equity": "Equity_USA",
        "Consumer Goods & Services Sector Equity": "Equity_USA",
        "Communications Sector Equity": "Equity_USA",
        "Industrials Sector Equity": "Equity_USA",
        "Other Sector Equity": "Equity_USA",
        "Real Estate Sector Equity": "Equity_USA",
        "Precious Metals Sector Equity": "Equity_USA",
        "Technology Sector Equity": "Equity_USA",
        "Utilities Sector Equity": "Equity_USA",
        "US Equity Large Cap Blend": "Equity_USA",
        "US Equity Large Cap Growth": "Equity_USA",
        "US Equity Large Cap Value": "Equity_USA",
        "US Equity Mid Cap": "Equity_USA",
        "US Equity Small Cap": "Equity_USA",
        "Options Trading": "Equity_USA",
        "Natural Resources Sector Equity": "Equity_USA",
        "Infrastructure Sector Equity": "Equity_USA",
        "Asia ex-Japan Equity": "Equity_Intl",
        "Australia & New Zealand Equity": "Equity_Intl",
        "Canadian Equity Large Cap": "Equity_Intl",
        "Europe Equity Large Cap": "Equity_Intl",
        "Europe Equity Mid/Small Cap": "Equity_Intl",
        "Greater China Equity": "Equity_Intl",
        "India Equity": "Equity_Intl",
        "Mexico Equity": "Equity_Intl",
        "Japan Equity": "Equity_Intl",
        "Korea Equity": "Equity_Intl",
        "Latin America Equity": "Equity_Intl",
        "UK Equity Large Cap": "Equity_Intl",
        "Thailand Equity": "Equity_Intl",
        "Global Emerging Markets Equity": "Equity_Global",
        "Global Equity Large Cap": "Equity_Global",
        "Global Equity Mid/Small Cap": "Equity_Global",
        "Global Fixed Income": "Fixed_Income",
        "Convertibles": "Fixed_Income",
        "Emerging Markets Fixed Income": "Fixed_Income",
        "Fixed Income Miscellaneous": "Fixed_Income",
        "US Fixed Income": "Fixed_Income",
        "US Municipal Fixed Income": "Fixed_Income",
        "Aggressive Allocation": "Allocation",
        "Allocation Miscellaneous": "Allocation",
        "Cautious Allocation": "Allocation",
        "Flexible Allocation": "Allocation",
        "Moderate Allocation": "Allocation",
        "Alternative Miscellaneous": "Alternative",
        "Long/Short Equity": "Alternative",
        "Market Neutral": "Alternative",
        "Multialternative": "Alternative",
        "Commodities Broad Basket": "Commodity",
        "Commodities Specified": "Commodity"
    }

    regression_category = category_to_regressions.get(category, "Allocation")
    if regression_category == "Allocation" and category not in category_to_regressions:
        logger.warning(f"Unmapped category {category} for {symbol}; using Allocation")

    windows = [12, 24, 36, 48, 60]
    
    for reg_name, factors, region, portfolio_base in regression_sets[regression_category]:
        try:
            if any(f in factors for f in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']):
                factor_data = load_century_factors(factors, portfolio_base or "All Stock Selection", None, region=region)
            elif reg_name.startswith("Commodity"):
                factor_data = load_commodity_factors()
            elif reg_name.startswith("FI"):
                fi_factors = [f for f in factors if f in ['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']]
                other_factors = [f for f in factors if f not in fi_factors]
                factor_data = pd.concat([
                    load_db_factors(other_factors, region) if other_factors else pd.DataFrame(index=returns.index),
                    load_fixed_income_factors(fi_factors) if fi_factors else pd.DataFrame(index=returns.index)
                ], axis=1).dropna()
            else:
                factor_data = load_db_factors(factors, region)
            if factor_data.empty:
                logger.warning(f"No factors for {symbol} ({reg_name})")
                continue
            factor_data = factor_data[factor_data.index >= '2015-01-01']
            for window in windows:
                records.extend(run_rolling_regression(symbol, returns, factor_data, "OLS", reg_name, window))
        except Exception as e:
            logger.warning(f"Error in regression for {symbol} ({reg_name}): {str(e)}")
    
    logger.info(f"Generated {len(records)} regression records for {symbol}")
    return records

def process_region(region, fund_data_list):
    from concurrent.futures import ThreadPoolExecutor
    records = []
    errors = 0
    logger.info(f"Processing {len(fund_data_list)} funds in {region}")
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        future_to_fund = {
            executor.submit(process_fund, fund_data): fund_data["SymbolCUSIP"]
            for fund_data in fund_data_list
        }
        for future in tqdm(future_to_fund, total=len(fund_data_list), desc=f"Processing {region}", file=sys.stdout):
            try:
                records.extend(future.result())
            except Exception as e:
                logger.error(f"Error processing {future_to_fund[future]}: {str(e)}")
                errors += 1
    
    logger.info(f"Region {region} generated {len(records)} total records with {errors} errors")
    log_summary(f"Region {region}: {len(fund_data_list)} funds, {len(records)} records, {errors} errors")
    if not DRY_RUN:
        insert_batch(records)
    
    return records, errors

# Section 6: Main Pipeline
@timer
def main():
    logger.info("Starting main pipeline")
    log_summary("Pipeline started")
    try:
        fund_meta = load_fund_metadata()
    except Exception as e:
        logger.error(f"Failed to load metadata: {e}")
        log_summary(f"Error: Failed to load metadata: {e}")
        return {"error": str(e)}
    
    regions = fund_meta["Region"].unique()
    logger.info(f"Total funds: {len(fund_meta)}, Regions: {regions}")
    log_summary(f"Total funds: {len(fund_meta)}, Regions: {regions}")
    
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
        logger.info(f"Sampled {len(fund_ids)} funds")
        log_summary(f"Sampled {len(fund_ids)} funds")
    
    summary = {"total_funds": len(fund_ids), "regions": {}, "errors": 0}
    for region in sorted(set(regions) - {'Unknown'} | {'USA'}):
        region_fund_ids = fund_meta[fund_meta["Region"] == region]["SymbolCUSIP"].tolist()
        if not region_fund_ids:
            logger.warning(f"No SymbolCUSIP found for region {region}")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 0}
            continue
        if SAMPLE_DRY_RUN:
            region_fund_ids = random.sample(region_fund_ids, min(SAMPLE_SIZE, len(region_fund_ids)))
            logger.info(f"Sampled {len(region_fund_ids)} SymbolCUSIP for {region}")
        
        try:
            returns = load_fund_returns(region_fund_ids)
        except Exception as e:
            logger.error(f"Failed to load returns for {region} with SymbolCUSIP {region_fund_ids[:5]}...: {e}")
            log_summary(f"Error: Failed to load returns for {region}: {e}")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 1}
            summary["errors"] += 1
            continue
        
        region_funds = [
            {
                "SymbolCUSIP": row["SymbolCUSIP"],
                "Global_Category_Name": row["Global_Category_Name"],
                "CWA_Broad_Category_Name": row.get("CWA_Broad_Category_Name", None),
                "returns": returns[row["SymbolCUSIP"]].to_dict()
            }
            for _, row in fund_meta.iterrows() if row["SymbolCUSIP"] in returns.columns
        ]
        if not region_funds:
            logger.warning(f"No valid returns data for {region} with SymbolCUSIP {region_fund_ids[:5]}...")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 0}
            continue
        
        records, errors = process_region(region, region_funds)
        summary["regions"][region] = {"funds_processed": len(region_funds), "records": len(records), "errors": errors}
        summary["errors"] += errors
    
    logger.info(f"Pipeline summary: {summary}")
    log_summary(f"Pipeline completed: {summary}")
    return summary

# Section 7: Database Output
def insert_batch(records):
    try:
        df = pd.DataFrame(records)
        with database_transaction() as connection:
            df.to_sql("AQRR_Factor_Attribution", connection, if_exists="append", index=False, method="multi")
        logger.info(f"Inserted {len(df)} records to database")
    except Exception as e:
        logger.error(f"Error inserting batch: {e}")
        raise

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logger.error(f"Main execution failed: {e}")
        raise

2025-04-16 17:50:13,714 - INFO - Starting main pipeline
2025-04-16 17:50:13,906 - INFO - Loaded metadata for 5584 funds
2025-04-16 17:50:14,293 - INFO - load_fund_metadata took 0.58 seconds
2025-04-16 17:50:14,295 - INFO - Total funds: 5584, Regions: ['Global' 'USA' 'International']
2025-04-16 17:50:14,296 - INFO - Sampled 10 funds
2025-04-16 17:50:14,298 - INFO - Sampled 10 SymbolCUSIP for Global
2025-04-16 17:50:14,319 - INFO - Loaded returns for 10 funds
2025-04-16 17:50:14,323 - INFO - load_fund_returns took 0.02 seconds
2025-04-16 17:50:14,471 - INFO - Processing 10 funds in Global
2025-04-16 17:50:14,879 - INFO - Generated 0 regression records for SCIEX
2025-04-16 17:50:14,886 - INFO - Generated 0 regression records for VIDI
2025-04-16 17:50:14,943 - INFO - Generated 0 regression records for PIVYX
2025-04-16 17:50:14,954 - INFO - Generated 0 regression records for CIHIX
2025-04-16 17:50:14,965 - INFO - Generated 0 regression records for DWCR
2025-04-16 17:50:15,020 - INFO - Gener

In [17]:
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
import statsmodels.api as sm
import time
import sys
from contextlib import contextmanager
from functools import wraps

# Section 1: Configuration and Logging
CONFIG = {
    "database": {
        "server": "JULIANS_LAPTOP\\SQLEXPRESS",
        "database": "CWA_Fund_Database",
        "driver": "ODBC Driver 18 for SQL Server"
    },
    "return_metric": "1 Month Return",
    "rolling_periods": [12, 24, 36, 48, 60],
    "dry_run": True,
    "sample_dry_run": True,
    "sample_size": 10,
    "chunk_size": 5600,
    "batch_insert_size": 10000,
    "max_workers_cpu": 16,  # Optimized for 16-core i9 Ultra
    "batch_size": 100,  # Process 100 funds per batch
}

CONNECTION_STRING = (
    f"mssql+pyodbc://{CONFIG['database']['server']}/{CONFIG['database']['database']}"
    f"?driver={CONFIG['database']['driver']}&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(CONNECTION_STRING, pool_size=20, max_overflow=10)

RETURN_METRIC = CONFIG["return_metric"]
ROLLING_PERIODS = CONFIG["rolling_periods"]
DRY_RUN = CONFIG["dry_run"]
SAMPLE_DRY_RUN = CONFIG["sample_dry_run"]
SAMPLE_SIZE = CONFIG["sample_size"]
CHUNK_SIZE = CONFIG["chunk_size"]
BATCH_INSERT_SIZE = CONFIG["batch_insert_size"]
MAX_WORKERS_CPU = CONFIG["max_workers_cpu"]
BATCH_SIZE = CONFIG["batch_size"]

# Logging setup
SUMMARY_LOG = "factor_attribution_summary.log"
logging.basicConfig(
    level=logging.WARNING,  # WARNING for production, DEBUG for targeted sections
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("factor_attribution.log"),
        logging.StreamHandler(sys.stdout)
    ],
    force=True
)
logger = logging.getLogger()

def log_summary(message):
    with open(SUMMARY_LOG, 'a') as f:
        f.write(f"{datetime.now()}: {message}\n")

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Natural Resources Sector Equity": ("USA", "Natural Resources Sector Equity"),
        "Infrastructure Sector Equity": ("USA", "Infrastructure Sector Equity"),
        "Trading Tools": ("USA", "Trading Tools"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    return mapping.get(category, ("USA", "Unknown"))

@contextmanager
def database_transaction():
    with engine.connect() as connection:
        transaction = connection.begin()
        try:
            yield connection
            transaction.commit()
        except Exception as e:
            transaction.rollback()
            logger.error(f"Transaction failed: {e}")
            raise

def timer(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        logger.debug(f"Starting {func.__name__}")
        result = func(*args, **kwargs)
        logger.info(f"{func.__name__} took {time.time() - start_time:.2f} seconds")
        return result
    return wrapper

# Section 3: Data Loading
FACTOR_CACHE = {}

@timer
def load_db_factors(factor_list, region="Global", table="factor_returns", asset_class=None):
    cache_key = (tuple(factor_list), region, table, asset_class)
    if cache_key in FACTOR_CACHE:
        logger.debug(f"Using cached factors for {cache_key}")
        return FACTOR_CACHE[cache_key]
    
    factor_dfs = []
    for factor in factor_list:
        query = f"""
            SELECT Date, Factor, Value, region
            FROM {table}
            WHERE Factor = '{factor}'
            AND Date >= '2015-01-01'
            AND UPPER(region) = UPPER('{region}')
        """
        if factor == 'RF':
            query = query.replace(f"region = '{region}'", "region = 'USA'")
        if asset_class:
            query += f" AND asset_class = '{asset_class}'"
        logger.debug(f"Executing query: {query}")
        df = pd.read_sql_query(query, engine, parse_dates=['Date'])
        if df.empty:
            logger.warning(f"No data for factor {factor} in {region}")
            continue
        if df[['Date', 'Factor']].duplicated().any():
            logger.error(f"Non-unique Date-Factor pairs for {factor}: {df[df[['Date', 'Factor']].duplicated()]}")
            return pd.DataFrame()
        factor_dfs.append(df)
    
    if not factor_dfs:
        logger.warning(f"No data for factors {factor_list} in {table} (region: {region})")
        return pd.DataFrame()
    
    df = pd.concat(factor_dfs)
    logger.debug(f"Raw factor data shape: {df.shape}, sample:\n{df.head()}")
    
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df = pivoted_df[~pivoted_df.index.duplicated(keep='first')]
    pivoted_df = pivoted_df.asfreq('ME')
    FACTOR_CACHE[cache_key] = pivoted_df
    return pivoted_df

@timer
def load_century_factors(factor_list, portfolio_base, asset_class=None, region="Global"):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value, portfolio
        FROM aqr_century_factors
        WHERE factor IN ({factor_in_clause})
        AND portfolio = ?
        AND date >= '2015-01-01'
    """
    params = [portfolio_base]
    if asset_class:
        query += " AND asset_class = ?"
        params.append(asset_class)
    if region != "Global":
        query += " AND region = ?"
        params.append(region)
    logger.debug(f"Executing query: {query}, params: {params}")
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    if df.empty:
        logger.warning(f"No data for factors {factor_list} in aqr_century_factors (portfolio: {portfolio_base}, region: {region})")
        return pd.DataFrame()
    if df[['Date', 'Factor']].duplicated().any():
        logger.error(f"Non-unique Date-Factor pairs in aqr_century_factors: {df[df[['Date', 'Factor']].duplicated()]}")
        return pd.DataFrame()
    logger.debug(f"Raw century factors shape: {df.shape}, portfolio: {df['portfolio'].unique()}, sample:\n{df.head()}")
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df = pivoted_df[~pivoted_df.index.duplicated(keep='first')]
    pivoted_df = pivoted_df.asfreq('ME')
    if pivoted_df.empty or pivoted_df.shape[1] == 0:
        logger.warning(f"Empty pivoted data for factors {factor_list} (portfolio: {portfolio_base})")
        return pd.DataFrame()
    return pivoted_df

@timer
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    df = pd.read_sql(query, engine)
    logger.info(f"Loaded metadata for {len(df)} funds")
    df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
    if df["CWA_Broad_Category_Name"].isnull().all():
        logger.warning("CWA_Broad_Category_Name missing; Equity regressions 5-7 will be skipped")
    return df.dropna(subset=["Region", "FactorProfile"])

@timer
def load_fund_returns(fund_ids=None):
    query = """
    SELECT SymbolCUSIP, Date, ReturnValue
    FROM Fund_Returns_Timeseries
    WHERE Metric = '1 Month Return' AND ReturnValue IS NOT NULL AND Date IS NOT NULL
    """
    if fund_ids:
        fund_ids = [fid for fid in fund_ids if isinstance(fid, str) and fid.strip()]
        if not fund_ids:
            logger.warning("No valid SymbolCUSIP provided")
            return pd.DataFrame()
        fund_ids_str = ",".join([f"'{fid}'" for fid in fund_ids])
        query += f" AND SymbolCUSIP IN ({fund_ids_str})"
    
    logger.debug(f"Executing query: {query}")
    chunks = []
    for chunk in pd.read_sql_query(query, engine, parse_dates=["Date"], chunksize=CHUNK_SIZE):
        logger.debug(f"Loaded chunk of {len(chunk)} rows")
        chunks.append(chunk)
    df = pd.concat(chunks) if chunks else pd.DataFrame()
    if df.empty:
        logger.warning(f"No returns data loaded for SymbolCUSIP: {fund_ids}")
        return pd.DataFrame()
    
    logger.info(f"Loaded returns for {len(df['SymbolCUSIP'].unique())} funds")
    pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
    pivoted.index = pd.to_datetime(pivoted.index) + pd.offsets.MonthEnd(0)
    pivoted.index = pivoted.index.drop_duplicates()
    pivoted = pivoted.asfreq('ME')
    for col in pivoted.columns:
        if not pd.api.types.is_numeric_dtype(pivoted[col]):
            logger.warning(f"Non-numeric returns for {col}; converting")
            pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
    return pivoted

@timer
def load_commodity_factors():
    query = """
        SELECT date AS Date, 
               excess_return_eqwt, 
               excess_spot_return_eqwt, 
               ir_adjusted_carry_eqwt, 
               spot_return_eqwt, 
               carry_eqwt, 
               excess_return_long_short, 
               excess_spot_return_long_short, 
               ir_adjusted_carry_long_short, 
               aggregate_backwardation_contango
        FROM aqr_cmdty_factors
    """
    df = pd.read_sql(query, engine, parse_dates=['Date'])
    if df.empty:
        logger.warning("No commodity factors loaded")
        return pd.DataFrame()
    pivoted_df = df.set_index("Date")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df.index = pivoted_df.index.drop_duplicates()
    pivoted_df = pivoted_df.asfreq('ME')
    return pivoted_df

@timer
def load_fixed_income_factors(factor_list):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
        WHERE Factor_Name IN ({factor_in_clause})
    """
    logger.debug(f"Executing query: {query}")
    df = pd.read_sql_query(query, engine, parse_dates=["Date"])
    if df.empty:
        logger.warning(f"No fixed income factors for {factor_list}")
        return pd.DataFrame()
    pivoted_df = df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df.index = pivoted_df.index.drop_duplicates()
    pivoted_df = pivoted_df.asfreq('ME')
    return pivoted_df

# Section 4: Regression Processing
def run_rolling_regression(symbol, returns, factor_data, regression_type, regression_name, window_months):
    records = []
    months_needed = window_months
    
    returns = returns.sort_index()
    factor_data = factor_data.sort_index()
    
    available_dates = returns.index.intersection(factor_data.index)
    if len(available_dates) < months_needed:
        logger.warning(f"Fund {symbol} ({regression_name}, window={window_months}m) skipped: {len(available_dates)} months available")
        return records
    
    for end_date in available_dates[months_needed-1::1]:
        start_date = end_date - pd.offsets.MonthEnd(months_needed)
        if start_date < available_dates[0]:
            continue
        
        window_returns = returns.loc[start_date:end_date]
        window_factors = factor_data.loc[start_date:end_date]
        
        window_returns = window_returns.reindex(window_factors.index)
        
        if len(window_returns.dropna()) < months_needed * 0.8 or len(window_factors.dropna()) < months_needed * 0.8:
            logger.debug(f"Skipping regression for {symbol} ({regression_name}, window={window_months}m, end_date={end_date}): insufficient data")
            continue
        
        try:
            if regression_type == "OLS":
                X = sm.add_constant(window_factors)
                model = sm.OLS(window_returns, X, missing='drop').fit()
                coefficients = model.params.to_dict()
                tvalues = model.tvalues.to_dict()
                pvalues = model.pvalues.to_dict()
                r_squared = model.rsquared
                record = {
                    "SymbolCUSIP": symbol,
                    "RegressionName": regression_name,
                    "Window": f"{window_months}m",
                    "EndDate": end_date,
                    "R_Squared": r_squared
                }
                for factor in coefficients:
                    record[f"{factor}_beta"] = coefficients[factor]
                    record[f"{factor}_tvalue"] = tvalues.get(factor, None)
                    record[f"{factor}_pvalue"] = pvalues.get(factor, None)
                records.append(record)
        except Exception as e:
            logger.warning(f"Regression failed for {symbol} ({regression_name}, window={window_months}m, end_date={end_date}): {str(e)}")
            continue
    
    if not records:
        logger.info(f"No regressions completed for {symbol} ({regression_name}, window={window_months}m)")
    return records

# Section 5: Fund Processing
category_to_regressions = {
    "Energy Sector Equity": "Equity_USA",
    "Equity Miscellaneous": "Equity_USA",
    "Financials Sector Equity": "Equity_USA",
    "Healthcare Sector Equity": "Equity_USA",
    "Consumer Goods & Services Sector Equity": "Equity_USA",
    "Communications Sector Equity": "Equity_USA",
    "Industrials Sector Equity": "Equity_USA",
    "Other Sector Equity": "Equity_USA",
    "Real Estate Sector Equity": "Equity_USA",
    "Precious Metals Sector Equity": "Equity_USA",
    "Technology Sector Equity": "Equity_USA",
    "Utilities Sector Equity": "Equity_USA",
    "US Equity Large Cap Blend": "Equity_USA",
    "US Equity Large Cap Growth": "Equity_USA",
    "US Equity Large Cap Value": "Equity_USA",
    "US Equity Mid Cap": "Equity_USA",
    "US Equity Small Cap": "Equity_USA",
    "Options Trading": "Equity_USA",
    "Natural Resources Sector Equity": "Equity_USA",
    "Infrastructure Sector Equity": "Equity_USA",
    "Asia ex-Japan Equity": "Equity_Intl",
    "Australia & New Zealand Equity": "Equity_Intl",
    "Canadian Equity Large Cap": "Equity_Intl",
    "Europe Equity Large Cap": "Equity_Intl",
    "Europe Equity Mid/Small Cap": "Equity_Intl",
    "Greater China Equity": "Equity_Intl",
    "India Equity": "Equity_Intl",
    "Mexico Equity": "Equity_Intl",
    "Japan Equity": "Equity_Intl",
    "Korea Equity": "Equity_Intl",
    "Latin America Equity": "Equity_Intl",
    "UK Equity Large Cap": "Equity_Intl",
    "Thailand Equity": "Equity_Intl",
    "Global Emerging Markets Equity": "Equity_Global",
    "Global Equity Large Cap": "Equity_Global",
    "Global Equity Mid/Small Cap": "Equity_Global",
    "Global Fixed Income": "Fixed_Income",
    "Convertibles": "Fixed_Income",
    "Emerging Markets Fixed Income": "Fixed_Income",
    "Fixed Income Miscellaneous": "Fixed_Income",
    "US Fixed Income": "Fixed_Income",
    "US Municipal Fixed Income": "Fixed_Income",
    "Aggressive Allocation": "Allocation",
    "Allocation Miscellaneous": "Allocation",
    "Cautious Allocation": "Allocation",
    "Flexible Allocation": "Allocation",
    "Moderate Allocation": "Allocation",
    "Alternative Miscellaneous": "Alternative",
    "Long/Short Equity": "Alternative",
    "Market Neutral": "Alternative",
    "Multialternative": "Alternative",
    "Commodities Broad Basket": "Commodity",
    "Commodities Specified": "Commodity"
}

regression_sets = {
    "Equity_USA": [
        ("Equity_USA_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB'], "USA", None),
        ("Equity_USA_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "USA", "US Stock Selection"),
        ("Equity_USA_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "USA", "All Macro"),
        ("Equity_USA_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "USA", "Equity indices"),
        ("Equity_USA_5", ['MKT', 'BAB', 'TSM-FI', 'TSM-FX', 'excess_return_eqwt'], "USA", None),
        ("Equity_USA_6", ['MKT', 'SMB', 'BAB', 'TSM-Com'], "USA", None),
        ("Equity_USA_7", ['MKT', 'HML_Devil', 'QMJ', 'UMD', 'SMB', 'BAB', 'TSM-Com', 'TSM-FI', 'TSM-FX'], "USA", None)
    ],
    "Equity_Intl": [
        ("Equity_Intl_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Intl", None),
        ("Equity_Intl_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "International", "Intl Stock Selection"),
        ("Equity_Intl_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "International", "All Macro"),
        ("Equity_Intl_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "International", "Equity indices")
    ],
    "Equity_Global": [
        ("Equity_Global_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Global", None),
        ("Equity_Global_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Stock Selection"),
        ("Equity_Global_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
        ("Equity_Global_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
    ],
    "Fixed_Income": [
        ("FI_1", ['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'TSM-FI', 'TSM-FX'], "Global", None),
        ("FI_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Fixed income"),
        ("FI_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
        ("FI_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
    ],
    "Allocation": [
        ("Allocation_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ', 'TSM-FI'], "Global", None),
        ("Allocation_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Stock Selection"),
        ("Allocation_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
        ("Allocation_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
    ],
    "Alternative": [
        ("Alternative_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ', 'TSM-FI', 'TSM-Com', 'TSM-FX'], "Global", None),
        ("Allocation_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Stock Selection"),
        ("Allocation_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
        ("Allocation_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
    ],
    "Commodity": [
        ("Commodity_1", ['excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt', 'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short', 'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango'], "Global", None)
    ]
}

def process_fund(fund_data):
    records = []
    symbol = fund_data["SymbolCUSIP"]
    category = fund_data["Global_Category_Name"]
    returns = pd.Series(fund_data["returns"]).dropna()
    
    if returns.empty:
        logger.warning(f"No valid returns for {symbol}")
        return records
    
    regression_category = category_to_regressions.get(category, "Allocation")
    if regression_category == "Allocation" and category not in category_to_regressions:
        logger.warning(f"Unmapped category {category} for {symbol}; using Allocation")
    
    factor_data_cache = {}
    for reg_name, factors, region, portfolio_base in regression_sets[regression_category]:
        try:
            logger.debug(f"Loading factors for {symbol} ({reg_name}): factors {factors}, region {region}, portfolio_base {portfolio_base}")
            if any(f in factors for f in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']):
                if not portfolio_base:
                    logger.warning(f"Skipping {reg_name} for {symbol}: portfolio_base missing")
                    continue
                factor_data = load_century_factors(factors, portfolio_base, region=region)
            elif reg_name.startswith("Commodity"):
                factor_data = load_commodity_factors()
            elif reg_name.startswith("FI"):
                fi_factors = [f for f in factors if f in ['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']]
                other_factors = [f for f in factors if f not in fi_factors]
                factor_data = pd.concat([
                    load_db_factors(other_factors, region) if other_factors else pd.DataFrame(index=returns.index),
                    load_fixed_income_factors(fi_factors) if fi_factors else pd.DataFrame(index=returns.index)
                ], axis=1).dropna()
            else:
                factor_data = load_db_factors(factors, region)
            if not isinstance(factor_data, pd.DataFrame):
                logger.error(f"Invalid factor data type for {reg_name} ({symbol}): {type(factor_data)}")
                continue
            if factor_data.empty or factor_data.shape[1] == 0:
                logger.warning(f"Empty factor data for {reg_name} ({symbol}): factors {factors}")
                continue
            logger.debug(f"Factor data for {reg_name} ({symbol}): shape {factor_data.shape}, columns {factor_data.columns.tolist()}")
            factor_data = factor_data[factor_data.index >= '2015-01-01']
            factor_data_cache[(reg_name, region)] = factor_data
        except Exception as e:
            logger.warning(f"Error loading factors for {symbol} ({reg_name}): {str(e)}")
            continue
    
    for reg_name, factors, region, portfolio_base in regression_sets[regression_category]:
        factor_data = factor_data_cache.get((reg_name, region))
        if factor_data is None:
            continue
        for window in ROLLING_PERIODS:
            records.extend(run_rolling_regression(symbol, returns, factor_data, "OLS", reg_name, window))
    
    logger.info(f"Generated {len(records)} regression records for {symbol}")
    return records

def process_region(region, fund_data_list):
    records = []
    errors = 0
    logger.info(f"Processing {len(fund_data_list)} funds in {region}")
    
    if SAMPLE_DRY_RUN:
        # Sequential processing for debugging
        for fund_data in tqdm(fund_data_list, total=len(fund_data_list), desc=f"Processing {region}", file=sys.stdout):
            try:
                records.extend(process_fund(fund_data))
            except Exception as e:
                logger.error(f"Error processing {fund_data['SymbolCUSIP']}: {str(e)}")
                errors += 1
    else:
        # Parallel processing for production
        for i in range(0, len(fund_data_list), BATCH_SIZE):
            batch = fund_data_list[i:i + BATCH_SIZE]
            with ProcessPoolExecutor(max_workers=MAX_WORKERS_CPU) as executor:
                future_to_fund = {
                    executor.submit(process_fund, fund_data): fund_data["SymbolCUSIP"]
                    for fund_data in batch
                }
                for future in tqdm(future_to_fund, total=len(batch), desc=f"Processing {region} batch", file=sys.stdout):
                    try:
                        records.extend(future.result())
                    except Exception as e:
                        logger.error(f"Error processing {future_to_fund[future]}: {str(e)}")
                        errors += 1
    
    logger.info(f"Region {region} generated {len(records)} total records with {errors} errors")
    log_summary(f"Region {region}: {len(fund_data_list)} funds, {len(records)} records, {errors} errors")
    if not DRY_RUN:
        insert_batch(records)
    
    return records, errors

# Section 6: Database Output
def insert_batch(records):
    if DRY_RUN:
        logger.info(f"Dry run: Would insert {len(records)} records")
        return
    try:
        df = pd.DataFrame(records)
        with database_transaction() as connection:
            df.to_sql("AQRR_Factor_Attribution", connection, if_exists="append", index=False, method="multi")
        logger.info(f"Inserted {len(df)} records to database")
    except Exception as e:
        logger.error(f"Error inserting batch: {e}")
        raise

# Section 7: Main Pipeline
@timer
def main():
    logger.info("Starting main pipeline")
    log_summary("Pipeline started")
    try:
        fund_meta = load_fund_metadata()
    except Exception as e:
        logger.error(f"Failed to load metadata: {e}")
        log_summary(f"Error: Failed to load metadata: {e}")
        return {"error": str(e)}
    
    regions = fund_meta["Region"].unique()
    logger.info(f"Total funds: {len(fund_meta)}, Regions: {regions}")
    log_summary(f"Total funds: {len(fund_meta)}, Regions: {regions}")
    
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
        logger.info(f"Sampled {len(fund_ids)} funds")
        log_summary(f"Sampled {len(fund_ids)} funds")
    
    summary = {"total_funds": len(fund_ids), "regions": {}, "errors": 0}
    for region in sorted(set(regions) - {'Unknown'} | {'USA'}):
        region_fund_ids = fund_meta[fund_meta["Region"] == region]["SymbolCUSIP"].tolist()
        if not region_fund_ids:
            logger.warning(f"No SymbolCUSIP found for region {region}")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 0}
            continue
        if SAMPLE_DRY_RUN:
            region_fund_ids = random.sample(region_fund_ids, min(SAMPLE_SIZE, len(region_fund_ids)))
            logger.info(f"Sampled {len(region_fund_ids)} SymbolCUSIP for {region}")
        
        try:
            returns = load_fund_returns(region_fund_ids)
        except Exception as e:
            logger.error(f"Failed to load returns for {region}: {e}")
            log_summary(f"Error: Failed to load returns for {region}: {e}")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 1}
            summary["errors"] += 1
            continue
        
        region_funds = [
            {
                "SymbolCUSIP": row["SymbolCUSIP"],
                "Global_Category_Name": row["Global_Category_Name"],
                "CWA_Broad_Category_Name": row.get("CWA_Broad_Category_Name", None),
                "returns": returns[row["SymbolCUSIP"]].to_dict()
            }
            for _, row in fund_meta.iterrows() if row["SymbolCUSIP"] in returns.columns
        ]
        if not region_funds:
            logger.warning(f"No valid returns data for {region}")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 0}
            continue
        
        records, errors = process_region(region, region_funds)
        summary["regions"][region] = {"funds_processed": len(region_funds), "records": len(records), "errors": errors}
        summary["errors"] += errors
    
    logger.info(f"Pipeline summary: {summary}")
    log_summary(f"Pipeline completed: {summary}")
    return summary

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logger.error(f"Main execution failed: {e}")
        raise

Processing Global: 100%|██████████| 10/10 [00:02<00:00,  3.96it/s]
Processing International: 100%|██████████| 10/10 [00:07<00:00,  1.27it/s]
Processing USA: 100%|██████████| 10/10 [00:14<00:00,  1.40s/it]


In [None]:
#chat gpt help

In [None]:
#this attempts to find the version inbetween above, and below when the code worked reasonably well except century factors

In [1]:
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
import statsmodels.api as sm
import time
import sys
from contextlib import contextmanager
from functools import wraps

# Section 1: Configuration and Logging
CONFIG = {
    "database": {
        "server": "JULIANS_LAPTOP\\SQLEXPRESS",
        "database": "CWA_Fund_Database",
        "driver": "ODBC Driver 18 for SQL Server"
    },
    "return_metric": "1 Month Return",
    "rolling_periods": [12, 24, 36, 48, 60],
    "dry_run": True,
    "sample_dry_run": True,
    "sample_size": 10,
    "chunk_size": 5600,
    "batch_insert_size": 10000,
    "max_workers_cpu": 16,  # Optimized for 16-core i9 Ultra
    "batch_size": 100,  # Process 100 funds per batch
}

CONNECTION_STRING = (
    f"mssql+pyodbc://{CONFIG['database']['server']}/{CONFIG['database']['database']}"
    f"?driver={CONFIG['database']['driver']}&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(CONNECTION_STRING, pool_size=20, max_overflow=10)

RETURN_METRIC = CONFIG["return_metric"]
ROLLING_PERIODS = CONFIG["rolling_periods"]
DRY_RUN = CONFIG["dry_run"]
SAMPLE_DRY_RUN = CONFIG["sample_dry_run"]
SAMPLE_SIZE = CONFIG["sample_size"]
CHUNK_SIZE = CONFIG["chunk_size"]
BATCH_INSERT_SIZE = CONFIG["batch_insert_size"]
MAX_WORKERS_CPU = CONFIG["max_workers_cpu"]
BATCH_SIZE = CONFIG["batch_size"]

# Logging setup
SUMMARY_LOG = "factor_attribution_summary.log"
logging.basicConfig(
    level=logging.WARNING,  # WARNING for production, DEBUG for targeted sections
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("factor_attribution.log"),
        logging.StreamHandler(sys.stdout)
    ],
    force=True
)
logger = logging.getLogger()

def log_summary(message):
    with open(SUMMARY_LOG, 'a') as f:
        f.write(f"{datetime.now()}: {message}\n")

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Natural Resources Sector Equity": ("USA", "Natural Resources Sector Equity"),
        "Infrastructure Sector Equity": ("USA", "Infrastructure Sector Equity"),
        "Trading Tools": ("USA", "Trading Tools"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    return mapping.get(category, ("USA", "Unknown"))

@contextmanager
def database_transaction():
    with engine.connect() as connection:
        transaction = connection.begin()
        try:
            yield connection
            transaction.commit()
        except Exception as e:
            transaction.rollback()
            logger.error(f"Transaction failed: {e}")
            raise

def timer(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        logger.debug(f"Starting {func.__name__}")
        result = func(*args, **kwargs)
        logger.info(f"{func.__name__} took {time.time() - start_time:.2f} seconds")
        return result
    return wrapper

# Section 3: Data Loading
FACTOR_CACHE = {}

@timer
def load_db_factors(factor_list, region="Global", table="factor_returns", asset_class=None):
    # Adding basic caching mechanism
    cache_key = (tuple(factor_list), region, table, asset_class)
    if cache_key in FACTOR_CACHE:
        logger.debug(f"Using cached factors for {cache_key}")
        return FACTOR_CACHE[cache_key]
    
    factor_dfs = []
    for factor in factor_list:
        query = f"""
            SELECT Date, Factor, Value, region
            FROM {table}
            WHERE Factor = '{factor}'
            AND Date >= '2015-01-01'
            AND UPPER(region) = UPPER('{region}')
        """
        if factor == 'RF':
            query = query.replace(f"region = '{region}'", "region = 'USA'")
        if asset_class:
            query += f" AND asset_class = '{asset_class}'"
        logger.debug(f"Executing query: {query}")
        df = pd.read_sql_query(query, engine, parse_dates=['Date'])
        if df.empty:
            logger.warning(f"No data for factor {factor} in {region}")
            continue
        if df[['Date', 'Factor']].duplicated().any():
            logger.error(f"Non-unique Date-Factor pairs for {factor}: {df[df[['Date', 'Factor']].duplicated()]}")
            return pd.DataFrame()
        factor_dfs.append(df)
    
    if not factor_dfs:
        logger.warning(f"No data for factors {factor_list} in {table} (region: {region})")
        return pd.DataFrame()
    
    df = pd.concat(factor_dfs)
    logger.debug(f"Raw factor data shape: {df.shape}, sample:\n{df.head()}")
    
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df = pivoted_df[~pivoted_df.index.duplicated(keep='first')]
    pivoted_df = pivoted_df.asfreq('ME')
    FACTOR_CACHE[cache_key] = pivoted_df
    return pivoted_df

@timer
def load_century_factors(factor_list, portfolio_base, asset_class=None, region="Global"):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value, portfolio
        FROM aqr_century_factors
        WHERE factor IN ({factor_in_clause})
        AND portfolio = ?
        AND date >= '2015-01-01'
    """
    params = [portfolio_base]
    if asset_class:
        query += " AND asset_class = ?"
        params.append(asset_class)
    if region != "Global":
        query += " AND region = ?"
        params.append(region)
    logger.debug(f"Executing query: {query}, params: {params}")
    df = pd.read_sql(query, engine, params=params, parse_dates=['Date'])
    if df.empty:
        logger.warning(f"No data for factors {factor_list} in aqr_century_factors (portfolio: {portfolio_base}, region: {region})")
        return pd.DataFrame()
    if df[['Date', 'Factor']].duplicated().any():
        logger.error(f"Non-unique Date-Factor pairs in aqr_century_factors: {df[df[['Date', 'Factor']].duplicated()]}")
        return pd.DataFrame()
    logger.debug(f"Raw century factors shape: {df.shape}, portfolio: {df['portfolio'].unique()}, sample:\n{df.head()}")
    pivoted_df = df.pivot(index="Date", columns="Factor", values="Value")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df = pivoted_df[~pivoted_df.index.duplicated(keep='first')]
    pivoted_df = pivoted_df.asfreq('ME')
    if pivoted_df.empty or pivoted_df.shape[1] == 0:
        logger.warning(f"Empty pivoted data for factors {factor_list} (portfolio: {portfolio_base})")
        return pd.DataFrame()
    return pivoted_df

# Section 4: Fund Processing
category_to_regressions = {
    "Energy Sector Equity": "Equity_USA",
    "Equity Miscellaneous": "Equity_USA",
    "Financials Sector Equity": "Equity_USA",
    "Healthcare Sector Equity": "Equity_USA",
    "Consumer Goods & Services Sector Equity": "Equity_USA",
    "Communications Sector Equity": "Equity_USA",
    "Industrials Sector Equity": "Equity_USA",
    "Other Sector Equity": "Equity_USA",
    "Real Estate Sector Equity": "Equity_USA",
    "Precious Metals Sector Equity": "Equity_USA",
    "Technology Sector Equity": "Equity_USA",
    "Utilities Sector Equity": "Equity_USA",
    "US Equity Large Cap Blend": "Equity_USA",
    "US Equity Large Cap Growth": "Equity_USA",
    "US Equity Large Cap Value": "Equity_USA",
    "US Equity Mid Cap": "Equity_USA",
    "US Equity Small Cap": "Equity_USA",
    "Options Trading": "Equity_USA",
    "Natural Resources Sector Equity": "Equity_USA",
    "Infrastructure Sector Equity": "Equity_USA",
    "Asia ex-Japan Equity": "Equity_Intl",
    "Australia & New Zealand Equity": "Equity_Intl",
    "Canadian Equity Large Cap": "Equity_Intl",
    "Europe Equity Large Cap": "Equity_Intl",
    "Europe Equity Mid/Small Cap": "Equity_Intl",
    "Greater China Equity": "Equity_Intl",
    "India Equity": "Equity_Intl",
    "Mexico Equity": "Equity_Intl",
    "Japan Equity": "Equity_Intl",
    "Korea Equity": "Equity_Intl",
    "Latin America Equity": "Equity_Intl",
    "UK Equity Large Cap": "Equity_Intl",
    "Thailand Equity": "Equity_Intl",
    "Global Emerging Markets Equity": "Equity_Global",
    "Global Equity Large Cap": "Equity_Global",
    "Global Equity Mid/Small Cap": "Equity_Global",
    "Global Fixed Income": "Fixed_Income",
    "Convertibles": "Fixed_Income",
    "Emerging Markets Fixed Income": "Fixed_Income",
    "Fixed Income Miscellaneous": "Fixed_Income",
    "US Fixed Income": "Fixed_Income",
    "US Municipal Fixed Income": "Fixed_Income",
    "Aggressive Allocation": "Allocation",
    "Allocation Miscellaneous": "Allocation",
    "Cautious Allocation": "Allocation",
    "Flexible Allocation": "Allocation",
    "Moderate Allocation": "Allocation",
    "Alternative Miscellaneous": "Alternative",
    "Long/Short Equity": "Alternative",
    "Market Neutral": "Alternative",
    "Multialternative": "Alternative",
    "Commodities Broad Basket": "Commodity",
    "Commodities Specified": "Commodity"
}

regression_sets = {
    "Equity_USA": [
        ("Equity_USA_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB'], "USA", None),
        ("Equity_USA_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "USA", "US Stock Selection"),
        ("Equity_USA_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "USA", "All Macro"),
        ("Equity_USA_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "USA", "Equity indices"),
        ("Equity_USA_5", ['MKT', 'BAB', 'TSM-FI', 'TSM-FX', 'excess_return_eqwt'], "USA", None),
        ("Equity_USA_6", ['MKT', 'SMB', 'BAB', 'TSM-Com'], "USA", None),
        ("Equity_USA_7", ['MKT', 'HML_Devil', 'QMJ', 'UMD', 'SMB', 'BAB', 'TSM-Com', 'TSM-FI', 'TSM-FX'], "USA", None)
    ],
    "Equity_Intl": [
        ("Equity_Intl_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Intl", None),
        ("Equity_Intl_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "International", "Intl Stock Selection"),
        ("Equity_Intl_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "International", "All Macro"),
        ("Equity_Intl_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "International", "Equity indices")
    ],
    "Equity_Global": [
        ("Equity_Global_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Global", None),
        ("Equity_Global_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Stock Selection"),
        ("Equity_Global_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
        ("Equity_Global_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
    ],
    "Fixed_Income": [
        ("FI_1", ['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'TSM-FI', 'TSM-FX'], "Global", None),
        ("FI_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Fixed income"),
        ("FI_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
        ("FI_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
    ],
    "Allocation": [
        ("Allocation_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ', 'TSM-FI'], "Global", None),
        ("Allocation_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Stock Selection"),
        ("Allocation_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
        ("Allocation_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
    ],
    "Alternative": [
        ("Alternative_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ', 'TSM-FI', 'TSM-Com', 'TSM-FX'], "Global", None),
        ("Allocation_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Stock Selection"),
        ("Allocation_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
        ("Allocation_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
    ],
    "Commodity": [
        ("Commodity_1", ['excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt', 'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short', 'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango'], "Global", None)
    ]
}

# Section 5: Fund Processing & Database Insertion remains the same...
# Section 5: Fund Processing


def process_fund(fund_data):
    records = []
    symbol = fund_data["SymbolCUSIP"]
    category = fund_data["Global_Category_Name"]
    returns = pd.Series(fund_data["returns"]).dropna()
    
    if returns.empty:
        logger.warning(f"No valid returns for {symbol}")
        return records
    
    regression_category = category_to_regressions.get(category, "Allocation")
    if regression_category == "Allocation" and category not in category_to_regressions:
        logger.warning(f"Unmapped category {category} for {symbol}; using Allocation")
    
    factor_data_cache = {}
    for reg_name, factors, region, portfolio_base in regression_sets[regression_category]:
        try:
            logger.debug(f"Loading factors for {symbol} ({reg_name}): factors {factors}, region {region}, portfolio_base {portfolio_base}")
            if any(f in factors for f in ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']):
                if not portfolio_base:
                    logger.warning(f"Skipping {reg_name} for {symbol}: portfolio_base missing")
                    continue
                factor_data = load_century_factors(factors, portfolio_base, region=region)
            elif reg_name.startswith("Commodity"):
                factor_data = load_commodity_factors()
            elif reg_name.startswith("FI"):
                fi_factors = [f for f in factors if f in ['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']]
                other_factors = [f for f in factors if f not in fi_factors]
                factor_data = pd.concat([
                    load_db_factors(other_factors, region) if other_factors else pd.DataFrame(index=returns.index),
                    load_fixed_income_factors(fi_factors) if fi_factors else pd.DataFrame(index=returns.index)
                ], axis=1).dropna()
            else:
                factor_data = load_db_factors(factors, region)
            if not isinstance(factor_data, pd.DataFrame):
                logger.error(f"Invalid factor data type for {reg_name} ({symbol}): {type(factor_data)}")
                continue
            if factor_data.empty or factor_data.shape[1] == 0:
                logger.warning(f"Empty factor data for {reg_name} ({symbol}): factors {factors}")
                continue
            logger.debug(f"Factor data for {reg_name} ({symbol}): shape {factor_data.shape}, columns {factor_data.columns.tolist()}")
            factor_data = factor_data[factor_data.index >= '2015-01-01']
            factor_data_cache[(reg_name, region)] = factor_data
        except Exception as e:
            logger.warning(f"Error loading factors for {symbol} ({reg_name}): {str(e)}")
            continue
    
    for reg_name, factors, region, portfolio_base in regression_sets[regression_category]:
        factor_data = factor_data_cache.get((reg_name, region))
        if factor_data is None:
            continue
        for window in ROLLING_PERIODS:
            records.extend(run_rolling_regression(symbol, returns, factor_data, "OLS", reg_name, window))
    
    logger.info(f"Generated {len(records)} regression records for {symbol}")
    return records

def process_region(region, fund_data_list):
    records = []
    errors = 0
    logger.info(f"Processing {len(fund_data_list)} funds in {region}")
    
    if SAMPLE_DRY_RUN:
        # Sequential processing for debugging
        for fund_data in tqdm(fund_data_list, total=len(fund_data_list), desc=f"Processing {region}", file=sys.stdout):
            try:
                records.extend(process_fund(fund_data))
            except Exception as e:
                logger.error(f"Error processing {fund_data['SymbolCUSIP']}: {str(e)}")
                errors += 1
    else:
        # Parallel processing for production
        for i in range(0, len(fund_data_list), BATCH_SIZE):
            batch = fund_data_list[i:i + BATCH_SIZE]
            with ProcessPoolExecutor(max_workers=MAX_WORKERS_CPU) as executor:
                future_to_fund = {
                    executor.submit(process_fund, fund_data): fund_data["SymbolCUSIP"]
                    for fund_data in batch
                }
                for future in tqdm(future_to_fund, total=len(batch), desc=f"Processing {region} batch", file=sys.stdout):
                    try:
                        records.extend(future.result())
                    except Exception as e:
                        logger.error(f"Error processing {future_to_fund[future]}: {str(e)}")
                        errors += 1
    
    logger.info(f"Region {region} generated {len(records)} total records with {errors} errors")
    log_summary(f"Region {region}: {len(fund_data_list)} funds, {len(records)} records, {errors} errors")
    if not DRY_RUN:
        insert_batch(records)
    
    return records, errors

# Section 6: Database Output
def insert_batch(records):
    if DRY_RUN:
        logger.info(f"Dry run: Would insert {len(records)} records")
        return
    try:
        df = pd.DataFrame(records)
        with database_transaction() as connection:
            df.to_sql("AQRR_Factor_Attribution", connection, if_exists="append", index=False, method="multi")
        logger.info(f"Inserted {len(df)} records to database")
    except Exception as e:
        logger.error(f"Error inserting batch: {e}")
        raise

# Section 7: Main Pipeline
@timer
def main():
    logger.info("Starting main pipeline")
    log_summary("Pipeline started")
    try:
        fund_meta = load_fund_metadata()
    except Exception as e:
        logger.error(f"Failed to load metadata: {e}")
        log_summary(f"Error: Failed to load metadata: {e}")
        return {"error": str(e)}
    
    regions = fund_meta["Region"].unique()
    logger.info(f"Total funds: {len(fund_meta)}, Regions: {regions}")
    log_summary(f"Total funds: {len(fund_meta)}, Regions: {regions}")
    
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
        logger.info(f"Sampled {len(fund_ids)} funds")
        log_summary(f"Sampled {len(fund_ids)} funds")
    
    summary = {"total_funds": len(fund_ids), "regions": {}, "errors": 0}
    for region in sorted(set(regions) - {'Unknown'} | {'USA'}):
        region_fund_ids = fund_meta[fund_meta["Region"] == region]["SymbolCUSIP"].tolist()
        if not region_fund_ids:
            logger.warning(f"No SymbolCUSIP found for region {region}")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 0}
            continue
        if SAMPLE_DRY_RUN:
            region_fund_ids = random.sample(region_fund_ids, min(SAMPLE_SIZE, len(region_fund_ids)))
            logger.info(f"Sampled {len(region_fund_ids)} SymbolCUSIP for {region}")
        
        try:
            returns = load_fund_returns(region_fund_ids)
        except Exception as e:
            logger.error(f"Failed to load returns for {region}: {e}")
            log_summary(f"Error: Failed to load returns for {region}: {e}")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 1}
            summary["errors"] += 1
            continue
        
        region_funds = [
            {
                "SymbolCUSIP": row["SymbolCUSIP"],
                "Global_Category_Name": row["Global_Category_Name"],
                "CWA_Broad_Category_Name": row.get("CWA_Broad_Category_Name", None),
                "returns": returns[row["SymbolCUSIP"]].to_dict()
            }
            for _, row in fund_meta.iterrows() if row["SymbolCUSIP"] in returns.columns
        ]
        if not region_funds:
            logger.warning(f"No valid returns data for {region}")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 0}
            continue
        
        records, errors = process_region(region, region_funds)
        summary["regions"][region] = {"funds_processed": len(region_funds), "records": len(records), "errors": errors}
        summary["errors"] += errors
    
    logger.info(f"Pipeline summary: {summary}")
    log_summary(f"Pipeline completed: {summary}")
    return summary

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logger.error(f"Main execution failed: {e}")
        raise


2025-04-21 15:47:04,802 - ERROR - Failed to load metadata: name 'load_fund_metadata' is not defined


In [None]:
#this was the last code before we swapt to a "new" and things broke

In [23]:
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
import statsmodels.api as sm
import time
import sys
from contextlib import contextmanager
from functools import wraps
from sqlalchemy import text


# Section 1: Configuration and Logging
CONFIG = {
    "database": {
        "server": "JULIANS_LAPTOP\\SQLEXPRESS",
        "database": "CWA_Fund_Database",
        "driver": "ODBC Driver 18 for SQL Server"
    },
    "return_metric": "1 Month Return",
    "rolling_periods": [12, 24, 36, 48, 60],
    "dry_run": True,
    "sample_dry_run": True,
    "sample_size": 10,
    "chunk_size": 5600,
    "batch_insert_size": 10000,
    "max_workers_cpu": 16,  # Optimized for 16-core i9 Ultra
    "batch_size": 100,  # Process 100 funds per batch
}

CONNECTION_STRING = (
    f"mssql+pyodbc://{CONFIG['database']['server']}/{CONFIG['database']['database']}"
    f"?driver={CONFIG['database']['driver']}&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(CONNECTION_STRING, pool_size=20, max_overflow=10)

RETURN_METRIC = CONFIG["return_metric"]
ROLLING_PERIODS = CONFIG["rolling_periods"]
DRY_RUN = CONFIG["dry_run"]
SAMPLE_DRY_RUN = CONFIG["sample_dry_run"]
SAMPLE_SIZE = CONFIG["sample_size"]
CHUNK_SIZE = CONFIG["chunk_size"]
BATCH_INSERT_SIZE = CONFIG["batch_insert_size"]
MAX_WORKERS_CPU = CONFIG["max_workers_cpu"]
BATCH_SIZE = CONFIG["batch_size"]

# Logging setup
SUMMARY_LOG = "factor_attribution_summary.log"
logging.basicConfig(
    level=logging.WARNING,  # WARNING for production, DEBUG for targeted sections
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("factor_attribution.log"),
        logging.StreamHandler(sys.stdout)
    ],
    force=True
)
logger = logging.getLogger()

def log_summary(message):
    with open(SUMMARY_LOG, 'a') as f:
        f.write(f"{datetime.now()}: {message}\n")

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("USA", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous"),
        "Energy Sector Equity": ("USA", "Energy Sector Equity"),
        "Equity Miscellaneous": ("USA", "Equity Miscellaneous"),
        "Financials Sector Equity": ("USA", "Financials Sector Equity"),
        "Healthcare Sector Equity": ("USA", "Healthcare Sector Equity"),
        "Consumer Goods & Services Sector Equity": ("USA", "Consumer Goods & Services Sector Equity"),
        "Communications Sector Equity": ("USA", "Communications Sector Equity"),
        "Industrials Sector Equity": ("USA", "Industrials Sector Equity"),
        "Other Sector Equity": ("USA", "Other Sector Equity"),
        "Real Estate Sector Equity": ("USA", "Real Estate Sector Equity"),
        "Precious Metals Sector Equity": ("USA", "Precious Metals Sector Equity"),
        "Technology Sector Equity": ("USA", "Technology Sector Equity"),
        "Utilities Sector Equity": ("USA", "Utilities Sector Equity"),
        "Natural Resources Sector Equity": ("USA", "Natural Resources Sector Equity"),
        "Infrastructure Sector Equity": ("USA", "Infrastructure Sector Equity"),
        "Trading Tools": ("USA", "Trading Tools"),
        "Asia ex-Japan Equity": ("International", "Asia ex-Japan Equity"),
        "Australia & New Zealand Equity": ("International", "Australia & New Zealand Equity"),
        "Canadian Equity Large Cap": ("International", "Canadian Equity Large Cap"),
        "Europe Equity Mid/Small Cap": ("International", "Europe Equity Mid/Small Cap"),
        "Greater China Equity": ("International", "Greater China Equity"),
        "India Equity": ("International", "India Equity"),
        "Mexico Equity": ("International", "Mexico Equity"),
        "Korea Equity": ("International", "Korea Equity"),
        "Latin America Equity": ("International", "Latin America Equity"),
        "UK Equity Large Cap": ("International", "UK Equity Large Cap"),
        "Thailand Equity": ("International", "Thailand Equity"),
        "Convertibles": ("USA", "Convertibles"),
        "Fixed Income Miscellaneous": ("USA", "Fixed Income Miscellaneous"),
        "Allocation Miscellaneous": ("Global", "Allocation Miscellaneous")
    }
    return mapping.get(category, ("USA", "Unknown"))

@contextmanager
def database_transaction():
    with engine.connect() as connection:
        transaction = connection.begin()
        try:
            yield connection
            transaction.commit()
        except Exception as e:
            transaction.rollback()
            logger.error(f"Transaction failed: {e}")
            raise

def timer(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        logger.debug(f"Starting {func.__name__}")
        result = func(*args, **kwargs)
        logger.info(f"{func.__name__} took {time.time() - start_time:.2f} seconds")
        return result
    return wrapper

# Section 3: Data Loading
FACTOR_CACHE = {}


#Section4: Load Factors from Database
def load_DB_Factors(engine, region: str, factor_list: list) -> pd.DataFrame:
    """
    Loads factor returns from the correct database tables for a given region and factor list.

    Parameters:
        engine: SQLAlchemy engine
        region: 'Global', 'USA', or 'International'
        factor_list: List of factor names to retrieve

    Returns:
        DataFrame with columns: ['Date', 'Factor', 'Value']
    """
    import pandas as pd

    queries = []
    for factor in factor_list:
        query = f"""
            SELECT
                date AS Date,
                '{factor}' AS Factor,
                value AS Value
            FROM aqr_century_factors
            WHERE region = :region
              AND factor = :factor
        """
        queries.append((query, {"region": region, "factor": factor}))

    all_data = []
    with engine.begin() as conn:
        for sql_text, params in queries:
            df = pd.read_sql(sql_text, conn, params=params)
            if not df.empty:
                all_data.append(df)

    if not all_data:
        print(f"⚠️ No Century factor data returned for region: {region}")
        return pd.DataFrame()

    combined = pd.concat(all_data, ignore_index=True)
    combined["Date"] = pd.to_datetime(combined["Date"])
    return combined.sort_values(["Date", "Factor"]).reset_index(drop=True)


@timer
def load_century_factors(engine, factors, portfolio, region):
    import pandas as pd
    import sqlalchemy as sa
    from datetime import datetime

    placeholders = ",".join(["?"] * len(factors))
    query = f"""
        SELECT factor, portfolio, region, date, value
        FROM aqr_century_factors
        WHERE factor IN ({placeholders})
          AND portfolio LIKE ?
          AND region = ?
    """

    # Bind factors, like-pattern for portfolio, and region
    bind_params = factors + [f"{portfolio}%", region]

    with engine.begin() as conn:
        df = pd.read_sql_query(sa.text(query), conn, params=bind_params)

    if df.empty:
        print(f"⚠️ No data loaded from aqr_century_factors for: "
              f"region={region}, portfolio LIKE {portfolio}%, factors={factors}")
        return pd.DataFrame()

    df["date"] = pd.to_datetime(df["date"])
    df = df.pivot_table(index="date", columns="factor", values="value").sort_index()

    return df

@timer
def load_fund_metadata():
    query = """
    SELECT 
        f.SymbolCUSIP, 
        f.Region, 
        f.YC_Global_Category_ID, 
        c.Global_Category_Name,
        f.YC_Category_ID,
        y.Category_Name,
        f.CWA_Broad_Category_ID,
        b.CWA_Broad_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
    LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    df = pd.read_sql(query, engine)
    logger.info(f"Loaded metadata for {len(df)} funds")
    df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
    if df["CWA_Broad_Category_Name"].isnull().all():
        logger.warning("CWA_Broad_Category_Name missing; Equity regressions 5-7 will be skipped")
    return df.dropna(subset=["Region", "FactorProfile"])

@timer
def load_fund_returns(fund_ids=None):
    query = """
    SELECT SymbolCUSIP, Date, ReturnValue
    FROM Fund_Returns_Timeseries
    WHERE Metric = '1 Month Return' AND ReturnValue IS NOT NULL AND Date IS NOT NULL
    """
    if fund_ids:
        fund_ids = [fid for fid in fund_ids if isinstance(fid, str) and fid.strip()]
        if not fund_ids:
            logger.warning("No valid SymbolCUSIP provided")
            return pd.DataFrame()
        fund_ids_str = ",".join([f"'{fid}'" for fid in fund_ids])
        query += f" AND SymbolCUSIP IN ({fund_ids_str})"
    
    logger.debug(f"Executing query: {query}")
    chunks = []
    for chunk in pd.read_sql_query(query, engine, parse_dates=["Date"], chunksize=CHUNK_SIZE):
        logger.debug(f"Loaded chunk of {len(chunk)} rows")
        chunks.append(chunk)
    df = pd.concat(chunks) if chunks else pd.DataFrame()
    if df.empty:
        logger.warning(f"No returns data loaded for SymbolCUSIP: {fund_ids}")
        return pd.DataFrame()
    
    logger.info(f"Loaded returns for {len(df['SymbolCUSIP'].unique())} funds")
    pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
    pivoted.index = pd.to_datetime(pivoted.index) + pd.offsets.MonthEnd(0)
    pivoted.index = pivoted.index.drop_duplicates()
    pivoted = pivoted.asfreq('ME')
    for col in pivoted.columns:
        if not pd.api.types.is_numeric_dtype(pivoted[col]):
            logger.warning(f"Non-numeric returns for {col}; converting")
            pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
    return pivoted

@timer
def load_commodity_factors():
    query = """
        SELECT date AS Date, 
               excess_return_eqwt, 
               excess_spot_return_eqwt, 
               ir_adjusted_carry_eqwt, 
               spot_return_eqwt, 
               carry_eqwt, 
               excess_return_long_short, 
               excess_spot_return_long_short, 
               ir_adjusted_carry_long_short, 
               aggregate_backwardation_contango
        FROM aqr_cmdty_factors
    """
    df = pd.read_sql(query, engine, parse_dates=['Date'])
    if df.empty:
        logger.warning("No commodity factors loaded")
        return pd.DataFrame()
    pivoted_df = df.set_index("Date")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df.index = pivoted_df.index.drop_duplicates()
    pivoted_df = pivoted_df.asfreq('ME')
    return pivoted_df

@timer
def load_fixed_income_factors(factor_list):
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    query = f"""
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
        WHERE Factor_Name IN ({factor_in_clause})
    """
    logger.debug(f"Executing query: {query}")
    df = pd.read_sql_query(query, engine, parse_dates=["Date"])
    if df.empty:
        logger.warning(f"No fixed income factors for {factor_list}")
        return pd.DataFrame()
    pivoted_df = df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")
    pivoted_df.index = pd.to_datetime(pivoted_df.index) + pd.offsets.MonthEnd(0)
    pivoted_df.index = pivoted_df.index.drop_duplicates()
    pivoted_df = pivoted_df.asfreq('ME')
    return pivoted_df

# Section 4: Regression Processing
def run_rolling_regression(symbol, returns, factor_data, regression_type, regression_name, window_months):
    records = []
    months_needed = window_months
    
    returns = returns.sort_index()
    factor_data = factor_data.sort_index()
    
    available_dates = returns.index.intersection(factor_data.index)
    if len(available_dates) < months_needed:
        logger.warning(f"Fund {symbol} ({regression_name}, window={window_months}m) skipped: {len(available_dates)} months available")
        return records
    
    for end_date in available_dates[months_needed-1::1]:
        start_date = end_date - pd.offsets.MonthEnd(months_needed)
        if start_date < available_dates[0]:
            continue
        
        window_returns = returns.loc[start_date:end_date]
        window_factors = factor_data.loc[start_date:end_date]
        
        window_returns = window_returns.reindex(window_factors.index)
        
        if len(window_returns.dropna()) < months_needed * 0.8 or len(window_factors.dropna()) < months_needed * 0.8:
            logger.debug(f"Skipping regression for {symbol} ({regression_name}, window={window_months}m, end_date={end_date}): insufficient data")
            continue
        
        try:
            if regression_type == "OLS":
                X = sm.add_constant(window_factors)
                model = sm.OLS(window_returns, X, missing='drop').fit()
                coefficients = model.params.to_dict()
                tvalues = model.tvalues.to_dict()
                pvalues = model.pvalues.to_dict()
                r_squared = model.rsquared
                record = {
                    "SymbolCUSIP": symbol,
                    "RegressionName": regression_name,
                    "Window": f"{window_months}m",
                    "EndDate": end_date,
                    "R_Squared": r_squared
                }
                for factor in coefficients:
                    record[f"{factor}_beta"] = coefficients[factor]
                    record[f"{factor}_tvalue"] = tvalues.get(factor, None)
                    record[f"{factor}_pvalue"] = pvalues.get(factor, None)
                records.append(record)
        except Exception as e:
            logger.warning(f"Regression failed for {symbol} ({regression_name}, window={window_months}m, end_date={end_date}): {str(e)}")
            continue
    
    if not records:
        logger.info(f"No regressions completed for {symbol} ({regression_name}, window={window_months}m)")
    return records

# Section 5: Fund Processing
category_to_regressions = {
    "Energy Sector Equity": "Equity_USA",
    "Equity Miscellaneous": "Equity_USA",
    "Financials Sector Equity": "Equity_USA",
    "Healthcare Sector Equity": "Equity_USA",
    "Consumer Goods & Services Sector Equity": "Equity_USA",
    "Communications Sector Equity": "Equity_USA",
    "Industrials Sector Equity": "Equity_USA",
    "Other Sector Equity": "Equity_USA",
    "Real Estate Sector Equity": "Equity_USA",
    "Precious Metals Sector Equity": "Equity_USA",
    "Technology Sector Equity": "Equity_USA",
    "Utilities Sector Equity": "Equity_USA",
    "US Equity Large Cap Blend": "Equity_USA",
    "US Equity Large Cap Growth": "Equity_USA",
    "US Equity Large Cap Value": "Equity_USA",
    "US Equity Mid Cap": "Equity_USA",
    "US Equity Small Cap": "Equity_USA",
    "Options Trading": "Equity_USA",
    "Natural Resources Sector Equity": "Equity_USA",
    "Infrastructure Sector Equity": "Equity_USA",
    "Asia ex-Japan Equity": "Equity_Intl",
    "Australia & New Zealand Equity": "Equity_Intl",
    "Canadian Equity Large Cap": "Equity_Intl",
    "Europe Equity Large Cap": "Equity_Intl",
    "Europe Equity Mid/Small Cap": "Equity_Intl",
    "Greater China Equity": "Equity_Intl",
    "India Equity": "Equity_Intl",
    "Mexico Equity": "Equity_Intl",
    "Japan Equity": "Equity_Intl",
    "Korea Equity": "Equity_Intl",
    "Latin America Equity": "Equity_Intl",
    "UK Equity Large Cap": "Equity_Intl",
    "Thailand Equity": "Equity_Intl",
    "Global Emerging Markets Equity": "Equity_Global",
    "Global Equity Large Cap": "Equity_Global",
    "Global Equity Mid/Small Cap": "Equity_Global",
    "Global Fixed Income": "Fixed_Income",
    "Convertibles": "Fixed_Income",
    "Emerging Markets Fixed Income": "Fixed_Income",
    "Fixed Income Miscellaneous": "Fixed_Income",
    "US Fixed Income": "Fixed_Income",
    "US Municipal Fixed Income": "Fixed_Income",
    "Aggressive Allocation": "Allocation",
    "Allocation Miscellaneous": "Allocation",
    "Cautious Allocation": "Allocation",
    "Flexible Allocation": "Allocation",
    "Moderate Allocation": "Allocation",
    "Alternative Miscellaneous": "Alternative",
    "Long/Short Equity": "Alternative",
    "Market Neutral": "Alternative",
    "Multialternative": "Alternative",
    "Commodities Broad Basket": "Commodity",
    "Commodities Specified": "Commodity"
}

regression_sets = {
    "Equity_USA": [
        ("Equity_USA_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB'], "USA", None),
        ("Equity_USA_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "USA", "US Stock Selection"),
        ("Equity_USA_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "USA", "All Macro"),
        ("Equity_USA_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "USA", "Equity indices"),
        ("Equity_USA_5", ['MKT', 'BAB', 'TSM-FI', 'TSM-FX', 'excess_return_eqwt'], "USA", None),
        ("Equity_USA_6", ['MKT', 'SMB', 'BAB', 'TSM-Com'], "USA", None),
        ("Equity_USA_7", ['MKT', 'HML_Devil', 'QMJ', 'UMD', 'SMB', 'BAB', 'TSM-Com', 'TSM-FI', 'TSM-FX'], "USA", None)
    ],
    "Equity_Intl": [
        ("Equity_Intl_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Intl", None),
        ("Equity_Intl_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "International", "Intl Stock Selection"),
        ("Equity_Intl_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "International", "All Macro"),
        ("Equity_Intl_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "International", "Equity indices")
    ],
    "Equity_Global": [
        ("Equity_Global_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'TSM-EQ', 'BAB'], "Global", None),
        ("Equity_Global_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Stock Selection"),
        ("Equity_Global_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
        ("Equity_Global_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
    ],
    "Fixed_Income": [
        ("FI_1", ['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY', 'TSM-FI', 'TSM-FX'], "Global", None),
        ("FI_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Fixed income"),
        ("FI_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
        ("FI_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
    ],
    "Allocation": [
        ("Allocation_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ', 'TSM-FI'], "Global", None),
        ("Allocation_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Stock Selection"),
        ("Allocation_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
        ("Allocation_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
    ],
    "Alternative": [
        ("Alternative_1", ['MKT', 'HML_Devil', 'QMJ', 'SMB', 'UMD', 'BAB', 'TSM-EQ', 'TSM-FI', 'TSM-Com', 'TSM-FX'], "Global", None),
        ("Allocation_2", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Stock Selection"),
        ("Allocation_3", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "All Macro"),
        ("Allocation_4", ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value'], "Global", "Equity indices")
    ],
    "Commodity": [
        ("Commodity_1", ['excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt', 'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short', 'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango'], "Global", None)
    ]
}

def process_fund(fund_data):
    records = []
    symbol = fund_data["SymbolCUSIP"]
    category = fund_data["Global_Category_Name"]
    returns = pd.Series(fund_data["returns"]).dropna()
    
    if returns.empty:
        logger.warning(f"No valid returns for {symbol}")
        return records
    
    regression_category = category_to_regressions.get(category, "Allocation")
    if regression_category == "Allocation" and category not in category_to_regressions:
        logger.warning(f"Unmapped category {category} for {symbol}; using Allocation")
    
    CENTURY_FACTORS = ['Carry', 'Defensive', 'Market', 'Momentum', 'Multi-Style', 'Value']
    factor_data_cache = {}
    
    for reg_name, factors, region, portfolio_base in regression_sets[regression_category]:
        try:
            logger.debug(f"Loading factors for {symbol} ({reg_name}): factors {factors}, region {region}, portfolio_base {portfolio_base}")
            
            if reg_name.startswith("Commodity"):
                factor_data = load_commodity_factors()
            elif reg_name.startswith("FI"):
                fi_factors = [f for f in factors if f in ['TERM_Int', 'TERM_Long', 'CREDIT', 'CREDIT_HY']]
                century_factors = [f for f in factors if f in CENTURY_FACTORS]
                other_factors = [f for f in factors if f not in fi_factors and f not in century_factors]
                
                factor_data_parts = []
                if century_factors:
                    if not portfolio_base:
                        logger.warning(f"Skipping century factors {century_factors} for {reg_name} ({symbol}): portfolio_base missing")
                    else:
                        factor_data_parts.append(load_century_factors(engine, century_factors, portfolio_base, region))
                if other_factors:
                    factor_data_parts.append(load_DB_Factors(engine, region, other_factors))
                if fi_factors:
                    factor_data_parts.append(load_fixed_income_factors(fi_factors))
                
                factor_data = pd.concat([df for df in factor_data_parts if not df.empty], axis=1).dropna()
            else:
                century_factors = [f for f in factors if f in CENTURY_FACTORS]
                other_factors = [f for f in factors if f not in CENTURY_FACTORS]
                
                factor_data_parts = []
                if century_factors:
                    if not portfolio_base:
                        logger.warning(f"Skipping century factors {century_factors} for {reg_name} ({symbol}): portfolio_base missing")
                    else:
                        factor_data_parts.append(load_century_factors(engine, century_factors, portfolio_base, region))
                if other_factors:
                    factor_data_parts.append(load_DB_Factors(engine, region, other_factors))
                
                factor_data = pd.concat([df for df in factor_data_parts if not df.empty], axis=1).dropna()
            
            if not isinstance(factor_data, pd.DataFrame):
                logger.error(f"Invalid factor data type for {reg_name} ({symbol}): {type(factor_data)}")
                continue
            if factor_data.empty or factor_data.shape[1] == 0:
                logger.warning(f"Empty factor data for {reg_name} ({symbol}): factors {factors}")
                continue
            logger.debug(f"Factor data for {reg_name} ({symbol}): shape {factor_data.shape}, columns {factor_data.columns.tolist()}")
            factor_data = factor_data[factor_data.index >= '2015-01-01']
            factor_data_cache[(reg_name, region)] = factor_data
        except Exception as e:
            logger.warning(f"Error loading factors for {symbol} ({reg_name}): {str(e)}")
            continue
    
    for reg_name, factors, region, portfolio_base in regression_sets[regression_category]:
        factor_data = factor_data_cache.get((reg_name, region))
        if factor_data is None:
            continue
        for window in ROLLING_PERIODS:
            records.extend(run_rolling_regression(symbol, returns, factor_data, "OLS", reg_name, window))
    
    logger.info(f"Generated {len(records)} regression records for {symbol}")
    return records

def process_region(region, fund_data_list):
    records = []
    errors = 0
    logger.info(f"Processing {len(fund_data_list)} funds in {region}")
    
    if SAMPLE_DRY_RUN:
        # Sequential processing for debugging
        for fund_data in tqdm(fund_data_list, total=len(fund_data_list), desc=f"Processing {region}", file=sys.stdout):
            try:
                records.extend(process_fund(fund_data))
            except Exception as e:
                logger.error(f"Error processing {fund_data['SymbolCUSIP']}: {str(e)}")
                errors += 1
    else:
        # Parallel processing for production
        for i in range(0, len(fund_data_list), BATCH_SIZE):
            batch = fund_data_list[i:i + BATCH_SIZE]
            with ProcessPoolExecutor(max_workers=MAX_WORKERS_CPU) as executor:
                future_to_fund = {
                    executor.submit(process_fund, fund_data): fund_data["SymbolCUSIP"]
                    for fund_data in batch
                }
                for future in tqdm(future_to_fund, total=len(batch), desc=f"Processing {region} batch", file=sys.stdout):
                    try:
                        records.extend(future.result())
                    except Exception as e:
                        logger.error(f"Error processing {future_to_fund[future]}: {str(e)}")
                        errors += 1
    
    logger.info(f"Region {region} generated {len(records)} total records with {errors} errors")
    log_summary(f"Region {region}: {len(fund_data_list)} funds, {len(records)} records, {errors} errors")
    if not DRY_RUN:
        insert_batch(records)
    
    return records, errors

# Section 6: Database Output
def insert_batch(records):
    if DRY_RUN:
        logger.info(f"Dry run: Would insert {len(records)} records")
        return
    try:
        df = pd.DataFrame(records)
        with database_transaction() as connection:
            df.to_sql("AQRR_Factor_Attribution", connection, if_exists="append", index=False, method="multi")
        logger.info(f"Inserted {len(df)} records to database")
    except Exception as e:
        logger.error(f"Error inserting batch: {e}")
        raise

# Section 7: Main Pipeline
@timer
def main():
    logger.info("Starting main pipeline")
    log_summary("Pipeline started")
    try:
        fund_meta = load_fund_metadata()
    except Exception as e:
        logger.error(f"Failed to load metadata: {e}")
        log_summary(f"Error: Failed to load metadata: {e}")
        return {"error": str(e)}
    
    regions = fund_meta["Region"].unique()
    logger.info(f"Total funds: {len(fund_meta)}, Regions: {regions}")
    log_summary(f"Total funds: {len(fund_meta)}, Regions: {regions}")
    
    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
        logger.info(f"Sampled {len(fund_ids)} funds")
        log_summary(f"Sampled {len(fund_ids)} funds")
    
    summary = {"total_funds": len(fund_ids), "regions": {}, "errors": 0}
    for region in sorted(set(regions) - {'Unknown'} | {'USA'}):
        region_fund_ids = fund_meta[fund_meta["Region"] == region]["SymbolCUSIP"].tolist()
        if not region_fund_ids:
            logger.warning(f"No SymbolCUSIP found for region {region}")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 0}
            continue
        if SAMPLE_DRY_RUN:
            region_fund_ids = random.sample(region_fund_ids, min(SAMPLE_SIZE, len(region_fund_ids)))
            logger.info(f"Sampled {len(region_fund_ids)} SymbolCUSIP for {region}")
        
        try:
            returns = load_fund_returns(region_fund_ids)
        except Exception as e:
            logger.error(f"Failed to load returns for {region}: {e}")
            log_summary(f"Error: Failed to load returns for {region}: {e}")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 1}
            summary["errors"] += 1
            continue
        
        region_funds = [
            {
                "SymbolCUSIP": row["SymbolCUSIP"],
                "Global_Category_Name": row["Global_Category_Name"],
                "CWA_Broad_Category_Name": row.get("CWA_Broad_Category_Name", None),
                "returns": returns[row["SymbolCUSIP"]].to_dict()
            }
            for _, row in fund_meta.iterrows() if row["SymbolCUSIP"] in returns.columns
        ]
        if not region_funds:
            logger.warning(f"No valid returns data for {region}")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 0}
            continue
        
        records, errors = process_region(region, region_funds)
        summary["regions"][region] = {"funds_processed": len(region_funds), "records": len(records), "errors": errors}
        summary["errors"] += errors
    
    logger.info(f"Pipeline summary: {summary}")
    log_summary(f"Pipeline completed: {summary}")
    return summary

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logger.error(f"Main execution failed: {e}")
        raise

[SQL: 
            SELECT
                date AS Date,
                'MKT' AS Factor,
                value AS Value
            FROM aqr_century_factors
            WHERE region = :region
              AND factor = :factor
        ]
[parameters: {'region': 'Global', 'factor': 'MKT'}]
(Background on this error at: https://sqlalche.me/e/20/f405)
[SQL: 
            SELECT
                date AS Date,
                'MKT' AS Factor,
                value AS Value
            FROM aqr_century_factors
            WHERE region = :region
              AND factor = :factor
        ]
[parameters: {'region': 'Global', 'factor': 'MKT'}]
(Background on this error at: https://sqlalche.me/e/20/f405)
[SQL: 
            SELECT
                date AS Date,
                'MKT' AS Factor,
                value AS Value
            FROM aqr_century_factors
            WHERE region = :region
              AND factor = :factor
        ]
[parameters: {'region': 'Global', 'factor': 'MKT'}]
(Background

In [None]:
# AQRR Attribution Pipeline - Modular Factor Loading Version
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor
from sqlalchemy import create_engine, text
from tqdm import tqdm
import statsmodels.api as sm
import time
import sys
from contextlib import contextmanager
from functools import wraps

# Section 1: Configuration and Logging
CONFIG = {
    "database": {
        "server": "JULIANS_LAPTOP\\SQLEXPRESS",
        "database": "CWA_Fund_Database",
        "driver": "ODBC Driver 18 for SQL Server"
    },
    "return_metric": "1 Month Return",
    "rolling_periods": [12, 24, 36, 48, 60],
    "dry_run": True,
    "sample_dry_run": True,
    "sample_size": 10,
    "chunk_size": 5600,
    "batch_insert_size": 10000,
    "max_workers_cpu": 16,
    "batch_size": 100,
}

CONNECTION_STRING = (
    f"mssql+pyodbc://{CONFIG['database']['server']}/{CONFIG['database']['database']}"
    f"?driver={CONFIG['database']['driver']}&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(CONNECTION_STRING, pool_size=20, max_overflow=10)

RETURN_METRIC = CONFIG["return_metric"]
ROLLING_PERIODS = CONFIG["rolling_periods"]
DRY_RUN = CONFIG["dry_run"]
SAMPLE_DRY_RUN = CONFIG["sample_dry_run"]
SAMPLE_SIZE = CONFIG["sample_size"]
CHUNK_SIZE = CONFIG["chunk_size"]
BATCH_INSERT_SIZE = CONFIG["batch_insert_size"]
MAX_WORKERS_CPU = CONFIG["max_workers_cpu"]
BATCH_SIZE = CONFIG["batch_size"]

logging.basicConfig(
    level=logging.WARNING,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("factor_attribution.log"),
        logging.StreamHandler(sys.stdout)
    ],
    force=True
)
logger = logging.getLogger()
SUMMARY_LOG = "factor_attribution_summary.log"

def log_summary(message):
    with open(SUMMARY_LOG, 'a') as f:
        f.write(f"{datetime.now()}: {message}\n")

@contextmanager
def database_transaction():
    with engine.connect() as connection:
        transaction = connection.begin()
        try:
            yield connection
            transaction.commit()
        except Exception as e:
            transaction.rollback()
            logger.error(f"Transaction failed: {e}")
            raise

def timer(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        logger.debug(f"Starting {func.__name__}")
        result = func(*args, **kwargs)
        logger.info(f"{func.__name__} took {time.time() - start_time:.2f} seconds")
        return result
    return wrapper

# Section 2: Factor Loaders
@timer
def load_db_factors_standard(engine, region: str, factors: list):
    placeholders = ', '.join([f":factor{i}" for i in range(len(factors))])
    params = {f"factor{i}": f for i, f in enumerate(factors)}
    params["region"] = region
    query = f"""
        SELECT date AS Date, factor AS Factor, value AS Value
        FROM Factor_Returns
        WHERE region = :region AND factor IN ({placeholders})
    """
    df = pd.read_sql_query(text(query), engine, params=params, parse_dates=["Date"])
    if df.empty:
        logger.warning(f"No standard factors for region={region}, factors={factors}")
        return pd.DataFrame()
    return df.pivot(index="Date", columns="Factor", values="Value")

@timer
def load_century_factors(engine, factors: list, portfolio: str, region: str):
    placeholders = ', '.join([f":factor{i}" for i in range(len(factors))])
    params = {f"factor{i}": f for i, f in enumerate(factors)}
    params["portfolio"] = f"{portfolio}%"
    params["region"] = region
    query = f"""
        SELECT date, factor, value
        FROM aqr_century_factors
        WHERE region = :region AND portfolio LIKE :portfolio AND factor IN ({placeholders})
    """
    df = pd.read_sql_query(text(query), engine, params=params, parse_dates=["date"])
    if df.empty:
        logger.warning(f"No century factors for {portfolio} ({region})")
        return pd.DataFrame()
    return df.pivot(index="date", columns="factor", values="value")

@timer
def load_fixed_income_factors(engine, factors: list):
    placeholders = ', '.join([f":factor{i}" for i in range(len(factors))])
    params = {f"factor{i}": f for i, f in enumerate(factors)}
    query = f"""
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
        WHERE Factor_Name IN ({placeholders})
    """
    df = pd.read_sql_query(text(query), engine, params=params, parse_dates=["Date"])
    if df.empty:
        logger.warning(f"No fixed income factors for: {factors}")
        return pd.DataFrame()
    return df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")

@timer
def load_commodity_factors(engine):
    query = """
        SELECT date AS Date,
               excess_return_eqwt, excess_spot_return_eqwt,
               ir_adjusted_carry_eqwt, spot_return_eqwt, carry_eqwt,
               excess_return_long_short, excess_spot_return_long_short,
               ir_adjusted_carry_long_short, aggregate_backwardation_contango
        FROM aqr_cmdty_factors
    """
    df = pd.read_sql_query(query, engine, parse_dates=["Date"])
    if df.empty:
        logger.warning("No commodity factors loaded")
        return pd.DataFrame()
    return df.set_index("Date")
# ... [PREVIOUS CONTENT ABOVE UNCHANGED] ...

    return df.set_index("Date")

# Section 3: Fund Metadata and Returns
@timer
def load_fund_metadata():
    query = """
        SELECT 
            f.SymbolCUSIP, 
            f.Region, 
            f.YC_Global_Category_ID, 
            c.Global_Category_Name,
            f.YC_Category_ID,
            y.Category_Name,
            f.CWA_Broad_Category_ID,
            b.CWA_Broad_Category_Name
        FROM Funds_to_Screen f
        JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
        JOIN YC_Category_List y ON f.YC_Category_ID = y.ID
        LEFT JOIN CWA_Broad_Category_List b ON f.CWA_Broad_Category_ID = b.ID
    """
    df = pd.read_sql(query, engine)
    logger.info(f"Loaded metadata for {len(df)} funds")
    return df.dropna(subset=["SymbolCUSIP", "Global_Category_Name"])

@timer
def load_fund_returns(fund_ids=None):
    base_query = """
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE Metric = :metric AND ReturnValue IS NOT NULL AND Date IS NOT NULL
    """
    params = {"metric": RETURN_METRIC}

    if fund_ids:
        fund_ids = [fid for fid in fund_ids if isinstance(fid, str) and fid.strip()]
        if not fund_ids:
            logger.warning("No valid SymbolCUSIP provided")
            return pd.DataFrame()

        symbol_placeholders = ','.join([f":id{i}" for i in range(len(fund_ids))])
        symbol_dict = {f"id{i}": fid for i, fid in enumerate(fund_ids)}
        params.update(symbol_dict)

        query = base_query + f" AND SymbolCUSIP IN ({symbol_placeholders})"
    else:
        query = base_query

    chunks = []
    for chunk in pd.read_sql_query(text(query), engine, params=params, parse_dates=["Date"], chunksize=CHUNK_SIZE):
        chunks.append(chunk)

    df = pd.concat(chunks) if chunks else pd.DataFrame()
    if df.empty:
        logger.warning("No returns data loaded")
        return pd.DataFrame()

    pivoted = df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")
    pivoted.index = pd.to_datetime(pivoted.index) + pd.offsets.MonthEnd(0)
    pivoted.index = pivoted.index.drop_duplicates()
    pivoted = pivoted.asfreq('ME')
    for col in pivoted.columns:
        if not pd.api.types.is_numeric_dtype(pivoted[col]):
            pivoted[col] = pd.to_numeric(pivoted[col], errors='coerce')
    return pivoted

# Section 4: Fund Processing Logic

def run_rolling_regression(symbol, returns, factor_data, regression_type, regression_name, window_months):
    records = []
    returns = returns.sort_index()
    factor_data = factor_data.sort_index()
    aligned_dates = returns.index.intersection(factor_data.index)
    if len(aligned_dates) < window_months:
        return records

    for end_date in aligned_dates[window_months - 1:]:
        start_date = end_date - pd.offsets.MonthEnd(window_months)
        window_returns = returns.loc[start_date:end_date].dropna()
        window_factors = factor_data.loc[start_date:end_date].dropna()
        if len(window_returns) < window_months * 0.8 or len(window_factors) < window_months * 0.8:
            continue
        try:
            X = sm.add_constant(window_factors)
            model = sm.OLS(window_returns, X, missing='drop').fit()
            rec = {
                "SymbolCUSIP": symbol,
                "RegressionName": regression_name,
                "Window": f"{window_months}m",
                "EndDate": end_date,
                "R_Squared": model.rsquared
            }
            for var in model.params.index:
                rec[f"{var}_beta"] = model.params[var]
                rec[f"{var}_tvalue"] = model.tvalues.get(var)
                rec[f"{var}_pvalue"] = model.pvalues.get(var)
            records.append(rec)
        except Exception as e:
            logger.warning(f"Regression failed for {symbol} ({regression_name}): {e}")
            continue
    return records

@timer
def process_fund(fund_data):
    symbol = fund_data["SymbolCUSIP"]
    category = fund_data["Global_Category_Name"]
    returns = pd.Series(fund_data["returns"]).dropna()
    if returns.empty:
        logger.warning(f"No returns for {symbol}")
        return []

    category_map = {
        "Equity_USA": [...],
        "Equity_Intl": [...],
        "Equity_Global": [...],
        "Fixed_Income": [...],
        "Allocation": [...],
        "Alternative": [...],
        "Commodity": [...]
    }  # Define or import from a config if external

    regression_category = category_to_regressions.get(category, "Allocation")
    reg_set = regression_sets.get(regression_category, [])
    results = []

    for reg_name, factors, region, portfolio in reg_set:
        factor_df_parts = []
        try:
            if reg_name.startswith("Commodity"):
                factor_df_parts.append(load_commodity_factors(engine))
            else:
                if any(f in factors for f in ["TERM_Int", "TERM_Long", "CREDIT", "CREDIT_HY"]):
                    fi_factors = [f for f in factors if f.startswith("TERM") or f.startswith("CREDIT")]
                    factor_df_parts.append(load_fixed_income_factors(engine, fi_factors))
                if any(f in factors for f in ["Carry", "Defensive", "Market", "Momentum", "Multi-Style", "Value"]):
                    if portfolio:
                        cen_factors = [f for f in factors if f in ["Carry", "Defensive", "Market", "Momentum", "Multi-Style", "Value"]]
                        factor_df_parts.append(load_century_factors(engine, cen_factors, portfolio, region))
                other_factors = [f for f in factors if f not in ["Carry", "Defensive", "Market", "Momentum", "Multi-Style", "Value"] and not f.startswith("TERM") and not f.startswith("CREDIT")]
                if other_factors:
                    factor_df_parts.append(load_db_factors_standard(engine, region, other_factors))
            if not factor_df_parts:
                continue
            factor_df = pd.concat([df for df in factor_df_parts if not df.empty], axis=1).dropna()
            if factor_df.empty:
                continue
            for window in ROLLING_PERIODS:
                results.extend(run_rolling_regression(symbol, returns, factor_df, "OLS", reg_name, window))
        except Exception as e:
            logger.warning(f"Error loading factors for {symbol} ({reg_name}): {e}")
            continue
    return results


# Section 5: Region Processing and DB Insert

def process_region(region, fund_data_list):
    records = []
    errors = 0
    logger.info(f"Processing {len(fund_data_list)} funds in {region}")
    if SAMPLE_DRY_RUN:
        for fund_data in tqdm(fund_data_list, desc=f"Processing {region}", file=sys.stdout):
            try:
                records.extend(process_fund(fund_data))
            except Exception as e:
                logger.error(f"Error processing {fund_data['SymbolCUSIP']}: {e}")
                errors += 1
    else:
        for i in range(0, len(fund_data_list), BATCH_SIZE):
            batch = fund_data_list[i:i+BATCH_SIZE]
            with ProcessPoolExecutor(max_workers=MAX_WORKERS_CPU) as executor:
                futures = {executor.submit(process_fund, f): f["SymbolCUSIP"] for f in batch}
                for future in tqdm(futures, total=len(futures), desc=f"{region} batch", file=sys.stdout):
                    try:
                        records.extend(future.result())
                    except Exception as e:
                        logger.error(f"Error processing {futures[future]}: {e}")
                        errors += 1
    logger.info(f"{region}: {len(records)} records with {errors} errors")
    log_summary(f"{region}: {len(fund_data_list)} funds, {len(records)} records, {errors} errors")
    if not DRY_RUN:
        insert_batch(records)
    return records, errors

def insert_batch(records):
    if DRY_RUN:
        logger.info(f"Dry run: {len(records)} records would be inserted")
        return
    try:
        df = pd.DataFrame(records)
        with database_transaction() as connection:
            df.to_sql("AQRR_Factor_Attribution", con=connection, if_exists="append", index=False, method="multi")
        logger.info(f"Inserted {len(df)} records into database")
    except Exception as e:
        logger.error(f"Batch insert failed: {e}")
        raise

# Section 6: Main Pipeline
@timer
def main():
    logger.info("Starting AQRR Attribution Pipeline")
    log_summary("Pipeline started")
    try:
        fund_meta = load_fund_metadata()
    except Exception as e:
        logger.error(f"Metadata load failed: {e}")
        log_summary(f"Error loading metadata: {e}")
        return {"error": str(e)}

    fund_ids = fund_meta["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        fund_ids = random.sample(fund_ids, min(SAMPLE_SIZE, len(fund_ids)))
        logger.info(f"Sampled {len(fund_ids)} funds for dry run")
        log_summary(f"Sampled {len(fund_ids)} funds")

    regions = fund_meta["Region"].unique()
    summary = {"total_funds": len(fund_ids), "regions": {}, "errors": 0}

    for region in sorted(set(regions) - {"Unknown"} | {"USA"}):
        region_funds_meta = fund_meta[fund_meta["Region"] == region]
        region_fund_ids = region_funds_meta["SymbolCUSIP"].tolist()
        if SAMPLE_DRY_RUN:
            region_fund_ids = random.sample(region_fund_ids, min(SAMPLE_SIZE, len(region_fund_ids)))

        try:
            returns = load_fund_returns(region_fund_ids)
        except Exception as e:
            logger.error(f"Returns load failed for {region}: {e}")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 1}
            summary["errors"] += 1
            continue

        region_funds = [
            {
                "SymbolCUSIP": row["SymbolCUSIP"],
                "Global_Category_Name": row["Global_Category_Name"],
                "returns": returns[row["SymbolCUSIP"]] if row["SymbolCUSIP"] in returns.columns else pd.Series()
            }
            for _, row in region_funds_meta.iterrows()
        ]

        if not region_funds:
            logger.warning(f"No valid return data for {region}")
            summary["regions"][region] = {"funds_processed": 0, "records": 0, "errors": 0}
            continue

        records, errors = process_region(region, region_funds)
        summary["regions"][region] = {"funds_processed": len(region_funds), "records": len(records), "errors": errors}
        summary["errors"] += errors

    logger.info(f"Pipeline complete: {summary}")
    log_summary(f"Pipeline complete: {summary}")
    return summary

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logger.error(f"Fatal error: {e}")
        raise


Processing Global: 100%|██████████| 1208/1208 [00:18<00:00, 64.41it/s]
