In [None]:
# R initializer code

In [1]:
def initialize_r_environment():
    import os

    # 🔒 Set environment variables BEFORE importing anything from rpy2
    os.environ['R_HOME'] = r'C:\\Program Files\\R\\R-4.4.3'
    os.environ['R_JIT_ENABLED'] = '0'

    # Now import RPy2 safely
    import logging
    import rpy2.rinterface_lib.callbacks
    import rpy2.robjects as ro
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri

    # Suppress console spam
    rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

    # Print test R version
    try:
        print("🔁 R Version:", ro.r('R.version.string')[0])
    except Exception as e:
        print("❌ Could not connect to R:", e)
        return None

    # Load required packages
    try:
        aqrr = importr('aqrr')
        print("✅ Loaded R package: aqrr")
        funcs = ro.r('ls("package:aqrr")')
        print(f"📦 AQRR Functions Available: {list(funcs)[:6]} ...")
    except Exception as e:
        print("❌ Failed to load 'aqrr':", e)
        return None

    # Enable pandas ↔ R dataframe conversion
    pandas2ri.activate()

    return aqrr


In [2]:
aqrr = initialize_r_environment()

🔁 R Version: R version 4.4.3 (2025-02-28 ucrt)
✅ Loaded R package: aqrr
📦 AQRR Functions Available: [':=', 'aqr_bab_daily', 'aqr_bab_monthly', 'aqr_commodities_long_run', 'aqr_credit_risk_premium', 'aqr_factor_premia_monthly'] ...


In [None]:
#  The code below is to run Factor regressions on funds

In [None]:
#Version 1

In [None]:
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor
from sqlalchemy import create_engine, text
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
from statsmodels.stats.diagnostic import het_breuschpagan, acorr_breusch_godfrey
from statsmodels.stats.stattools import durbin_watson
import statsmodels.api as sm

# Section 1: Configuration and Setup
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

RETURN_METRIC = "1 Month Return"
ROLLING_PERIODS = [12, 24, 36, 48, 60]  # in months
DRY_RUN = True
SAMPLE_DRY_RUN = True
SAMPLE_SIZE = 100
CHUNK_SIZE = 5600
BATCH_INSERT_SIZE = 10000

# Section 2: Data Source Initialization
def initialize_r_environment():
    import os
    os.environ['R_HOME'] = r'C:\\Program Files\\R\\R-4.4.3'
    os.environ['R_JIT_ENABLED'] = '0'
    import logging
    import rpy2.rinterface_lib.callbacks
    import rpy2.robjects as ro
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri
    rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
    try:
        version = ro.r('R.version.string')[0]
        print(f"🔁 R Version: {version}")
        logging.info(f"R initialized successfully: {version}")
    except (OSError, ValueError) as e:
        print(f"❌ Could not connect to R: {e}")
        logging.warning(f"Could not connect to R: {e}")
        return None
    try:
        aqrr = importr('aqrr')
        ro.r('library(dplyr)')
        print("✅ Loaded R package: aqrr")
        funcs = ro.r('ls("package:aqrr")')
        print(f"📦 AQRR Functions Available: {list(funcs)[:6]} ...")
        logging.info("AQRR package loaded successfully")
    except ImportError as e:
        print(f"❌ Failed to load 'aqrr': {e}")
        logging.warning(f"Failed to load 'aqrr': {e}")
        return None
    pandas2ri.activate()
    return aqrr

# Section 3: Data Loading Functions
def load_fund_metadata():
    query = """
    SELECT f.SymbolCUSIP, f.YC_Global_Category_ID, c.Global_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    """
    df = pd.read_sql(query, engine)
    df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
    return df.dropna(subset=["Region", "FactorProfile"])

def load_fund_returns(fund_ids):
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
        AND Metric = '{RETURN_METRIC}'
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

def load_aqrr_factors(region="USA"):
    from rpy2.robjects import r as ro
    from rpy2.robjects import pandas2ri
    from rpy2.robjects.conversion import localconverter
    print(f"📦 Loading AQRR factor data for region: {region}")
    ro('library(dplyr)')
    ro(f"""
    mkt <- aqr_mkt_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, mkt = value)
    smb <- aqr_smb_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, smb = value)
    hml <- aqr_hml_ff_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, hml = value)
    umd <- aqr_umd_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, umd = value)
    qmj <- aqr_qmj_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, qmj = value)
    bab <- aqr_bab_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, bab = value)
    """)
    def fix_date(r_obj):
        with localconverter(pandas2ri.converter):
            df = pandas2ri.rpy2py(r_obj)
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        return df[df['date'] >= pd.to_datetime("2015-01-31")].reset_index(drop=True)
    factors = (
        fix_date(ro['mkt']).merge(fix_date(ro['smb']), on='date')
            .merge(fix_date(ro['hml']), on='date')
            .merge(fix_date(ro['umd']), on='date')
            .merge(fix_date(ro['qmj']), on='date')
            .merge(fix_date(ro['bab']), on='date')
            .rename(columns={"date": "Date"})
            .sort_values("Date")
            .reset_index(drop=True)
    )
    print(f"✅ Loaded AQRR factors: {region} | Shape: {factors.shape}")
    return factors

def load_db_factors(factor_list, region="Global", table="aqr_factors", portfolio_filter=None):
    """Load factors from the specified DB table."""
    query = f"""
        SELECT date AS Date, factor_symbol AS Factor, value AS Value
        FROM {table}
        WHERE region = :region
        AND factor_symbol IN ({','.join([f"'{f}'" for f in factor_list])})
    """
    if portfolio_filter:
        query += f" AND portfolio IN ({','.join([f"'{p}'" for p in portfolio_filter])})"
    df = pd.read_sql(query, engine, params={'region': region}, parse_dates=['Date'])
    df = df.pivot(index="Date", columns="Factor", values="Value")
    rename_map = {'MKT': 'mkt', 'SMB': 'smb', 'HML-D': 'hml', 'UMD': 'umd', 'QMJ': 'qmj', 'BAB': 'bab', 'TSM': 'tsm'}
    df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})
    logging.info(f"Loaded factors from {table} | Shape: {df.shape}")
    return df

def load_century_factors(asset_class, factor_list, region="Global"):
    """Load Century paper factors from aqr_century_factors."""
    query = f"""
        SELECT date AS Date, portfolio AS Factor, value AS Value
        FROM aqr_century_factors
        WHERE asset_class = :asset_class
        AND region = :region
        AND portfolio IN ({','.join([f"'{f}'" for f in factor_list])})
    """
    df = pd.read_sql(query, engine, params={'asset_class': asset_class, 'region': region}, parse_dates=['Date'])
    df = df.pivot(index="Date", columns="Factor", values="Value")
    logging.info(f"Loaded Century factors for {asset_class} | Shape: {df.shape}")
    return df

def load_fixed_income_factors():
    """Placeholder for PortfolioVisualizer fixed income factors."""
    # Assuming TERM, DEF, CRED, LIQ - replace with actual DB query if available
    query = """
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")

def merge_all_factors(factors1, factors2):
    factors1.index = pd.to_datetime(factors1.index)
    factors2.index = pd.to_datetime(factors2.index)
    merged = factors1.merge(factors2, how="left", left_index=True, right_index=True)
    return merged

def compare_and_select_factors(region, r_factors, db_factors):
    r_max = r_factors['Date'].max() if not r_factors.empty else pd.Timestamp("1900-01-01")
    db_max = db_factors.index.max() if not db_factors.empty else pd.Timestamp("1900-01-01")
    print(f"📅 R max date: {r_max.date()} | DB max date: {db_max.date()}")
    if r_max > db_max:
        print("✅ Using R-based AQRR factors (more recent)")
        return r_factors.set_index("Date")
    else:
        print("✅ Using DB-based AQRR factors")
        return db_factors

# Section 4: Rolling Regression Functions
def precheck_rolling_periods(returns, factors, rolling_periods):
    returns = returns.dropna()
    factors = factors.dropna()
    min_date = max(returns.index.min(), factors.index.min())
    max_date = min(returns.index.max(), factors.index.max())
    viable_periods = [w for w in rolling_periods if min_date + relativedelta(months=w) <= max_date]
    return viable_periods

def run_rolling_regression(fund, returns, factors, regression_type):
    results = []
    returns.index = pd.to_datetime(returns.index)
    factors.index = pd.to_datetime(factors.index)
    ran_any = False
    
    viable_periods = precheck_rolling_periods(returns, factors, ROLLING_PERIODS)
    if not viable_periods:
        print(f"⚠️ No viable rolling periods for {fund}")
        return results
    
    for window in viable_periods:
        start = returns.index.min() + relativedelta(months=window)
        for end_date in returns.loc[returns.index >= start].index:
            start_date = end_date - relativedelta(months=window - 1)
            y = returns.loc[start_date:end_date]
            X = factors.loc[start_date:end_date]
            X, y = X.align(y, join="inner", axis=0)
            if len(y) < window or y.isnull().any() or X.isnull().any().any():
                continue
            try:
                X_const = add_constant(X)
                model = OLS(y, X_const).fit()
                diagnostics = {
                    'dw': durbin_watson(model.resid),
                    'bp_pval': het_breuschpagan(model.resid, model.model.exog)[1]
                }
                is_robust = diagnostics['dw'] < 1.5 or diagnostics['bp_pval'] < 0.05
                reg_type = "Robust" if is_robust else "OLS"
                if is_robust:
                    model = sm.OLS(y, X_const).fit(cov_type='HAC', cov_kwds={'maxlags': 1})
                
                for factor in X.columns:
                    results.append({
                        "SymbolCUSIP": fund,
                        "MonthEndDate": end_date,
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": model.params.get(factor, np.nan),
                        "P_Value": model.pvalues.get(factor, np.nan),
                        "T_Stat": model.tvalues.get(factor, np.nan),
                        "Standard_Error": model.bse.get(factor, np.nan),
                        "CI_Lower": model.conf_int().loc[factor][0] if factor in model.params else np.nan,
                        "CI_Upper": model.conf_int().loc[factor][1] if factor in model.params else np.nan,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1],
                        "Autocorrelation_Flag": diagnostics['dw'] < 1.5,
                        "Heteroskedasticity_Flag": diagnostics['bp_pval'] < 0.05,
                        "Regression_Type": regression_type
                    })
                ran_any = True
            except (np.linalg.LinAlgError, ValueError):
                continue
    print(f"{'✅' if ran_any else '⚠️'} {'Ran' if ran_any else 'Skipped'} {regression_type} for {fund}")
    return results

# Section 5: Main Processing Pipeline
def initialize_data_sources():
    print("🔧 Initializing R interface...")
    aqrr = initialize_r_environment()
    use_r = aqrr is not None
    if not use_r:
        logging.warning("R unavailable, using DB factors only")
        print("⚠️ R unavailable, using DB factors only.")
    return use_r

def process_region(region, fund_subset, use_r):
    from concurrent.futures import ProcessPoolExecutor
    
    funds = fund_subset["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        funds = random.sample(funds, min(SAMPLE_SIZE, len(funds)))
        logging.info(f"Sample dry run: Processing {len(funds)} funds from {region}")
        print(f"ℹ️ Sample dry run: Processing {len(funds)} funds")
    
    profiles = fund_subset.set_index("SymbolCUSIP")["FactorProfile"].to_dict()
    
    for i in range(0, len(funds), CHUNK_SIZE):
        chunk = funds[i:i + CHUNK_SIZE]
        fund_returns = load_fund_returns(chunk)
        records = []
        
        with ProcessPoolExecutor() as executor:
            futures = {}
            for fund in fund_returns.columns:
                profile = profiles.get(fund)
                region_map = {'USA': 'US', 'Global Ex USA': 'Intl', 'Global': 'Global'}
                db_region = region_map.get(region, 'Global')
                
                if profile in ["US Equity Large Cap Blend", "US Equity Large Cap Growth", "US Equity Large Cap Value", "US Equity Mid Cap", "US Equity Small Cap"]:
                    # US Equities
                    factors1 = load_db_factors(['MKT', 'SMB', 'HML-D', 'QMJ', 'UMD', 'BAB', 'TSM'], 'US', portfolio_filter=['Global', 'Equities'])
                    factors1['mkt-rf'] = factors1['mkt'] - load_db_factors(['RF'], 'Global')['rf']
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], factors1[['mkt-rf', 'smb', 'hml', 'qmj', 'umd', 'bab', 'tsm']], "US_AQRR")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Stock Selection", ["US Stock Selection Value", "US Stock Selection Momentum", "US Stock Selection Defensive", "US Stock Selection Multi-Style"], "US"), "US_Century_Stock")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Equity Indices", ["Equity Indices Value", "Equity Indices Momentum", "Equity Indices Defensive", "Equity Indices Carry", "Equity Indices Multi-Style"], "US"), "US_Century_Equity")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Macro", ["All Macro Value", "All Macro Momentum", "All Macro Carry", "All Macro Defensive", "All Macro Multi-style"], "Global"), "US_Century_Macro")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("All Asset Classes", ["All Asset Class Value", "All Asset Class Momentum", "All Asset Class Defensive", "All Asset Class Carry", "All Asset Class Multi-Style"], "Global"), "US_Century_All")] = fund
                
                elif profile in ["Global Equity Large Cap", "Global Equity Mid/Small Cap", "Global Emerging Markets Equity"]:
                    # Global Equities
                    factors1 = load_db_factors(['MKT', 'SMB', 'HML-D', 'QMJ', 'UMD', 'BAB', 'TSM'], 'Global', portfolio_filter=['Global', 'Equities'])
                    factors1['mkt-rf'] = factors1['mkt'] - load_db_factors(['RF'], 'Global')['rf']
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], factors1[['mkt-rf', 'smb', 'hml', 'qmj', 'umd', 'bab', 'tsm']], "Global_AQRR")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Stock Selection", ["All Stock Selection Value", "All Stock Selection Momentum", "All Stock Selection Defensive", "All Stock Selection Multi-Style"], "Global"), "Global_Century_Stock")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Equity Indices", ["Equity Indices Value", "Equity Indices Momentum", "Equity Indices Defensive", "Equity Indices Carry", "Equity Indices Multi-Style"], "Global"), "Global_Century_Equity")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Macro", ["All Macro Value", "All Macro Momentum", "All Macro Carry", "All Macro Defensive", "All Macro Multi-style"], "Global"), "Global_Century_Macro")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("All Asset Classes", ["All Asset Class Value", "All Asset Class Momentum", "All Asset Class Defensive", "All Asset Class Carry", "All Asset Class Multi-Style"], "Global"), "Global_Century_All")] = fund
                
                elif profile in ["Europe Equity Large Cap", "Asia Equity", "Japan Equity", "Emerging Markets Fixed Income"]:
                    # International Equities
                    factors1 = load_db_factors(['MKT', 'SMB', 'HML-D', 'QMJ', 'UMD', 'BAB', 'TSM'], 'Global Ex USA', portfolio_filter=['Global', 'Equities'])
                    factors1['mkt-rf'] = factors1['mkt'] - load_db_factors(['RF'], 'Global')['rf']
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], factors1[['mkt-rf', 'smb', 'hml', 'qmj', 'umd', 'bab', 'tsm']], "Intl_AQRR")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Stock Selection", ["International Stock Selection Value", "International Stock Selection Momentum", "International Stock Selection Defensive", "International Stock Selection Multi-Style"], "Intl"), "Intl_Century_Stock")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Equity Indices", ["Equity Indices Value", "Equity Indices Momentum", "Equity Indices Defensive", "Equity Indices Carry", "Equity Indices Multi-Style"], "Global"), "Intl_Century_Equity")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Macro", ["All Macro Value", "All Macro Momentum", "All Macro Carry", "All Macro Defensive", "All Macro Multi-style"], "Global"), "Intl_Century_Macro")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("All Asset Classes", ["All Asset Class Value", "All Asset Class Momentum", "All Asset Class Defensive", "All Asset Class Carry", "All Asset Class Multi-Style"], "Global"), "Intl_Century_All")] = fund
                
                elif profile in ["US Fixed Income", "US Municipal Fixed Income", "Global Fixed Income"]:
                    # Fixed Income
                    fi_factors = load_fixed_income_factors()  # Placeholder for PortfolioVisualizer
                    tsm_fi = load_db_factors(['TSM'], 'Global', portfolio_filter=['Fixed Income'])
                    factors1 = pd.concat([fi_factors, tsm_fi], axis=1)
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], factors1, "FI_AQRR")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Fixed Income", ["Fixed Income Value", "Fixed Income Momentum", "Fixed Income Defensive", "Fixed Income Multi-Style"], "Global"), "FI_Century_FI")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Macro", ["All Macro Value", "All Macro Momentum", "All Macro Carry", "All Macro Defensive", "All Macro Multi-style"], "Global"), "FI_Century_Macro")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("All Asset Classes", ["All Asset Class Value", "All Asset Class Momentum", "All Asset Class Defensive", "All Asset Class Carry", "All Asset Class Multi-Style"], "Global"), "FI_Century_All")] = fund
                
                elif profile in ["Flexible Allocation", "Aggressive Allocation", "Moderate Allocation", "Cautious Allocation"]:
                    # Asset Allocation
                    factors1 = load_db_factors(['MKT', 'SMB', 'HML-D', 'QMJ', 'UMD', 'BAB', 'TSM'], 'Global', portfolio_filter=['Global', 'Equities', 'Fixed Income'])
                    factors1['mkt-rf'] = factors1['mkt'] - load_db_factors(['RF'], 'Global')['rf']
                    fi_factors = load_fixed_income_factors()
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], factors1[['mkt-rf', 'smb', 'hml', 'qmj', 'umd', 'bab', 'tsm']], "AA_AQRR")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], fi_factors, "AA_PV")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], pd.concat([factors1[['mkt-rf', 'smb', 'hml', 'qmj', 'umd', 'bab', 'tsm']], fi_factors], axis=1), "AA_Combined")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Stock Selection", ["All Stock Selection Value", "All Stock Selection Momentum", "All Stock Selection Defensive", "All Stock Selection Multi-Style"], "Global"), "AA_Century_Stock")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Fixed Income", ["Fixed Income Value", "Fixed Income Momentum", "Fixed Income Defensive", "Fixed Income Multi-Style"], "Global"), "AA_Century_FI")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Equity Indices", ["Equity Indices Value", "Equity Indices Momentum", "Equity Indices Defensive", "Equity Indices Carry", "Equity Indices Multi-Style"], "Global"), "AA_Century_Equity")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Macro", ["All Macro Value", "All Macro Momentum", "All Macro Carry", "All Macro Defensive", "All Macro Multi-style"], "Global"), "AA_Century_Macro")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("All Asset Classes", ["All Asset Class Value", "All Asset Class Momentum", "All Asset Class Defensive", "All Asset Class Carry", "All Asset Class Multi-Style"], "Global"), "AA_Century_All")] = fund
                
                elif profile in ["Commodities Broad Basket", "Commodities Specified"]:
                    # Commodities (simplified, no split yet)
                    com_factors = load_db_factors(['COM', 'TSM'], 'Global', portfolio_filter=['Excess return of equal-weight commodities portfolio', 'Commodities'])
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], com_factors, "COM_General")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Commodity", ["Commodity Value", "Commodity Momentum", "Commodity Carry", "Commodity Multi-style"], "Global"), "COM_Century_Com")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Macro", ["All Macro Value", "All Macro Momentum", "All Macro Carry", "All Macro Defensive", "All Macro Multi-style"], "Global"), "COM_Century_Macro")] = fund
                
                elif profile in ["Options Trading", "Multialternative", "Market Neutral", "Long/Short Equity", "Alternative Miscellaneous"]:
                    # Alternative Assets
                    factors1 = load_db_factors(['MKT', 'SMB', 'HML-D', 'QMJ', 'UMD', 'BAB', 'TSM'], 'Global', portfolio_filter=['Global', 'Equities', 'Fixed Income', 'Commodities', 'Currencies'])
                    factors1['mkt-rf'] = factors1['mkt'] - load_db_factors(['RF'], 'Global')['rf']
                    fi_factors = load_fixed_income_factors()
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], factors1[['mkt-rf', 'smb', 'hml', 'qmj', 'umd', 'bab', 'tsm']], "Alt_AQRR")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], fi_factors, "Alt_PV")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], pd.concat([factors1[['mkt-rf', 'smb', 'hml', 'qmj', 'umd', 'bab', 'tsm']], fi_factors], axis=1), "Alt_Combined")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], factors1[['mkt-rf', 'smb', 'hml', 'qmj', 'umd', 'bab', 'tsm']], "Alt_Commodity_Split")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Stock Selection", ["All Stock Selection Value", "All Stock Selection Momentum", "All Stock Selection Defensive", "All Stock Selection Multi-Style"], "Global"), "Alt_Century_Stock")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Fixed Income", ["Fixed Income Value", "Fixed Income Momentum", "Fixed Income Defensive", "Fixed Income Multi-Style"], "Global"), "Alt_Century_FI")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Equity Indices", ["Equity Indices Value", "Equity Indices Momentum", "Equity Indices Defensive", "Equity Indices Carry", "Equity Indices Multi-Style"], "Global"), "Alt_Century_Equity")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Macro", ["All Macro Value", "All Macro Momentum", "All Macro Carry", "All Macro Defensive", "All Macro Multi-style"], "Global"), "Alt_Century_Macro")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Commodity", ["Commodity Value", "Commodity Momentum", "Commodity Carry", "Commodity Multi-style"], "Global"), "Alt_Century_Com")] = fund
                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], load_century_factors("Currencies", ["Currency Value", "Currency Momentum", "Currency Carry", "Currency Multi-style"], "Global"), "Alt_Century_Currency")] = fund
                
            for future in tqdm(futures, desc=f"🔁 Region: {region}"):
                try:
                    records.extend(future.result())
                except concurrent.futures.TimeoutError as e:
                    print(f"⚠️ Error in {futures[future]}: Timeout - {e}")
                except Exception as e:
                    print(f"⚠️ Error in {futures[future]}: Unexpected error - {type(e).__name__}: {e}")
        
        if records:
            if not DRY_RUN:
                insert_batch(records)
            else:
                logging.info(f"Dry run: Skipped writing {len(records)} records for chunk {i//CHUNK_SIZE + 1}")
                print(f"ℹ️ Dry run: Would have written {len(records)} records")

def main():
    logging.basicConfig(level=logging.INFO)
    use_r = initialize_data_sources()
    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()
    print(f"🧠 Total mapped funds: {len(fund_meta)}")
    print(f"📍 Regions detected: {regions}\n")
    
    for region in regions:
        fund_subset = fund_meta[fund_meta["Region"] == region]
        process_region(region, fund_subset, use_r)

# Section 6: Database Output
def insert_batch(records):
    df = pd.DataFrame(records)
    if not DRY_RUN:
        for i in range(0, len(df), BATCH_INSERT_SIZE):
            batch = df.iloc[i:i + BATCH_INSERT_SIZE]
            batch.to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)
    else:
        for i in range(0, len(df), BATCH_INSERT_SIZE):
            batch_size = len(df.iloc[i:i + BATCH_INSERT_SIZE])
            logging.info(f"Dry run: Skipped batch insert of {batch_size} records")

if __name__ == "__main__":
    main()

In [None]:
# Version 1
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri

# Section 1: Configuration and Setup
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

RETURN_METRIC = "1 Month Return"
ROLLING_PERIODS = [12, 24, 36, 48, 60]  # in months
DRY_RUN = True
SAMPLE_DRY_RUN = True
SAMPLE_SIZE = 100
CHUNK_SIZE = 5600
BATCH_INSERT_SIZE = 10000
MAX_WORKERS = 15  # Optimized for 16-core i9-185H

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("Global", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous")
    }
    return mapping.get(category, ("Unknown", "Unknown"))

def initialize_r_environment():
    """Initialize R environment for AQRR factor loading, returning None if unavailable."""
    import os
    os.environ['R_HOME'] = r'C:\\Program Files\\R\\R-4.4.3'
    os.environ['R_JIT_ENABLED'] = '0'
    
    import logging
    import rpy2.rinterface_lib.callbacks
    import rpy2.robjects as ro
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri
    
    rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
    
    try:
        version = ro.r('R.version.string')[0]
        print(f"🔁 R Version: {version}")
        logging.info(f"R initialized successfully: {version}")
    except (OSError, ValueError) as e:
        print(f"❌ Could not connect to R: {e}")
        logging.warning(f"Could not connect to R: {e}")
        return None
    
    try:
        aqrr = importr('aqrr')
        print("✅ Loaded R package: aqrr")
        funcs = ro.r('ls("package:aqrr")')
        print(f"📦 AQRR Functions Available: {list(funcs)[:6]} ...")
        logging.info("AQRR package loaded successfully")
    except ImportError as e:
        print(f"❌ Failed to load 'aqrr': {e}")
        logging.warning(f"Failed to load 'aqrr': {e}")
        return None
    
    pandas2ri.activate()
    return aqrr

# Section 3: Data Loading Functions
def load_fund_metadata():
    query = """
    SELECT f.SymbolCUSIP, f.YC_Global_Category_ID, c.Global_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    """
    df = pd.read_sql(query, engine)
    df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
    return df.dropna(subset=["Region", "FactorProfile"])

def load_fund_returns(fund_ids):
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
        AND Metric = '{RETURN_METRIC}'
    """
    return pd.read_sql(query, engine, parse_dates=["Date"]).pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

def load_factors(region, category):
    region_map = {"USA": "US", "Global": "Global", "International": "Global Ex USA"}
    db_region = region_map.get(region, "Global")
    
    if "Equity" in category:
        factors = load_db_factors(['MKT', 'SMB', 'HML-D', 'QMJ', 'UMD', 'BAB', 'TSM'], db_region, portfolio_filter=['Global', 'Equities'])
        factors['mkt-rf'] = factors['mkt'] - load_db_factors(['RF'], 'Global')['rf']
        return factors[['mkt-rf', 'smb', 'hml', 'qmj', 'umd', 'bab', 'tsm']]
    elif "Fixed Income" in category:
        fi_factors = load_fixed_income_factors()
        tsm_fi = load_db_factors(['TSM'], 'Global', portfolio_filter=['Fixed Income'])
        return pd.concat([fi_factors, tsm_fi], axis=1)
    elif "Commodities" in category:
        return load_db_factors(['COM', 'TSM'], 'Global', portfolio_filter=['Excess return of equal-weight commodities portfolio', 'Commodities'])
    elif "Allocation" in category or "Alternative" in category:
        factors = load_db_factors(['MKT', 'SMB', 'HML-D', 'QMJ', 'UMD', 'BAB', 'TSM'], db_region, portfolio_filter=['Global', 'Equities', 'Fixed Income'])
        factors['mkt-rf'] = factors['mkt'] - load_db_factors(['RF'], 'Global')['rf']
        fi_factors = load_fixed_income_factors()
        return pd.concat([factors[['mkt-rf', 'smb', 'hml', 'qmj', 'umd', 'bab', 'tsm']], fi_factors], axis=1)
    return pd.DataFrame()  # Default empty for unhandled categories

def load_db_factors(factor_list, region="Global", table="aqr_factors", portfolio_filter=None):
    query = f"""
        SELECT date AS Date, factor_symbol AS Factor, value AS Value
        FROM {table}
        WHERE region = :region
        AND factor_symbol IN ({','.join([f"'{f}'" for f in factor_list])})
    """
    if portfolio_filter:
        query += f" AND portfolio IN ({','.join([f"'{p}'" for p in portfolio_filter])})"
    df = pd.read_sql(query, engine, params={'region': region}, parse_dates=['Date'])
    return df.pivot(index="Date", columns="Factor", values="Value").rename(columns={'MKT': 'mkt', 'SMB': 'smb', 'HML-D': 'hml', 'UMD': 'umd', 'QMJ': 'qmj', 'BAB': 'bab', 'TSM': 'tsm'})

def load_fixed_income_factors():
    query = """
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
    """
    return pd.read_sql(query, engine, parse_dates=["Date"]).pivot(index="Date", columns="Factor_Name", values="ReturnValue")

# Section 4: Rolling Regression Functions
def run_rolling_regression_python(fund, returns, factors, regression_type):
    results = []
    returns.index = pd.to_datetime(returns.index)
    factors.index = pd.to_datetime(factors.index)
    viable_periods = [w for w in ROLLING_PERIODS if (returns.index.max() - relativedelta(months=w)) >= returns.index.min()]
    
    for window in viable_periods:
        start = returns.index.min() + relativedelta(months=window)
        for end_date in returns.loc[returns.index >= start].index:
            start_date = end_date - relativedelta(months=window - 1)
            y = returns.loc[start_date:end_date]
            X = factors.loc[start_date:end_date]
            X, y = X.align(y, join="inner", axis=0)
            if len(y) < window or y.isnull().any() or X.isnull().any().any():
                continue
            X_const = add_constant(X)
            model = OLS(y, X_const).fit()
            for factor in X.columns:
                results.append({
                    "SymbolCUSIP": fund,
                    "MonthEndDate": end_date,
                    "RollPeriod": f"{window}m",
                    "Factor_Name": factor,
                    "Coefficient": model.params.get(factor, np.nan),
                    "P_Value": model.pvalues.get(factor, np.nan),
                    "Regression_Type": regression_type
                })
    return results

def run_rolling_regression_r(fund, returns, factors, regression_type):
    with pandas2ri.localconverter(pandas2ri.converter):
        r_returns = pandas2ri.py2rpy(returns)
        r_factors = pandas2ri.py2rpy(factors)
    ro.r.assign("returns", r_returns)
    ro.r.assign("factors", r_factors)
    ro.r("""
    library(dplyr)
    results <- list()
    for (w in c(12, 24, 36, 48, 60)) {
        for (i in (w+1):nrow(returns)) {
            fit <- lm(returns[(i-w+1):i] ~ ., data=factors[(i-w+1):i,])
            coefs <- summary(fit)$coefficients
            results[[length(results)+1]] <- data.frame(
                RollPeriod = paste0(w, "m"),
                Factor_Name = rownames(coefs)[-1],
                Coefficient = coefs[-1, "Estimate"],
                P_Value = coefs[-1, "Pr(>|t|)"]
            )
        }
    }
    results <- bind_rows(results)
    """)
    results = pandas2ri.rpy2py(ro.r["results"])
    results["SymbolCUSIP"] = fund
    results["Regression_Type"] = regression_type
    return results.to_dict("records")

def test_regression_speed(fund, returns, factors):
    print(f"Testing regression speed for {fund}...")
    start_time = time.time()
    python_results = run_rolling_regression_python(fund, returns, factors, "Python_OLS")
    python_time = time.time() - start_time
    print(f"Python time: {python_time:.2f} seconds | Results: {len(python_results)}")
    
    start_time = time.time()
    r_results = run_rolling_regression_r(fund, returns, factors, "R_OLS")
    r_time = time.time() - start_time
    print(f"R time: {r_time:.2f} seconds | Results: {len(r_results)}")
    
    return "R" if r_time < python_time else "Python"

# Section 5: Main Processing Pipeline
def process_region(region, fund_subset, use_r, regression_func):
    categories = fund_subset["FactorProfile"].unique()
    records = []
    
    for category in categories:
        cat_subset = fund_subset[fund_subset["FactorProfile"] == category]
        funds = cat_subset["SymbolCUSIP"].tolist()
        if SAMPLE_DRY_RUN:
            funds = random.sample(funds, min(SAMPLE_SIZE, len(funds)))
            print(f"ℹ️ Sample dry run: Processing {len(funds)} funds in {region}/{category}")
        
        # Load fund returns and factors once per region/category
        fund_returns = load_fund_returns(funds)
        factors = load_factors(region, category)
        
        with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = {executor.submit(regression_func, fund, fund_returns[fund], factors, f"{region}_{category}_OLS"): fund for fund in fund_returns.columns}
            for future in tqdm(futures, desc=f"🔁 {region}/{category}"):
                try:
                    records.extend(future.result())
                except Exception as e:
                    print(f"⚠️ Error in {futures[future]}: {e}")
    
    if records:
        if not DRY_RUN:
            insert_batch(records)
        else:
            print(f"ℹ️ Dry run: Would have written {len(records)} records for {region}")

def main():
    logging.basicConfig(level=logging.INFO)
    use_r = initialize_r_environment()
    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()
    print(f"🧠 Total mapped funds: {len(fund_meta)}")
    print(f"📍 Regions detected: {regions}\n")
    
    # Test R vs Python on a sample
    sample_funds = fund_meta["SymbolCUSIP"].sample(1).tolist()
    sample_returns = load_fund_returns(sample_funds)
    sample_factors = load_factors("USA", "US Equity Large Cap Blend")  # Default test case
    best_method = test_regression_speed(sample_funds[0], sample_returns[sample_funds[0]], sample_factors)
    regression_func = run_rolling_regression_r if best_method == "R" and use_r is not None else run_rolling_regression_python
    print(f"Using {best_method} for regressions")
    
    for region in regions:
        fund_subset = fund_meta[fund_meta["Region"] == region]
        process_region(region, fund_subset, use_r, regression_func)

# Section 6: Database Output
def insert_batch(records):
    df = pd.DataFrame(records)
    if not DRY_RUN:
        for i in range(0, len(df), BATCH_INSERT_SIZE):
            batch = df.iloc[i:i + BATCH_INSERT_SIZE]
            batch.to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)
    else:
        print(f"ℹ️ Dry run: Skipped writing {len(df)} records")

if __name__ == "__main__":
    main()

In [None]:
#version 2

In [6]:
# Version 3.3
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
import time
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

# Section 1: Configuration and Setup
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

RETURN_METRIC = "1 Month Return"
ROLLING_PERIODS = [12, 24, 36, 48, 60]  # in months
DRY_RUN = True
SAMPLE_DRY_RUN = True
SAMPLE_SIZE = 100
CHUNK_SIZE = 5600
BATCH_INSERT_SIZE = 10000
MAX_WORKERS = 15  # Optimized for 16-core i9-185H

# Section 2: Helper Functions
def category_to_region(category):
    mapping = {
        "US Equity Large Cap Blend": ("USA", "US Equity Large Cap Blend"),
        "US Equity Large Cap Growth": ("USA", "US Equity Large Cap Growth"),
        "US Equity Large Cap Value": ("USA", "US Equity Large Cap Value"),
        "US Equity Mid Cap": ("USA", "US Equity Mid Cap"),
        "US Equity Small Cap": ("USA", "US Equity Small Cap"),
        "Global Equity Large Cap": ("Global", "Global Equity Large Cap"),
        "Global Equity Mid/Small Cap": ("Global", "Global Equity Mid/Small Cap"),
        "Global Emerging Markets Equity": ("Global", "Global Emerging Markets Equity"),
        "Europe Equity Large Cap": ("International", "Europe Equity Large Cap"),
        "Asia Equity": ("International", "Asia Equity"),
        "Japan Equity": ("International", "Japan Equity"),
        "Emerging Markets Fixed Income": ("International", "Emerging Markets Fixed Income"),
        "US Fixed Income": ("USA", "US Fixed Income"),
        "US Municipal Fixed Income": ("USA", "US Municipal Fixed Income"),
        "Global Fixed Income": ("Global", "Global Fixed Income"),
        "Flexible Allocation": ("Global", "Flexible Allocation"),
        "Aggressive Allocation": ("Global", "Aggressive Allocation"),
        "Moderate Allocation": ("Global", "Moderate Allocation"),
        "Cautious Allocation": ("Global", "Cautious Allocation"),
        "Commodities Broad Basket": ("Global", "Commodities Broad Basket"),
        "Commodities Specified": ("Global", "Commodities Specified"),
        "Options Trading": ("Global", "Options Trading"),
        "Multialternative": ("Global", "Multialternative"),
        "Market Neutral": ("Global", "Market Neutral"),
        "Long/Short Equity": ("Global", "Long/Short Equity"),
        "Alternative Miscellaneous": ("Global", "Alternative Miscellaneous")
    }
    return mapping.get(category, ("Unknown", "Unknown"))

def initialize_r_environment():
    """Initialize R environment for AQRR factor loading, returning None if unavailable."""
    import os
    os.environ['R_HOME'] = r'C:\\Program Files\\R\\R-4.4.3'
    os.environ['R_JIT_ENABLED'] = '0'
    
    import logging
    import rpy2.rinterface_lib.callbacks
    import rpy2.robjects as ro
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri
    
    rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
    
    try:
        version = ro.r('R.version.string')[0]
        print(f"🔁 R Version: {version}")
        logging.info(f"R initialized successfully: {version}")
    except (OSError, ValueError) as e:
        print(f"❌ Could not connect to R: {e}")
        logging.warning(f"Could not connect to R: {e}")
        return None
    
    try:
        aqrr = importr('aqrr')
        print("✅ Loaded R package: aqrr")
        funcs = ro.r('ls("package:aqrr")')
        print(f"📦 AQRR Functions Available: {list(funcs)[:6]} ...")
        logging.info("AQRR package loaded successfully")
    except ImportError as e:
        print(f"❌ Failed to load 'aqrr': {e}")
        logging.warning(f"Failed to load 'aqrr': {e}")
        return None
    
    pandas2ri.activate()
    return aqrr

# Section 3: Data Loading Functions
def load_fund_metadata():
    query = """
    SELECT f.SymbolCUSIP, f.YC_Global_Category_ID, c.Global_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    """
    df = pd.read_sql(query, engine)
    df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
    return df.dropna(subset=["Region", "FactorProfile"])

def load_fund_returns(fund_ids):
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
        AND Metric = '{RETURN_METRIC}'
    """
    return pd.read_sql(query, engine, parse_dates=["Date"]).pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

def load_aqrr_factors(region="USA", aqrr=None):
    if aqrr is None:
        return pd.DataFrame()
    region_map = {"USA": "US", "Global": "Global", "International": "Global Ex USA"}
    r_region = region_map.get(region, "Global")
    print(f"📦 Loading AQRR factor data for region: {r_region}")
    ro.r('library(dplyr)')
    ro.r(f"""
    mkt <- aqr_mkt_monthly() %>% filter(name == '{r_region}') %>%
      mutate(date = as.character(date)) %>% select(date, mkt = value)
    smb <- aqr_smb_monthly() %>% filter(name == '{r_region}') %>%
      mutate(date = as.character(date)) %>% select(date, smb = value)
    hml <- aqr_hml_ff_monthly() %>% filter(name == '{r_region}') %>%
      mutate(date = as.character(date)) %>% select(date, hml = value)
    umd <- aqr_umd_monthly() %>% filter(name == '{r_region}') %>%
      mutate(date = as.character(date)) %>% select(date, umd = value)
    qmj <- aqr_qmj_monthly() %>% filter(name == '{r_region}') %>%
      mutate(date = as.character(date)) %>% select(date, qmj = value)
    bab <- aqr_bab_monthly() %>% filter(name == '{r_region}') %>%
      mutate(date = as.character(date)) %>% select(date, bab = value)
    """)
    def fix_date(r_obj):
        with localconverter(pandas2ri.converter):
            df = pandas2ri.rpy2py(r_obj)
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        return df[df['date'] >= pd.to_datetime("2015-01-31")].reset_index(drop=True)
    factors = (
        fix_date(ro.r['mkt']).merge(fix_date(ro.r['smb']), on='date')
            .merge(fix_date(ro.r['hml']), on='date')
            .merge(fix_date(ro.r['umd']), on='date')
            .merge(fix_date(ro.r['qmj']), on='date')
            .merge(fix_date(ro.r['bab']), on='date')
            .rename(columns={"date": "Date"})
            .sort_values("Date")
            .reset_index(drop=True)
    )
    print(f"✅ Loaded AQRR factors: {r_region} | Shape: {factors.shape}")
    return factors

def load_db_factors(factor_list, region="Global", table="aqr_factors", portfolio_filter=None):
    # Use ? placeholder for PyODBC compatibility
    factor_in_clause = ','.join([f"'{f}'" for f in factor_list])
    portfolio_in_clause = ','.join([f"'{p}'" for p in portfolio_filter]) if portfolio_filter else ''
    query = f"""
        SELECT date AS Date, factor_symbol AS Factor, value AS Value
        FROM {table}
        WHERE region = ?
        AND factor_symbol IN ({factor_in_clause})
    """
    if portfolio_filter:
        query += f" AND portfolio IN ({portfolio_in_clause})"
    print(f"Executing query: {query} with param: {region}")  # Debug
    df = pd.read_sql(query, engine, params=(region,), parse_dates=['Date'])
    return df.pivot(index="Date", columns="Factor", values="Value").rename(columns={'MKT': 'mkt', 'SMB': 'smb', 'HML-D': 'hml', 'UMD': 'umd', 'QMJ': 'qmj', 'BAB': 'bab', 'TSM': 'tsm'})

def load_fixed_income_factors():
    query = """
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
    """
    return pd.read_sql(query, engine, parse_dates=["Date"]).pivot(index="Date", columns="Factor_Name", values="ReturnValue")

def compare_and_select_factors(region, r_factors, db_factors):
    r_max = r_factors['Date'].max() if not r_factors.empty else pd.Timestamp("1900-01-01")
    db_max = db_factors.index.max() if not db_factors.empty else pd.Timestamp("1900-01-01")
    print(f"📅 R max date: {r_max.date()} | DB max date: {db_max.date()}")
    if r_max > db_max:
        print(f"✅ Using R-based AQRR factors for {region} (more recent)")
        return r_factors.set_index("Date")
    else:
        print(f"✅ Using DB-based AQRR factors for {region}")
        return db_factors

def load_factors(region, category, aqrr=None):
    region_map = {"USA": "US", "Global": "Global", "International": "Global Ex USA"}
    db_region = region_map.get(region, "Global")
    
    if "Equity" in category:
        db_factors = load_db_factors(['MKT', 'SMB', 'HML-D', 'QMJ', 'UMD', 'BAB', 'TSM'], db_region, portfolio_filter=['Global', 'Equities'])
        db_factors['mkt-rf'] = db_factors['mkt'] - load_db_factors(['RF'], 'Global')['rf']
        r_factors = load_aqrr_factors(region, aqrr) if aqrr else pd.DataFrame()
        factors = compare_and_select_factors(region, r_factors, db_factors[['mkt-rf', 'smb', 'hml', 'qmj', 'umd', 'bab', 'tsm']])
        return factors
    elif "Fixed Income" in category:
        fi_factors = load_fixed_income_factors()
        tsm_fi = load_db_factors(['TSM'], 'Global', portfolio_filter=['Fixed Income'])
        return pd.concat([fi_factors, tsm_fi], axis=1)
    elif "Commodities" in category:
        return load_db_factors(['COM', 'TSM'], 'Global', portfolio_filter=['Excess return of equal-weight commodities portfolio', 'Commodities'])
    elif "Allocation" in category or "Alternative" in category:
        db_factors = load_db_factors(['MKT', 'SMB', 'HML-D', 'QMJ', 'UMD', 'BAB', 'TSM'], db_region, portfolio_filter=['Global', 'Equities', 'Fixed Income'])
        db_factors['mkt-rf'] = db_factors['mkt'] - load_db_factors(['RF'], 'Global')['rf']
        r_factors = load_aqrr_factors(region, aqrr) if aqrr else pd.DataFrame()
        factors = compare_and_select_factors(region, r_factors, db_factors[['mkt-rf', 'smb', 'hml', 'qmj', 'umd', 'bab', 'tsm']])
        fi_factors = load_fixed_income_factors()
        return pd.concat([factors, fi_factors], axis=1)
    return pd.DataFrame()

# Section 4: Rolling Regression Functions
def run_rolling_regression_python(fund, returns, factors, regression_type):
    results = []
    returns.index = pd.to_datetime(returns.index)
    factors.index = pd.to_datetime(factors.index)
    viable_periods = [w for w in ROLLING_PERIODS if (returns.index.max() - relativedelta(months=w)) >= returns.index.min()]
    
    for window in viable_periods:
        start = returns.index.min() + relativedelta(months=window)
        for end_date in returns.loc[returns.index >= start].index:
            start_date = end_date - relativedelta(months=window - 1)
            y = returns.loc[start_date:end_date]
            X = factors.loc[start_date:end_date]
            X, y = X.align(y, join="inner", axis=0)
            if len(y) < window or y.isnull().any() or X.isnull().any().any():
                continue
            X_const = add_constant(X)
            model = OLS(y, X_const).fit()
            for factor in X.columns:
                results.append({
                    "SymbolCUSIP": fund,
                    "MonthEndDate": end_date,
                    "RollPeriod": f"{window}m",
                    "Factor_Name": factor,
                    "Coefficient": model.params.get(factor, np.nan),
                    "P_Value": model.pvalues.get(factor, np.nan),
                    "Regression_Type": regression_type
                })
    return results

def run_rolling_regression_r(fund, returns, factors, regression_type):
    with pandas2ri.localconverter(pandas2ri.converter):
        r_returns = pandas2ri.py2rpy(returns)
        r_factors = pandas2ri.py2rpy(factors)
    ro.r.assign("returns", r_returns)
    ro.r.assign("factors", r_factors)
    ro.r("""
    library(dplyr)
    results <- list()
    for (w in c(12, 24, 36, 48, 60)) {
        for (i in (w+1):nrow(returns)) {
            fit <- lm(returns[(i-w+1):i] ~ ., data=factors[(i-w+1):i,])
            coefs <- summary(fit)$coefficients
            results[[length(results)+1]] <- data.frame(
                RollPeriod = paste0(w, "m"),
                Factor_Name = rownames(coefs)[-1],
                Coefficient = coefs[-1, "Estimate"],
                P_Value = coefs[-1, "Pr(>|t|)"]
            )
        }
    }
    results <- bind_rows(results)
    """)
    results = pandas2ri.rpy2py(ro.r["results"])
    results["SymbolCUSIP"] = fund
    results["Regression_Type"] = regression_type
    return results.to_dict("records")

def test_regression_speed(fund, returns, factors):
    print(f"Testing regression speed for {fund}...")
    start_time = time.time()
    python_results = run_rolling_regression_python(fund, returns, factors, "Python_OLS")
    python_time = time.time() - start_time
    print(f"Python time: {python_time:.2f} seconds | Results: {len(python_results)}")
    
    start_time = time.time()
    try:
        r_results = run_rolling_regression_r(fund, returns, factors, "R_OLS")
        r_time = time.time() - start_time
        print(f"R time: {r_time:.2f} seconds | Results: {len(r_results)}")
        return "R" if r_time < python_time else "Python"
    except Exception as e:
        print(f"❌ R regression failed: {e}, falling back to Python")
        return "Python"

# Section 5: Main Processing Pipeline
def process_region(region, fund_subset, aqrr, regression_func):
    categories = fund_subset["FactorProfile"].unique()
    records = []
    
    for category in categories:
        cat_subset = fund_subset[fund_subset["FactorProfile"] == category]
        funds = cat_subset["SymbolCUSIP"].tolist()
        if SAMPLE_DRY_RUN:
            funds = random.sample(funds, min(SAMPLE_SIZE, len(funds)))
            print(f"ℹ️ Sample dry run: Processing {len(funds)} funds in {region}/{category}")
        
        fund_returns = load_fund_returns(funds)
        factors = load_factors(region, category, aqrr)
        
        with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = {executor.submit(regression_func, fund, fund_returns[fund], factors, f"{region}_{category}_OLS"): fund for fund in fund_returns.columns}
            for future in tqdm(futures, desc=f"🔁 {region}/{category}"):
                try:
                    records.extend(future.result())
                except Exception as e:
                    print(f"⚠️ Error in {futures[future]}: {e}")
    
    if records:
        if not DRY_RUN:
            insert_batch(records)
        else:
            print(f"ℹ️ Dry run: Would have written {len(records)} records for {region}")

def main():
    logging.basicConfig(level=logging.INFO)
    aqrr = initialize_r_environment()
    use_r = aqrr is not None
    if not use_r:
        print("⚠️ R initialization failed, using DB factors only")
    
    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()
    print(f"🧠 Total mapped funds: {len(fund_meta)}")
    print(f"📍 Regions detected: {regions}\n")
    
    # Test R vs Python on a sample
    sample_funds = fund_meta["SymbolCUSIP"].sample(1).tolist()
    sample_returns = load_fund_returns(sample_funds)
    sample_factors = load_factors("USA", "US Equity Large Cap Blend", aqrr)
    best_method = test_regression_speed(sample_funds[0], sample_returns[sample_funds[0]], sample_factors)
    regression_func = run_rolling_regression_r if best_method == "R" and use_r else run_rolling_regression_python
    print(f"Using {best_method} for regressions")
    
    for region in regions:
        fund_subset = fund_meta[fund_meta["Region"] == region]
        process_region(region, fund_subset, aqrr, regression_func)

# Section 6: Database Output
def insert_batch(records):
    df = pd.DataFrame(records)
    if not DRY_RUN:
        for i in range(0, len(df), BATCH_INSERT_SIZE):
            batch = df.iloc[i:i + BATCH_INSERT_SIZE]
            batch.to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)
    else:
        print(f"ℹ️ Dry run: Skipped writing {len(df)} records")

if __name__ == "__main__":
    main()

INFO:root:R initialized successfully: R version 4.4.3 (2025-02-28 ucrt)


🔁 R Version: R version 4.4.3 (2025-02-28 ucrt)


INFO:root:AQRR package loaded successfully


✅ Loaded R package: aqrr
📦 AQRR Functions Available: [':=', 'aqr_bab_daily', 'aqr_bab_monthly', 'aqr_commodities_long_run', 'aqr_credit_risk_premium', 'aqr_factor_premia_monthly'] ...
🧠 Total mapped funds: 5584
📍 Regions detected: ['Global' 'Unknown' 'USA' 'International']

Executing query: 
        SELECT date AS Date, factor_symbol AS Factor, value AS Value
        FROM aqr_factors
        WHERE region = ?
        AND factor_symbol IN ('MKT','SMB','HML-D','QMJ','UMD','BAB','TSM')
     AND portfolio IN ('Global','Equities') with param: US


KeyError: 'mkt'

In [3]:
aqrr = initialize_r_environment()


🔁 R Version: R version 4.4.3 (2025-02-28 ucrt)
✅ Loaded R package: aqrr
📦 AQRR Functions Available: [':=', 'aqr_bab_daily', 'aqr_bab_monthly', 'aqr_commodities_long_run', 'aqr_credit_risk_premium', 'aqr_factor_premia_monthly'] ...
