In [None]:
# The script below populates the fixed income factor data

In [None]:
#Section 1: Setup
import pandas as pd
from sqlalchemy import create_engine, text

connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

#Section 2: Define fixed income factors
factors = {
    'TERM':         ('VGLT', '^BBUTB13MTR'),
    'TERM_Int':     ('IEF', '^BBUTB13MTR'),
    'TERM_Long':    ('VGLT', 'IEF'),
    'CREDIT':       ('^BBUSCOTR', 'VGLT'),
    'CREDIT_HY':    ('^BBUSCOHYTR', 'IEF')
}
symbols = set(s for pair in factors.values() for s in pair)

#Section 3: Load all relevant data from both tables
symbol_str = "', '".join(symbols)
query = f"""
    SELECT Symbol, Date, ReturnValue FROM (
        SELECT SymbolCUSIP AS Symbol, Date, ReturnValue 
        FROM dbo.Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ('{symbol_str}')
        UNION ALL
        SELECT Benchmark_Symbol AS Symbol, Date, ReturnValue 
        FROM dbo.Benchmark_Returns_Timeseries
        WHERE Benchmark_Symbol IN ('{symbol_str}')
    ) AS combined
"""
all_returns = pd.read_sql(query, engine, parse_dates=['Date'])
returns_wide = all_returns.pivot_table(index='Date', columns='Symbol', values='ReturnValue')

#Section 4: Load existing factor-date combos to avoid duplication
existing_query = """
    SELECT Factor_Name, Date FROM dbo.Fixed_Income_Factor_Returns
"""
existing_df = pd.read_sql(existing_query, engine, parse_dates=['Date'])
existing_pairs = set(zip(existing_df['Factor_Name'], existing_df['Date']))

#Section 5: Calculate factors
factor_dfs = []
for factor_name, (s1, s2) in factors.items():
    if s1 in returns_wide.columns and s2 in returns_wide.columns:
        df = returns_wide[[s1, s2]].dropna().copy()
        df['ReturnValue'] = df[s1] - df[s2]
        df['Factor_Name'] = factor_name
        df['Source_1'] = s1
        df['Source_2'] = s2
        df['Notes'] = f"{s1} minus {s2}"
        df['Date'] = df.index
        df = df[['Factor_Name', 'Date', 'ReturnValue', 'Source_1', 'Source_2', 'Notes']]
        # Filter out rows already in the DB
        df = df[~df.set_index(['Factor_Name', 'Date']).index.isin(existing_pairs)]
        factor_dfs.append(df.reset_index(drop=True))

#Section 6: Combine and insert into SQL
if factor_dfs:
    final_df = pd.concat(factor_dfs, ignore_index=True)
    final_df.to_sql("Fixed_Income_Factor_Returns", engine, if_exists="append", index=False)
    print(f"✅ Inserted {len(final_df)} new rows into Fixed_Income_Factor_Returns.")
else:
    print("✅ No new factor data to insert — database is up to date.")


In [None]:
# R integration Set up

In [None]:
# STEP 1: Load R Home To not get fatal errors

In [1]:
import os
os.environ['R_HOME'] = r'C:\Program Files\R\R-4.4.3'  # Use raw string (r) to handle backslashes
import rpy2.robjects as ro
print(ro.r('R.version.string'))


[1] "R version 4.4.3 (2025-02-28 ucrt)"



In [None]:
# STEP 2: Disable JIT

In [2]:
import os
os.environ['R_JIT_ENABLED'] = '0'  # Disable JIT
import rpy2.robjects as ro
print(ro.r('R.version.string'))

[1] "R version 4.4.3 (2025-02-28 ucrt)"



In [None]:
# STEP 3: Test connection to R

In [3]:
import rpy2.robjects as ro
print(ro.r('R.version.string'))  # Test basic R connection

[1] "R version 4.4.3 (2025-02-28 ucrt)"



In [None]:
# STEP 4: Test connection to AQRR

In [4]:
import os
os.environ['R_HOME'] = r'C:\Program Files\R\R-4.4.3'
import rpy2.robjects as ro
from rpy2.robjects.packages import importr

aqrr = importr('aqrr')
funcs = ro.r('ls("package:aqrr")')
print(funcs)

 [1] ":="                        "aqr_bab_daily"            
 [3] "aqr_bab_monthly"           "aqr_commodities_long_run" 
 [5] "aqr_credit_risk_premium"   "aqr_factor_premia_monthly"
 [7] "aqr_hml_devil_daily"       "aqr_hml_devil_monthly"    
 [9] "aqr_hml_ff_daily"          "aqr_hml_ff_monthly"       
[11] "aqr_mkt_daily"             "aqr_mkt_monthly"          
[13] "aqr_momentum_monthly"      "aqr_qmj_daily"            
[15] "aqr_qmj_monthly"           "aqr_smb_daily"            
[17] "aqr_smb_monthly"           "aqr_umd_daily"            
[19] "aqr_umd_monthly"           "as_label"                 
[21] "as_name"                   "enquo"                    
[23] "enquos"                   



In [None]:
# Code below was to test the issues around date's from R vs Python & test it 
# There was coruption in how it came over, this code below helped fixed it
# Then we verified this fix via an R tupple, then below this incorporated it into a funciton

In [None]:
import os
from datetime import datetime, timedelta
import pandas as pd

# ✅ R + PATH setup
os.environ['R_HOME'] = r"C:\\Program Files\\R\\R-4.4.3"
os.environ['PATH'] += r";C:\\Program Files\\R\\R-4.4.3\\bin\\x64"

import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
import rpy2.rinterface_lib.callbacks
import logging

# ✅ Suppress R[write to console]:
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# ✅ Activate R ↔ pandas conversion
pandas2ri.activate()

# ✅ Load R libraries
aqrr = importr('aqrr')
dplyr = importr('dplyr')

# ✅ Define cutoff date
cutoff = datetime.today() - timedelta(days=3 * 365)
print("📅 Using cutoff date:", cutoff.date())

# ✅ Fix date (handles character or datetime)
def fix_date_column(df):
    if pd.api.types.is_string_dtype(df['date']):
        df['date'] = pd.to_datetime(df['date'])
    return df[df['date'] >= cutoff].reset_index(drop=True)

# ✅ Step 1: Pull AQRR data in R, force date to character
ro.r("""
mkt <- aqr_mkt_monthly() %>% filter(name == "USA") %>% 
  mutate(date = as.character(date)) %>% select(date, mkt = value)

smb <- aqr_smb_monthly() %>% filter(name == "USA") %>% 
  mutate(date = as.character(date)) %>% select(date, smb = value)

hml <- aqr_hml_ff_monthly() %>% filter(name == "USA") %>% 
  mutate(date = as.character(date)) %>% select(date, hml = value)

umd <- aqr_umd_monthly() %>% filter(name == "USA") %>% 
  mutate(date = as.character(date)) %>% select(date, umd = value)

qmj <- aqr_qmj_monthly() %>% filter(name == "USA") %>% 
  mutate(date = as.character(date)) %>% select(date, qmj = value)

bab <- aqr_bab_monthly() %>% filter(name == "USA") %>% 
  mutate(date = as.character(date)) %>% select(date, bab = value)
""")

# ✅ Step 2: Convert each to pandas + fix dates
mkt_df = fix_date_column(pandas2ri.rpy2py(ro.r['mkt']))
smb_df = fix_date_column(pandas2ri.rpy2py(ro.r['smb']))
hml_df = fix_date_column(pandas2ri.rpy2py(ro.r['hml']))
umd_df = fix_date_column(pandas2ri.rpy2py(ro.r['umd']))
qmj_df = fix_date_column(pandas2ri.rpy2py(ro.r['qmj']))
bab_df = fix_date_column(pandas2ri.rpy2py(ro.r['bab']))

# ✅ Step 3: Merge all on date
factors = mkt_df \
    .merge(smb_df, on='date', how='inner') \
    .merge(hml_df, on='date', how='inner') \
    .merge(umd_df, on='date', how='inner') \
    .merge(qmj_df, on='date', how='inner') \
    .merge(bab_df, on='date', how='inner')

# ✅ Step 4: Final preview
print("\n✅ Final AQRR Factors (USA, Last 3 Years):")
print(factors.head())
print(factors.tail())
print(f"\n✅ Shape: {factors.shape}")


In [None]:
# Code below was to test the issues around date's from R vs Python & test it as a funciton
# There was coruption in how it came over, this code below helped fixed it
#    # Helper to clean and filter
#    def fix_date(df):
#        df['date'] = pd.to_datetime(df['date'])
#        return df[df['date'] >= cutoff].reset_index(drop=True)

In [None]:
def load_aqrr_factors(region="USA", years=3):
    import os
    from datetime import datetime, timedelta
    import pandas as pd
    from rpy2.robjects import r as ro
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri
    import rpy2.rinterface_lib.callbacks
    import logging

    # R paths
    os.environ['R_HOME'] = r"C:\\Program Files\\R\\R-4.4.3"
    os.environ['PATH'] += r";C:\\Program Files\\R\\R-4.4.3\\bin\\x64"

    # Suppress R logging
    rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

    # Load R libs
    aqrr = importr('aqrr')
    dplyr = importr('dplyr')
    pandas2ri.activate()

    # Calculate cutoff
    cutoff = datetime.today() - timedelta(days=years * 365)
    print(f"📅 Cutoff date: {cutoff.date()} for region: {region}")

    # R: Load and filter factors
    ro(f"""
    mkt <- aqr_mkt_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, mkt = value)
    smb <- aqr_smb_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, smb = value)
    hml <- aqr_hml_ff_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, hml = value)
    umd <- aqr_umd_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, umd = value)
    qmj <- aqr_qmj_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, qmj = value)
    bab <- aqr_bab_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, bab = value)
    """)

    # Helper to clean and filter
    def fix_date(df):
        df['date'] = pd.to_datetime(df['date'])
        return df[df['date'] >= cutoff].reset_index(drop=True)

    # Pull into Python
    mkt_df = fix_date(pandas2ri.rpy2py(ro['mkt']))
    smb_df = fix_date(pandas2ri.rpy2py(ro['smb']))
    hml_df = fix_date(pandas2ri.rpy2py(ro['hml']))
    umd_df = fix_date(pandas2ri.rpy2py(ro['umd']))
    qmj_df = fix_date(pandas2ri.rpy2py(ro['qmj']))
    bab_df = fix_date(pandas2ri.rpy2py(ro['bab']))

    # Merge
    factors = mkt_df \
        .merge(smb_df, on='date') \
        .merge(hml_df, on='date') \
        .merge(umd_df, on='date') \
        .merge(qmj_df, on='date') \
        .merge(bab_df, on='date')

    print(f"✅ Loaded AQRR factors: {region} | Shape: {factors.shape}")
    return factors


In [None]:
# R initializer code

In [1]:
def initialize_r_environment():
    import os

    # 🔒 Set environment variables BEFORE importing anything from rpy2
    os.environ['R_HOME'] = r'C:\\Program Files\\R\\R-4.4.3'
    os.environ['R_JIT_ENABLED'] = '0'

    # Now import RPy2 safely
    import logging
    import rpy2.rinterface_lib.callbacks
    import rpy2.robjects as ro
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri

    # Suppress console spam
    rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

    # Print test R version
    try:
        print("🔁 R Version:", ro.r('R.version.string')[0])
    except Exception as e:
        print("❌ Could not connect to R:", e)
        return None

    # Load required packages
    try:
        aqrr = importr('aqrr')
        print("✅ Loaded R package: aqrr")
        funcs = ro.r('ls("package:aqrr")')
        print(f"📦 AQRR Functions Available: {list(funcs)[:6]} ...")
    except Exception as e:
        print("❌ Failed to load 'aqrr':", e)
        return None

    # Enable pandas ↔ R dataframe conversion
    pandas2ri.activate()

    return aqrr


In [2]:
aqrr = initialize_r_environment()

🔁 R Version: R version 4.4.3 (2025-02-28 ucrt)
✅ Loaded R package: aqrr
📦 AQRR Functions Available: [':=', 'aqr_bab_daily', 'aqr_bab_monthly', 'aqr_commodities_long_run', 'aqr_credit_risk_premium', 'aqr_factor_premia_monthly'] ...


In [None]:
# First attempt at full code for regressions

In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from concurrent.futures import ThreadPoolExecutor
from sqlalchemy import create_engine, text
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
from statsmodels.stats.diagnostic import het_breuschpagan, acorr_breusch_godfrey
from statsmodels.stats.stattools import durbin_watson
import statsmodels.api as sm

#Section1: Configuration
import logging
from rpy2.rinterface_lib.callbacks import logger as rpy2_logger
rpy2_logger.setLevel(logging.ERROR)

RETURN_METRIC = "1 Month Return"

connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)
ROLLING_PERIODS = [12, 24, 36, 48, 60]  # in months
DRY_RUN = True
CHUNK_SIZE = 200  # Number of funds per chunk
BATCH_INSERT_SIZE = 2000

# Updated region mapping
category_to_region = {
    # --- US Equity ---
    "US Equity Large Cap Blend": ("USA", "equity"),
    "US Equity Large Cap Growth": ("USA", "equity"),
    "US Equity Large Cap Value": ("USA", "equity"),
    "US Equity Mid Cap": ("USA", "equity"),
    "US Equity Small Cap": ("USA", "equity"),

    # --- US Fixed Income ---
    "US Fixed Income": ("USA", "fixed_income"),
    "US Municipal Fixed Income": ("USA", "fixed_income"),

    # --- US Sector Equity ---
    "Communications Sector Equity": ("USA", "equity"),
    "Consumer Goods & Services Sector Equity": ("USA", "equity"),
    "Energy Sector Equity": ("USA", "equity"),
    "Financials Sector Equity": ("USA", "equity"),
    "Healthcare Sector Equity": ("USA", "equity"),
    "Industrials Sector Equity": ("USA", "equity"),
    "Infrastructure Sector Equity": ("USA", "equity"),
    "Other Sector Equity": ("USA", "equity"),
    "Precious Metals Sector Equity": ("USA", "equity"),
    "Technology Sector Equity": ("USA", "equity"),
    "Utilities Sector Equity": ("USA", "equity"),
    "Real Estate Sector Equity": ("USA", "equity"),
    "Natural Resources Sector Equity": ("USA", "equity"),

    # --- Alternatives / Hybrids ---
    "Options Trading": ("USA", "equity"),
    "Multialternative": ("USA", "both"),
    "Market Neutral": ("USA", "both"),
    "Long/Short Equity": ("USA", "both"),
    "Alternative Miscellaneous": ("USA", "both"),
    "Allocation Miscellaneous": ("USA", "both"),
    "Fixed Income Miscellaneous": ("USA", "fixed_income"),
    "Equity Miscellaneous": ("USA", "equity"),
    "Convertibles": ("USA", "both"),

    # --- Global & Allocation ---
    "Flexible Allocation": ("Global", "both"),
    "Aggressive Allocation": ("Global", "both"),
    "Moderate Allocation": ("Global", "both"),
    "Cautious Allocation": ("Global", "both"),
    "Global Fixed Income": ("Global", "fixed_income"),
    "Global Equity Large Cap": ("Global", "equity"),
    "Global Equity Mid/Small Cap": ("Global", "equity"),
    "Global Emerging Markets Equity": ("Global", "equity"),

    # --- Global Ex-USA ---
    "Europe Equity Large Cap": ("Global Ex USA", "equity"),
    "Europe Equity Mid/Small Cap": ("Global Ex USA", "equity"),
    "Asia Equity": ("Global Ex USA", "equity"),
    "Asia ex-Japan Equity": ("Global Ex USA", "equity"),
    "India Equity": ("Global Ex USA", "equity"),
    "Latin America Equity": ("Global Ex USA", "equity"),
    "Japan Equity": ("Global Ex USA", "equity"),
    "Korea Equity": ("Global Ex USA", "equity"),
    "Thailand Equity": ("Global Ex USA", "equity"),
    "Mexico Equity": ("Global Ex USA", "equity"),
    "Australia & New Zealand Equity": ("Global Ex USA", "equity"),
    "Greater China Equity": ("Global Ex USA", "equity"),
    "UK Equity Large Cap": ("Global Ex USA", "equity"),
    "Emerging Markets Fixed Income": ("Global Ex USA", "fixed_income"),
    "Canadian Equity Large Cap": ("Global Ex USA", "equity"),

    # --- Excluded or Unused ---
    "Commodities Broad Basket": (None, None),
    "Commodities Specified": (None, None),
    "Target Date": (None, None),
    "Target Date 2021-2045": (None, None),
    "Target Date 2046+": (None, None),
    "Trading Tools": (None, None),
    "Currency": (None, None),
    "Uncategorized": (None, None),
}

# Section 1.5 Load the R environment
def initialize_r_environment():
    import os

    # 🔒 Set environment variables BEFORE importing anything from rpy2
    os.environ['R_HOME'] = r'C:\\Program Files\\R\\R-4.4.3'
    os.environ['R_JIT_ENABLED'] = '0'

    # Now import RPy2 safely
    import logging
    import rpy2.rinterface_lib.callbacks
    import rpy2.robjects as ro
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri

    # Suppress console spam
    rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

    # Print test R version
    try:
        print("🔁 R Version:", ro.r('R.version.string')[0])
    except Exception as e:
        print("❌ Could not connect to R:", e)
        return None

    # Load required packages
    try:
        aqrr = importr('aqrr')
        print("✅ Loaded R package: aqrr")
        funcs = ro.r('ls("package:aqrr")')
        print(f"📦 AQRR Functions Available: {list(funcs)[:6]} ...")
    except Exception as e:
        print("❌ Failed to load 'aqrr':", e)
        return None

    # Enable pandas ↔ R dataframe conversion
    pandas2ri.activate()

    return aqrr

#Section2: Load Fund Metadata and Region Mapping
def load_fund_metadata():
    query = """
    SELECT f.SymbolCUSIP, f.YC_Global_Category_ID, c.Global_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    """
    df = pd.read_sql(query, engine)

    def determine_profile(cat):
        if cat in category_to_region:
            region, use_fi = category_to_region[cat]
            if region is None:
                return pd.Series([None, None, None])
            if use_fi:
                return pd.Series([region, "fixed_income" if "Fixed Income" in cat else "both"])
            else:
                return pd.Series([region, "equity"])
        return pd.Series([None, None])

    df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
    return df.dropna(subset=["Region", "FactorProfile"])


#Section3: Load Return Time Series
def load_fund_returns(fund_ids):
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
        AND Metric = '{RETURN_METRIC}'
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

#Section4: Load AQRR Factor Data
def load_aqrr_factors(region="USA"):
    import os
    import pandas as pd
    from rpy2.robjects import r as ro
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri
    from rpy2.robjects.conversion import localconverter
    import rpy2.rinterface_lib.callbacks
    import logging

    # R environment setup
    os.environ['R_HOME'] = r"C:\\Program Files\\R\\R-4.4.3"
    os.environ['PATH'] += r";C:\\Program Files\\R\\R-4.4.3\\bin\\x64"

    # Suppress R logging
    rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

    # Import R packages
    aqrr = importr('aqrr')
    dplyr = importr('dplyr')

    print(f"📦 Loading AQRR factor data for region: {region}")

    # Pull factor data from R
    ro(f"""
    mkt <- aqr_mkt_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, mkt = value)
    smb <- aqr_smb_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, smb = value)
    hml <- aqr_hml_ff_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, hml = value)
    umd <- aqr_umd_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, umd = value)
    qmj <- aqr_qmj_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, qmj = value)
    bab <- aqr_bab_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, bab = value)
    """)

    # Helper function to convert and filter each factor
    def fix_date(r_obj):
        with localconverter(pandas2ri.converter):
            df = pandas2ri.rpy2py(r_obj)
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        return df[df['date'] >= pd.to_datetime("2015-01-31")].reset_index(drop=True)

    # Pull and process each
    mkt_df = fix_date(ro['mkt'])
    smb_df = fix_date(ro['smb'])
    hml_df = fix_date(ro['hml'])
    umd_df = fix_date(ro['umd'])
    qmj_df = fix_date(ro['qmj'])
    bab_df = fix_date(ro['bab'])

    # Merge and clean
    factors = (
        mkt_df.merge(smb_df, on='date')
              .merge(hml_df, on='date')
              .merge(umd_df, on='date')
              .merge(qmj_df, on='date')
              .merge(bab_df, on='date')
              .rename(columns={"date": "Date"})
              .sort_values("Date")
              .reset_index(drop=True)
    )

    print(f"✅ Loaded AQRR factors: {region} | Shape: {factors.shape} | Date range: {factors['Date'].min().date()} → {factors['Date'].max().date()}")
    return factors


#Section5: Load Fixed Income Factor Data
def load_fixed_income_factors():
    query = """
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")

#Section6: Merge Factors
def merge_all_factors(equity_df, fixed_income_df):
    equity_df["Date"] = pd.to_datetime(equity_df["Date"])
    fixed_income_df.index = pd.to_datetime(fixed_income_df.index)
    merged = equity_df.merge(fixed_income_df, how="left", left_on="Date", right_index=True)

    # Set date as index and drop any remaining 'Date' column just in case
    merged = merged.set_index("Date")
    return merged

#Section7: Perform Rolling Regression
def run_rolling_regression(fund, returns, factors):
    results = []
    returns.index = pd.to_datetime(returns.index)
    factors.index = pd.to_datetime(factors.index)
    for window in ROLLING_PERIODS:
        start = returns.index.min() + relativedelta(months=window)
        for end_date in returns.loc[returns.index >= start].index:
            start_date = end_date - relativedelta(months=window - 1)
            y = returns.loc[start_date:end_date]
            X = factors.loc[start_date:end_date]

            # Align on index
            X, y = X.align(y, join="inner", axis=0)


            try:
                y = y.astype(float)
                X = factors.loc[start_date:end_date].copy()

                if "Date" in X.columns:
                    X = X.drop(columns=["Date"])

                X = X.astype(float)

                if y.isnull().any() or X.isnull().any().any():
                    continue

                X_const = add_constant(X)
                model = OLS(y, X_const).fit()

                diagnostics = {
                    'dw': durbin_watson(model.resid),
                    'bp_pval': het_breuschpagan(model.resid, model.model.exog)[1]
                }

                is_robust = diagnostics['dw'] < 1.5 or diagnostics['bp_pval'] < 0.05
                reg_type = "Robust" if is_robust else "OLS"

                if is_robust:
                    model = sm.OLS(y, X_const).fit(cov_type='HAC', cov_kwds={'maxlags': 1})

                for factor in X.columns:
                    coeff = model.params.get(factor, np.nan)
                    pval = model.pvalues.get(factor, np.nan)
                    tstat = model.tvalues.get(factor, np.nan)
                    stderr = model.bse.get(factor, np.nan)
                    ci_low, ci_upp = model.conf_int().loc[factor] if factor in model.params else (np.nan, np.nan)

                    results.append({
                        "SymbolCUSIP": fund,
                        "MonthEndDate": end_date,
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": coeff,
                        "P_Value": pval,
                        "T_Stat": tstat,
                        "Standard_Error": stderr,
                        "CI_Lower": ci_low,
                        "CI_Upper": ci_upp,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1],
                        "Autocorrelation_Flag": diagnostics['dw'] < 1.5,
                        "Heteroskedasticity_Flag": diagnostics['bp_pval'] < 0.05,
                        "Regression_Type": reg_type
                    })

            except Exception as e:
                print(f"⚠️  {fund} | {start_date.date()} to {end_date.date()} | {type(e).__name__}: {e}")
                continue

    return results

#Section8: Main Batch Driver
def main():
    print("🔧 Initializing R interface...")
    aqrr = initialize_r_environment()

    if aqrr is None:
        print("❌ R not initialized. Exiting.")
        return
    print("✅ R environment is ready.\n")

    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()

    print(f"🧠 Total mapped funds: {len(fund_meta)}")
    print(f"📍 Regions detected: {regions}\n")

    for region in regions:
        fund_subset = fund_meta[fund_meta["Region"] == region]
        equity_factors = load_aqrr_factors(region)
        fixed_income_factors = load_fixed_income_factors()

        funds = fund_subset["SymbolCUSIP"].tolist()
        profiles = fund_subset.set_index("SymbolCUSIP")["FactorProfile"].to_dict()

        for i in range(0, len(funds), CHUNK_SIZE):
            chunk = funds[i:i + CHUNK_SIZE]
            fund_returns = load_fund_returns(chunk)
            records = []

            with ThreadPoolExecutor() as executor:
                futures = {}
                for fund in fund_returns.columns:
                    profile = profiles.get(fund)

                    # Select correct factor set per fund
                    if profile == "equity":
                        factors = equity_factors.copy()
                    elif profile == "fixed_income":
                        factors = fixed_income_factors.copy()
                    elif profile == "both":
                        factors = merge_all_factors(equity_factors, fixed_income_factors)
                    else:
                        continue  # skip fund if no profile

                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], factors)] = fund

                for future in tqdm(futures, desc=f"🔁 Region: {region}"):
                    try:
                        records.extend(future.result())
                    except Exception as e:
                        print(f"⚠️ Error in {futures[future]}: {type(e).__name__}: {e}")

            if not DRY_RUN and records:
                insert_batch(records)


#Section9: Insert to Database
def insert_batch(records):
    df = pd.DataFrame(records)
    for i in range(0, len(df), BATCH_INSERT_SIZE):
        df.iloc[i:i+BATCH_INSERT_SIZE].to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)

if __name__ == "__main__":
    main()


📦 Loading AQRR factor data for region: Global
✅ Loaded AQRR factors: Global | Shape: (101, 7) | Date range: 2015-01-31 → 2023-05-31
⚠️  ABIYX | 2015-02-28 to 2016-01-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  ACIOX | 2015-02-28 to 2016-01-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  ACIOX | 2015-03-29 to 2016-02-29 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  ACIOX | 2015-04-30 to 2016-03-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  ACIOX | 2015-05-30 to 2016-04-30 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  ACIOX | 2015-06-30 to 2016-05-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  ACIOX | 2015-07-30 to 2016-06-30 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  ACIOX | 2015-08-31 to 2016-07-31 | TypeError: '<' not supported between instances of 'i

  0%|                                                                                          | 0/200 [00:00<?, ?it/s]

⚠️  ARDBX | 2015-12-30 to 2016-11-30 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  AGOX | 2023-04-30 to 2024-03-31 | ValueError: The indices for endog and exog are not aligned


⚠️  APDJX | 2016-02-29 to 2017-01-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'⚠️  BEXIX | 2015-02-28 to 2016-01-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'

⚠️  AIMOX | 2021-12-30 to 2022-11-30 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  AIMOX | 2022-01-31 to 2022-12-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'⚠️  AVXC | 2017-05-30 to 2018-04-30 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  BEMIX | 2015-03-29 to 2016-02-29 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'⚠️  AWSYX | 2016-05-30 to 2017-04-30 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  AVSE | 2016-

  0%|▍                                                                              | 1/200 [01:41<5:35:23, 101.12s/it]


⚠️  BEMIX | 2021-03-28 to 2023-02-28 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'⚠️  AVSE | 2017-04-30 to 2021-03-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  BGVIX | 2017-09-30 to 2021-08-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'

⚠️  BAFLX | 2022-10-30 to 2024-09-30 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'⚠️  AWSYX | 2016-05-30 to 2019-04-30 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'

⚠️  ACIOX | 2019-09-30 to 2024-08-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'⚠️  APDKX | 2017-04-30 to 2020-03-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'

⚠️  AVDE | 2016-04-30 to 2019-03-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  BEXIX | 2022-08-31 to 2024-07-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'⚠️ 

  1%|▊                                                                               | 2/200 [01:54<2:42:45, 49.32s/it]

⚠️  BICPX | 2022-02-28 to 2024-01-31 | ValueError: The indices for endog and exog are not aligned
⚠️  BGVIX | 2020-11-30 to 2024-10-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  AVDE | 2018-10-30 to 2021-09-30 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'⚠️  APDGX | 2019-03-28 to 2022-02-28 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  AVEM | 2015-06-30 to 2018-05-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  AVEM | 2015-07-30 to 2018-06-30 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  BEMIX | 2023-02-28 to 2025-01-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'⚠️  AVSE | 2020-08-31 to 2024-07-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'⚠️  BEXIX | 2017-02-28 to 2020-01-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  BAFLX | 2016-10-3

  3%|██▍                                                                             | 6/200 [02:38<1:25:25, 26.42s/it]

⚠️  BEMIX | 2015-09-30 to 2019-08-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'⚠️  AWSYX | 2018-05-30 to 2022-04-30 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  BEXIX | 2017-10-30 to 2021-09-30 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  BEXIX | 2017-11-30 to 2021-10-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'



⚠️  AVDE | 2019-11-30 to 2023-10-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'⚠️  BAFLX | 2016-03-29 to 2020-02-29 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  ARDBX | 2017-01-31 to 2020-12-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  ARDBX | 2017-02-28 to 2021-01-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  APDJX | 2018-07-30 to 2022-06-30 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'⚠️




⚠️  AVDE | 2019-12-30 to 2023-11-30 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  CBSE | 2022-06-30 to 2023-05-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  AZEMX | 2016-09-30 to 2020-08-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  ARDBX | 2017-03-28 to 2021-02-28 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'


⚠️  AVEM | 2015-04-30 to 2019-03-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'⚠️  APDJX | 2018-09-30 to 2022-08-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  AWSYX | 2018-08-31 to 2022-07-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  AWSYX | 2018-09-30 to 2022-08-31 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️  AWSYX | 2018-10-30 to 2022-09-30 | TypeError: '<' not supported between instances of 'int' and 'Timestamp'
⚠️ 

In [None]:
# version 2

In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from concurrent.futures import ThreadPoolExecutor
from sqlalchemy import create_engine, text
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
from statsmodels.stats.diagnostic import het_breuschpagan, acorr_breusch_godfrey
from statsmodels.stats.stattools import durbin_watson
import statsmodels.api as sm

#Section1: Configuration
import logging
from rpy2.rinterface_lib.callbacks import logger as rpy2_logger
rpy2_logger.setLevel(logging.ERROR)

RETURN_METRIC = "1 Month Return"

connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)
ROLLING_PERIODS = [12, 24, 36, 48, 60]  # in months
DRY_RUN = True
CHUNK_SIZE = 200  # Number of funds per chunk
BATCH_INSERT_SIZE = 2000

# Updated region mapping
category_to_region = {
    # --- US Equity ---
    "US Equity Large Cap Blend": ("USA", "equity"),
    "US Equity Large Cap Growth": ("USA", "equity"),
    "US Equity Large Cap Value": ("USA", "equity"),
    "US Equity Mid Cap": ("USA", "equity"),
    "US Equity Small Cap": ("USA", "equity"),

    # --- US Fixed Income ---
    "US Fixed Income": ("USA", "fixed_income"),
    "US Municipal Fixed Income": ("USA", "fixed_income"),

    # --- US Sector Equity ---
    "Communications Sector Equity": ("USA", "equity"),
    "Consumer Goods & Services Sector Equity": ("USA", "equity"),
    "Energy Sector Equity": ("USA", "equity"),
    "Financials Sector Equity": ("USA", "equity"),
    "Healthcare Sector Equity": ("USA", "equity"),
    "Industrials Sector Equity": ("USA", "equity"),
    "Infrastructure Sector Equity": ("USA", "equity"),
    "Other Sector Equity": ("USA", "equity"),
    "Precious Metals Sector Equity": ("USA", "equity"),
    "Technology Sector Equity": ("USA", "equity"),
    "Utilities Sector Equity": ("USA", "equity"),
    "Real Estate Sector Equity": ("USA", "equity"),
    "Natural Resources Sector Equity": ("USA", "equity"),

    # --- Alternatives / Hybrids ---
    "Options Trading": ("USA", "equity"),
    "Multialternative": ("USA", "both"),
    "Market Neutral": ("USA", "both"),
    "Long/Short Equity": ("USA", "both"),
    "Alternative Miscellaneous": ("USA", "both"),
    "Allocation Miscellaneous": ("USA", "both"),
    "Fixed Income Miscellaneous": ("USA", "fixed_income"),
    "Equity Miscellaneous": ("USA", "equity"),
    "Convertibles": ("USA", "both"),

    # --- Global & Allocation ---
    "Flexible Allocation": ("Global", "both"),
    "Aggressive Allocation": ("Global", "both"),
    "Moderate Allocation": ("Global", "both"),
    "Cautious Allocation": ("Global", "both"),
    "Global Fixed Income": ("Global", "fixed_income"),
    "Global Equity Large Cap": ("Global", "equity"),
    "Global Equity Mid/Small Cap": ("Global", "equity"),
    "Global Emerging Markets Equity": ("Global", "equity"),

    # --- Global Ex-USA ---
    "Europe Equity Large Cap": ("Global Ex USA", "equity"),
    "Europe Equity Mid/Small Cap": ("Global Ex USA", "equity"),
    "Asia Equity": ("Global Ex USA", "equity"),
    "Asia ex-Japan Equity": ("Global Ex USA", "equity"),
    "India Equity": ("Global Ex USA", "equity"),
    "Latin America Equity": ("Global Ex USA", "equity"),
    "Japan Equity": ("Global Ex USA", "equity"),
    "Korea Equity": ("Global Ex USA", "equity"),
    "Thailand Equity": ("Global Ex USA", "equity"),
    "Mexico Equity": ("Global Ex USA", "equity"),
    "Australia & New Zealand Equity": ("Global Ex USA", "equity"),
    "Greater China Equity": ("Global Ex USA", "equity"),
    "UK Equity Large Cap": ("Global Ex USA", "equity"),
    "Emerging Markets Fixed Income": ("Global Ex USA", "fixed_income"),
    "Canadian Equity Large Cap": ("Global Ex USA", "equity"),

    # --- Excluded or Unused ---
    "Commodities Broad Basket": (None, None),
    "Commodities Specified": (None, None),
    "Target Date": (None, None),
    "Target Date 2021-2045": (None, None),
    "Target Date 2046+": (None, None),
    "Trading Tools": (None, None),
    "Currency": (None, None),
    "Uncategorized": (None, None),
}

# Section 1.5 Load the R environment
def initialize_r_environment():
    import os

    # 🔒 Set environment variables BEFORE importing anything from rpy2
    os.environ['R_HOME'] = r'C:\\Program Files\\R\\R-4.4.3'
    os.environ['R_JIT_ENABLED'] = '0'

    # Now import RPy2 safely
    import logging
    import rpy2.rinterface_lib.callbacks
    import rpy2.robjects as ro
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri

    # Suppress console spam
    rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

    # Print test R version
    try:
        print("🔁 R Version:", ro.r('R.version.string')[0])
    except Exception as e:
        print("❌ Could not connect to R:", e)
        return None

    # Load required packages
    try:
        aqrr = importr('aqrr')
        print("✅ Loaded R package: aqrr")
        funcs = ro.r('ls("package:aqrr")')
        print(f"📦 AQRR Functions Available: {list(funcs)[:6]} ...")
    except Exception as e:
        print("❌ Failed to load 'aqrr':", e)
        return None

    # Enable pandas ↔ R dataframe conversion
    pandas2ri.activate()

    return aqrr

#Section2: Load Fund Metadata and Region Mapping
def load_fund_metadata():
    query = """
    SELECT f.SymbolCUSIP, f.YC_Global_Category_ID, c.Global_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    """
    df = pd.read_sql(query, engine)

    def determine_profile(cat):
        if cat in category_to_region:
            region, use_fi = category_to_region[cat]
            if region is None:
                return pd.Series([None, None, None])
            if use_fi:
                return pd.Series([region, "fixed_income" if "Fixed Income" in cat else "both"])
            else:
                return pd.Series([region, "equity"])
        return pd.Series([None, None])

    df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
    return df.dropna(subset=["Region", "FactorProfile"])


#Section3: Load Return Time Series
def load_fund_returns(fund_ids):
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
        AND Metric = '{RETURN_METRIC}'
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

#Section4: Load AQRR Factor Data
def load_aqrr_factors(region="USA"):
    import os
    import pandas as pd
    from rpy2.robjects import r as ro
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri
    from rpy2.robjects.conversion import localconverter
    import rpy2.rinterface_lib.callbacks
    import logging

    # R environment setup
    os.environ['R_HOME'] = r"C:\\Program Files\\R\\R-4.4.3"
    os.environ['PATH'] += r";C:\\Program Files\\R\\R-4.4.3\\bin\\x64"

    # Suppress R logging
    rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

    # Import R packages
    aqrr = importr('aqrr')
    dplyr = importr('dplyr')

    print(f"📦 Loading AQRR factor data for region: {region}")

    # Pull factor data from R
    ro(f"""
    mkt <- aqr_mkt_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, mkt = value)
    smb <- aqr_smb_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, smb = value)
    hml <- aqr_hml_ff_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, hml = value)
    umd <- aqr_umd_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, umd = value)
    qmj <- aqr_qmj_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, qmj = value)
    bab <- aqr_bab_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, bab = value)
    """)

    # Helper function to convert and filter each factor
    def fix_date(r_obj):
        with localconverter(pandas2ri.converter):
            df = pandas2ri.rpy2py(r_obj)
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        return df[df['date'] >= pd.to_datetime("2015-01-31")].reset_index(drop=True)

    # Pull and process each
    mkt_df = fix_date(ro['mkt'])
    smb_df = fix_date(ro['smb'])
    hml_df = fix_date(ro['hml'])
    umd_df = fix_date(ro['umd'])
    qmj_df = fix_date(ro['qmj'])
    bab_df = fix_date(ro['bab'])

    # Merge and clean
    factors = (
        mkt_df.merge(smb_df, on='date')
              .merge(hml_df, on='date')
              .merge(umd_df, on='date')
              .merge(qmj_df, on='date')
              .merge(bab_df, on='date')
              .rename(columns={"date": "Date"})
              .sort_values("Date")
              .reset_index(drop=True)
    )

    print(f"✅ Loaded AQRR factors: {region} | Shape: {factors.shape} | Date range: {factors['Date'].min().date()} → {factors['Date'].max().date()}")
    return factors


#Section5: Load Fixed Income Factor Data
def load_fixed_income_factors():
    query = """
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")

#Section6: Merge Factors
def merge_all_factors(equity_df, fixed_income_df):
    equity_df["Date"] = pd.to_datetime(equity_df["Date"])
    fixed_income_df.index = pd.to_datetime(fixed_income_df.index)
    merged = equity_df.merge(fixed_income_df, how="left", left_on="Date", right_index=True)

    # Set date as index and drop any remaining 'Date' column just in case
    merged = merged.set_index("Date")
    return merged

#Section7: Perform Rolling Regression
def run_rolling_regression(fund, returns, factors):
    results = []
    returns.index = pd.to_datetime(returns.index)
    factors.index = pd.to_datetime(factors.index)
    ran_any = False

    for window in ROLLING_PERIODS:
        start = returns.index.min() + relativedelta(months=window)
        for end_date in returns.loc[returns.index >= start].index:
            start_date = end_date - relativedelta(months=window - 1)
            y = returns.loc[start_date:end_date]
            X = factors.loc[start_date:end_date]

            # Align X and y to the same dates
            X, y = X.align(y, join="inner", axis=0)

            try:
                if y.isnull().any() or X.isnull().any().any():
                    continue

                X_const = add_constant(X)
                model = OLS(y, X_const).fit()

                diagnostics = {
                    'dw': durbin_watson(model.resid),
                    'bp_pval': het_breuschpagan(model.resid, model.model.exog)[1]
                }

                is_robust = diagnostics['dw'] < 1.5 or diagnostics['bp_pval'] < 0.05
                reg_type = "Robust" if is_robust else "OLS"

                if is_robust:
                    model = sm.OLS(y, X_const).fit(cov_type='HAC', cov_kwds={'maxlags': 1})

                for factor in X.columns:
                    coeff = model.params.get(factor, np.nan)
                    pval = model.pvalues.get(factor, np.nan)
                    tstat = model.tvalues.get(factor, np.nan)
                    stderr = model.bse.get(factor, np.nan)
                    ci_low, ci_upp = model.conf_int().loc[factor] if factor in model.params else (np.nan, np.nan)

                    results.append({
                        "SymbolCUSIP": fund,
                        "MonthEndDate": end_date,
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": coeff,
                        "P_Value": pval,
                        "T_Stat": tstat,
                        "Standard_Error": stderr,
                        "CI_Lower": ci_low,
                        "CI_Upper": ci_upp,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1],
                        "Autocorrelation_Flag": diagnostics['dw'] < 1.5,
                        "Heteroskedasticity_Flag": diagnostics['bp_pval'] < 0.05,
                        "Regression_Type": reg_type
                    })
                    ran_any = True

            except Exception as e:
                print(f"⚠️  {fund} | {start_date.date()} to {end_date.date()} | {type(e).__name__}: {e}")
                continue

    print(f"{'✅' if ran_any else '⚠️'} {'Ran' if ran_any else 'Skipped'} regressions for {fund}")
    return results


#Section8: Main Batch Driver
def main():
    print("🔧 Initializing R interface...")
    aqrr = initialize_r_environment()

    if aqrr is None:
        print("❌ R not initialized. Exiting.")
        return
    print("✅ R environment is ready.\n")

    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()

    print(f"🧠 Total mapped funds: {len(fund_meta)}")
    print(f"📍 Regions detected: {regions}\n")

    for region in regions:
        fund_subset = fund_meta[fund_meta["Region"] == region]
        equity_factors = load_aqrr_factors(region)
        fixed_income_factors = load_fixed_income_factors()

        funds = fund_subset["SymbolCUSIP"].tolist()
        profiles = fund_subset.set_index("SymbolCUSIP")["FactorProfile"].to_dict()

        for i in range(0, len(funds), CHUNK_SIZE):
            chunk = funds[i:i + CHUNK_SIZE]
            fund_returns = load_fund_returns(chunk)
            records = []

            with ThreadPoolExecutor() as executor:
                futures = {}
                for fund in fund_returns.columns:
                    profile = profiles.get(fund)

                    # Select correct factor set per fund
                    if profile == "equity":
                        factors = equity_factors.copy()
                    elif profile == "fixed_income":
                        factors = fixed_income_factors.copy()
                    elif profile == "both":
                        factors = merge_all_factors(equity_factors, fixed_income_factors)
                    else:
                        continue  # skip fund if no profile

                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], factors)] = fund

                for future in tqdm(futures, desc=f"🔁 Region: {region}"):
                    try:
                        records.extend(future.result())
                    except Exception as e:
                        print(f"⚠️ Error in {futures[future]}: {type(e).__name__}: {e}")

            if not DRY_RUN and records:
                insert_batch(records)


#Section9: Insert to Database
def insert_batch(records):
    df = pd.DataFrame(records)
    for i in range(0, len(df), BATCH_INSERT_SIZE):
        df.iloc[i:i+BATCH_INSERT_SIZE].to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)

if __name__ == "__main__":
    main()


🔧 Initializing R interface...
🔁 R Version: R version 4.4.3 (2025-02-28 ucrt)
✅ Loaded R package: aqrr
📦 AQRR Functions Available: [':=', 'aqr_bab_daily', 'aqr_bab_monthly', 'aqr_commodities_long_run', 'aqr_credit_risk_premium', 'aqr_factor_premia_monthly'] ...
✅ R environment is ready.

🧠 Total mapped funds: 5500
📍 Regions detected: ['Global' 'USA' 'Global Ex USA']

📦 Loading AQRR factor data for region: Global
✅ Loaded AQRR factors: Global | Shape: (101, 7) | Date range: 2015-01-31 → 2023-05-31
⚠️  ABIYX | 2015-02-28 to 2016-01-31 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
⚠️  ABIYX | 2015-03-29 to 2016-02-29 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
⚠️  ABIYX | 2015-04-30 to 2016-03-31 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
⚠️  ABIYX | 2015-05-30 to 2016-04-30 | ValueError: Pandas data cast to numpy dtype of object. Che

🔁 Region: Global:   0%|                                                                       | 0/200 [00:00<?, ?it/s]

⚠️  APDJX | 2021-06-30 to 2023-05-31 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).⚠️  AVXC | 2020-03-28 to 2022-02-28 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
⚠️  ACIOX | 2017-11-30 to 2020-10-31 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
⚠️  AVDE | 2020-09-30 to 2022-08-31 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
⚠️  AVXC | 2020-04-30 to 2022-03-31 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
⚠️  AVXC | 2020-05-30 to 2022-04-30 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
⚠️  BBEM | 2018-01-31 to 2019-12-31 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
⚠️  AVSD | 2020-12-30 to 2022-11-30 | ValueError: Pandas data cast t

🔁 Region: Global:   0%|▎                                                              | 1/200 [00:15<50:50, 15.33s/it]

⚠️  BAFLX | 2017-09-30 to 2022-08-31 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).⚠️  ARDBX | 2016-08-31 to 2021-07-31 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
⚠️  BEMIX | 2020-05-30 to 2024-04-30 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
⚠️  APDKX | 2016-08-31 to 2021-07-31 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
⚠️  APDJX | 2015-12-30 to 2020-11-30 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
⚠️  APDGX | 2018-05-30 to 2023-04-30 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
⚠️  AZEMX | 2015-02-28 to 2020-01-31 | ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
⚠️  ARDBX | 2016-09-30 to 2021-08-31 | ValueError: Pandas data 

In [None]:
# Version 3: Reads AQR link to excel file and AQRR table to find most current data set

In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ThreadPoolExecutor
from sqlalchemy import create_engine, text
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
from statsmodels.stats.diagnostic import het_breuschpagan, acorr_breusch_godfrey
from statsmodels.stats.stattools import durbin_watson
import statsmodels.api as sm

#Section1: Configuration
import logging
from rpy2.rinterface_lib.callbacks import logger as rpy2_logger
rpy2_logger.setLevel(logging.ERROR)

connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# Configuarble data
RETURN_METRIC = "1 Month Return"
AQRR_EXCEL_PATH = "C:/path/to/your/Betting Against Beta Equity Factors Monthly.xlsx" # Excel fallback AQRR file path
ROLLING_PERIODS = [12, 24, 36, 48, 60]  # in months
DRY_RUN = True
CHUNK_SIZE = 200  # Number of funds per chunk
BATCH_INSERT_SIZE = 2000

# Updated region mapping
category_to_region = {
    # --- US Equity ---
    "US Equity Large Cap Blend": ("USA", "equity"),
    "US Equity Large Cap Growth": ("USA", "equity"),
    "US Equity Large Cap Value": ("USA", "equity"),
    "US Equity Mid Cap": ("USA", "equity"),
    "US Equity Small Cap": ("USA", "equity"),

    # --- US Fixed Income ---
    "US Fixed Income": ("USA", "fixed_income"),
    "US Municipal Fixed Income": ("USA", "fixed_income"),

    # --- US Sector Equity ---
    "Communications Sector Equity": ("USA", "equity"),
    "Consumer Goods & Services Sector Equity": ("USA", "equity"),
    "Energy Sector Equity": ("USA", "equity"),
    "Financials Sector Equity": ("USA", "equity"),
    "Healthcare Sector Equity": ("USA", "equity"),
    "Industrials Sector Equity": ("USA", "equity"),
    "Infrastructure Sector Equity": ("USA", "equity"),
    "Other Sector Equity": ("USA", "equity"),
    "Precious Metals Sector Equity": ("USA", "equity"),
    "Technology Sector Equity": ("USA", "equity"),
    "Utilities Sector Equity": ("USA", "equity"),
    "Real Estate Sector Equity": ("USA", "equity"),
    "Natural Resources Sector Equity": ("USA", "equity"),

    # --- Alternatives / Hybrids ---
    "Options Trading": ("USA", "equity"),
    "Multialternative": ("USA", "both"),
    "Market Neutral": ("USA", "both"),
    "Long/Short Equity": ("USA", "both"),
    "Alternative Miscellaneous": ("USA", "both"),
    "Allocation Miscellaneous": ("USA", "both"),
    "Fixed Income Miscellaneous": ("USA", "fixed_income"),
    "Equity Miscellaneous": ("USA", "equity"),
    "Convertibles": ("USA", "both"),

    # --- Global & Allocation ---
    "Flexible Allocation": ("Global", "both"),
    "Aggressive Allocation": ("Global", "both"),
    "Moderate Allocation": ("Global", "both"),
    "Cautious Allocation": ("Global", "both"),
    "Global Fixed Income": ("Global", "fixed_income"),
    "Global Equity Large Cap": ("Global", "equity"),
    "Global Equity Mid/Small Cap": ("Global", "equity"),
    "Global Emerging Markets Equity": ("Global", "equity"),

    # --- Global Ex-USA ---
    "Europe Equity Large Cap": ("Global Ex USA", "equity"),
    "Europe Equity Mid/Small Cap": ("Global Ex USA", "equity"),
    "Asia Equity": ("Global Ex USA", "equity"),
    "Asia ex-Japan Equity": ("Global Ex USA", "equity"),
    "India Equity": ("Global Ex USA", "equity"),
    "Latin America Equity": ("Global Ex USA", "equity"),
    "Japan Equity": ("Global Ex USA", "equity"),
    "Korea Equity": ("Global Ex USA", "equity"),
    "Thailand Equity": ("Global Ex USA", "equity"),
    "Mexico Equity": ("Global Ex USA", "equity"),
    "Australia & New Zealand Equity": ("Global Ex USA", "equity"),
    "Greater China Equity": ("Global Ex USA", "equity"),
    "UK Equity Large Cap": ("Global Ex USA", "equity"),
    "Emerging Markets Fixed Income": ("Global Ex USA", "fixed_income"),
    "Canadian Equity Large Cap": ("Global Ex USA", "equity"),

    # --- Excluded or Unused ---
    "Commodities Broad Basket": (None, None),
    "Commodities Specified": (None, None),
    "Target Date": (None, None),
    "Target Date 2021-2045": (None, None),
    "Target Date 2046+": (None, None),
    "Trading Tools": (None, None),
    "Currency": (None, None),
    "Uncategorized": (None, None),
}

# Section 1.5 Load the R environment
def initialize_r_environment():
    import os

    # 🔒 Set environment variables BEFORE importing anything from rpy2
    os.environ['R_HOME'] = r'C:\\Program Files\\R\\R-4.4.3'
    os.environ['R_JIT_ENABLED'] = '0'

    # Now import RPy2 safely
    import logging
    import rpy2.rinterface_lib.callbacks
    import rpy2.robjects as ro
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri

    # Suppress console spam
    rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

    # Print test R version
    try:
        print("🔁 R Version:", ro.r('R.version.string')[0])
    except Exception as e:
        print("❌ Could not connect to R:", e)
        return None

    # Load required packages
    try:
        aqrr = importr('aqrr')
        print("✅ Loaded R package: aqrr")
        funcs = ro.r('ls("package:aqrr")')
        print(f"📦 AQRR Functions Available: {list(funcs)[:6]} ...")
    except Exception as e:
        print("❌ Failed to load 'aqrr':", e)
        return None

    # Enable pandas ↔ R dataframe conversion
    pandas2ri.activate()

    return aqrr

#Section2: Load Fund Metadata and Region Mapping
def load_fund_metadata():
    query = """
    SELECT f.SymbolCUSIP, f.YC_Global_Category_ID, c.Global_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    """
    df = pd.read_sql(query, engine)
    df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
    return df.dropna(subset=["Region", "FactorProfile"])

#Section3: Load Return Time Series
def load_fund_returns(fund_ids):
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
        AND Metric = '{RETURN_METRIC}'
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

#Section4: Load AQRR Factor Data
def load_aqrr_factors(region="USA"):
    from rpy2.robjects import r as ro
    from rpy2.robjects import pandas2ri
    from rpy2.robjects.conversion import localconverter

    print(f"📦 Loading AQRR factor data for region: {region}")

    # Pull factor data from R
    ro(f"""
    mkt <- aqr_mkt_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, mkt = value)
    smb <- aqr_smb_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, smb = value)
    hml <- aqr_hml_ff_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, hml = value)
    umd <- aqr_umd_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, umd = value)
    qmj <- aqr_qmj_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, qmj = value)
    bab <- aqr_bab_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, bab = value)
    """)

    # Helper function to convert and filter each factor
    def fix_date(r_obj):
        with localconverter(pandas2ri.converter):
            df = pandas2ri.rpy2py(r_obj)
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        return df[df['date'] >= pd.to_datetime("2015-01-31")].reset_index(drop=True)

    # Pull and process each
    mkt_df = fix_date(ro['mkt'])
    smb_df = fix_date(ro['smb'])
    hml_df = fix_date(ro['hml'])
    umd_df = fix_date(ro['umd'])
    qmj_df = fix_date(ro['qmj'])
    bab_df = fix_date(ro['bab'])

    # Merge and clean
    factors = (
        mkt_df.merge(smb_df, on='date')
              .merge(hml_df, on='date')
              .merge(umd_df, on='date')
              .merge(qmj_df, on='date')
              .merge(bab_df, on='date')
              .rename(columns={"date": "Date"})
              .sort_values("Date")
              .reset_index(drop=True)
    )

    print(f"✅ Loaded AQRR factors: {region} | Shape: {factors.shape} | Date range: {factors['Date'].min().date()} → {factors['Date'].max().date()}")
    return factors

#Section4.1: Load AQRR Factor Data from Excel
def load_aqrr_excel_factors(filepath):
    factor_map = {
        "BAB Factor": "bab",
        "QMJ Factor": "qmj",
        "UMD Factor": "umd",
        "HML FF Factor": "hml",
        "SMB Factor": "smb",
        "MKT FF Factor": "mkt"
    }

    all_dfs = {}
    xls = pd.ExcelFile(filepath)
    for sheet, factor in factor_map.items():
        if sheet in xls.sheet_names:
            df = pd.read_excel(xls, sheet_name=sheet, skiprows=range(0, 6))
            df.columns = ['Date', factor]
            df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
            df = df.dropna(subset=['Date']).sort_values('Date')
            all_dfs[factor] = df

    merged = all_dfs['mkt']
    for f in ['smb', 'hml', 'umd', 'qmj', 'bab']:
        merged = pd.merge(merged, all_dfs[f], on='Date', how='inner')

    merged = merged.reset_index(drop=True)
    return merged
#Section4.2: Compare R vs Excel Factor Data and Use Latest
def compare_and_select_factors(region, r_factors, excel_factors):
    r_max = r_factors['Date'].max() if not r_factors.empty else pd.Timestamp("1900-01-01")
    e_max = excel_factors['Date'].max() if not excel_factors.empty else pd.Timestamp("1900-01-01")

    print(f"📅 R max date: {r_max.date()} | Excel max date: {e_max.date()}")
    if e_max > r_max:
        print("✅ Using Excel-based AQRR factors (more recent)")
        return excel_factors
    else:
        print("✅ Using R-based AQRR factors")
        return r_factors

#Section5: Load Fixed Income Factor Data
def load_fixed_income_factors():
    query = """
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")

#Section6: Merge Factors
def merge_all_factors(equity_df, fixed_income_df):
    equity_df["Date"] = pd.to_datetime(equity_df["Date"])
    fixed_income_df.index = pd.to_datetime(fixed_income_df.index)
    merged = equity_df.merge(fixed_income_df, how="left", left_on="Date", right_index=True)

    # Set date as index and drop any remaining 'Date' column just in case
    merged = merged.set_index("Date")
    return merged

#Section7: Perform Rolling Regression
def run_rolling_regression(fund, returns, factors):
    results = []
    returns.index = pd.to_datetime(returns.index)
    factors.index = pd.to_datetime(factors.index)
    ran_any = False

    for window in ROLLING_PERIODS:
        start = returns.index.min() + relativedelta(months=window)
        for end_date in returns.loc[returns.index >= start].index:
            start_date = end_date - relativedelta(months=window - 1)
            y = returns.loc[start_date:end_date]
            X = factors.loc[start_date:end_date]

            # Align X and y to the same dates
            X, y = X.align(y, join="inner", axis=0)

            try:
                if y.isnull().any() or X.isnull().any().any():
                    continue

                X_const = add_constant(X)
                model = OLS(y, X_const).fit()

                diagnostics = {
                    'dw': durbin_watson(model.resid),
                    'bp_pval': het_breuschpagan(model.resid, model.model.exog)[1]
                }

                is_robust = diagnostics['dw'] < 1.5 or diagnostics['bp_pval'] < 0.05
                reg_type = "Robust" if is_robust else "OLS"

                if is_robust:
                    model = sm.OLS(y, X_const).fit(cov_type='HAC', cov_kwds={'maxlags': 1})

                for factor in X.columns:
                    coeff = model.params.get(factor, np.nan)
                    pval = model.pvalues.get(factor, np.nan)
                    tstat = model.tvalues.get(factor, np.nan)
                    stderr = model.bse.get(factor, np.nan)
                    ci_low, ci_upp = model.conf_int().loc[factor] if factor in model.params else (np.nan, np.nan)

                    results.append({
                        "SymbolCUSIP": fund,
                        "MonthEndDate": end_date,
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": coeff,
                        "P_Value": pval,
                        "T_Stat": tstat,
                        "Standard_Error": stderr,
                        "CI_Lower": ci_low,
                        "CI_Upper": ci_upp,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1],
                        "Autocorrelation_Flag": diagnostics['dw'] < 1.5,
                        "Heteroskedasticity_Flag": diagnostics['bp_pval'] < 0.05,
                        "Regression_Type": reg_type
                    })
                    ran_any = True

            except Exception as e:
                print(f"⚠️  {fund} | {start_date.date()} to {end_date.date()} | {type(e).__name__}: {e}")
                continue

    print(f"{'✅' if ran_any else '⚠️'} {'Ran' if ran_any else 'Skipped'} regressions for {fund}")
    return results


#Section8: Main Batch Driver
def main():
    print("🔧 Initializing R interface...")
    aqrr = initialize_r_environment()

    if aqrr is None:
        print("❌ R not initialized. Exiting.")
        return
    print("✅ R environment is ready.\n")

    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()

    print(f"🧠 Total mapped funds: {len(fund_meta)}")
    print(f"📍 Regions detected: {regions}\n")

    for region in regions:
        fund_subset = fund_meta[fund_meta["Region"] == region]
        r_factors = load_aqrr_factors(region)
        excel_factors = load_aqrr_excel_factors(AQRR_EXCEL_PATH)
        equity_factors = compare_and_select_factors(region, r_factors, excel_factors)

        fixed_income_factors = load_fixed_income_factors()

        funds = fund_subset["SymbolCUSIP"].tolist()
        profiles = fund_subset.set_index("SymbolCUSIP")["FactorProfile"].to_dict()

        for i in range(0, len(funds), CHUNK_SIZE):
            chunk = funds[i:i + CHUNK_SIZE]
            fund_returns = load_fund_returns(chunk)
            records = []

            with ThreadPoolExecutor() as executor:
                futures = {}
                for fund in fund_returns.columns:
                    profile = profiles.get(fund)

                    # Select correct factor set per fund
                    if profile == "equity":
                        factors = equity_factors.copy()
                    elif profile == "fixed_income":
                        factors = fixed_income_factors.copy()
                    elif profile == "both":
                        factors = merge_all_factors(equity_factors, fixed_income_factors)
                    else:
                        continue  # skip fund if no profile

                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], factors)] = fund

                for future in tqdm(futures, desc=f"🔁 Region: {region}"):
                    try:
                        records.extend(future.result())
                    except Exception as e:
                        print(f"⚠️ Error in {futures[future]}: {type(e).__name__}: {e}")

            if not DRY_RUN and records:
                insert_batch(records)


#Section9: Insert to Database
def insert_batch(records):
    df = pd.DataFrame(records)
    for i in range(0, len(df), BATCH_INSERT_SIZE):
        df.iloc[i:i+BATCH_INSERT_SIZE].to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)

if __name__ == "__main__":
    main()


In [None]:
# Version 4, Grok redo given Chat gpt lies

In [2]:
import pandas as pd
import numpy as np
import os
import random
import logging
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor
from sqlalchemy import create_engine, text
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
from statsmodels.stats.diagnostic import het_breuschpagan, acorr_breusch_godfrey
from statsmodels.stats.stattools import durbin_watson
import statsmodels.api as sm

# Section 1: Configuration and Setup
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# Configurable data
RETURN_METRIC = "1 Month Return"
AQRR_EXCEL_URL = "https://www.aqr.com/-/media/AQR/Documents/Insights/Data-Sets/Betting-Against-Beta-Equity-Factors-Monthly.xlsx"
AQRR_EXCEL_PATH = os.path.join(os.getcwd(), "Betting_Against_Beta_Equity_Factors_Monthly.xlsx")
ROLLING_PERIODS = [12, 24, 36, 48, 60]  # in months
DRY_RUN = True
SAMPLE_DRY_RUN = True
SAMPLE_SIZE = 100
CHUNK_SIZE = 5600
BATCH_INSERT_SIZE = 10000

# Fund category to region mapping
category_to_region = {
    "US Equity Large Cap Blend": ("USA", "equity"),
    "US Equity Large Cap Growth": ("USA", "equity"),
    "US Equity Large Cap Value": ("USA", "equity"),
    "US Equity Mid Cap": ("USA", "equity"),
    "US Equity Small Cap": ("USA", "equity"),
    "US Fixed Income": ("USA", "fixed_income"),
    "US Municipal Fixed Income": ("USA", "fixed_income"),
    "Communications Sector Equity": ("USA", "equity"),
    "Consumer Goods & Services Sector Equity": ("USA", "equity"),
    "Energy Sector Equity": ("USA", "equity"),
    "Financials Sector Equity": ("USA", "equity"),
    "Healthcare Sector Equity": ("USA", "equity"),
    "Industrials Sector Equity": ("USA", "equity"),
    "Infrastructure Sector Equity": ("USA", "equity"),
    "Other Sector Equity": ("USA", "equity"),
    "Precious Metals Sector Equity": ("USA", "equity"),
    "Technology Sector Equity": ("USA", "equity"),
    "Utilities Sector Equity": ("USA", "equity"),
    "Real Estate Sector Equity": ("USA", "equity"),
    "Natural Resources Sector Equity": ("USA", "equity"),
    "Options Trading": ("USA", "equity"),
    "Multialternative": ("USA", "both"),
    "Market Neutral": ("USA", "both"),
    "Long/Short Equity": ("USA", "both"),
    "Alternative Miscellaneous": ("USA", "both"),
    "Allocation Miscellaneous": ("USA", "both"),
    "Fixed Income Miscellaneous": ("USA", "fixed_income"),
    "Equity Miscellaneous": ("USA", "equity"),
    "Convertibles": ("USA", "both"),
    "Flexible Allocation": ("Global", "both"),
    "Aggressive Allocation": ("Global", "both"),
    "Moderate Allocation": ("Global", "both"),
    "Cautious Allocation": ("Global", "both"),
    "Global Fixed Income": ("Global", "fixed_income"),
    "Global Equity Large Cap": ("Global", "equity"),
    "Global Equity Mid/Small Cap": ("Global", "equity"),
    "Global Emerging Markets Equity": ("Global", "equity"),
    "Europe Equity Large Cap": ("Global Ex USA", "equity"),
    "Europe Equity Mid/Small Cap": ("Global Ex USA", "equity"),
    "Asia Equity": ("Global Ex USA", "equity"),
    "Asia ex-Japan Equity": ("Global Ex USA", "equity"),
    "India Equity": ("Global Ex USA", "equity"),
    "Latin America Equity": ("Global Ex USA", "equity"),
    "Japan Equity": ("Global Ex USA", "equity"),
    "Korea Equity": ("Global Ex USA", "equity"),
    "Thailand Equity": ("Global Ex USA", "equity"),
    "Mexico Equity": ("Global Ex USA", "equity"),
    "Australia & New Zealand Equity": ("Global Ex USA", "equity"),
    "Greater China Equity": ("Global Ex USA", "equity"),
    "UK Equity Large Cap": ("Global Ex USA", "equity"),
    "Emerging Markets Fixed Income": ("Global Ex USA", "fixed_income"),
    "Canadian Equity Large Cap": ("Global Ex USA", "equity"),
    "Commodities Broad Basket": (None, None),
    "Commodities Specified": (None, None),
    "Target Date": (None, None),
    "Target Date 2021-2045": (None, None),
    "Target Date 2046+": (None, None),
    "Trading Tools": (None, None),
    "Currency": (None, None),
    "Uncategorized": (None, None),
}

# Section 2: Data Source Initialization
def initialize_r_environment():
    """Initialize R environment for AQRR factor loading, returning None if unavailable."""
    import os
    os.environ['R_HOME'] = r'C:\\Program Files\\R\\R-4.4.3'
    os.environ['R_JIT_ENABLED'] = '0'
    
    import logging
    import rpy2.rinterface_lib.callbacks
    import rpy2.robjects as ro
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri
    
    rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
    
    try:
        version = ro.r('R.version.string')[0]
        print(f"🔁 R Version: {version}")
        logging.info(f"R initialized successfully: {version}")
    except (OSError, ValueError) as e:
        print(f"❌ Could not connect to R: {e}")
        logging.warning(f"Could not connect to R: {e}")
        return None
    
    try:
        aqrr = importr('aqrr')
        print("✅ Loaded R package: aqrr")
        funcs = ro.r('ls("package:aqrr")')
        print(f"📦 AQRR Functions Available: {list(funcs)[:6]} ...")
        logging.info("AQRR package loaded successfully")
    except ImportError as e:
        print(f"❌ Failed to load 'aqrr': {e}")
        logging.warning(f"Failed to load 'aqrr': {e}")
        return None
    
    pandas2ri.activate()
    return aqrr

# Section 3: Data Loading Functions
def load_fund_metadata():
    """Load fund metadata from the SQL database and map to regions and profiles."""
    query = """
    SELECT f.SymbolCUSIP, f.YC_Global_Category_ID, c.Global_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    """
    df = pd.read_sql(query, engine)
    df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
    return df.dropna(subset=["Region", "FactorProfile"])

def load_fund_returns(fund_ids):
    """Load 1-month return time series for specified funds from the SQL database."""
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
        AND Metric = '{RETURN_METRIC}'
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

def load_aqrr_factors(region="USA"):
    """Load AQRR factor data from R for a specified region."""
    from rpy2.robjects import r as ro
    from rpy2.robjects import pandas2ri
    from rpy2.robjects.conversion import localconverter

    print(f"📦 Loading AQRR factor data for region: {region}")
    # Load dplyr for pipes and data manipulation functions
    ro('library(dplyr)')
    ro(f"""
    mkt <- aqr_mkt_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, mkt = value)
    smb <- aqr_smb_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, smb = value)
    hml <- aqr_hml_ff_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, hml = value)
    umd <- aqr_umd_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, umd = value)
    qmj <- aqr_qmj_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, qmj = value)
    bab <- aqr_bab_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, bab = value)
    """)

    def fix_date(r_obj):
        with localconverter(pandas2ri.converter):
            df = pandas2ri.rpy2py(r_obj)
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        return df[df['date'] >= pd.to_datetime("2015-01-31")].reset_index(drop=True)

    mkt_df = fix_date(ro['mkt'])
    smb_df = fix_date(ro['smb'])
    hml_df = fix_date(ro['hml'])
    umd_df = fix_date(ro['umd'])
    qmj_df = fix_date(ro['qmj'])
    bab_df = fix_date(ro['bab'])

    factors = (
        mkt_df.merge(smb_df, on='date')
              .merge(hml_df, on='date')
              .merge(umd_df, on='date')
              .merge(qmj_df, on='date')
              .merge(bab_df, on='date')
              .rename(columns={"date": "Date"})
              .sort_values("Date")
              .reset_index(drop=True)
    )

    print(f"✅ Loaded AQRR factors: {region} | Shape: {factors.shape} | Date range: {factors['Date'].min().date()} → {factors['Date'].max().date()}")
    return factors

def load_aqrr_excel_factors(filepath):
    """Load AQRR factor data from an Excel file, downloading from URL if needed."""
    import requests
    
    if not os.path.isfile(filepath):
        try:
            response = requests.get(AQRR_EXCEL_URL, stream=True)
            response.raise_for_status()
            with open(filepath, 'wb') as f:
                f.write(response.content)
            logging.info(f"Downloaded AQRR Excel file to {filepath}")
        except requests.RequestException as e:
            logging.error(f"Failed to download AQRR Excel from {AQRR_EXCEL_URL}: {e}")
            raise
    
    factor_map = {
        "BAB Factor": "bab",
        "QMJ Factor": "qmj",
        "UMD Factor": "umd",
        "HML FF Factor": "hml",
        "SMB Factor": "smb",
        "MKT FF Factor": "mkt"
    }

    all_dfs = {}
    xls = pd.ExcelFile(filepath)
    for sheet, factor in factor_map.items():
        if sheet in xls.sheet_names:
            df = pd.read_excel(xls, sheet_name=sheet, skiprows=range(0, 6))
            df.columns = ['Date', factor]
            df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
            df = df.dropna(subset=['Date']).sort_values('Date')
            all_dfs[factor] = df

    merged = all_dfs['mkt']
    for f in ['smb', 'hml', 'umd', 'qmj', 'bab']:
        merged = pd.merge(merged, all_dfs[f], on='Date', how='inner')

    merged = merged.reset_index(drop=True)
    return merged

def compare_and_select_factors(region, r_factors, excel_factors):
    """Compare R and Excel AQRR factors and select the most recent."""
    r_max = r_factors['Date'].max() if not r_factors.empty else pd.Timestamp("1900-01-01")
    e_max = excel_factors['Date'].max() if not excel_factors.empty else pd.Timestamp("1900-01-01")

    print(f"📅 R max date: {r_max.date()} | Excel max date: {e_max.date()}")
    if e_max > r_max:
        print("✅ Using Excel-based AQRR factors (more recent)")
        return excel_factors
    else:
        print("✅ Using R-based AQRR factors")
        return r_factors

def load_fixed_income_factors():
    """Load fixed income factor returns from the SQL database."""
    query = """
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")

def merge_all_factors(equity_df, fixed_income_df):
    """Merge equity and fixed income factor DataFrames by date."""
    equity_df["Date"] = pd.to_datetime(equity_df["Date"])
    fixed_income_df.index = pd.to_datetime(fixed_income_df.index)
    merged = equity_df.merge(fixed_income_df, how="left", left_on="Date", right_index=True)
    merged = merged.set_index("Date")
    return merged

# Section 4: Rolling Regression Functions
def precheck_rolling_periods(returns, factors, rolling_periods):
    """Precheck data to determine viable rolling periods based on available data."""
    returns = returns.dropna()
    factors = factors.dropna()
    min_date = max(returns.index.min(), factors.index.min())
    max_date = min(returns.index.max(), factors.index.max())
    viable_periods = []
    
    for window in rolling_periods:
        earliest_start = min_date + relativedelta(months=window)
        if earliest_start <= max_date:
            viable_periods.append(window)
    
    return viable_periods

def run_single_regression(y, X):
    """Run a single regression with diagnostics and return results."""
    try:
        X_const = add_constant(X)
        model = OLS(y, X_const).fit()
        
        diagnostics = {
            'dw': durbin_watson(model.resid),
            'bp_pval': het_breuschpagan(model.resid, model.model.exog)[1]
        }
        is_robust = diagnostics['dw'] < 1.5 or diagnostics['bp_pval'] < 0.05
        reg_type = "Robust" if is_robust else "OLS"
        
        if is_robust:
            model = sm.OLS(y, X_const).fit(cov_type='HAC', cov_kwds={'maxlags': 1})
        
        return model, diagnostics, reg_type
    except np.linalg.LinAlgError:
        print(f"⚠️ Singular matrix error in regression")
        return None, None, None
    except Exception as e:
        print(f"⚠️ Unexpected error in regression: {type(e).__name__}: {e}")
        return None, None, None

def run_rolling_regression(fund, returns, factors):
    """Perform rolling regressions for a fund over viable periods."""
    results = []
    returns.index = pd.to_datetime(returns.index)
    factors.index = pd.to_datetime(factors.index)
    ran_any = False
    
    viable_periods = precheck_rolling_periods(returns, factors, ROLLING_PERIODS)
    if not viable_periods:
        print(f"⚠️ No viable rolling periods for {fund}")
        return results
    
    for window in viable_periods:
        start = returns.index.min() + relativedelta(months=window)
        for end_date in returns.loc[returns.index >= start].index:
            start_date = end_date - relativedelta(months=window - 1)
            y = returns.loc[start_date:end_date]
            X = factors.loc[start_date:end_date]
            
            X, y = X.align(y, join="inner", axis=0)
            if len(y) < window:
                continue
            
            try:
                if y.isnull().any() or X.isnull().any().any():
                    continue
                
                X_const = add_constant(X)
                model = OLS(y, X_const).fit()
                
                diagnostics = {
                    'dw': durbin_watson(model.resid),
                    'bp_pval': het_breuschpagan(model.resid, model.model.exog)[1]
                }
                
                is_robust = diagnostics['dw'] < 1.5 or diagnostics['bp_pval'] < 0.05
                reg_type = "Robust" if is_robust else "OLS"
                
                if is_robust:
                    model = sm.OLS(y, X_const).fit(cov_type='HAC', cov_kwds={'maxlags': 1})
                
                for factor in X.columns:
                    coeff = model.params.get(factor, np.nan)
                    pval = model.pvalues.get(factor, np.nan)
                    tstat = model.tvalues.get(factor, np.nan)
                    stderr = model.bse.get(factor, np.nan)
                    ci_low, ci_upp = model.conf_int().loc[factor] if factor in model.params else (np.nan, np.nan)
                    
                    results.append({
                        "SymbolCUSIP": fund,
                        "MonthEndDate": end_date,
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": coeff,
                        "P_Value": pval,
                        "T_Stat": tstat,
                        "Standard_Error": stderr,
                        "CI_Lower": ci_low,
                        "CI_Upper": ci_upp,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1],
                        "Autocorrelation_Flag": diagnostics['dw'] < 1.5,
                        "Heteroskedasticity_Flag": diagnostics['bp_pval'] < 0.05,
                        "Regression_Type": reg_type
                    })
                ran_any = True
            
            except np.linalg.LinAlgError as e:
                print(f"⚠️ {fund} | {start_date.date()} to {end_date.date()} | Matrix error: {e}")
                continue
            except ValueError as e:
                print(f"⚠️ {fund} | {start_date.date()} to {end_date.date()} | Data error: {e}")
                continue
    
    print(f"{'✅' if ran_any else '⚠️'} {'Ran' if ran_any else 'Skipped'} regressions for {fund}")
    return results

# Section 5: Main Processing Pipeline
def initialize_data_sources():
    """Initialize R and return AQRR availability."""
    print("🔧 Initializing R interface...")
    aqrr = initialize_r_environment()
    use_r = aqrr is not None
    if not use_r:
        logging.warning("R unavailable, falling back to Excel AQRR factors")
        print("⚠️ R unavailable, falling back to Excel AQRR factors.")
    return use_r

def process_region(region, fund_subset, use_r):
    """Process funds for a given region with parallel regression execution."""
    from concurrent.futures import ProcessPoolExecutor
    
    funds = fund_subset["SymbolCUSIP"].tolist()
    if SAMPLE_DRY_RUN:
        funds = random.sample(funds, min(SAMPLE_SIZE, len(funds)))
        logging.info(f"Sample dry run: Processing {len(funds)} funds from {region}")
        print(f"ℹ️ Sample dry run: Processing {len(funds)} funds")
    
    profiles = fund_subset.set_index("SymbolCUSIP")["FactorProfile"].to_dict()
    
    for i in range(0, len(funds), CHUNK_SIZE):
        chunk = funds[i:i + CHUNK_SIZE]
        fund_returns = load_fund_returns(chunk)
        records = []
        
        with ProcessPoolExecutor() as executor:
            futures = {}
            for fund in fund_returns.columns:
                profile = profiles.get(fund)
                if profile == "equity":
                    r_factors = pd.DataFrame()
                    if use_r:
                        r_factors = load_aqrr_factors(region)
                    excel_factors = load_aqrr_excel_factors(AQRR_EXCEL_PATH)
                    factors = compare_and_select_factors(region, r_factors, excel_factors) if use_r else excel_factors
                elif profile == "fixed_income":
                    factors = load_fixed_income_factors()
                elif profile == "both":
                    r_factors = pd.DataFrame()
                    if use_r:
                        r_factors = load_aqrr_factors(region)
                    excel_factors = load_aqrr_excel_factors(AQRR_EXCEL_PATH)
                    equity_factors = compare_and_select_factors(region, r_factors, excel_factors) if use_r else excel_factors
                    factors = merge_all_factors(equity_factors, load_fixed_income_factors())
                else:
                    continue
                futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], factors)] = fund
            
            for future in tqdm(futures, desc=f"🔁 Region: {region}"):
                try:
                    records.extend(future.result())
                except concurrent.futures.TimeoutError as e:
                    print(f"⚠️ Error in {futures[future]}: Timeout - {e}")
                except Exception as e:
                    print(f"⚠️ Error in {futures[future]}: Unexpected error - {type(e).__name__}: {e}")
        
        if records:
            if not DRY_RUN:
                insert_batch(records)
            else:
                logging.info(f"Dry run: Skipped writing {len(records)} records for chunk {i//CHUNK_SIZE + 1}")
                print(f"ℹ️ Dry run: Would have written {len(records)} records")

def main():
    """Main driver to orchestrate fund processing."""
    logging.basicConfig(level=logging.INFO)
    
    use_r = initialize_data_sources()
    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()
    
    print(f"🧠 Total mapped funds: {len(fund_meta)}")
    print(f"📍 Regions detected: {regions}\n")
    
    for region in regions:
        fund_subset = fund_meta[fund_meta["Region"] == region]
        process_region(region, fund_subset, use_r)

# Section 6: Database Output
def insert_batch(records):
    """Insert regression results into the SQL database in batches."""
    df = pd.DataFrame(records)
    if not DRY_RUN:
        for i in range(0, len(df), BATCH_INSERT_SIZE):
            batch = df.iloc[i:i + BATCH_INSERT_SIZE]
            batch.to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)
    else:
        for i in range(0, len(df), BATCH_INSERT_SIZE):
            batch_size = len(df.iloc[i:i + BATCH_INSERT_SIZE])
            logging.info(f"Dry run: Skipped batch insert of {batch_size} records")

if __name__ == "__main__":
    main()

INFO:root:R initialized successfully: R version 4.4.3 (2025-02-28 ucrt)


🔧 Initializing R interface...
🔁 R Version: R version 4.4.3 (2025-02-28 ucrt)


INFO:root:AQRR package loaded successfully


✅ Loaded R package: aqrr
📦 AQRR Functions Available: [':=', 'aqr_bab_daily', 'aqr_bab_monthly', 'aqr_commodities_long_run', 'aqr_credit_risk_premium', 'aqr_factor_premia_monthly'] ...


INFO:root:Sample dry run: Processing 100 funds from Global


🧠 Total mapped funds: 5500
📍 Regions detected: ['Global' 'USA' 'Global Ex USA']

ℹ️ Sample dry run: Processing 100 funds
📦 Loading AQRR factor data for region: Global
✅ Loaded AQRR factors: Global | Shape: (101, 7) | Date range: 2015-01-31 → 2023-05-31


KeyError: 'mkt'

In [None]:

from concurrent.futures import ThreadPoolExecutor
from sqlalchemy import create_engine
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson
import statsmodels.api as sm
import os
import logging

# Section 1: Configuration
RETURN_METRIC = "1 Month Return"

connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)
ROLLING_PERIODS = [12, 24, 36, 48, 60]
DRY_RUN = True
CHUNK_SIZE = 200
BATCH_INSERT_SIZE = 2000

category_to_region = { ... }  # Keep your existing dictionary unchanged here

# Section 1.5: Initialize R

def initialize_r_environment():
    import rpy2.robjects as ro
    import rpy2.rinterface_lib.callbacks
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri

    os.environ['R_HOME'] = r'C:\\Program Files\\R\\R-4.4.3'
    os.environ['R_JIT_ENABLED'] = '0'
    rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

    try:
        print("🔁 R Version:", ro.r('R.version.string')[0])
        aqrr = importr('aqrr')
        dplyr = importr('dplyr')
        pandas2ri.activate()
        print("✅ R environment is ready.")
        return True
    except Exception as e:
        print("❌ Failed to initialize R environment:", e)
        return False

# Section 2: Fund Metadata

def load_fund_metadata():
    query = """
    SELECT f.SymbolCUSIP, f.YC_Global_Category_ID, c.Global_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    """
    df = pd.read_sql(query, engine)
    df[["Region", "FactorProfile"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
    return df.dropna(subset=["Region", "FactorProfile"])

# Section 3: Returns

def load_fund_returns(fund_ids):
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
        AND Metric = '{RETURN_METRIC}'
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

# Section 4: Hybrid AQRR Factor Loader

def load_aqrr_factor_from_r(factor_func: str, region: str):
    from rpy2.robjects import r as ro
    from rpy2.robjects import pandas2ri
    from rpy2.robjects.conversion import localconverter
    ro(f"""
    library(dplyr)
    factor_data <- {factor_func}() %>%
        filter(name == '{region}') %>%
        select(date, value) %>%
        mutate(date = as.character(date))
    """)
    with localconverter(pandas2ri.converter):
        df = pandas2ri.rpy2py(ro["factor_data"])
    df.columns = ["date", "value"]
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    return df.dropna()

def load_aqrr_factor_from_excel(file_path: str, sheet_name: str, col_name: str):
    df = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=5)
    df.columns = df.columns.str.lower().str.strip()
    df = df.rename(columns={df.columns[0]: "date", df.columns[1]: col_name})
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    return df[["date", col_name]].dropna()

def get_latest_aqrr_factors(region: str, excel_path: str):
    factor_sources = {
        "mkt": ("MKT", "aqr_mkt_monthly"),
        "smb": ("SMB", "aqr_smb_monthly"),
        "hml": ("HML FF", "aqr_hml_ff_monthly"),
        "umd": ("UMD", "aqr_umd_monthly"),
        "qmj": ("QMJ", "aqr_qmj_monthly"),
        "bab": ("BAB Factors", "aqr_bab_monthly")
    }

    final_df = None
    for factor, (excel_sheet, r_func) in factor_sources.items():
        try:
            excel_df = load_aqrr_factor_from_excel(excel_path, excel_sheet, factor)
            r_df = load_aqrr_factor_from_r(r_func, region)

            excel_latest = excel_df["date"].max()
            r_latest = r_df["date"].max()

            chosen_df = excel_df if excel_latest >= r_latest else r_df.rename(columns={"value": factor})
            source = "Excel" if chosen_df is excel_df else "R"
            print(f"🧠 {factor.upper()} → Using {source} (Latest: {chosen_df['date'].max().date()})")

            if final_df is None:
                final_df = chosen_df
            else:
                final_df = pd.merge(final_df, chosen_df, on="date", how="outer")

        except Exception as e:
            print(f"⚠️ Failed to load factor '{factor}' from one or both sources: {e}")

    if final_df is not None:
        final_df = final_df.sort_values("date").dropna().set_index("date")
        last_month = pd.to_datetime(datetime.today().replace(day=1)) - pd.DateOffset(days=1)
        final_df = final_df[final_df.index <= last_month]
        print(f"✅ Final AQRR merged factor DataFrame: {final_df.shape} | {final_df.index.min().date()} → {final_df.index.max().date()}")

    return final_df

#Section5: Load Fixed Income Factor Data
def load_fixed_income_factors():
    query = """
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")

#Section6: Merge Factors
def merge_all_factors(equity_df, fixed_income_df):
    equity_df["Date"] = pd.to_datetime(equity_df["Date"])
    fixed_income_df.index = pd.to_datetime(fixed_income_df.index)
    merged = equity_df.merge(fixed_income_df, how="left", left_on="Date", right_index=True)

    # Set date as index and drop any remaining 'Date' column just in case
    merged = merged.set_index("Date")
    return merged

#Section7: Perform Rolling Regression
def run_rolling_regression(fund, returns, factors):
    results = []
    returns.index = pd.to_datetime(returns.index)
    factors.index = pd.to_datetime(factors.index)
    ran_any = False

    for window in ROLLING_PERIODS:
        start = returns.index.min() + relativedelta(months=window)
        for end_date in returns.loc[returns.index >= start].index:
            start_date = end_date - relativedelta(months=window - 1)
            y = returns.loc[start_date:end_date]
            X = factors.loc[start_date:end_date]

            # Align X and y to the same dates
            X, y = X.align(y, join="inner", axis=0)

            try:
                if y.isnull().any() or X.isnull().any().any():
                    continue

                X_const = add_constant(X)
                model = OLS(y, X_const).fit()

                diagnostics = {
                    'dw': durbin_watson(model.resid),
                    'bp_pval': het_breuschpagan(model.resid, model.model.exog)[1]
                }

                is_robust = diagnostics['dw'] < 1.5 or diagnostics['bp_pval'] < 0.05
                reg_type = "Robust" if is_robust else "OLS"

                if is_robust:
                    model = sm.OLS(y, X_const).fit(cov_type='HAC', cov_kwds={'maxlags': 1})

                for factor in X.columns:
                    coeff = model.params.get(factor, np.nan)
                    pval = model.pvalues.get(factor, np.nan)
                    tstat = model.tvalues.get(factor, np.nan)
                    stderr = model.bse.get(factor, np.nan)
                    ci_low, ci_upp = model.conf_int().loc[factor] if factor in model.params else (np.nan, np.nan)

                    results.append({
                        "SymbolCUSIP": fund,
                        "MonthEndDate": end_date,
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": coeff,
                        "P_Value": pval,
                        "T_Stat": tstat,
                        "Standard_Error": stderr,
                        "CI_Lower": ci_low,
                        "CI_Upper": ci_upp,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1],
                        "Autocorrelation_Flag": diagnostics['dw'] < 1.5,
                        "Heteroskedasticity_Flag": diagnostics['bp_pval'] < 0.05,
                        "Regression_Type": reg_type
                    })
                    ran_any = True

            except Exception as e:
                print(f"⚠️  {fund} | {start_date.date()} to {end_date.date()} | {type(e).__name__}: {e}")
                continue

    print(f"{'✅' if ran_any else '⚠️'} {'Ran' if ran_any else 'Skipped'} regressions for {fund}")
    return results


#Section8: Main Batch Driver
def main():
    print("🔧 Initializing R interface...")
    aqrr = initialize_r_environment()

    if aqrr is None:
        print("❌ R not initialized. Exiting.")
        return
    print("✅ R environment is ready.\n")

    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()

    print(f"🧠 Total mapped funds: {len(fund_meta)}")
    print(f"📍 Regions detected: {regions}\n")

    for region in regions:
        fund_subset = fund_meta[fund_meta["Region"] == region]
        equity_factors = load_aqrr_factors(region)
        fixed_income_factors = load_fixed_income_factors()

        funds = fund_subset["SymbolCUSIP"].tolist()
        profiles = fund_subset.set_index("SymbolCUSIP")["FactorProfile"].to_dict()

        for i in range(0, len(funds), CHUNK_SIZE):
            chunk = funds[i:i + CHUNK_SIZE]
            fund_returns = load_fund_returns(chunk)
            records = []

            with ThreadPoolExecutor() as executor:
                futures = {}
                for fund in fund_returns.columns:
                    profile = profiles.get(fund)

                    # Select correct factor set per fund
                    if profile == "equity":
                        factors = equity_factors.copy()
                    elif profile == "fixed_income":
                        factors = fixed_income_factors.copy()
                    elif profile == "both":
                        factors = merge_all_factors(equity_factors, fixed_income_factors)
                    else:
                        continue  # skip fund if no profile

                    futures[executor.submit(run_rolling_regression, fund, fund_returns[fund], factors)] = fund

                for future in tqdm(futures, desc=f"🔁 Region: {region}"):
                    try:
                        records.extend(future.result())
                    except Exception as e:
                        print(f"⚠️ Error in {futures[future]}: {type(e).__name__}: {e}")

            if not DRY_RUN and records:
                insert_batch(records)


#Section9: Insert to Database
def insert_batch(records):
    df = pd.DataFrame(records)
    for i in range(0, len(df), BATCH_INSERT_SIZE):
        df.iloc[i:i+BATCH_INSERT_SIZE].to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)

if __name__ == "__main__":
    main()

