In [None]:
# The script below populates the fixed income factor data

In [4]:
#Section 1: Setup
import pandas as pd
from sqlalchemy import create_engine, text

connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

#Section 2: Define fixed income factors
factors = {
    'TERM':         ('VGLT', '^BBUTB13MTR'),
    'TERM_Int':     ('IEF', '^BBUTB13MTR'),
    'TERM_Long':    ('VGLT', 'IEF'),
    'CREDIT':       ('^BBUSCOTR', 'VGLT'),
    'CREDIT_HY':    ('^BBUSCOHYTR', 'IEF')
}
symbols = set(s for pair in factors.values() for s in pair)

#Section 3: Load all relevant data from both tables
symbol_str = "', '".join(symbols)
query = f"""
    SELECT Symbol, Date, ReturnValue FROM (
        SELECT SymbolCUSIP AS Symbol, Date, ReturnValue 
        FROM dbo.Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ('{symbol_str}')
        UNION ALL
        SELECT Benchmark_Symbol AS Symbol, Date, ReturnValue 
        FROM dbo.Benchmark_Returns_Timeseries
        WHERE Benchmark_Symbol IN ('{symbol_str}')
    ) AS combined
"""
all_returns = pd.read_sql(query, engine, parse_dates=['Date'])
returns_wide = all_returns.pivot_table(index='Date', columns='Symbol', values='ReturnValue')

#Section 4: Load existing factor-date combos to avoid duplication
existing_query = """
    SELECT Factor_Name, Date FROM dbo.Fixed_Income_Factor_Returns
"""
existing_df = pd.read_sql(existing_query, engine, parse_dates=['Date'])
existing_pairs = set(zip(existing_df['Factor_Name'], existing_df['Date']))

#Section 5: Calculate factors
factor_dfs = []
for factor_name, (s1, s2) in factors.items():
    if s1 in returns_wide.columns and s2 in returns_wide.columns:
        df = returns_wide[[s1, s2]].dropna().copy()
        df['ReturnValue'] = df[s1] - df[s2]
        df['Factor_Name'] = factor_name
        df['Source_1'] = s1
        df['Source_2'] = s2
        df['Notes'] = f"{s1} minus {s2}"
        df['Date'] = df.index
        df = df[['Factor_Name', 'Date', 'ReturnValue', 'Source_1', 'Source_2', 'Notes']]
        # Filter out rows already in the DB
        df = df[~df.set_index(['Factor_Name', 'Date']).index.isin(existing_pairs)]
        factor_dfs.append(df.reset_index(drop=True))

#Section 6: Combine and insert into SQL
if factor_dfs:
    final_df = pd.concat(factor_dfs, ignore_index=True)
    final_df.to_sql("Fixed_Income_Factor_Returns", engine, if_exists="append", index=False)
    print(f"✅ Inserted {len(final_df)} new rows into Fixed_Income_Factor_Returns.")
else:
    print("✅ No new factor data to insert — database is up to date.")


✅ Inserted 605 new rows into Fixed_Income_Factor_Returns.


In [None]:
# test of the AQRR database

In [None]:
# STEP 1: Load R Home To not get fatal errors

In [1]:
import os
os.environ['R_HOME'] = r'C:\Program Files\R\R-4.4.3'  # Use raw string (r) to handle backslashes
import rpy2.robjects as ro
print(ro.r('R.version.string'))


[1] "R version 4.4.3 (2025-02-28 ucrt)"



In [None]:
# STEP 2: Disable JIT

In [2]:
import os
os.environ['R_JIT_ENABLED'] = '0'  # Disable JIT
import rpy2.robjects as ro
print(ro.r('R.version.string'))

[1] "R version 4.4.3 (2025-02-28 ucrt)"



In [None]:
# STEP 3: Test connection to R

In [5]:
import rpy2.robjects as ro
print(ro.r('R.version.string'))  # Test basic R connection

['R version 4.4.3 (2025-02-28 ucrt)']


In [None]:
# STEP 4: Test connection to AQRR

In [6]:
import os
os.environ['R_HOME'] = r'C:\Program Files\R\R-4.4.3'
import rpy2.robjects as ro
from rpy2.robjects.packages import importr

aqrr = importr('aqrr')
funcs = ro.r('ls("package:aqrr")')
print(funcs)

[':=' 'aqr_bab_daily' 'aqr_bab_monthly' 'aqr_commodities_long_run'
 'aqr_credit_risk_premium' 'aqr_factor_premia_monthly'
 'aqr_hml_devil_daily' 'aqr_hml_devil_monthly' 'aqr_hml_ff_daily'
 'aqr_hml_ff_monthly' 'aqr_mkt_daily' 'aqr_mkt_monthly'
 'aqr_momentum_monthly' 'aqr_qmj_daily' 'aqr_qmj_monthly' 'aqr_smb_daily'
 'aqr_smb_monthly' 'aqr_umd_daily' 'aqr_umd_monthly' 'as_label' 'as_name'
 'enquo' 'enquos']


In [3]:
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.packages import importr

pandas2ri.activate()
aqrr = importr("aqrr")
dplyr = importr("dplyr")

# Run MKT test
r('mkt <- aqr_mkt_monthly() %>% dplyr::filter(name == "USA") %>% dplyr::select(date, mkt_excess = value)')
mkt = pandas2ri.rpy2py(r['mkt'])
print(mkt.head())


R[write to console]: 
R[write to console]: -
R[write to console]: 
R[write to console]: /
                                                                              
R[write to console]: 
R[write to console]: 
R[write to console]: /
                                                                              
R[write to console]: 
R[write to console]: 
R[write to console]: -
                                                                              
R[write to console]: 


      date  mkt_excess
1 -15860.0    0.028335
2 -15829.0    0.026245
3 -15799.0    0.003287
4 -15768.0   -0.031103
5 -15738.0    0.024357


In [3]:
import pandas as pd
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.packages import importr

pandas2ri.activate()
aqrr = importr("aqrr")
dplyr = importr("dplyr")

def get_aqrr_factors(region="USA"):
    # 1. Call AQRR download functions in R
    r(f"mkt <- aqr_mkt_monthly() %>% filter(name == '{region}') %>% select(date, mkt_excess = value)")
    r(f"smb <- aqr_smb_monthly() %>% filter(name == '{region}') %>% select(date, smb = value)")
    r(f"hml <- aqr_hml_ff_monthly() %>% filter(name == '{region}') %>% select(date, hml = value)")
    r(f"umd <- aqr_umd_monthly() %>% filter(name == '{region}') %>% select(date, umd = value)")
    r(f"qmj <- aqr_qmj_monthly() %>% filter(name == '{region}') %>% select(date, qmj = value)")
    r(f"bab <- aqr_bab_monthly() %>% filter(name == '{region}') %>% select(date, bab = value)")

    # 2. Merge all by date in R
    r("""
        factors <- Reduce(function(x, y) full_join(x, y, by = "date"), 
                          list(mkt, smb, hml, umd, qmj, bab))
    """)

    # 3. Convert to pandas DataFrame
    factors_df = pandas2ri.rpy2py(r['factors'])
    factors_df = factors_df.sort_values("date").reset_index(drop=True)
    
    return factors_df


OSError: cannot load library 'C:\Program Files\R\R-4.4.1\bin\x64\R.dll': error 0x7e

In [None]:
import pandas as pd
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.packages import importr

# Activate conversion
pandas2ri.activate()

# Load AQRR and dplyr
aqrr = importr("aqrr")
dplyr = importr("dplyr")

# Test region
region = "USA"
print(f"⏳ Loading MKT factor for region: {region}")

# Call just MKT factor with filtering and renaming
r(f"""
mkt <- aqr_mkt_monthly() %>%
  dplyr::filter(name == '{region}') %>%
  dplyr::select(date, mkt_excess = value)
""")

# Pull from R to pandas
mkt_df = pandas2ri.rpy2py(r['mkt'])

# Show results
print("✅ Retrieved MKT factor:")
print(mkt_df.head())


In [None]:
import pandas as pd
from datetime import datetime, timedelta
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.packages import importr

# Setup
pandas2ri.activate()
aqrr = importr("aqrr")
dplyr = importr("dplyr")
lubridate = importr("lubridate")

# Calculate cutoff date for last 3 years
cutoff_date = (datetime.today() - timedelta(days=3*365)).strftime("%Y-%m-%d")

# ⏳ Filter and select USA MKT data for last 3 years
r(f"""
mkt <- aqr_mkt_monthly() %>%
  dplyr::filter(name == "USA" & date >= as.Date('{cutoff_date}')) %>%
  dplyr::select(date, mkt_excess = value)
""")

# Pull into Python
mkt_df = pandas2ri.rpy2py(r['mkt'])
print("✅ MKT data for USA (last 3 years):")
print(mkt_df.tail())


In [None]:
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.packages import importr
import pandas as pd
from datetime import datetime, timedelta

# Fix for R 4.4+ JIT issue
r("compiler::enableJIT(0)")

# Enable pandas ↔ R conversion
pandas2ri.activate()

# Load R libraries
aqrr = importr("aqrr")
dplyr = importr("dplyr")

# Set date range
cutoff_date = (datetime.today() - timedelta(days=3*365)).strftime("%Y-%m-%d")

# Run MKT factor query
r(f"""
mkt <- aqr_mkt_monthly() %>%
  dplyr::filter(name == "USA" & date >= as.Date('{cutoff_date}')) %>%
  dplyr::select(date, mkt_excess = value)
""")

# Pull to pandas
mkt_df = pandas2ri.rpy2py(r['mkt'])
print(mkt_df.tail())



In [4]:
import rpy2.robjects as ro
print(ro.r('R.version.string'))  # Test basic R connection

['R version 4.4.3 (2025-02-28 ucrt)']


In [4]:
import os
os.environ['R_HOME'] = r'C:\Program Files\R\R-4.4.3'
import rpy2.robjects as ro
from rpy2.robjects.packages import importr

aqrr = importr('aqrr')
funcs = ro.r('ls("package:aqrr")')
print(funcs)

 [1] ":="                        "aqr_bab_daily"            
 [3] "aqr_bab_monthly"           "aqr_commodities_long_run" 
 [5] "aqr_credit_risk_premium"   "aqr_factor_premia_monthly"
 [7] "aqr_hml_devil_daily"       "aqr_hml_devil_monthly"    
 [9] "aqr_hml_ff_daily"          "aqr_hml_ff_monthly"       
[11] "aqr_mkt_daily"             "aqr_mkt_monthly"          
[13] "aqr_momentum_monthly"      "aqr_qmj_daily"            
[15] "aqr_qmj_monthly"           "aqr_smb_daily"            
[17] "aqr_smb_monthly"           "aqr_umd_daily"            
[19] "aqr_umd_monthly"           "as_label"                 
[21] "as_name"                   "enquo"                    
[23] "enquos"                   



In [6]:
import os
from datetime import datetime, timedelta
import pandas as pd

# ✅ Step 1: Ensure R path is set correctly
os.environ['R_HOME'] = r"C:\Program Files\R\R-4.4.3"
os.environ['PATH'] += r";C:\Program Files\R\R-4.4.3\bin\x64"

# ✅ Step 2: Import rpy2
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri

# ✅ Step 3: Enable pandas <-> R conversion
pandas2ri.activate()

# ✅ Step 4: Import needed R packages
aqrr = importr('aqrr')
dplyr = importr('dplyr')

# ✅ Step 5: Build R filter string
cutoff_date = (datetime.today() - timedelta(days=3*365)).strftime("%Y-%m-%d")
ro.r(f"""
    library(aqrr)
    library(dplyr)
    mkt <- aqr_mkt_monthly() %>%
        filter(name == 'USA' & date >= as.Date('{cutoff_date}')) %>%
        mutate(date = as.Date(date)) %>%
        select(date, mkt_excess = value)
""")


# ✅ Step 6: Bring it into pandas
mkt_df = pandas2ri.rpy2py(ro.r['mkt'])

# ✅ Step 7: Display result
print("✅ MKT factor (USA, last 3 years):")
print(mkt_df.head())
print(mkt_df.tail())


R[write to console]: 
R[write to console]: -
R[write to console]: 
R[write to console]: /
                                                                              
R[write to console]: 
R[write to console]: 
R[write to console]: /
                                                                              
R[write to console]: 
R[write to console]: 
R[write to console]: -
                                                                              
R[write to console]: 


✅ MKT factor (USA, last 3 years):
      date  mkt_excess
1  19112.0   -0.090538
2  19143.0   -0.003493
3  19173.0   -0.086723
4  19204.0    0.092145
5  19235.0   -0.037716
       date  mkt_excess
10  19388.0    0.068904
11  19416.0   -0.027004
12  19447.0    0.018459
13  19477.0    0.004732
14  19508.0   -0.003810


In [11]:
import os
from datetime import datetime, timedelta
import pandas as pd

# ✅ Step 1: Configure R path
os.environ['R_HOME'] = r"C:\\Program Files\\R\\R-4.4.3"
os.environ['PATH'] += r";C:\\Program Files\\R\\R-4.4.3\\bin\\x64"

# ✅ Step 2: rpy2 setup
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()

# ✅ Step 3: Load packages
aqrr = importr('aqrr')
dplyr = importr('dplyr')

# ✅ Step 4: Cutoff date for last 3 years
cutoff_date = (datetime.today() - timedelta(days=3 * 365)).strftime("%Y-%m-%d")
print("Using cutoff date:", cutoff_date)

# ✅ Step 5: Pull raw MKT data for USA (date will come as serial)
ro.r(f"""
    mkt <- aqr_mkt_monthly() %>%
        dplyr::filter(name == "USA") %>%
        dplyr::select(date, mkt_excess = value)
""")

# ✅ Step 6: Convert to pandas
mkt_df = pandas2ri.rpy2py(ro.r['mkt'])

# ✅ Step 7: Convert Excel serial to datetime
mkt_df['date'] = mkt_df['date'].apply(lambda x: datetime(1899, 12, 30) + timedelta(days=int(x)))

# ✅ Step 8: Filter in Python to last 3 years
cutoff = datetime.today() - timedelta(days=3*365)
mkt_df = mkt_df[mkt_df['date'] >= cutoff].reset_index(drop=True)

# ✅ Step 9: Show result
print("✅ Cleaned MKT factor (USA, last 3 years):")
print(mkt_df.head())
print(mkt_df.tail())


Using cutoff date: 2022-04-03


R[write to console]: 
R[write to console]: -
R[write to console]: 
R[write to console]: /
                                                                              
R[write to console]: 
R[write to console]: 
R[write to console]: /
                                                                              
R[write to console]: 
R[write to console]: 
R[write to console]: -
                                                                              
R[write to console]: 


✅ Cleaned MKT factor (USA, last 3 years):
Empty DataFrame
Columns: [date, mkt_excess]
Index: []
Empty DataFrame
Columns: [date, mkt_excess]
Index: []


In [15]:
import os
from datetime import datetime, timedelta
import pandas as pd

# ✅ R + PATH setup
os.environ['R_HOME'] = r"C:\\Program Files\\R\\R-4.4.3"
os.environ['PATH'] += r";C:\\Program Files\\R\\R-4.4.3\\bin\\x64"

import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
import rpy2.rinterface_lib.callbacks
import logging

# ✅ Suppress R console output (like "R[write to console]:")
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# ✅ Activate pandas ↔ R DataFrame bridge
pandas2ri.activate()

# ✅ Load required R packages
aqrr = importr('aqrr')
dplyr = importr('dplyr')

# ✅ Set cutoff date for last 3 years
cutoff = datetime.today() - timedelta(days=3 * 365)
print("📅 Using cutoff date:", cutoff.date())

# ✅ Safe date fixer (detects Excel-style float vs datetime)
def fix_excel_date(df):
    if pd.api.types.is_float_dtype(df['date']):
        df['date'] = df['date'].apply(lambda d: datetime(1899, 12, 30) + timedelta(days=int(d)))
    return df[df['date'] >= cutoff].reset_index(drop=True)

# ✅ Step 1: Load all 6 AQRR factors from R
ro.r("""
mkt <- aqr_mkt_monthly() %>% filter(name == "USA") %>% select(date, mkt = value)
smb <- aqr_smb_monthly() %>% filter(name == "USA") %>% select(date, smb = value)
hml <- aqr_hml_ff_monthly() %>% filter(name == "USA") %>% select(date, hml = value)
umd <- aqr_umd_monthly() %>% filter(name == "USA") %>% select(date, umd = value)
qmj <- aqr_qmj_monthly() %>% filter(name == "USA") %>% select(date, qmj = value)
bab <- aqr_bab_monthly() %>% filter(name == "USA") %>% select(date, bab = value)
""")
# DEBUG: See raw R > pandas transfer
raw_mkt_df = pandas2ri.rpy2py(ro.r['mkt'])
print("\n🧪 Raw MKT from R > pandas (first 5):")
print(raw_mkt_df.head())
print("🧪 dtypes:")
print(raw_mkt_df.dtypes)

# ✅ Step 2: Convert to pandas & fix dates
mkt_df = fix_excel_date(pandas2ri.rpy2py(ro.r['mkt']))
smb_df = fix_excel_date(pandas2ri.rpy2py(ro.r['smb']))
hml_df = fix_excel_date(pandas2ri.rpy2py(ro.r['hml']))
umd_df = fix_excel_date(pandas2ri.rpy2py(ro.r['umd']))
qmj_df = fix_excel_date(pandas2ri.rpy2py(ro.r['qmj']))
bab_df = fix_excel_date(pandas2ri.rpy2py(ro.r['bab']))

# ✅ Step 3: Merge all on date
factors = mkt_df \
    .merge(smb_df, on='date', how='inner') \
    .merge(hml_df, on='date', how='inner') \
    .merge(umd_df, on='date', how='inner') \
    .merge(qmj_df, on='date', how='inner') \
    .merge(bab_df, on='date', how='inner')

# ✅ Step 4: Final preview
print("\n✅ Final AQRR Factors (USA, Last 3 Years):")
print(factors.head())
print(factors.tail())
print(f"\n✅ Shape: {factors.shape}")


📅 Using cutoff date: 2022-04-03

🧪 Raw MKT from R > pandas (first 5):
      date       mkt
1 -15860.0  0.028335
2 -15829.0  0.026245
3 -15799.0  0.003287
4 -15768.0 -0.031103
5 -15738.0  0.024357
🧪 dtypes:
date    float64
mkt     float64
dtype: object

✅ Final AQRR Factors (USA, Last 3 Years):
Empty DataFrame
Columns: [date, mkt, smb, hml, umd, qmj, bab]
Index: []
Empty DataFrame
Columns: [date, mkt, smb, hml, umd, qmj, bab]
Index: []

✅ Shape: (0, 7)


In [None]:
# Code below was to test the issues around date's from R vs Python & test it 
# There was coruption in how it came over, this code below helped fixed it
# Then we verified this fix via an R tupple, then below this incorporated it into a funciton

In [16]:
import os
from datetime import datetime, timedelta
import pandas as pd

# ✅ R + PATH setup
os.environ['R_HOME'] = r"C:\\Program Files\\R\\R-4.4.3"
os.environ['PATH'] += r";C:\\Program Files\\R\\R-4.4.3\\bin\\x64"

import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
import rpy2.rinterface_lib.callbacks
import logging

# ✅ Suppress R[write to console]:
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# ✅ Activate R ↔ pandas conversion
pandas2ri.activate()

# ✅ Load R libraries
aqrr = importr('aqrr')
dplyr = importr('dplyr')

# ✅ Define cutoff date
cutoff = datetime.today() - timedelta(days=3 * 365)
print("📅 Using cutoff date:", cutoff.date())

# ✅ Fix date (handles character or datetime)
def fix_date_column(df):
    if pd.api.types.is_string_dtype(df['date']):
        df['date'] = pd.to_datetime(df['date'])
    return df[df['date'] >= cutoff].reset_index(drop=True)

# ✅ Step 1: Pull AQRR data in R, force date to character
ro.r("""
mkt <- aqr_mkt_monthly() %>% filter(name == "USA") %>% 
  mutate(date = as.character(date)) %>% select(date, mkt = value)

smb <- aqr_smb_monthly() %>% filter(name == "USA") %>% 
  mutate(date = as.character(date)) %>% select(date, smb = value)

hml <- aqr_hml_ff_monthly() %>% filter(name == "USA") %>% 
  mutate(date = as.character(date)) %>% select(date, hml = value)

umd <- aqr_umd_monthly() %>% filter(name == "USA") %>% 
  mutate(date = as.character(date)) %>% select(date, umd = value)

qmj <- aqr_qmj_monthly() %>% filter(name == "USA") %>% 
  mutate(date = as.character(date)) %>% select(date, qmj = value)

bab <- aqr_bab_monthly() %>% filter(name == "USA") %>% 
  mutate(date = as.character(date)) %>% select(date, bab = value)
""")

# ✅ Step 2: Convert each to pandas + fix dates
mkt_df = fix_date_column(pandas2ri.rpy2py(ro.r['mkt']))
smb_df = fix_date_column(pandas2ri.rpy2py(ro.r['smb']))
hml_df = fix_date_column(pandas2ri.rpy2py(ro.r['hml']))
umd_df = fix_date_column(pandas2ri.rpy2py(ro.r['umd']))
qmj_df = fix_date_column(pandas2ri.rpy2py(ro.r['qmj']))
bab_df = fix_date_column(pandas2ri.rpy2py(ro.r['bab']))

# ✅ Step 3: Merge all on date
factors = mkt_df \
    .merge(smb_df, on='date', how='inner') \
    .merge(hml_df, on='date', how='inner') \
    .merge(umd_df, on='date', how='inner') \
    .merge(qmj_df, on='date', how='inner') \
    .merge(bab_df, on='date', how='inner')

# ✅ Step 4: Final preview
print("\n✅ Final AQRR Factors (USA, Last 3 Years):")
print(factors.head())
print(factors.tail())
print(f"\n✅ Shape: {factors.shape}")


📅 Using cutoff date: 2022-04-03

✅ Final AQRR Factors (USA, Last 3 Years):
        date       mkt       smb       hml       umd       qmj       bab
0 2022-04-30 -0.090538 -0.011782  0.065451  0.050946  0.048957  0.015034
1 2022-05-31 -0.003493 -0.025232  0.075750  0.019250  0.034398  0.000300
2 2022-06-30 -0.086723  0.015858 -0.053612  0.007132  0.048585  0.019509
3 2022-07-31  0.092145  0.010569 -0.053493 -0.046330 -0.037926 -0.024594
4 2022-08-31 -0.037716  0.014321  0.009722  0.026841 -0.033810 -0.018017
         date       mkt       smb       hml       umd       qmj       bab
9  2023-01-31  0.068904  0.047574 -0.037542 -0.155380 -0.080619 -0.045561
10 2023-02-28 -0.027004  0.003444 -0.001986  0.014203  0.027468  0.007468
11 2023-03-31  0.018459 -0.051877 -0.072046 -0.017036  0.028151 -0.023685
12 2023-04-30  0.004732 -0.037933  0.008794  0.021689  0.010236  0.000746
13 2023-05-31 -0.003810  0.020872 -0.086065  0.008928 -0.042434 -0.053512

✅ Shape: (14, 7)


In [None]:
# Code below was to test the issues around date's from R vs Python & test it as a funciton
# There was coruption in how it came over, this code below helped fixed it
#    # Helper to clean and filter
#    def fix_date(df):
#        df['date'] = pd.to_datetime(df['date'])
#        return df[df['date'] >= cutoff].reset_index(drop=True)

In [2]:
def load_aqrr_factors(region="USA", years=3):
    import os
    from datetime import datetime, timedelta
    import pandas as pd
    from rpy2.robjects import r as ro
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri
    import rpy2.rinterface_lib.callbacks
    import logging

    # R paths
    os.environ['R_HOME'] = r"C:\\Program Files\\R\\R-4.4.3"
    os.environ['PATH'] += r";C:\\Program Files\\R\\R-4.4.3\\bin\\x64"

    # Suppress R logging
    rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

    # Load R libs
    aqrr = importr('aqrr')
    dplyr = importr('dplyr')
    pandas2ri.activate()

    # Calculate cutoff
    cutoff = datetime.today() - timedelta(days=years * 365)
    print(f"📅 Cutoff date: {cutoff.date()} for region: {region}")

    # R: Load and filter factors
    ro(f"""
    mkt <- aqr_mkt_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, mkt = value)
    smb <- aqr_smb_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, smb = value)
    hml <- aqr_hml_ff_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, hml = value)
    umd <- aqr_umd_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, umd = value)
    qmj <- aqr_qmj_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, qmj = value)
    bab <- aqr_bab_monthly() %>% filter(name == '{region}') %>%
      mutate(date = as.character(date)) %>% select(date, bab = value)
    """)

    # Helper to clean and filter
    def fix_date(df):
        df['date'] = pd.to_datetime(df['date'])
        return df[df['date'] >= cutoff].reset_index(drop=True)

    # Pull into Python
    mkt_df = fix_date(pandas2ri.rpy2py(ro['mkt']))
    smb_df = fix_date(pandas2ri.rpy2py(ro['smb']))
    hml_df = fix_date(pandas2ri.rpy2py(ro['hml']))
    umd_df = fix_date(pandas2ri.rpy2py(ro['umd']))
    qmj_df = fix_date(pandas2ri.rpy2py(ro['qmj']))
    bab_df = fix_date(pandas2ri.rpy2py(ro['bab']))

    # Merge
    factors = mkt_df \
        .merge(smb_df, on='date') \
        .merge(hml_df, on='date') \
        .merge(umd_df, on='date') \
        .merge(qmj_df, on='date') \
        .merge(bab_df, on='date')

    print(f"✅ Loaded AQRR factors: {region} | Shape: {factors.shape}")
    return factors


In [None]:
df = load_aqrr_factors(region="USA", years=3)
print(df.head())


In [None]:
# First attempt at full code for regressions

In [None]:
#Section 1: Imports and Config
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sqlalchemy import create_engine, text
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.packages import importr
import datetime

# Enable R data conversion
pandas2ri.activate()

# Import R AQRR library
aqrr = importr("aqrr")

# DB Connection
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes"
    "&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# Config
ROLLING_PERIODS = [12, 24, 36, 48, 60]
DRY_RUN = True
MAX_WORKERS = 8
BATCH_SIZE = 200
INSERT_CHUNK_SIZE = 3000

# Sample region mapping (expand as needed)
category_to_region = {
    # USA
    "US Equity Large Cap Blend": "USA",
    "US Equity Large Cap Growth": "USA",
    "US Equity Large Cap Value": "USA",
    "US Equity Mid Cap": "USA",
    "US Equity Small Cap": "USA",
    "US Fixed Income": "USA",
    "US Municipal Fixed Income": "USA",
    "Communications Sector Equity": "USA",
    "Consumer Goods & Services Sector Equity": "USA",
    "Energy Sector Equity": "USA",
    "Financials Sector Equity": "USA",
    "Healthcare Sector Equity": "USA",
    "Industrials Sector Equity": "USA",
    "Infrastructure Sector Equity": "USA",
    "Other Sector Equity": "USA",
    "Precious Metals Sector Equity": "USA",
    "Technology Sector Equity": "USA",
    "Utilities Sector Equity": "USA",
    "Real Estate Sector Equity": "USA",
    "Natural Resources Sector Equity": "USA",
    "Options Trading": "USA",
    "Multialternative": "USA",
    "Market Neutral": "USA",
    "Long/Short Equity": "USA",
    "Alternative Miscellaneous": "USA",
    "Allocation Miscellaneous": "USA",
    "Fixed Income Miscellaneous": "USA",
    "Equity Miscellaneous": "USA",
    "Convertibles": "USA",

    # Global
    "Global Equity Large Cap": "Global",
    "Global Equity Mid/Small Cap": "Global",
    "Global Fixed Income": "Global",
    "Global Emerging Markets Equity": "Global",
    "Flexible Allocation": "Global",
    "Aggressive Allocation": "Global",
    "Moderate Allocation": "Global",
    "Cautious Allocation": "Global",
    "Convertibles": "USA",

    # Global Ex USA
    "Europe Equity Large Cap": "Global Ex USA",
    "Europe Equity Mid/Small Cap": "Global Ex USA",
    "Asia Equity": "Global Ex USA",
    "Asia ex-Japan Equity": "Global Ex USA",
    "India Equity": "Global Ex USA",
    "Latin America Equity": "Global Ex USA",
    "Japan Equity": "Global Ex USA",
    "Korea Equity": "Global Ex USA",
    "Thailand Equity": "Global Ex USA",
    "Mexico Equity": "Global Ex USA",
    "Australia & New Zealand Equity": "Global Ex USA",
    "Greater China Equity": "Global Ex USA",
    "UK Equity Large Cap": "Global Ex USA",
    "Emerging Markets Fixed Income": "Global Ex USA",
    "Canadian Equity Large Cap": "Global Ex USA",

    # Skip / None
    "Commodities Broad Basket": None,
    "Commodities Specified": None,
    "Target Date": None,
    "Target Date 2021-2045": None,
    "Target Date 2046+": None,
    "Trading Tools": None,
    "Currency": None,
    "Uncategorized": None,
}


#Section 2: Load Funds & Classification
def load_classified_funds():
    query = """
        SELECT f.SymbolCUSIP, f.Fund_Name, g.Global_Category_Name
        FROM Funds_to_Screen f
        JOIN YC_Global_Category_List g
            ON f.YC_Global_Category_ID = g.ID
    """
    df = pd.read_sql(query, engine)
    df["AQRR_Region"] = df["Global_Category_Name"].map(category_to_region)
    return df.dropna(subset=["AQRR_Region"]).reset_index(drop=True)

#Section 3: Load Fund Returns (per batch)
def load_fund_returns(symbols):
    symbol_str = ", ".join([f"'{s}'" for s in symbols])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({symbol_str})
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

#Section 4: Load AQRR Factors
def load_aqrr_factors(region):
    # Pull and merge from R
    r(f"""
        mkt <- aqr_mkt_monthly() %>% filter(name == '{region}') %>% select(date, mkt = value)
        smb <- aqr_smb_monthly() %>% filter(name == '{region}') %>% select(date, smb = value)
        hml <- aqr_hml_ff_monthly() %>% filter(name == '{region}') %>% select(date, hml = value)
        umd <- aqr_umd_monthly() %>% filter(name == '{region}') %>% select(date, umd = value)
        qmj <- aqr_qmj_monthly() %>% filter(name == '{region}') %>% select(date, qmj = value)
        bab <- aqr_bab_monthly() %>% filter(name == '{region}') %>% select(date, bab = value)
        factors <- Reduce(function(x, y) full_join(x, y, by = 'date'), list(mkt, smb, hml, umd, qmj, bab))
    """)
    df = pandas2ri.rpy2py(r['factors'])
    df['date'] = pd.to_datetime(df['date'])
    return df.set_index('date')

#Section 5: Regression Helper
def run_regression(fund_series, factor_df):
    results = []
    for window in ROLLING_PERIODS:
        if len(fund_series) < window:
            continue
        y = fund_series[-window:]
        x = factor_df.loc[y.index].dropna()
        y = y.loc[x.index]
        if len(x) < window: continue

        x = sm.add_constant(x)
        model = sm.OLS(y, x).fit()

        for factor in factor_df.columns:
            ci_low, ci_high = model.conf_int().loc[factor]
            results.append({
                "RollPeriod": window,
                "Factor_Name": factor,
                "Coefficient": model.params[factor],
                "P_Value": model.pvalues[factor],
                "T_Stat": model.tvalues[factor],
                "Standard_Error": model.bse[factor],
                "CI_Lower": ci_low,
                "CI_Upper": ci_high,
                "Adj_R2": model.rsquared_adj,
                "Correlation": np.corrcoef(x[factor], y)[0,1],
                "Autocorrelation_Flag": False,  # To implement
                "Heteroskedasticity_Flag": False,  # To implement
                "Regression_Type": "OLS"
            })
    return results

#Section 6: Process Fund Batch
def process_fund_batch(fund_batch, fund_returns, factor_df):
    rows = []
    for _, row in fund_batch.iterrows():
        symbol = row.SymbolCUSIP
        series = fund_returns.get(symbol)
        if series is None: continue
        reg_results = run_regression(series.dropna(), factor_df)
        for r in reg_results:
            r.update({"SymbolCUSIP": symbol, "MonthEndDate": series.dropna().index[-1]})
            rows.append(r)
    return rows

#Section 7: Main Driver
def main():
    all_funds = load_classified_funds()
    batches = [all_funds[i:i+BATCH_SIZE] for i in range(0, len(all_funds), BATCH_SIZE)]
    all_results = []

    for batch in tqdm(batches, desc="Processing Batches"):
        region = batch["AQRR_Region"].iloc[0]  # Assume batch shares region
        factor_df = load_aqrr_factors(region)
        fund_returns = load_fund_returns(batch.SymbolCUSIP.tolist())

        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = [executor.submit(process_fund_batch, pd.DataFrame([row]), fund_returns, factor_df)
                       for _, row in batch.iterrows()]
            for future in futures:
                all_results.extend(future.result())

    if DRY_RUN:
        print(f"\n🔍 Dry run complete. Total results: {len(all_results):,}")
        return

    # Insert in chunks
    for i in range(0, len(all_results), INSERT_CHUNK_SIZE):
        chunk = pd.DataFrame(all_results[i:i+INSERT_CHUNK_SIZE])
        chunk.to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False, method="multi")

    print(f"✅ Inserted {len(all_results):,} rows into AQRR_Factor_Attribution.")

if __name__ == "__main__":
    main()

In [22]:
import pandas as pd
import numpy as np
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from concurrent.futures import ThreadPoolExecutor
from sqlalchemy import create_engine, text
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
from statsmodels.stats.diagnostic import het_breuschpagan, acorr_breusch_godfrey
from statsmodels.stats.stattools import durbin_watson
import statsmodels.api as sm

#Section1: Configuration
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)
ROLLING_PERIODS = [12, 24, 36, 48, 60]  # in months
DRY_RUN = True
CHUNK_SIZE = 200  # Number of funds per chunk
BATCH_INSERT_SIZE = 2000

# Updated region mapping
category_to_region = {
    "US Equity Large Cap Blend": ("USA", False),
    "US Equity Large Cap Growth": ("USA", False),
    "US Equity Large Cap Value": ("USA", False),
    "US Equity Mid Cap": ("USA", False),
    "US Equity Small Cap": ("USA", False),
    "US Fixed Income": ("USA", True),
    "US Municipal Fixed Income": ("USA", True),
    "Communications Sector Equity": ("USA", False),
    "Consumer Goods & Services Sector Equity": ("USA", False),
    "Energy Sector Equity": ("USA", False),
    "Financials Sector Equity": ("USA", False),
    "Healthcare Sector Equity": ("USA", False),
    "Industrials Sector Equity": ("USA", False),
    "Infrastructure Sector Equity": ("USA", False),
    "Other Sector Equity": ("USA", False),
    "Precious Metals Sector Equity": ("USA", False),
    "Technology Sector Equity": ("USA", False),
    "Utilities Sector Equity": ("USA", False),
    "Real Estate Sector Equity": ("USA", False),
    "Natural Resources Sector Equity": ("USA", False),
    "Options Trading": ("USA", False),
    "Multialternative": ("USA", False),
    "Market Neutral": ("USA", False),
    "Long/Short Equity": ("USA", False),
    "Alternative Miscellaneous": ("USA", False),
    "Allocation Miscellaneous": ("USA", True),
    "Fixed Income Miscellaneous": ("USA", True),
    "Equity Miscellaneous": ("USA", False),
    "Convertibles": ("USA", False),

    "Global Equity Large Cap": ("Global", False),
    "Global Equity Mid/Small Cap": ("Global", False),
    "Global Fixed Income": ("Global", True),
    "Global Emerging Markets Equity": ("Global", False),
    "Flexible Allocation": ("Global", True),
    "Aggressive Allocation": ("Global", True),
    "Moderate Allocation": ("Global", True),
    "Cautious Allocation": ("Global", True),

    "Europe Equity Large Cap": ("Global Ex USA", False),
    "Europe Equity Mid/Small Cap": ("Global Ex USA", False),
    "Asia Equity": ("Global Ex USA", False),
    "Asia ex-Japan Equity": ("Global Ex USA", False),
    "India Equity": ("Global Ex USA", False),
    "Latin America Equity": ("Global Ex USA", False),
    "Japan Equity": ("Global Ex USA", False),
    "Korea Equity": ("Global Ex USA", False),
    "Thailand Equity": ("Global Ex USA", False),
    "Mexico Equity": ("Global Ex USA", False),
    "Australia & New Zealand Equity": ("Global Ex USA", False),
    "Greater China Equity": ("Global Ex USA", False),
    "UK Equity Large Cap": ("Global Ex USA", False),
    "Emerging Markets Fixed Income": ("Global Ex USA", True),
    "Canadian Equity Large Cap": ("Global Ex USA", False),

    "Commodities Broad Basket": (None, False),
    "Commodities Specified": (None, False),
    "Target Date": (None, False),
    "Target Date 2021-2045": (None, False),
    "Target Date 2046+": (None, False),
    "Trading Tools": (None, False),
    "Currency": (None, False),
    "Uncategorized": (None, False),
}

#Section2: Load Fund Metadata and Region Mapping
def load_fund_metadata():
    query = """
    SELECT f.SymbolCUSIP, f.YC_Global_Category_ID, c.Global_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    """
    df = pd.read_sql(query, engine)
    df[["Region", "UseFixedIncome"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
    return df.dropna(subset=["Region"])

#Section3: Load Return Time Series
def load_fund_returns(fund_ids):
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

#Section4: Load AQRR Factor Data
#Section4: Load AQRR Factor Data
def load_aqrr_factors(region):
    from rpy2.robjects import r, pandas2ri
    from rpy2.robjects.packages import importr
    pandas2ri.activate()
    aqrr = importr("aqrr")
    dplyr = importr("dplyr")
    base = importr("base")

    r(f"""
        suppressMessages(library(aqrr))
        mkt <- aqr_mkt_monthly() %>% filter(name == '{region}') %>% select(date, mkt = value)
        smb <- aqr_smb_monthly() %>% filter(name == '{region}') %>% select(date, smb = value)
        hml <- aqr_hml_ff_monthly() %>% filter(name == '{region}') %>% select(date, hml = value)
        umd <- aqr_umd_monthly() %>% filter(name == '{region}') %>% select(date, umd = value)
        qmj <- aqr_qmj_monthly() %>% filter(name == '{region}') %>% select(date, qmj = value)
        bab <- aqr_bab_monthly() %>% filter(name == '{region}') %>% select(date, bab = value)
        factors <- Reduce(function(x, y) full_join(x, y, by = "date"), list(mkt, smb, hml, umd, qmj, bab))
    """)

    from datetime import datetime
    df = pandas2ri.rpy2py(r["factors"])
    df = df.rename(columns={"date": "Date"})
    
    # Convert to datetime safely
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

    # Database start of data
    df = df[df["Date"] >= pd.to_datetime("2015-01-31")]


    df = df.sort_values("Date").reset_index(drop=True)
    return df


#Section5: Load Fixed Income Factor Data
def load_fixed_income_factors():
    query = """
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")

#Section6: Merge Factors
def merge_all_factors(equity_df, fixed_income_df):
    equity_df["Date"] = pd.to_datetime(equity_df["Date"])
    fixed_income_df.index = pd.to_datetime(fixed_income_df.index)
    return equity_df.merge(fixed_income_df, how="left", left_on="Date", right_index=True)

#Section7: Perform Rolling Regression
def run_rolling_regression(fund, returns, factors):
    results = []
    for window in ROLLING_PERIODS:
        start = returns.index.min() + relativedelta(months=window)
        for end_date in returns.loc[returns.index >= start].index:
            start_date = end_date - relativedelta(months=window-1)
            y = returns.loc[start_date:end_date]
            X = factors.loc[start_date:end_date]
            if y.isnull().any() or X.isnull().any().any():
                continue
            X_const = add_constant(X)
            model = OLS(y, X_const).fit()
            diagnostics = {
                'dw': durbin_watson(model.resid),
                'bp_pval': het_breuschpagan(model.resid, model.model.exog)[1]
            }
            is_robust = diagnostics['dw'] < 1.5 or diagnostics['bp_pval'] < 0.05
            reg_type = "Robust" if is_robust else "OLS"
            if is_robust:
                model = sm.OLS(y, X_const).fit(cov_type='HAC', cov_kwds={'maxlags':1})
            for factor in X.columns:
                coeff = model.params.get(factor, np.nan)
                pval = model.pvalues.get(factor, np.nan)
                tstat = model.tvalues.get(factor, np.nan)
                stderr = model.bse.get(factor, np.nan)
                ci_low, ci_upp = model.conf_int().loc[factor] if factor in model.params else (np.nan, np.nan)
                results.append({
                    "SymbolCUSIP": fund,
                    "MonthEndDate": end_date,
                    "RollPeriod": f"{window}m",
                    "Factor_Name": factor,
                    "Coefficient": coeff,
                    "P_Value": pval,
                    "T_Stat": tstat,
                    "Standard_Error": stderr,
                    "CI_Lower": ci_low,
                    "CI_Upper": ci_upp,
                    "Adj_R2": model.rsquared_adj,
                    "Correlation": np.corrcoef(y, model.fittedvalues)[0,1],
                    "Autocorrelation_Flag": diagnostics['dw'] < 1.5,
                    "Heteroskedasticity_Flag": diagnostics['bp_pval'] < 0.05,
                    "Regression_Type": reg_type
                })
    return results

#Section8: Main Batch Driver
def main():
    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()
    for region in regions:
        fund_subset = fund_meta[fund_meta["Region"] == region]
        equity_factors = load_aqrr_factors(region)
        use_fi = fund_subset["UseFixedIncome"].any()
        fixed_income_factors = load_fixed_income_factors() if use_fi else pd.DataFrame()
        all_factors = merge_all_factors(equity_factors, fixed_income_factors) if not fixed_income_factors.empty else equity_factors
        funds = fund_subset["SymbolCUSIP"].tolist()
        for i in range(0, len(funds), CHUNK_SIZE):
            chunk = funds[i:i+CHUNK_SIZE]
            fund_returns = load_fund_returns(chunk)
            records = []
            with ThreadPoolExecutor() as executor:
                futures = {
                    executor.submit(run_rolling_regression, fund, fund_returns[fund], all_factors): fund
                    for fund in fund_returns.columns
                }
                for future in tqdm(futures):
                    try:
                        records.extend(future.result())
                    except Exception as e:
                        print(f"⚠️ Error in {futures[future]}: {e}")
            if not DRY_RUN and records:
                insert_batch(records)

#Section9: Insert to Database
def insert_batch(records):
    df = pd.DataFrame(records)
    for i in range(0, len(df), BATCH_INSERT_SIZE):
        df.iloc[i:i+BATCH_INSERT_SIZE].to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)

if __name__ == "__main__":
    main()


ValueError: You are trying to merge on float64 and datetime64[ns] columns for key 'Date'. If you wish to proceed you should use pd.concat

In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from concurrent.futures import ThreadPoolExecutor
from sqlalchemy import create_engine, text
from tqdm import tqdm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
from statsmodels.stats.diagnostic import het_breuschpagan, acorr_breusch_godfrey
from statsmodels.stats.stattools import durbin_watson
import statsmodels.api as sm

#Section1: Configuration
import logging
from rpy2.rinterface_lib.callbacks import logger as rpy2_logger
rpy2_logger.setLevel(logging.ERROR)

RETURN_METRIC = "1 Month Return"

connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)
ROLLING_PERIODS = [12, 24, 36, 48, 60]  # in months
DRY_RUN = True
CHUNK_SIZE = 200  # Number of funds per chunk
BATCH_INSERT_SIZE = 2000

# Updated region mapping
category_to_region = {
    "US Equity Large Cap Blend": ("USA", False),
    "US Equity Large Cap Growth": ("USA", False),
    "US Equity Large Cap Value": ("USA", False),
    "US Equity Mid Cap": ("USA", False),
    "US Equity Small Cap": ("USA", False),
    "US Fixed Income": ("USA", True),
    "US Municipal Fixed Income": ("USA", True),
    "Communications Sector Equity": ("USA", False),
    "Consumer Goods & Services Sector Equity": ("USA", False),
    "Energy Sector Equity": ("USA", False),
    "Financials Sector Equity": ("USA", False),
    "Healthcare Sector Equity": ("USA", False),
    "Industrials Sector Equity": ("USA", False),
    "Infrastructure Sector Equity": ("USA", False),
    "Other Sector Equity": ("USA", False),
    "Precious Metals Sector Equity": ("USA", False),
    "Technology Sector Equity": ("USA", False),
    "Utilities Sector Equity": ("USA", False),
    "Real Estate Sector Equity": ("USA", False),
    "Natural Resources Sector Equity": ("USA", False),
    "Options Trading": ("USA", False),
    "Multialternative": ("USA", False),
    "Market Neutral": ("USA", False),
    "Long/Short Equity": ("USA", False),
    "Alternative Miscellaneous": ("USA", False),
    "Allocation Miscellaneous": ("USA", True),
    "Fixed Income Miscellaneous": ("USA", True),
    "Equity Miscellaneous": ("USA", False),
    "Convertibles": ("USA", False),

    "Global Equity Large Cap": ("Global", False),
    "Global Equity Mid/Small Cap": ("Global", False),
    "Global Fixed Income": ("Global", True),
    "Global Emerging Markets Equity": ("Global", False),
    "Flexible Allocation": ("Global", True),
    "Aggressive Allocation": ("Global", True),
    "Moderate Allocation": ("Global", True),
    "Cautious Allocation": ("Global", True),

    "Europe Equity Large Cap": ("Global Ex USA", False),
    "Europe Equity Mid/Small Cap": ("Global Ex USA", False),
    "Asia Equity": ("Global Ex USA", False),
    "Asia ex-Japan Equity": ("Global Ex USA", False),
    "India Equity": ("Global Ex USA", False),
    "Latin America Equity": ("Global Ex USA", False),
    "Japan Equity": ("Global Ex USA", False),
    "Korea Equity": ("Global Ex USA", False),
    "Thailand Equity": ("Global Ex USA", False),
    "Mexico Equity": ("Global Ex USA", False),
    "Australia & New Zealand Equity": ("Global Ex USA", False),
    "Greater China Equity": ("Global Ex USA", False),
    "UK Equity Large Cap": ("Global Ex USA", False),
    "Emerging Markets Fixed Income": ("Global Ex USA", True),
    "Canadian Equity Large Cap": ("Global Ex USA", False),

    "Commodities Broad Basket": (None, False),
    "Commodities Specified": (None, False),
    "Target Date": (None, False),
    "Target Date 2021-2045": (None, False),
    "Target Date 2046+": (None, False),
    "Trading Tools": (None, False),
    "Currency": (None, False),
    "Uncategorized": (None, False),
}

#Section2: Load Fund Metadata and Region Mapping
def load_fund_metadata():
    query = """
    SELECT f.SymbolCUSIP, f.YC_Global_Category_ID, c.Global_Category_Name
    FROM Funds_to_Screen f
    JOIN YC_Global_Category_List c ON f.YC_Global_Category_ID = c.ID
    """
    df = pd.read_sql(query, engine)
    df[["Region", "UseFixedIncome"]] = df["Global_Category_Name"].map(category_to_region).apply(pd.Series)
    
    missing = df[df["Region"].isna()]["Global_Category_Name"].unique()
    if len(missing) > 0:
        print("⚠️ Missing category_to_region mappings for:")
        for cat in sorted(missing):
            print(f" - {cat}")
    
    return df.dropna(subset=["Region"])

#Section3: Load Return Time Series
def load_fund_returns(fund_ids):
    placeholders = ",".join([f"'{fid}'" for fid in fund_ids])
    query = f"""
        SELECT SymbolCUSIP, Date, ReturnValue
        FROM Fund_Returns_Timeseries
        WHERE SymbolCUSIP IN ({placeholders})
        AND Metric = '{RETURN_METRIC}'
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="SymbolCUSIP", values="ReturnValue")

#Section4: Load AQRR Factor Data
def load_aqrr_factors(region):
    from rpy2.robjects import r, pandas2ri
    from rpy2.robjects.conversion import localconverter
    from rpy2.robjects.packages import importr
    import pandas as pd

    aqrr = importr("aqrr")
    dplyr = importr("dplyr")
    base = importr("base")

    r(f"""
        suppressMessages(library(aqrr))
        mkt <- aqr_mkt_monthly() %>% filter(name == '{region}') %>% select(date, mkt = value)
        smb <- aqr_smb_monthly() %>% filter(name == '{region}') %>% select(date, smb = value)
        hml <- aqr_hml_ff_monthly() %>% filter(name == '{region}') %>% select(date, hml = value)
        umd <- aqr_umd_monthly() %>% filter(name == '{region}') %>% select(date, umd = value)
        qmj <- aqr_qmj_monthly() %>% filter(name == '{region}') %>% select(date, qmj = value)
        bab <- aqr_bab_monthly() %>% filter(name == '{region}') %>% select(date, bab = value)
        factors <- Reduce(function(x, y) full_join(x, y, by = "date"), list(mkt, smb, hml, umd, qmj, bab))
    """)

    with localconverter(pandas2ri.converter):
        df = pandas2ri.rpy2py(r["factors"])

    df = df.rename(columns={"date": "Date"})
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    df = df[df["Date"] >= pd.to_datetime("2015-01-31")]
    df = df.sort_values("Date").reset_index(drop=True)

    return df

#Section5: Load Fixed Income Factor Data
def load_fixed_income_factors():
    query = """
        SELECT Date, Factor_Name, ReturnValue
        FROM Fixed_Income_Factor_Returns
    """
    df = pd.read_sql(query, engine, parse_dates=["Date"])
    return df.pivot(index="Date", columns="Factor_Name", values="ReturnValue")

#Section6: Merge Factors
def merge_all_factors(equity_df, fixed_income_df):
    equity_df["Date"] = pd.to_datetime(equity_df["Date"])
    fixed_income_df.index = pd.to_datetime(fixed_income_df.index)
    merged = equity_df.merge(fixed_income_df, how="left", left_on="Date", right_index=True)

    # Set date as index and drop any remaining 'Date' column just in case
    merged = merged.set_index("Date")
    return merged

#Section7: Perform Rolling Regression
def run_rolling_regression(fund, returns, factors):
    results = []
    for window in ROLLING_PERIODS:
        start = returns.index.min() + relativedelta(months=window)
        for end_date in returns.loc[returns.index >= start].index:
            start_date = end_date - relativedelta(months=window - 1)
            y = returns.loc[start_date:end_date]

            try:
                y = y.astype(float)
                X = factors.loc[start_date:end_date].copy()

                if "Date" in X.columns:
                    X = X.drop(columns=["Date"])

                X = X.astype(float)

                if y.isnull().any() or X.isnull().any().any():
                    continue

                X_const = add_constant(X)
                model = OLS(y, X_const).fit()

                diagnostics = {
                    'dw': durbin_watson(model.resid),
                    'bp_pval': het_breuschpagan(model.resid, model.model.exog)[1]
                }

                is_robust = diagnostics['dw'] < 1.5 or diagnostics['bp_pval'] < 0.05
                reg_type = "Robust" if is_robust else "OLS"

                if is_robust:
                    model = sm.OLS(y, X_const).fit(cov_type='HAC', cov_kwds={'maxlags': 1})

                for factor in X.columns:
                    coeff = model.params.get(factor, np.nan)
                    pval = model.pvalues.get(factor, np.nan)
                    tstat = model.tvalues.get(factor, np.nan)
                    stderr = model.bse.get(factor, np.nan)
                    ci_low, ci_upp = model.conf_int().loc[factor] if factor in model.params else (np.nan, np.nan)

                    results.append({
                        "SymbolCUSIP": fund,
                        "MonthEndDate": end_date,
                        "RollPeriod": f"{window}m",
                        "Factor_Name": factor,
                        "Coefficient": coeff,
                        "P_Value": pval,
                        "T_Stat": tstat,
                        "Standard_Error": stderr,
                        "CI_Lower": ci_low,
                        "CI_Upper": ci_upp,
                        "Adj_R2": model.rsquared_adj,
                        "Correlation": np.corrcoef(y, model.fittedvalues)[0, 1],
                        "Autocorrelation_Flag": diagnostics['dw'] < 1.5,
                        "Heteroskedasticity_Flag": diagnostics['bp_pval'] < 0.05,
                        "Regression_Type": reg_type
                    })

            except Exception as e:
                print(f"⚠️ Regression error for {fund} [{start_date.date()} to {end_date.date()}]: {e}")
                print("X types:\n", X.dtypes if 'X' in locals() else 'X not defined')
                print("X head:\n", X.head() if 'X' in locals() else 'X not defined')
                print("Y head:\n", y.head() if 'y' in locals() else 'y not defined')
                continue

    return results

#Section8: Main Batch Driver
def main():
    fund_meta = load_fund_metadata()
    regions = fund_meta["Region"].unique()
    for region in regions:
        fund_subset = fund_meta[fund_meta["Region"] == region]
        equity_factors = load_aqrr_factors(region)
        use_fi = fund_subset["UseFixedIncome"].any()
        fixed_income_factors = load_fixed_income_factors() if use_fi else pd.DataFrame()
        all_factors = merge_all_factors(equity_factors, fixed_income_factors) if not fixed_income_factors.empty else equity_factors
        funds = fund_subset["SymbolCUSIP"].tolist()
        for i in range(0, len(funds), CHUNK_SIZE):
            chunk = funds[i:i+CHUNK_SIZE]
            fund_returns = load_fund_returns(chunk)
            records = []
            with ThreadPoolExecutor() as executor:
                futures = {
                    executor.submit(run_rolling_regression, fund, fund_returns[fund], all_factors): fund
                    for fund in fund_returns.columns
                }
                for future in tqdm(futures):
                    try:
                        records.extend(future.result())
                    except Exception as e:
                        print(f"⚠️ Error in {futures[future]}: {e}")
            if not DRY_RUN and records:
                insert_batch(records)

#Section9: Insert to Database
def insert_batch(records):
    df = pd.DataFrame(records)
    for i in range(0, len(df), BATCH_INSERT_SIZE):
        df.iloc[i:i+BATCH_INSERT_SIZE].to_sql("AQRR_Factor_Attribution", engine, if_exists="append", index=False)

if __name__ == "__main__":
    main()


⚠️ Missing category_to_region mappings for:
 - Commodities Broad Basket
 - Commodities Specified
 - Trading Tools
