In [None]:
#These scripts are to load AQR Factors

In [None]:
#This script loads from all papers, but new links need to be provided, and if their format changes
#then headers would need to be adjusted etc.  It checks for current data as well.

In [22]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
from sqlalchemy.sql import text
import logging
import os

# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')])
logger = logging.getLogger(__name__)

# Database connection
engine = create_engine("mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes")

# File configuration
data_files = {
    "Century": {
        "path": r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx",
        "header": 18,
        "paper": "Century of Factor Premia Monthly-Ilmanen et al. (2021)"
    },
    "QMJ": {
        "path": r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx",
        "header": 18,
        "paper": "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)"
    },
    "TSM": {
        "path": r"C:\Users\JulianHeron\Downloads\Momentum Indices Monthly (1).xlsx",
        "header": 1,  # Corrected to 1 as per your instruction (row 2 in Excel)
        "sheet": 1,   # Sheet 2 (0-based index)
        "paper": "Time Series Momentum (Moskowitz et al., 2012)"
    },
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {"sheet": 0, "header": 18, "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)", "factor_symbol": "BAB", "factor_name": "Betting Against Beta"},
            "MKT": {"sheet": 4, "header": 18, "paper": "Fama-French Factors", "factor_symbol": "MKT", "factor_name": "Market"},
            "SMB": {"sheet": 5, "header": 18, "paper": "Fama-French Factors", "factor_symbol": "SMB", "factor_name": "Small Minus Big"},
            "HML_FF": {"sheet": 6, "header": 18, "paper": "Fama-French Factors", "factor_symbol": "HML-FF", "factor_name": "High Minus Low (Fama-French)"},
            "HML_Devil": {"sheet": 7, "header": 18, "paper": "AQR Factors", "factor_symbol": "HML-Devil", "factor_name": "High Minus Low (AQR)"},
            "UMD": {"sheet": 8, "header": 18, "paper": "Fama-French Factors", "factor_symbol": "UMD", "factor_name": "Up Minus Down"},
            "ME": {"sheet": 9, "header": 18, "paper": "AQR Factors", "factor_symbol": "ME", "factor_name": "Market Equity"},
            "RF": {"sheet": 10, "header": 18, "paper": "Fama-French Factors", "factor_symbol": "RF", "factor_name": "Risk-Free Rate"}
        }
    },
    "COM": {
        "path": r"C:\Users\JulianHeron\Downloads\Commodities for the Long Run Index Level Data Monthly.xlsx",
        "header": 10,
        "paper": "Commodities Paper (TBD)"
    }
}

def process_century_data(file_config):
    logger.info(f"Processing {file_config['path']}...")
    try:
        df = pd.read_excel(file_config['path'], sheet_name=0, header=file_config['header'])
        logger.info(f"Raw data sample: {df.head().to_string()}")
        df = df.rename(columns={df.columns[0]: 'date'})
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        for col in df.columns[1:]:
            df[col] = pd.to_numeric(df[col].replace('', '0%').str.rstrip('%') if isinstance(df[col].iloc[0], str) else df[col], errors='coerce') / 100
        df_long = df.melt(id_vars=['date'], var_name='portfolio', value_name='value')
        df_long = df_long.dropna(subset=['value', 'date'])
        df_long['factor'] = df_long['portfolio'].str.extract('(Value|Momentum|Carry|Defensive|Multi-style|Market)')
        df_long['asset_class'] = df_long['portfolio'].str.extract('(Stock Selection|Equity indices|Fixed income|Currencies|Commodities|All Macro|All asset classes)')
        df_long['region'] = df_long['portfolio'].apply(lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global')
        df_long['associated_paper'] = file_config['paper']
        df_long = df_long[['factor', 'portfolio', 'asset_class', 'region', 'date', 'value', 'associated_paper']]

        with engine.connect() as connection:
            existing = pd.read_sql("SELECT factor, portfolio, date FROM aqr_century_factors", connection)
            existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
            existing_keys = set(tuple(row) for row in existing[['factor', 'portfolio', 'date']].values)
            logger.info(f"Found {len(existing_keys)} existing keys in aqr_century_factors.")

        df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
        df_long['key'] = df_long.apply(lambda row: (row['factor'], row['portfolio'], row['date_str']), axis=1)
        df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['date_str', 'key'])
        logger.info(f"New rows to load: {len(df_new)}")

        if not df_new.empty:
            df_new.to_sql('aqr_century_factors', engine, if_exists='append', index=False,
                          dtype={'factor': VARCHAR(50), 'portfolio': VARCHAR(100), 'asset_class': VARCHAR(50),
                                 'region': VARCHAR(50), 'date': DATE, 'value': DECIMAL(15, 6), 'associated_paper': VARCHAR(150)})
            logger.info(f"Loaded {len(df_new)} rows into aqr_century_factors.")
        else:
            logger.info("No new rows to load into aqr_century_factors.")
    except Exception as e:
        logger.error(f"Error processing {file_config['path']}: {str(e)}")

def process_aqr_data(file_config, factor_symbol, factor_name, has_portfolios=False, regions=None, sheet=0):
    logger.info(f"Processing {file_config['path']} (sheet {sheet})...")
    try:
        df = pd.read_excel(file_config['path'], sheet_name=sheet, header=file_config['header'])
        logger.info(f"Raw data sample: {df.head().to_string()}")
        if factor_symbol == "COM":
            df.columns = ['date'] + df.columns[1:].tolist()
        else:
            df = df.rename(columns={df.columns[0]: 'date'})
        df['date'] = pd.to_datetime(df['date'], format='%b-%y' if factor_symbol == "TSM" else None, errors='coerce')
        # Limit TSM to 4 columns: "date", "U.S. Large Cap", "U.S. Small Cap", "International"
        if factor_symbol == "TSM" and len(df.columns) > 4:
            df = df.iloc[:, :4]
            logger.info(f"Trimmed TSM to 4 columns: {df.columns.tolist()}")
        for col in df.columns[1:]:
            df[col] = pd.to_numeric(df[col].replace('', '0%').str.rstrip('%') if isinstance(df[col].iloc[0], str) else df[col], errors='coerce') / 100
        df_long = df.melt(id_vars=['date'], var_name='portfolio', value_name='value')
        df_long = df_long.dropna(subset=['value', 'date'])

        if has_portfolios:  # For QMJ, TSM, Commodities
            df_long['factor_symbol'] = factor_symbol
            df_long['factor_name'] = factor_name
            df_long['portfolio'] = df_long['portfolio']
            df_long['region'] = df_long['portfolio'].apply(
                lambda x: 'US' if 'U.S.' in str(x) or 'USA' in str(x) else 'Intl' if 'International' in str(x) else 'Global'
            ) if factor_symbol == "TSM" else "Global"
        else:  # For BAB, MKT, SMB, etc.
            df_long['factor_symbol'] = factor_symbol
            df_long['factor_name'] = factor_name
            df_long['portfolio'] = factor_symbol
            df_long['region'] = df_long['portfolio'].apply(
                lambda x: 'US' if 'USA' in str(x) else 'Global' if 'Global' in str(x) else 'Intl'
            )
            if regions:
                df_long = df_long[df_long['region'].isin(regions)]

        df_long['associated_paper'] = file_config['paper']
        df_long = df_long[['factor_symbol', 'factor_name', 'portfolio', 'region', 'date', 'value', 'associated_paper']]

        with engine.connect() as connection:
            existing = pd.read_sql("SELECT factor_symbol, portfolio, region, date FROM aqr_factors", connection)
            existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
            existing_keys = set(tuple(row) for row in existing[['factor_symbol', 'portfolio', 'region', 'date']].values)
            logger.info(f"Found {len(existing_keys)} existing keys in aqr_factors.")

        df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
        df_long['key'] = df_long.apply(lambda row: (row['factor_symbol'], row['portfolio'], row['region'], row['date_str']), axis=1)
        df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['date_str', 'key'])
        logger.info(f"New rows to load: {len(df_new)}")

        if not df_new.empty:
            df_new.to_sql('aqr_factors', engine, if_exists='append', index=False,
                          dtype={'factor_symbol': VARCHAR(20), 'factor_name': VARCHAR(100), 'portfolio': VARCHAR(100),
                                 'region': VARCHAR(50), 'date': DATE, 'value': DECIMAL(15, 6), 'associated_paper': VARCHAR(150)})
            logger.info(f"Loaded {len(df_new)} rows into aqr_factors.")
        else:
            logger.info("No new rows to load into aqr_factors.")
    except Exception as e:
        logger.error(f"Error processing {file_config['path']}: {str(e)}")

def process_multi_sheet_data(file_config):
    for factor, config in file_config['sheets'].items():
        logger.info(f"Processing sheet {factor} from {file_config['path']}...")
        try:
            df = pd.read_excel(file_config['path'], sheet_name=config['sheet'], header=config['header'])
            logger.info(f"Raw data sample for {factor}: {df.head().to_string()}")
            df = df.rename(columns={df.columns[0]: 'date'})
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            for col in df.columns[1:]:
                df[col] = pd.to_numeric(df[col].replace('', '0%').str.rstrip('%') if isinstance(df[col].iloc[0], str) else df[col], errors='coerce') / 100
            df_long = df.melt(id_vars=['date'], var_name='portfolio', value_name='value')
            df_long = df_long.dropna(subset=['value', 'date'])

            if factor == "RF":  # Special case for RF
                df_long['factor_symbol'] = config['factor_symbol']
                df_long['factor_name'] = config['factor_name']
                df_long['portfolio'] = config['factor_symbol']
                df_long['region'] = 'Global'
            else:  # BAB, MKT, SMB, etc.
                df_long['factor_symbol'] = config['factor_symbol']
                df_long['factor_name'] = config['factor_name']
                df_long['portfolio'] = config['factor_symbol']
                df_long['region'] = df_long['portfolio'].apply(
                    lambda x: 'US' if 'USA' in str(x) else 'Global' if 'Global' in str(x) else 'Intl'
                )
                df_long = df_long[df_long['region'].isin(["USA", "Global", "Global Ex USA"])]

            df_long['associated_paper'] = config['paper']
            df_long = df_long[['factor_symbol', 'factor_name', 'portfolio', 'region', 'date', 'value', 'associated_paper']]

            with engine.connect() as connection:
                existing = pd.read_sql("SELECT factor_symbol, portfolio, region, date FROM aqr_factors", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(tuple(row) for row in existing[['factor_symbol', 'portfolio', 'region', 'date']].values)
                logger.info(f"Found {len(existing_keys)} existing keys in aqr_factors for {factor}.")

            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            df_long['key'] = df_long.apply(lambda row: (row['factor_symbol'], row['portfolio'], row['region'], row['date_str']), axis=1)
            df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['date_str', 'key'])
            logger.info(f"New rows to load for {factor}: {len(df_new)}")

            if not df_new.empty:
                df_new.to_sql('aqr_factors', engine, if_exists='append', index=False,
                              dtype={'factor_symbol': VARCHAR(20), 'factor_name': VARCHAR(100), 'portfolio': VARCHAR(100),
                                     'region': VARCHAR(50), 'date': DATE, 'value': DECIMAL(15, 6), 'associated_paper': VARCHAR(150)})
                logger.info(f"Loaded {len(df_new)} rows for {factor} into aqr_factors.")
            else:
                logger.info(f"No new rows to load for {factor} into aqr_factors.")
        except Exception as e:
            logger.error(f"Error processing sheet {factor} in {file_config['path']}: {str(e)}")

# Process all files
process_century_data(data_files["Century"])
process_aqr_data(data_files["QMJ"], "QMJ", "Quality Minus Junk", has_portfolios=True)
process_aqr_data(data_files["TSM"], "TSM", "Momentum", has_portfolios=True, sheet=data_files["TSM"]["sheet"])
process_multi_sheet_data(data_files["BAB_multi"])
process_aqr_data(data_files["COM"], "COM", "Commodities", has_portfolios=True)

logger.info("All data processing complete!")

2025-04-03 20:37:19,493 - INFO - Processing C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx...
2025-04-03 20:37:20,127 - INFO - Raw data sample:         Date  US Stock Selection Value  US Stock Selection Momentum  US Stock Selection Defensive  US Stock Selection Multi-style  Intl Stock Selection Value  Intl Stock Selection Momentum  Intl Stock Selection Defensive  Intl Stock Selection Multi-style  Equity indices Value  Equity indices Momentum  Equity indices Carry  Equity indices Defensive  Equity indices Multi-style  Fixed income Value  Fixed income Momentum  Fixed income Carry  Fixed income Defensive  Fixed income Multi-style  Currencies Value  Currencies Momentum  Currencies Carry  Currencies Multi-style  Commodities Value  Commodities Momentum  Commodities Carry  Commodities Multi-style  All Stock Selection Value  All Stock Selection Momentum  All Stock Selection Defensive  All Stock Selection Multi-style  All Macro Value  All Macro Momentum  All Macro Carr

In [None]:
# Updated Loading script to include TSM

In [2]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
from sqlalchemy.sql import text
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)

# Database connection string
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# Toggle for dataset selection
load_qmj = False  # Set to True for QMJ
load_tsmom = True  # Set to True for TSMOM
if load_qmj:
    filepath = r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx"
    paper = "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)"
    portfolio_columns = [
        "P1 (low quality)", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10 (high quality)", "P10-P1"
    ]
    header_row = 18  # Header row for QMJ dataset
    def classify_portfolio(portfolio_name):
        if portfolio_name == "P10-P1":
            factor = "QMJ"
            asset_class = "Stock Selection"
            region = "Global"
        elif portfolio_name == "P1 (low quality)":
            factor = "QMJ-P1"
            asset_class = "Stock Selection"
            region = "Global"
        elif portfolio_name == "P10 (high quality)":
            factor = "QMJ-P10"
            asset_class = "Stock Selection"
            region = "Global"
        elif portfolio_name in ["P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9"]:
            factor = f"QMJ-{portfolio_name}"
            asset_class = "Stock Selection"
            region = "Global"
        else:
            raise ValueError(f"Unrecognized portfolio name: {portfolio_name}")
        return factor, asset_class, region
elif load_tsmom:
    filepath = r"C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx"
    paper = "Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)"
    portfolio_columns = [
        "TSMOM", "TSMOM^CM", "TSMOM^EQ", "TSMOM^FI", "TSMOM^FX"
    ]
    header_row = 17  # Header row for TSMOM dataset (row 18 in Excel, 0-based index 17)
    def classify_portfolio(portfolio_name):
        # Map column names to desired factor names
        factor_map = {
            "TSMOM": "TSMOM",
            "TSMOM^CM": "TSM-Commodities",
            "TSMOM^EQ": "TSM-Equity",
            "TSMOM^FI": "TSM-Fixed-Income",
            "TSMOM^FX": "TSM-Currency"
        }
        factor = factor_map.get(portfolio_name, portfolio_name)
        
        # Define asset class and region based on portfolio
        if portfolio_name == "TSMOM":
            asset_class = "Multi-Asset"
            region = "Global"
        elif portfolio_name == "TSMOM^CM":
            asset_class = "Commodities"
            region = "Global"
        elif portfolio_name == "TSMOM^EQ":
            asset_class = "Equity"
            region = "Global"
        elif portfolio_name == "TSMOM^FI":
            asset_class = "Fixed Income"
            region = "Global"
        elif portfolio_name == "TSMOM^FX":
            asset_class = "Currencies"
            region = "Global"
        else:
            raise ValueError(f"Unrecognized portfolio name: {portfolio_name}")
        return factor, asset_class, region
else:
    filepath = r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx"
    paper = "Century of Factor Premia Monthly-Ilmanen et al. (2021)"
    portfolio_columns = [
        "US Stock Selection Value", "US Stock Selection Momentum", "US Stock Selection Defensive", "US Stock Selection Multi-style",
        "Intl Stock Selection Value", "Intl Stock Selection Momentum", "Intl Stock Selection Defensive", "Intl Stock Selection Multi-style",
        "Equity indices Value", "Equity indices Momentum", "Equity indices Carry", "Equity indices Defensive", "Equity indices Multi-style",
        "Fixed income Value", "Fixed income Momentum", "Fixed income Carry", "Fixed income Defensive", "Fixed income Multi-style",
        "Currencies Value", "Currencies Momentum", "Currencies Carry", "Currencies Multi-style",
        "Commodities Value", "Commodities Momentum", "Commodities Carry", "Commodities Multi-style",
        "All Stock Selection Value", "All Stock Selection Momentum", "All Stock Selection Defensive", "All Stock Selection Multi-style",
        "All Macro Value", "All Macro Momentum", "All Macro Carry", "All Macro Defensive", "All Macro Multi-style",
        "All asset classes Value", "All asset classes Momentum", "All asset classes Carry", "All asset classes Defensive", "All asset classes Multi-style",
        "Equity indices Market", "Fixed income Market", "Commodities Market", "All Macro Market"
    ]
    header_row = 18  # Header row for Century dataset
    def classify_portfolio(portfolio_name):
        # Use the full portfolio name as the factor to ensure uniqueness
        factor = portfolio_name  # e.g., "US Stock Selection Value"
        if "US Stock Selection" in portfolio_name:
            asset_class = "Stock Selection"
            region = "US"
        elif "Intl Stock Selection" in portfolio_name:
            asset_class = "Stock Selection"
            region = "Intl"
        elif "All Stock Selection" in portfolio_name:
            asset_class = "All Stock Selection"
            region = "Global"
        elif "Equity indices" in portfolio_name:
            asset_class = "Equity Indices"
            region = "Global"
        elif "Fixed income" in portfolio_name:
            asset_class = "Fixed Income"
            region = None
        elif "Currencies" in portfolio_name:
            asset_class = "Currencies"
            region = None
        elif "Commodities" in portfolio_name:
            asset_class = "Commodities"
            region = None
        elif "All Macro" in portfolio_name:
            asset_class = "All Macro"
            region = "Global"
        elif "All asset classes" in portfolio_name:
            asset_class = "All Asset Classes"
            region = "Global"
        else:
            raise ValueError(f"Unrecognized asset class in portfolio name: {portfolio_name}")
        return factor, asset_class, region

# Process the Excel file
logger.info(f"Processing {filepath}...")

# Verify table state (optional pre-check)
with engine.connect() as connection:
    row_count = connection.execute(text("SELECT COUNT(*) FROM factor_returns")).scalar()
    if row_count > 0:
        logger.info(f"Table contains {row_count} rows. Will filter out duplicates before loading.")
    else:
        logger.info(f"Table is empty with {row_count} rows.")

try:
    # Read the Excel file with the specified header row
    df = pd.read_excel(filepath, sheet_name=0, header=header_row)
    logger.info(f"Read {len(df)} rows with columns: {list(df.columns)}")

    # Log a sample of the data to debug
    logger.info(f"First 5 rows sample: {df.head().to_string()}")

    # Rename first column to 'date'
    df = df.rename(columns={df.columns[0]: 'date'})

    # Convert date column to datetime
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    # Filter out invalid dates
    df = df[df['date'].notna()]
    logger.info(f"After filtering invalid dates, {len(df)} rows remain.")

    # Filter to only known portfolio columns (strictly enforce 39 columns for Century, 11 for QMJ, 5 for TSMOM)
    valid_columns = ['date'] + portfolio_columns
    available_columns = [col for col in df.columns if col in valid_columns]
    expected_count = 39 if not load_qmj and not load_tsmom else (5 if load_tsmom else 11)
    if len(available_columns) - 1 != expected_count:  # -1 for 'date'
        missing = [col for col in valid_columns if col not in df.columns]
        extra = [col for col in df.columns if col not in valid_columns]
        logger.warning(f"Expected {expected_count} portfolios, found {len(available_columns) - 1}. Missing: {missing}, Extra: {extra}")
    df = df[available_columns]
    logger.info(f"Filtered to {len(df.columns) - 1} portfolio columns: {available_columns[1:]}")

    # Melt to long format
    df_long = df.melt(id_vars=['date'], var_name='portfolio_name', value_name='value')
    logger.info(f"Melted to {len(df_long)} rows.")

    # Remove rows with missing values
    df_long = df_long.dropna(subset=['value'])
    logger.info(f"After dropping rows with missing values, {len(df_long)} rows remain.")

    # Apply classification
    df_long[['factor', 'asset_class', 'region']] = df_long['portfolio_name'].apply(
        lambda x: pd.Series(classify_portfolio(x))
    )

    # Add associated paper
    df_long['associated_paper'] = paper

    # Ensure column names match table
    df_long = df_long[['factor', 'date', 'associated_paper', 'asset_class', 'value', 'region']]

    # Drop duplicates within the new data
    df_long = df_long.drop_duplicates(subset=['factor', 'date', 'associated_paper', 'asset_class'])
    logger.info(f"After dropping duplicates within new data, {len(df_long)} rows remain.")

    # Fetch existing keys from the table
    with engine.connect() as connection:
        existing_keys = pd.read_sql(
            text("SELECT factor, date, associated_paper, asset_class FROM factor_returns"),
            connection
        )
        existing_keys['date'] = pd.to_datetime(existing_keys['date']).dt.tz_localize(None)  # Remove timezone
        # Convert to string for consistent comparison
        existing_keys['date_str'] = existing_keys['date'].dt.strftime('%Y-%m-%d')
        existing_keys_set = set(
            (row['factor'], row['date_str'], row['associated_paper'], row['asset_class'])
            for _, row in existing_keys.iterrows()
        )
        logger.info(f"Found {len(existing_keys_set)} existing keys in the table.")
        # Log a sample of existing keys for debugging
        logger.info(f"Sample of existing keys: {list(existing_keys_set)[:5]}")

    # Filter out rows that already exist in the table
    df_long['date'] = df_long['date'].dt.tz_localize(None)  # Remove timezone
    df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
    df_long['key'] = df_long.apply(
        lambda row: (row['factor'], row['date_str'], row['associated_paper'], row['asset_class']), axis=1
    )
    # Log a sample of new keys for debugging
    logger.info(f"Sample of new keys: {list(df_long['key'].head())}")
    df_new = df_long[~df_long['key'].isin(existing_keys_set)].drop(columns=['date_str', 'key'])
    logger.info(f"After filtering out existing keys, {len(df_new)} new rows to load.")

    # Log a sample of new data to load
    if not df_new.empty:
        logger.info(f"Sample of new data to load: {df_new.head().to_string()}")
    else:
        logger.info("No new data to load after filtering.")

    # Load only new rows into MSSQL database with batching
    if not df_new.empty:
        df_new.to_sql(
            'factor_returns',
            engine,
            if_exists='append',
            index=False,
            dtype={
                'factor': VARCHAR(50),
                'date': DATE,
                'associated_paper': VARCHAR(100),
                'asset_class': VARCHAR(50),
                'value': DECIMAL(15, 6),
                'region': VARCHAR(50)
            },
            chunksize=1000
        )
        logger.info(f"Successfully loaded {len(df_new)} new rows into database.")
    else:
        logger.info("No new rows to load after filtering duplicates.")

    # Verify total rows
    with engine.connect() as connection:
        total_rows = connection.execute(text("SELECT COUNT(*) FROM factor_returns")).scalar()
        logger.info(f"Total rows in factor_returns: {total_rows}")

except Exception as e:
    logger.error(f"Error loading data: {str(e)}")

logger.info("Processing complete!")

2025-04-06 16:08:52,643 - INFO - Processing C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx...
2025-04-06 16:08:52,653 - INFO - Table contains 4537 rows. Will filter out duplicates before loading.
2025-04-06 16:08:52,860 - INFO - Read 481 rows with columns: ['Unnamed: 0', 'TSMOM', 'TSMOM^CM', 'TSMOM^EQ', 'TSMOM^FI', 'TSMOM^FX']
2025-04-06 16:08:52,865 - INFO - First 5 rows sample:   Unnamed: 0     TSMOM  TSMOM^CM  TSMOM^EQ  TSMOM^FI  TSMOM^FX
0 1985-01-31  0.043066 -0.014042  0.153376 -0.015625  0.056041
1 1985-02-28  0.038128  0.047449  0.043061 -0.193815  0.099316
2 1985-03-29 -0.052719 -0.083491  0.032803  0.071045 -0.117085
3 1985-04-30  0.039634  0.060051  0.024413  0.047500  0.019504
4 1985-05-31  0.063918  0.045516  0.121216  0.144524  0.020146
2025-04-06 16:08:52,868 - INFO - After filtering invalid dates, 481 rows remain.
2025-04-06 16:08:52,878 - INFO - Filtered to 5 portfolio columns: ['TSMOM', 'TSMOM^CM', 'TSMOM^EQ', 'TSMOM^FI', 'TSMOM^FX']
2025-04-

In [None]:
# This is the Quality Factor Loading QMJ

In [12]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)

# Database connection string
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# File path
filepath = r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx"

# Associated paper
paper = "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)"

# Known portfolio columns (11 total, updated to match file)
portfolio_columns = [
    "P1 (low quality)", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10 (high quality)", "P10-P1"
]

# Mapping function to classify portfolios
def classify_portfolio(portfolio_name):
    if portfolio_name == "P10-P1":
        factor = "QMJ"  # Factor value for Quality Minus Junk
        asset_class = "Stock Selection"
        region = "Global"
    elif portfolio_name == "P1 (low quality)":
        factor = "QMJ-P1"
        asset_class = "Stock Selection"
        region = "Global"
    elif portfolio_name == "P10 (high quality)":
        factor = "QMJ-P10"
        asset_class = "Stock Selection"
        region = "Global"
    elif portfolio_name in ["P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9"]:
        factor = f"QMJ-{portfolio_name}"  # e.g., QMJ-P2, QMJ-P3, ..., QMJ-P9
        asset_class = "Stock Selection"
        region = "Global"
    else:
        raise ValueError(f"Unrecognized portfolio name: {portfolio_name}")

    return factor, asset_class, region

# Process the Excel file
logger.info(f"Processing {filepath}...")

try:
    # Read the Excel file with header=18
    df = pd.read_excel(filepath, sheet_name=0, header=18)
    logger.info(f"Read {len(df)} rows with columns: {list(df.columns)}")

    # Rename first column to 'date'
    df = df.rename(columns={df.columns[0]: 'date'})

    # Convert date column to datetime
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    # Filter out invalid dates
    df = df[df['date'].notna()]
    logger.info(f"After filtering invalid dates, {len(df)} rows remain.")

    # Filter to only known portfolio columns
    valid_columns = ['date'] + portfolio_columns
    df = df[valid_columns]
    missing = [col for col in portfolio_columns if col not in df.columns]
    if missing:
        logger.warning(f"Missing columns: {missing}")
    extra = [col for col in df.columns if col not in valid_columns]
    if extra:
        logger.warning(f"Extra columns ignored: {extra}")
    logger.info(f"Filtered to {len(df.columns) - 1} portfolio columns: {valid_columns[1:]}")

    # Melt to long format
    df_long = df.melt(id_vars=['date'], var_name='portfolio_name', value_name='value')
    logger.info(f"Melted to {len(df_long)} rows.")

    # Remove rows with missing values
    df_long = df_long.dropna(subset=['value'])
    logger.info(f"After dropping rows with missing values, {len(df_long)} rows remain.")

    # Apply classification
    df_long[['factor', 'asset_class', 'region']] = df_long['portfolio_name'].apply(
        lambda x: pd.Series(classify_portfolio(x))
    )

    # Add associated paper
    df_long['associated_paper'] = paper

    # Ensure column names match table
    df_long = df_long[['factor', 'date', 'associated_paper', 'asset_class', 'value', 'region']]

    # Drop duplicates based on primary key
    df_long = df_long.drop_duplicates(subset=['factor', 'date', 'associated_paper', 'asset_class'])
    logger.info(f"After dropping duplicates, {len(df_long)} rows remain.")

    # Load into MSSQL database
    df_long.to_sql(
        'factor_returns',
        engine,
        if_exists='append',
        index=False,
        dtype={
            'factor': VARCHAR(50),
            'date': DATE,
            'associated_paper': VARCHAR(100),
            'asset_class': VARCHAR(50),
            'value': DECIMAL(15, 6),
            'region': VARCHAR(50)
        }
    )
    logger.info(f"Successfully loaded {len(df_long)} rows into database.")

except Exception as e:
    logger.error(f"Error loading data: {str(e)}")

logger.info("Processing complete!")

2025-04-03 18:20:12,111 - INFO - Processing C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx...
2025-04-03 18:20:12,487 - INFO - Read 809 rows with columns: ['DATE', 'P1 (low quality)', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10 (high quality)', 'P10-P1', 'P1 (low quality).1', 'P2.1', 'P3.1', 'P4.1', 'P5.1', 'P6.1', 'P7.1', 'P8.1', 'P9.1', 'P10 (high quality).1', 'P10-P1.1']
2025-04-03 18:20:12,493 - INFO - After filtering invalid dates, 809 rows remain.
2025-04-03 18:20:12,495 - INFO - Filtered to 11 portfolio columns: ['P1 (low quality)', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10 (high quality)', 'P10-P1']
2025-04-03 18:20:12,501 - INFO - Melted to 8899 rows.
2025-04-03 18:20:12,505 - INFO - After dropping rows with missing values, 8899 rows remain.
2025-04-03 18:20:13,391 - INFO - After dropping duplicates, 8899 rows remain.
2025-04-03 18:20:14,209 - INFO - Successfully loaded 8899 rows into database.
2025-04-03 18:20:14

In [None]:
# New consolidated load factors script

In [7]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
from sqlalchemy.sql import text
import logging

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)

# Database connection
engine = create_engine(
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

# File configurations
data_files = {
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {"sheet": 0, "header": 19, "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)", "columns": {"USA": "BAB", "Global": "BAB", "Global Ex USA": "BAB"}},
            "MKT": {"sheet": 4, "header": 19, "paper": "Fama-French Factors", "columns": {"USA": "MKT", "Global": "MKT", "Global Ex USA": "MKT"}},
            "SMB": {"sheet": 5, "header": 19, "paper": "Fama-French Factors", "columns": {"USA": "SMB", "Global": "SMB", "Global Ex USA": "SMB"}},
            "HML-FF": {"sheet": 6, "header": 19, "paper": "Fama-French Factors", "columns": {"USA": "HML-FF", "Global": "HML-FF", "Global Ex USA": "HML-FF"}},
            "HML-D": {"sheet": 7, "header": 19, "paper": "The Devil's in HML's Details", "columns": {"USA": "HML-D", "Global": "HML-D", "Global Ex USA": "HML-D"}},
            "UMD": {"sheet": 8, "header": 19, "paper": "On Persistence in Mutual Fund Performance", "columns": {"USA": "UMD", "Global": "UMD", "Global Ex USA": "UMD"}},
            "ME": {"sheet": 9, "header": 19, "paper": "AQR Factors", "columns": {"USA": "ME", "Global": "ME", "Global Ex USA": "ME"}},
            "RF": {"sheet": 10, "header": 18, "paper": None, "columns": {"Risk Free Rate": "RF"}}
        }
    },
    "TSMOM": {
        "path": r"C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx",
        "sheet": 0,
        "header": 17,
        "paper": "Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)",
        "columns": {
            "Unnamed: 0": "DATE",
            "TSMOM": "TSM-MA",
            "TSMOM^CM": "TSM-Com",
            "TSMOM^EQ": "TSM-EQ",
            "TSMOM^FI": "TSM-FI",
            "TSMOM^FX": "TSM-FX"
        }
    },
    "QMJ": {
        "path": r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx",
        "sheet": 0,
        "header": 19,
        "paper": "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)",
        "columns": {
            "DATE": "DATE",
            "P1 (low quality)": "QMJ-P1-LQ",
            "P2": "QMJ-P2",
            "P3": "QMJ-P3",
            "P4": "QMJ-P4",
            "P5": "QMJ-P5",
            "P6": "QMJ-P6",
            "P7": "QMJ-P7",
            "P8": "QMJ-P8",
            "P9": "QMJ-P9",
            "P10 (high quality)": "QMJ-P10-HQ",
            "P10-P1": "QMJ"
        }
    },
    "Century": {
        "path": r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Century of Factor Premia Monthly-Ilmanen et al. (2021)",
        "columns": {
            "Date": "DATE",
            "US Stock Selection Value": "HML-FF-US", "US Stock Selection Momentum": "UMD-US", 
            "US Stock Selection Defensive": "BAB-US", "US Stock Selection Multi-style": "Multi-style-US",
            "Intl Stock Selection Value": "HML-FF-Intl", "Intl Stock Selection Momentum": "UMD-Intl", 
            "Intl Stock Selection Defensive": "BAB-Intl", "Intl Stock Selection Multi-style": "Multi-style-Intl",
            "Equity indices Value": "HML-Equity", "Equity indices Momentum": "UMD-Equity", 
            "Equity indices Carry": "Carry-Equity", "Equity indices Defensive": "BAB-Equity", 
            "Equity indices Multi-style": "Multi-style-Equity",
            "Fixed income Value": "HML-FI", "Fixed income Momentum": "UMD-FI", 
            "Fixed income Carry": "Carry-FI", "Fixed income Defensive": "BAB-FI", 
            "Fixed income Multi-style": "Multi-style-FI",
            "Currencies Value": "HML-FX", "Currencies Momentum": "UMD-FX", 
            "Currencies Carry": "Carry-FX", "Currencies Multi-style": "Multi-style-FX",
            "Commodities Value": "HML-COM", "Commodities Momentum": "UMD-COM", 
            "Commodities Carry": "Carry-COM", "Commodities Multi-style": "Multi-style-COM",
            "All Stock Selection Value": "HML-All-SS", "All Stock Selection Momentum": "UMD-All-SS", 
            "All Stock Selection Defensive": "BAB-All-SS", "All Stock Selection Multi-style": "Multi-style-All-SS",
            "All Macro Value": "HML-All-Macro", "All Macro Momentum": "UMD-All-Macro", 
            "All Macro Carry": "Carry-All-Macro", "All Macro Defensive": "BAB-All-Macro", 
            "All Macro Multi-style": "Multi-style-All-Macro",
            "All asset classes Value": "HML-All", "All asset classes Momentum": "UMD-All", 
            "All asset classes Carry": "Carry-All", "All asset classes Defensive": "BAB-All", 
            "All asset classes Multi-style": "Multi-style-All",
            "Equity indices Market": "MKT-Equity", "Fixed income Market": "MKT-FI", 
            "Commodities Market": "MKT-COM", "All Macro Market": "MKT-All-Macro"
        }
    },
    "COM": {
        "path": r"C:\Users\JulianHeron\Downloads\Commodities for the Long Run Index Level Data Monthly.xlsx",
        "sheet": 0,
        "header": 10,
        "paper": "Commodities for the Long Run"
    }
}

def process_factors(file_config, sheet=0, is_com=False, parent_path=None):
    file_path = parent_path if parent_path else file_config['path']
    logger.info(f"Processing {file_path} (sheet {sheet})...")
    try:
        # Read Excel with single header row
        df = pd.read_excel(file_path, sheet_name=sheet, header=file_config['header'])
        logger.info(f"Raw columns: {list(df.columns)}")
        
        if is_com:
            df.columns = [
                'date', 'excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt',
                'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short',
                'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango', 
                'state_backwardation_contango', 'state_inflation'
            ]
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df['associated_paper'] = file_config['paper']
            df = df.dropna(subset=['date'])
            
            with engine.connect() as connection:
                existing = pd.read_sql("SELECT date FROM aqr_cmdty_factors", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(existing['date'])
            
            df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')
            df_new = df[~df['date_str'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'aqr_cmdty_factors',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'date': DATE,
                        'excess_return_eqwt': DECIMAL(15, 6),
                        'excess_spot_return_eqwt': DECIMAL(15, 6),
                        'ir_adjusted_carry_eqwt': DECIMAL(15, 6),
                        'spot_return_eqwt': DECIMAL(15, 6),
                        'carry_eqwt': DECIMAL(15, 6),
                        'excess_return_long_short': DECIMAL(15, 6),
                        'excess_spot_return_long_short': DECIMAL(15, 6),
                        'ir_adjusted_carry_long_short': DECIMAL(15, 6),
                        'aggregate_backwardation_contango': DECIMAL(15, 6),
                        'state_backwardation_contango': VARCHAR(50),
                        'state_inflation': VARCHAR(50),
                        'associated_paper': VARCHAR(100)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into aqr_cmdty_factors.")
            else:
                logger.info("No new rows to load into aqr_cmdty_factors.")
        else:
            df = df.rename(columns={df.columns[0]: 'date'})
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            
            if 'columns' in file_config:
                valid_cols = list(file_config['columns'].keys())
                df = df[[col for col in df.columns if col in valid_cols]]
                df.columns = [file_config['columns'].get(col, col) for col in df.columns]
            
            df_long = df.melt(id_vars=['date'], var_name='factor', value_name='value')
            df_long = df_long.dropna(subset=['value', 'date'])
            
            if 'TSMOM' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].map({
                    "TSM-MA": "Multi-Asset",
                    "TSM-Com": "Commodities",
                    "TSM-EQ": "Equity",
                    "TSM-FI": "Fixed Income",
                    "TSM-FX": "Currencies"
                })
                df_long['region'] = "Global"
            elif 'Quality Minus Junk' in file_config['path']:
                df_long['asset_class'] = "Equity"
                df_long['region'] = df_long['date'].apply(
                    lambda x: "USA" if x < pd.Timestamp("1989-07-31") else "Global"
                )
            elif 'Century' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].str.extract(
                    '(Stock Selection|Equity indices|Fixed income|Currencies|Commodities|All Macro|All asset classes)'
                )
                df_long['region'] = df_long['factor'].apply(
                    lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global'
                )
            else:  # BAB_multi
                df_long['asset_class'] = "Equity" if 'Risk Free Rate' not in file_config['columns'] else "Fixed Income"
                df_long['region'] = df_long['factor'].apply(
                    lambda x: "USA" if 'USA' in df.columns[df.columns.get_loc(x)] or x == "RF"
                    else "Global" if 'Global' in df.columns[df.columns.get_loc(x)]
                    else "Intl"
                )
            
            df_long['associated_paper'] = file_config['paper']
            df_long = df_long[['factor', 'date', 'associated_paper', 'asset_class', 'value', 'region']]
            
            with engine.connect() as connection:
                existing = pd.read_sql(
                    "SELECT factor, date, associated_paper, asset_class FROM factor_returns",
                    connection
                )
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(tuple(row) for row in existing[['factor', 'date', 'associated_paper', 'asset_class']].values)
            
            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            df_long['key'] = df_long.apply(
                lambda row: (row['factor'], row['date_str'], row['associated_paper'] if row['associated_paper'] else 'None', row['asset_class']), axis=1
            )
            df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['date_str', 'key'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'factor_returns',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'factor': VARCHAR(50),
                        'date': DATE,
                        'associated_paper': VARCHAR(100),
                        'asset_class': VARCHAR(50),
                        'value': DECIMAL(15, 6),
                        'region': VARCHAR(50)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into factor_returns from {file_path} (sheet {sheet}).")
            else:
                logger.info(f"No new rows to load into factor_returns from {file_path} (sheet {sheet}).")
            
    except Exception as e:
        logger.error(f"Error processing {file_path} (sheet {sheet}): {str(e)}")

# Process all datasets
for key, config in data_files.items():
    if key == "BAB_multi":
        for subkey, subconfig in config['sheets'].items():
            process_factors(subconfig, sheet=subconfig['sheet'], parent_path=config['path'])
    elif key == "COM":
        process_factors(config, sheet=config['sheet'], is_com=True)
    else:
        process_factors(config, sheet=config['sheet'])

logger.info("All data processing complete!")

2025-04-10 17:23:29,797 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0)...
2025-04-10 17:23:32,388 - INFO - Raw columns: ['12/31/1930', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', -0.000557986425086, 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29']
2025-04-10 17:23:32,419 - ERROR - Error processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0): "The following id_vars or value_vars are not present in the DataFrame: ['date']"
2025-04-10 17:23:32,421 - INFO -

In [8]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
from sqlalchemy.sql import text
import logging

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)

# Database connection
engine = create_engine(
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

# File configurations
data_files = {
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {"sheet": 0, "header": 18, "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)", "columns": {"DATE": "DATE", "USA": "BAB", "Global": "BAB", "Global Ex USA": "BAB"}},
            "MKT": {"sheet": 4, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "MKT", "Global": "MKT", "Global Ex USA": "MKT"}},
            "SMB": {"sheet": 5, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "SMB", "Global": "SMB", "Global Ex USA": "SMB"}},
            "HML-FF": {"sheet": 6, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "HML-FF", "Global": "HML-FF", "Global Ex USA": "HML-FF"}},
            "HML-D": {"sheet": 7, "header": 18, "paper": "The Devil's in HML's Details", "columns": {"DATE": "DATE", "USA": "HML-D", "Global": "HML-D", "Global Ex USA": "HML-D"}},
            "UMD": {"sheet": 8, "header": 18, "paper": "On Persistence in Mutual Fund Performance", "columns": {"DATE": "DATE", "USA": "UMD", "Global": "UMD", "Global Ex USA": "UMD"}},
            "ME": {"sheet": 9, "header": 18, "paper": "AQR Factors", "columns": {"DATE": "DATE", "USA": "ME", "Global": "ME", "Global Ex USA": "ME"}},
            "RF": {"sheet": 10, "header": 18, "paper": None, "columns": {"DATE": "DATE", "Risk Free Rate": "RF"}}
        }
    },
    "TSMOM": {
        "path": r"C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx",
        "sheet": 0,
        "header": 17,
        "paper": "Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)",
        "columns": {
            "Unnamed: 0": "DATE",
            "TSMOM": "TSM-MA",
            "TSMOM^CM": "TSM-Com",
            "TSMOM^EQ": "TSM-EQ",
            "TSMOM^FI": "TSM-FI",
            "TSMOM^FX": "TSM-FX"
        }
    },
    "QMJ": {
        "path": r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx",
        "sheet": 0,
        "header": 19,
        "paper": "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)",
        "columns": {
            "DATE": "DATE",
            "P1 (low quality)": "QMJ-P1-LQ",
            "P2": "QMJ-P2",
            "P3": "QMJ-P3",
            "P4": "QMJ-P4",
            "P5": "QMJ-P5",
            "P6": "QMJ-P6",
            "P7": "QMJ-P7",
            "P8": "QMJ-P8",
            "P9": "QMJ-P9",
            "P10 (high quality)": "QMJ-P10-HQ",
            "P10-P1": "QMJ"
        }
    },
    "Century": {
        "path": r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Century of Factor Premia Monthly-Ilmanen et al. (2021)",
        "columns": {
            "Date": "DATE",
            "US Stock Selection Value": "HML-FF-US", "US Stock Selection Momentum": "UMD-US", 
            "US Stock Selection Defensive": "BAB-US", "US Stock Selection Multi-style": "Multi-style-US",
            "Intl Stock Selection Value": "HML-FF-Intl", "Intl Stock Selection Momentum": "UMD-Intl", 
            "Intl Stock Selection Defensive": "BAB-Intl", "Intl Stock Selection Multi-style": "Multi-style-Intl",
            "Equity indices Value": "HML-Equity", "Equity indices Momentum": "UMD-Equity", 
            "Equity indices Carry": "Carry-Equity", "Equity indices Defensive": "BAB-Equity", 
            "Equity indices Multi-style": "Multi-style-Equity",
            "Fixed income Value": "HML-FI", "Fixed income Momentum": "UMD-FI", 
            "Fixed income Carry": "Carry-FI", "Fixed income Defensive": "BAB-FI", 
            "Fixed income Multi-style": "Multi-style-FI",
            "Currencies Value": "HML-FX", "Currencies Momentum": "UMD-FX", 
            "Currencies Carry": "Carry-FX", "Currencies Multi-style": "Multi-style-FX",
            "Commodities Value": "HML-COM", "Commodities Momentum": "UMD-COM", 
            "Commodities Carry": "Carry-COM", "Commodities Multi-style": "Multi-style-COM",
            "All Stock Selection Value": "HML-All-SS", "All Stock Selection Momentum": "UMD-All-SS", 
            "All Stock Selection Defensive": "BAB-All-SS", "All Stock Selection Multi-style": "Multi-style-All-SS",
            "All Macro Value": "HML-All-Macro", "All Macro Momentum": "UMD-All-Macro", 
            "All Macro Carry": "Carry-All-Macro", "All Macro Defensive": "BAB-All-Macro", 
            "All Macro Multi-style": "Multi-style-All-Macro",
            "All asset classes Value": "HML-All", "All asset classes Momentum": "UMD-All", 
            "All asset classes Carry": "Carry-All", "All asset classes Defensive": "BAB-All", 
            "All asset classes Multi-style": "Multi-style-All",
            "Equity indices Market": "MKT-Equity", "Fixed income Market": "MKT-FI", 
            "Commodities Market": "MKT-COM", "All Macro Market": "MKT-All-Macro"
        }
    },
    "COM": {
        "path": r"C:\Users\JulianHeron\Downloads\Commodities for the Long Run Index Level Data Monthly.xlsx",
        "sheet": 0,
        "header": 10,
        "paper": "Commodities for the Long Run"
    }
}

def process_factors(file_config, sheet=0, is_com=False, parent_path=None):
    file_path = parent_path if parent_path else file_config['path']
    logger.info(f"Processing {file_path} (sheet {sheet})...")
    try:
        # Read Excel with correct header row
        df = pd.read_excel(file_path, sheet_name=sheet, header=file_config['header'])
        logger.info(f"Raw columns: {list(df.columns)}")
        
        if is_com:
            df.columns = [
                'date', 'excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt',
                'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short',
                'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango', 
                'state_backwardation_contango', 'state_inflation'
            ]
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df['associated_paper'] = file_config['paper']
            df = df.dropna(subset=['date'])
            
            with engine.connect() as connection:
                existing = pd.read_sql("SELECT date FROM aqr_cmdty_factors", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(existing['date'])
            
            df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')
            df_new = df[~df['date_str'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'aqr_cmdty_factors',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'date': DATE,
                        'excess_return_eqwt': DECIMAL(15, 6),
                        'excess_spot_return_eqwt': DECIMAL(15, 6),
                        'ir_adjusted_carry_eqwt': DECIMAL(15, 6),
                        'spot_return_eqwt': DECIMAL(15, 6),
                        'carry_eqwt': DECIMAL(15, 6),
                        'excess_return_long_short': DECIMAL(15, 6),
                        'excess_spot_return_long_short': DECIMAL(15, 6),
                        'ir_adjusted_carry_long_short': DECIMAL(15, 6),
                        'aggregate_backwardation_contango': DECIMAL(15, 6),
                        'state_backwardation_contango': VARCHAR(50),
                        'state_inflation': VARCHAR(50),
                        'associated_paper': VARCHAR(100)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into aqr_cmdty_factors.")
            else:
                logger.info("No new rows to load into aqr_cmdty_factors.")
        else:
            # Rename date column based on actual header
            date_col = next((col for col in df.columns if 'DATE' in col.upper() or 'Date' in col), df.columns[0])
            df = df.rename(columns={date_col: 'date'})
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            
            if 'columns' in file_config:
                valid_cols = list(file_config['columns'].keys())
                df = df[[col for col in df.columns if col in valid_cols]]
                df.columns = [file_config['columns'].get(col, col) for col in df.columns]
            
            df_long = df.melt(id_vars=['date'], var_name='factor', value_name='value')
            df_long = df_long.dropna(subset=['value', 'date'])
            
            if 'TSMOM' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].map({
                    "TSM-MA": "Multi-Asset",
                    "TSM-Com": "Commodities",
                    "TSM-EQ": "Equity",
                    "TSM-FI": "Fixed Income",
                    "TSM-FX": "Currencies"
                })
                df_long['region'] = "Global"
            elif 'Quality Minus Junk' in file_config['path']:
                df_long['asset_class'] = "Equity"
                df_long['region'] = df_long['date'].apply(
                    lambda x: "USA" if x < pd.Timestamp("1989-07-31") else "Global"
                )
            elif 'Century' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].str.extract(
                    '(Stock Selection|Equity indices|Fixed income|Currencies|Commodities|All Macro|All asset classes)'
                )
                df_long['region'] = df_long['factor'].apply(
                    lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global'
                )
            else:  # BAB_multi
                df_long['asset_class'] = "Equity" if 'Risk Free Rate' not in file_config['columns'].values() else "Fixed Income"
                df_long['region'] = df_long.apply(
                    lambda row: "USA" if 'USA' in df.columns[df.columns.get_loc(row['factor'])] or row['factor'] == "RF"
                    else "Global" if 'Global' in df.columns[df.columns.get_loc(row['factor'])]
                    else "Intl", axis=1
                )
            
            df_long['associated_paper'] = file_config['paper']
            df_long = df_long[['factor', 'date', 'associated_paper', 'asset_class', 'value', 'region']]
            
            with engine.connect() as connection:
                existing = pd.read_sql(
                    "SELECT factor, date, associated_paper, asset_class FROM factor_returns",
                    connection
                )
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(tuple(row) for row in existing[['factor', 'date', 'associated_paper', 'asset_class']].values)
            
            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            df_long['key'] = df_long.apply(
                lambda row: (row['factor'], row['date_str'], row['associated_paper'] if row['associated_paper'] else 'None', row['asset_class']), axis=1
            )
            df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['date_str', 'key'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'factor_returns',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'factor': VARCHAR(50),
                        'date': DATE,
                        'associated_paper': VARCHAR(100),
                        'asset_class': VARCHAR(50),
                        'value': DECIMAL(15, 6),
                        'region': VARCHAR(50)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into factor_returns from {file_path} (sheet {sheet}).")
            else:
                logger.info(f"No new rows to load into factor_returns from {file_path} (sheet {sheet}).")
            
    except Exception as e:
        logger.error(f"Error processing {file_path} (sheet {sheet}): {str(e)}")

# Process all datasets
for key, config in data_files.items():
    if key == "BAB_multi":
        for subkey, subconfig in config['sheets'].items():
            process_factors(subconfig, sheet=subconfig['sheet'], parent_path=config['path'])
    elif key == "COM":
        process_factors(config, sheet=config['sheet'], is_com=True)
    else:
        process_factors(config, sheet=config['sheet'])

logger.info("All data processing complete!")

2025-04-10 17:25:57,534 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0)...
2025-04-10 17:25:59,914 - INFO - Raw columns: ['DATE', 'AUS', 'AUT', 'BEL', 'CAN', 'CHE', 'DEU', 'DNK', 'ESP', 'FIN', 'FRA', 'GBR', 'GRC', 'HKG', 'IRL', 'ISR', 'ITA', 'JPN', 'NLD', 'NOR', 'NZL', 'PRT', 'SGP', 'SWE', 'USA', 'Global', 'Global Ex USA', 'Europe', 'North America', 'Pacific']
2025-04-10 17:25:59,930 - ERROR - Error processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0): "The following id_vars or value_vars are not present in the DataFrame: ['date']"
2025-04-10 17:25:59,931 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 4)...
2025-04-10 17:26:01,416 - INFO - Raw colu

In [14]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
import logging

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)

# Database connection
engine = create_engine(
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

# File configurations
data_files = {
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {"sheet": 0, "header": 18, "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)", "columns": {"DATE": "DATE", "USA": "BAB", "Global": "BAB", "Global Ex USA": "BAB"}},
            "MKT": {"sheet": 4, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "MKT", "Global": "MKT", "Global Ex USA": "MKT"}},
            "SMB": {"sheet": 5, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "SMB", "Global": "SMB", "Global Ex USA": "SMB"}},
            "HML-FF": {"sheet": 6, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "HML-FF", "Global": "HML-FF", "Global Ex USA": "HML-FF"}},
            "HML-D": {"sheet": 7, "header": 18, "paper": "The Devil's in HML's Details", "columns": {"DATE": "DATE", "USA": "HML-D", "Global": "HML-D", "Global Ex USA": "HML-D"}},
            "UMD": {"sheet": 8, "header": 18, "paper": "On Persistence in Mutual Fund Performance", "columns": {"DATE": "DATE", "USA": "UMD", "Global": "UMD", "Global Ex USA": "UMD"}},
            "ME": {"sheet": 9, "header": 18, "paper": "AQR Factors", "columns": {"DATE": "DATE", "USA": "ME", "Global": "ME", "Global Ex USA": "ME"}},
            "RF": {"sheet": 10, "header": 18, "paper": None, "columns": {"DATE": "DATE", "Risk Free Rate": "RF"}}
        }
    },
    "TSMOM": {
        "path": r"C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx",
        "sheet": 0,
        "header": 17,
        "paper": "Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)",
        "columns": {
            "Unnamed: 0": "DATE",
            "TSMOM": "TSM-MA",
            "TSMOM^CM": "TSM-Com",
            "TSMOM^EQ": "TSM-EQ",
            "TSMOM^FI": "TSM-FI",
            "TSMOM^FX": "TSM-FX"
        }
    },
    "QMJ": {
        "path": r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx",
        "sheet": 0,
        "header": 19,
        "paper": "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)",
        "columns": {
            "DATE": "DATE",
            "P1 (low quality)": "QMJ-P1-LQ",
            "P2": "QMJ-P2",
            "P3": "QMJ-P3",
            "P4": "QMJ-P4",
            "P5": "QMJ-P5",
            "P6": "QMJ-P6",
            "P7": "QMJ-P7",
            "P8": "QMJ-P8",
            "P9": "QMJ-P9",
            "P10 (high quality)": "QMJ-P10-HQ",
            "P10-P1": "QMJ"
        }
    },
    "Century": {
        "path": r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Century of Factor Premia Monthly-Ilmanen et al. (2021)",
        "columns": {
            "Date": "DATE",
            "US Stock Selection Value": "HML-FF-US", "US Stock Selection Momentum": "UMD-US", 
            "US Stock Selection Defensive": "BAB-US", "US Stock Selection Multi-style": "Multi-style-US",
            "Intl Stock Selection Value": "HML-FF-Intl", "Intl Stock Selection Momentum": "UMD-Intl", 
            "Intl Stock Selection Defensive": "BAB-Intl", "Intl Stock Selection Multi-style": "Multi-style-Intl",
            "Equity indices Value": "HML-Equity", "Equity indices Momentum": "UMD-Equity", 
            "Equity indices Carry": "Carry-Equity", "Equity indices Defensive": "BAB-Equity", 
            "Equity indices Multi-style": "Multi-style-Equity",
            "Fixed income Value": "HML-FI", "Fixed income Momentum": "UMD-FI", 
            "Fixed income Carry": "Carry-FI", "Fixed income Defensive": "BAB-FI", 
            "Fixed income Multi-style": "Multi-style-FI",
            "Currencies Value": "HML-FX", "Currencies Momentum": "UMD-FX", 
            "Currencies Carry": "Carry-FX", "Currencies Multi-style": "Multi-style-FX",
            "Commodities Value": "HML-COM", "Commodities Momentum": "UMD-COM", 
            "Commodities Carry": "Carry-COM", "Commodities Multi-style": "Multi-style-COM",
            "All Stock Selection Value": "HML-All-SS", "All Stock Selection Momentum": "UMD-All-SS", 
            "All Stock Selection Defensive": "BAB-All-SS", "All Stock Selection Multi-style": "Multi-style All-SS",
            "All Macro Value": "HML-All-Macro", "All Macro Momentum": "UMD-All-Macro", 
            "All Macro Carry": "Carry-All-Macro", "All Macro Defensive": "BAB-All-Macro", 
            "All Macro Multi-style": "Multi-style-All-Macro",
            "All asset classes Value": "HML-All", "All asset classes Momentum": "UMD-All", 
            "All asset classes Carry": "Carry-All", "All asset classes Defensive": "BAB-All", 
            "All asset classes Multi-style": "Multi-style-All",
            "Equity indices Market": "MKT-Equity", "Fixed income Market": "MKT-FI", 
            "Commodities Market": "MKT-COM", "All Macro Market": "MKT-All-Macro"
        }
    },
    "COM": {
        "path": r"C:\Users\JulianHeron\Downloads\Commodities for the Long Run Index Level Data Monthly.xlsx",
        "sheet": 0,
        "header": 10,
        "paper": "Commodities for the Long Run"
    }
}

def process_factors(file_config, sheet=0, is_com=False, parent_path=None):
    file_path = parent_path if parent_path else file_config['path']
    logger.info(f"Processing {file_path} (sheet {sheet})...")
    try:
        # Read Excel with correct header row
        df = pd.read_excel(file_path, sheet_name=sheet, header=file_config['header'])
        logger.info(f"Raw columns: {list(df.columns)}")
        
        if is_com:
            df.columns = [
                'date', 'excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt',
                'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short',
                'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango', 
                'state_backwardation_contango', 'state_inflation'
            ]
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df['associated_paper'] = file_config['paper']
            df = df.dropna(subset=['date'])
            
            with engine.connect() as connection:
                existing = pd.read_sql("SELECT date FROM aqr_cmdty_factors", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(existing['date'])
            
            df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')
            df_new = df[~df['date_str'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'aqr_cmdty_factors',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'date': DATE,
                        'excess_return_eqwt': DECIMAL(15, 6),
                        'excessЗапрос спот_возврат_eqwt': DECIMAL(15, 6),
                        'ir_adjusted_carry_eqwt': DECIMAL(15, 6),
                        'spot_return_eqwt': DECIMAL(15, 6),
                        'carry_eqwt': DECIMAL(15, 6),
                        'excess_return_long_short': DECIMAL(15, 6),
                        'excess_spot_return_long_short': DECIMAL(15, 6),
                        'ir_adjusted_carry_long_short': DECIMAL(15, 6),
                        'aggregate_backwardation_contango': DECIMAL(15, 6),
                        'state_backwardation_contango': VARCHAR(50),
                        'state_inflation': VARCHAR(50),
                        'associated_paper': VARCHAR(100)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into aqr_cmdty_factors.")
            else:
                logger.info("No new rows to load into aqr_cmdty_factors.")
        else:
            # Identify date column
            date_col = next(
                (col for col in df.columns if isinstance(col, str) and ('DATE' in col.upper() or 'Date' in col)),
                df.columns[0]
            )
            
            # Special handling for Quality Minus Junk due to header mismatch
            if 'Quality Minus Junk' in file_path:
                expected_cols = list(file_config['columns'].keys())
                if not any(col in df.columns for col in expected_cols[1:]):  # Exclude DATE
                    logger.warning("Expected columns not found in Quality Minus Junk file. Assigning columns manually.")
                    df.columns = [
                        'DATE', 'P1 (low quality)', 'P2', 'P3', 'P4', 'P5', 
                        'P6', 'P7', 'P8', 'P9', 'P10 (high quality)', 'P10-P1'
                    ]
            
            # Filter and rename columns
            if 'columns' in file_config:
                valid_cols = list(file_config['columns'].keys())
                if date_col not in valid_cols:
                    valid_cols.append(date_col)
                df = df[[col for col in df.columns if col in valid_cols]]
                new_columns = []
                for col in df.columns:
                    if col == date_col:
                        new_columns.append('date')
                    else:
                        new_columns.append(file_config['columns'].get(col, col))
                df.columns = new_columns
            else:
                df = df.rename(columns={date_col: 'date'})
            
            logger.info(f"Columns after renaming: {list(df.columns)}")
            
            # Parse dates
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            
            # Melt the DataFrame
            df_long = df.melt(id_vars=['date'], var_name='factor', value_name='value')
            df_long = df_long.dropna(subset=['value', 'date'])
            
            # Assign asset class and region
            if 'TSMOM' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].map({
                    "TSM-MA": "Multi-Asset",
                    "TSM-Com": "Commodities",
                    "TSM-EQ": "Equity",
                    "TSM-FI": "Fixed Income",
                    "TSM-FX": "Currencies"
                })
                df_long['region'] = "Global"
            elif 'Quality Minus Junk' in file_config['path']:
                df_long['asset_class'] = "Equity"
                df_long['region'] = df_long['date'].apply(
                    lambda x: "USA" if x < pd.Timestamp("1989-07-31") else "Global"
                )
            elif 'Century' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].str.extract(
                    '(Stock Selection|Equity indices|Fixed income|Currencies|Commodities|All Macro|All asset classes)'
                )[0]
                df_long['region'] = df_long['factor'].apply(
                    lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global'
                )
            else:  # BAB_multi
                df_long['asset_class'] = "Equity" if 'Risk Free Rate' not in file_config.get('columns', {}).values() else "Fixed Income"
                region_map = {}
                for col, new_name in file_config.get('columns', {}).items():
                    if col == 'DATE':
                        continue
                    if col == 'USA':
                        region_map[new_name] = 'USA'
                    elif col == 'Global':
                        region_map[new_name] = 'Global'
                    elif col == 'Global Ex USA':
                        region_map[new_name] = 'Intl'
                    else:
                        region_map[new_name] = 'Intl'
                region_map['RF'] = 'USA'
                df_long['region'] = df_long['factor'].map(region_map)
            
            df_long['associated_paper'] = file_config['paper']
            df_long = df_long[['factor', 'date', 'associated_paper', 'asset_class', 'value', 'region']]
            
            with engine.connect() as connection:
                existing = pd.read_sql(
                    "SELECT factor, date, associated_paper, asset_class FROM factor_returns",
                    connection
                )
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(tuple(row) for row in existing[['factor', 'date', 'associated_paper', 'asset_class']].values)
            
            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            df_long['key'] = df_long.apply(
                lambda row: (
                    row['factor'],
                    row['date_str'],
                    row['associated_paper'] if row['associated_paper'] else 'None',
                    row['asset_class'] if row['asset_class'] else 'None'
                ), axis=1
            )
            df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['date_str', 'key'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'factor_returns',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'factor': VARCHAR(50),
                        'date': DATE,
                        'associated_paper': VARCHAR(100),
                        'asset_class': VARCHAR(50),
                        'value': DECIMAL(15, 6),
                        'region': VARCHAR(50)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into factor_returns from {file_path} (sheet {sheet}).")
            else:
                logger.info(f"No new rows to load into factor_returns from {file_path} (sheet {sheet}).")
            
    except Exception as e:
        logger.error(f"Error processing {file_path} (sheet {sheet}): {str(e)}")

# Process all datasets
for key, config in data_files.items():
    if key == "BAB_multi":
        for subkey, subconfig in config['sheets'].items():
            process_factors(subconfig, sheet=subconfig['sheet'], parent_path=config['path'])
    elif key == "COM":
        process_factors(config, sheet=config['sheet'], is_com=True)
    else:
        process_factors(config, sheet=config['sheet'])

logger.info("All data processing complete!")


2025-04-10 17:31:55,474 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0)...
2025-04-10 17:31:56,894 - INFO - Raw columns: ['DATE', 'AUS', 'AUT', 'BEL', 'CAN', 'CHE', 'DEU', 'DNK', 'ESP', 'FIN', 'FRA', 'GBR', 'GRC', 'HKG', 'IRL', 'ISR', 'ITA', 'JPN', 'NLD', 'NOR', 'NZL', 'PRT', 'SGP', 'SWE', 'USA', 'Global', 'Global Ex USA', 'Europe', 'North America', 'Pacific']
2025-04-10 17:31:56,896 - INFO - Columns after renaming: ['date', 'BAB', 'BAB', 'BAB']
2025-04-10 17:31:56,918 - ERROR - Error processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0): 'path'
2025-04-10 17:31:56,920 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 4)...
2025-04-10 17:31:57,618 - IN

In [15]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
from sqlalchemy.sql import text
import logging

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)

# Database connection
engine = create_engine(
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

# File configurations
data_files = {
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {"sheet": 0, "header": 18, "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)", "columns": {"DATE": "DATE", "USA": "BAB", "Global": "BAB", "Global Ex USA": "BAB"}},
            "MKT": {"sheet": 4, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "MKT", "Global": "MKT", "Global Ex USA": "MKT"}},
            "SMB": {"sheet": 5, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "SMB", "Global": "SMB", "Global Ex USA": "SMB"}},
            "HML-FF": {"sheet": 6, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "HML-FF", "Global": "HML-FF", "Global Ex USA": "HML-FF"}},
            "HML-D": {"sheet": 7, "header": 18, "paper": "The Devil's in HML's Details", "columns": {"DATE": "DATE", "USA": "HML-D", "Global": "HML-D", "Global Ex USA": "HML-D"}},
            "UMD": {"sheet": 8, "header": 18, "paper": "On Persistence in Mutual Fund Performance", "columns": {"DATE": "DATE", "USA": "UMD", "Global": "UMD", "Global Ex USA": "UMD"}},
            "ME": {"sheet": 9, "header": 18, "paper": "AQR Factors", "columns": {"DATE": "DATE", "USA": "ME", "Global": "ME", "Global Ex USA": "ME"}},
            "RF": {"sheet": 10, "header": 18, "paper": None, "columns": {"DATE": "DATE", "Risk Free Rate": "RF"}}
        }
    },
    "TSMOM": {
        "path": r"C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx",
        "sheet": 0,
        "header": 17,
        "paper": "Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)",
        "columns": {
            "Unnamed: 0": "DATE",
            "TSMOM": "TSM-MA",
            "TSMOM^CM": "TSM-Com",
            "TSMOM^EQ": "TSM-EQ",
            "TSMOM^FI": "TSM-FI",
            "TSMOM^FX": "TSM-FX"
        }
    },
    "QMJ": {
        "path": r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)",
        "columns": {
            "DATE": "DATE",
            "P1 (low quality)": "QMJ-P1-LQ",
            "P2": "QMJ-P2",
            "P3": "QMJ-P3",
            "P4": "QMJ-P4",
            "P5": "QMJ-P5",
            "P6": "QMJ-P6",
            "P7": "QMJ-P7",
            "P8": "QMJ-P8",
            "P9": "QMJ-P9",
            "P10 (high quality)": "QMJ-P10-HQ",
            "P10-P1": "QMJ"
        }
    },
    "Century": {
        "path": r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Century of Factor Premia Monthly-Ilmanen et al. (2021)",
        "columns": {
            "Date": "DATE",
            "US Stock Selection Value": "HML-FF-US", "US Stock Selection Momentum": "UMD-US", 
            "US Stock Selection Defensive": "BAB-US", "US Stock Selection Multi-style": "Multi-style-US",
            "Intl Stock Selection Value": "HML-FF-Intl", "Intl Stock Selection Momentum": "UMD-Intl", 
            "Intl Stock Selection Defensive": "BAB-Intl", "Intl Stock Selection Multi-style": "Multi-style-Intl",
            "Equity indices Value": "HML-Equity", "Equity indices Momentum": "UMD-Equity", 
            "Equity indices Carry": "Carry-Equity", "Equity indices Defensive": "BAB-Equity", 
            "Equity indices Multi-style": "Multi-style-Equity",
            "Fixed income Value": "HML-FI", "Fixed income Momentum": "UMD-FI", 
            "Fixed income Carry": "Carry-FI", "Fixed income Defensive": "BAB-FI", 
            "Fixed income Multi-style": "Multi-style-FI",
            "Currencies Value": "HML-FX", "Currencies Momentum": "UMD-FX", 
            "Currencies Carry": "Carry-FX", "Currencies Multi-style": "Multi-style-FX",
            "Commodities Value": "HML-COM", "Commodities Momentum": "UMD-COM", 
            "Commodities Carry": "Carry-COM", "Commodities Multi-style": "Multi-style-COM",
            "All Stock Selection Value": "HML-All-SS", "All Stock Selection Momentum": "UMD-All-SS", 
            "All Stock Selection Defensive": "BAB-All-SS", "All Stock Selection Multi-style": "Multi-style-All-SS",
            "All Macro Value": "HML-All-Macro", "All Macro Momentum": "UMD-All-Macro", 
            "All Macro Carry": "Carry-All-Macro", "All Macro Defensive": "BAB-All-Macro", 
            "All Macro Multi-style": "Multi-style-All-Macro",
            "All asset classes Value": "HML-All", "All asset classes Momentum": "UMD-All", 
            "All asset classes Carry": "Carry-All", "All asset classes Defensive": "BAB-All", 
            "All asset classes Multi-style": "Multi-style-All",
            "Equity indices Market": "MKT-Equity", "Fixed income Market": "MKT-FI", 
            "Commodities Market": "MKT-COM", "All Macro Market": "MKT-All-Macro"
        }
    },
    "COM": {
        "path": r"C:\Users\JulianHeron\Downloads\Commodities for the Long Run Index Level Data Monthly.xlsx",
        "sheet": 0,
        "header": 10,
        "paper": "Commodities for the Long Run"
    }
}

def process_factors(file_config, sheet=0, is_com=False, parent_path=None):
    file_path = parent_path if parent_path else file_config['path']
    logger.info(f"Processing {file_path} (sheet {sheet})...")
    try:
        # Read Excel with correct header row
        df = pd.read_excel(file_path, sheet_name=sheet, header=file_config['header'])
        logger.info(f"Raw columns: {list(df.columns)}")
        
        if is_com:
            df.columns = [
                'date', 'excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt',
                'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short',
                'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango', 
                'state_backwardation_contango', 'state_inflation'
            ]
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df['associated_paper'] = file_config['paper']
            df = df.dropna(subset=['date'])
            
            with engine.connect() as connection:
                existing = pd.read_sql("SELECT date FROM aqr_cmdty_factors", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(existing['date'])
            
            df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')
            df_new = df[~df['date_str'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'aqr_cmdty_factors',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'date': DATE,
                        'excess_return_eqwt': DECIMAL(15, 6),
                        'excess_spot_return_eqwt': DECIMAL(15, 6),
                        'ir_adjusted_carry_eqwt': DECIMAL(15, 6),
                        'spot_return_eqwt': DECIMAL(15, 6),
                        'carry_eqwt': DECIMAL(15, 6),
                        'excess_return_long_short': DECIMAL(15, 6),
                        'excess_spot_return_long_short': DECIMAL(15, 6),
                        'ir_adjusted_carry_long_short': DECIMAL(15, 6),
                        'aggregate_backwardation_contango': DECIMAL(15, 6),
                        'state_backwardation_contango': VARCHAR(50),
                        'state_inflation': VARCHAR(50),
                        'associated_paper': VARCHAR(100)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into aqr_cmdty_factors.")
            else:
                logger.info("No new rows to load into aqr_cmdty_factors.")
        else:
            # Identify date column
            date_col = next(
                (col for col in df.columns if isinstance(col, str) and ('DATE' in col.upper() or 'Date' in col)),
                df.columns[0]
            )
            
            # Filter and rename columns
            if 'columns' in file_config:
                valid_cols = list(file_config['columns'].keys())
                if date_col not in valid_cols:
                    valid_cols.append(date_col)
                df = df[[col for col in df.columns if col in valid_cols]]
                new_columns = []
                for col in df.columns:
                    if col == date_col:
                        new_columns.append('date')
                    else:
                        new_columns.append(file_config['columns'].get(col, col))
                df.columns = new_columns
            else:
                df = df.rename(columns={date_col: 'date'})
            
            logger.info(f"Columns after renaming: {list(df.columns)}")
            
            # Parse dates
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            
            # Melt the DataFrame
            df_long = df.melt(id_vars=['date'], var_name='factor', value_name='value')
            df_long = df_long.dropna(subset=['value', 'date'])
            
            # Assign asset class and region
            if 'TSMOM' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].map({
                    "TSM-MA": "Multi-Asset",
                    "TSM-Com": "Commodities",
                    "TSM-EQ": "Equity",
                    "TSM-FI": "Fixed Income",
                    "TSM-FX": "Currencies"
                })
                df_long['region'] = "Global"
            elif 'Quality Minus Junk' in file_config['path']:
                df_long['asset_class'] = "Equity"
                df_long['region'] = df_long['date'].apply(
                    lambda x: "USA" if x < pd.Timestamp("1989-07-31") else "Global"
                )
            elif 'Century' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].str.extract(
                    '(Stock Selection|Equity indices|Fixed income|Currencies|Commodities|All Macro|All asset classes)'
                )
                df_long['region'] = df_long['factor'].apply(
                    lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global'
                )
            else:  # BAB_multi
                df_long['asset_class'] = "Equity" if 'Risk Free Rate' not in file_config['columns'].values() else "Fixed Income"
                region_map = {}
                for col, factor in file_config['columns'].items():
                    if col == 'DATE':
                        continue
                    if col == 'USA':
                        region_map[factor] = 'USA'
                    elif col == 'Global':
                        region_map[factor] = 'Global'
                    elif col == 'Global Ex USA':
                        region_map[factor] = 'Intl'
                region_map['RF'] = 'USA'
                df_long['region'] = df_long['factor'].map(region_map)
            
            df_long['associated_paper'] = file_config['paper']
            df_long = df_long[['factor', 'date', 'associated_paper', 'asset_class', 'value', 'region']]
            
            with engine.connect() as connection:
                existing = pd.read_sql(
                    "SELECT factor, date, associated_paper, asset_class FROM factor_returns",
                    connection
                )
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(tuple(row) for row in existing[['factor', 'date', 'associated_paper', 'asset_class']].values)
            
            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            df_long['key'] = df_long.apply(
                lambda row: (row['factor'], row['date_str'], row['associated_paper'] if row['associated_paper'] else 'None', row['asset_class']), axis=1
            )
            df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['date_str', 'key'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'factor_returns',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'factor': VARCHAR(50),
                        'date': DATE,
                        'associated_paper': VARCHAR(100),
                        'asset_class': VARCHAR(50),
                        'value': DECIMAL(15, 6),
                        'region': VARCHAR(50)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into factor_returns from {file_path} (sheet {sheet}).")
            else:
                logger.info(f"No new rows to load into factor_returns from {file_path} (sheet {sheet}).")
            
    except Exception as e:
        logger.error(f"Error processing {file_path} (sheet {sheet}): {str(e)}")

# Process all datasets
for key, config in data_files.items():
    if key == "BAB_multi":
        for subkey, subconfig in config['sheets'].items():
            process_factors(subconfig, sheet=subconfig['sheet'], parent_path=config['path'])
    elif key == "COM":
        process_factors(config, sheet=config['sheet'], is_com=True)
    else:
        process_factors(config, sheet=config['sheet'])

logger.info("All data processing complete!")

2025-04-10 17:33:07,210 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0)...
2025-04-10 17:33:08,533 - INFO - Raw columns: ['DATE', 'AUS', 'AUT', 'BEL', 'CAN', 'CHE', 'DEU', 'DNK', 'ESP', 'FIN', 'FRA', 'GBR', 'GRC', 'HKG', 'IRL', 'ISR', 'ITA', 'JPN', 'NLD', 'NOR', 'NZL', 'PRT', 'SGP', 'SWE', 'USA', 'Global', 'Global Ex USA', 'Europe', 'North America', 'Pacific']
2025-04-10 17:33:08,536 - INFO - Columns after renaming: ['date', 'BAB', 'BAB', 'BAB']
2025-04-10 17:33:08,547 - ERROR - Error processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0): 'path'
2025-04-10 17:33:08,549 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 4)...
2025-04-10 17:33:09,167 - IN

In [16]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
from sqlalchemy.sql import text
import logging

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)

# Database connection
engine = create_engine(
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

# File configurations
data_files = {
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {"sheet": 0, "header": 18, "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)", "columns": {"DATE": "DATE", "USA": "BAB", "Global": "BAB", "Global Ex USA": "BAB"}},
            "MKT": {"sheet": 4, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "MKT", "Global": "MKT", "Global Ex USA": "MKT"}},
            "SMB": {"sheet": 5, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "SMB", "Global": "SMB", "Global Ex USA": "SMB"}},
            "HML-FF": {"sheet": 6, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "HML-FF", "Global": "HML-FF", "Global Ex USA": "HML-FF"}},
            "HML-D": {"sheet": 7, "header": 18, "paper": "The Devil's in HML's Details", "columns": {"DATE": "DATE", "USA": "HML-D", "Global": "HML-D", "Global Ex USA": "HML-D"}},
            "UMD": {"sheet": 8, "header": 18, "paper": "On Persistence in Mutual Fund Performance", "columns": {"DATE": "DATE", "USA": "UMD", "Global": "UMD", "Global Ex USA": "UMD"}},
            "ME": {"sheet": 9, "header": 18, "paper": "AQR Factors", "columns": {"DATE": "DATE", "USA": "ME", "Global": "ME", "Global Ex USA": "ME"}},
            "RF": {"sheet": 10, "header": 18, "paper": None, "columns": {"DATE": "DATE", "Risk Free Rate": "RF"}}
        }
    },
    "TSMOM": {
        "path": r"C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx",
        "sheet": 0,
        "header": 17,
        "paper": "Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)",
        "columns": {
            "Unnamed: 0": "DATE",
            "TSMOM": "TSM-MA",
            "TSMOM^CM": "TSM-Com",
            "TSMOM^EQ": "TSM-EQ",
            "TSMOM^FI": "TSM-FI",
            "TSMOM^FX": "TSM-FX"
        }
    },
    "QMJ": {
        "path": r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)",
        "columns": {
            "DATE": "DATE",
            "P1 (low quality)": "QMJ-P1-LQ",
            "P2": "QMJ-P2",
            "P3": "QMJ-P3",
            "P4": "QMJ-P4",
            "P5": "QMJ-P5",
            "P6": "QMJ-P6",
            "P7": "QMJ-P7",
            "P8": "QMJ-P8",
            "P9": "QMJ-P9",
            "P10 (high quality)": "QMJ-P10-HQ",
            "P10-P1": "QMJ"
        }
    },
    "Century": {
        "path": r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Century of Factor Premia Monthly-Ilmanen et al. (2021)",
        "columns": {
            "Date": "DATE",
            "US Stock Selection Value": "HML-FF-US", "US Stock Selection Momentum": "UMD-US", 
            "US Stock Selection Defensive": "BAB-US", "US Stock Selection Multi-style": "Multi-style-US",
            "Intl Stock Selection Value": "HML-FF-Intl", "Intl Stock Selection Momentum": "UMD-Intl", 
            "Intl Stock Selection Defensive": "BAB-Intl", "Intl Stock Selection Multi-style": "Multi-style-Intl",
            "Equity indices Value": "HML-Equity", "Equity indices Momentum": "UMD-Equity", 
            "Equity indices Carry": "Carry-Equity", "Equity indices Defensive": "BAB-Equity", 
            "Equity indices Multi-style": "Multi-style-Equity",
            "Fixed income Value": "HML-FI", "Fixed income Momentum": "UMD-FI", 
            "Fixed income Carry": "Carry-FI", "Fixed income Defensive": "BAB-FI", 
            "Fixed income Multi-style": "Multi-style-FI",
            "Currencies Value": "HML-FX", "Currencies Momentum": "UMD-FX", 
            "Currencies Carry": "Carry-FX", "Currencies Multi-style": "Multi-style-FX",
            "Commodities Value": "HML-COM", "Commodities Momentum": "UMD-COM", 
            "Commodities Carry": "Carry-COM", "Commodities Multi-style": "Multi-style-COM",
            "All Stock Selection Value": "HML-All-SS", "All Stock Selection Momentum": "UMD-All-SS", 
            "All Stock Selection Defensive": "BAB-All-SS", "All Stock Selection Multi-style": "Multi-style-All-SS",
            "All Macro Value": "HML-All-Macro", "All Macro Momentum": "UMD-All-Macro", 
            "All Macro Carry": "Carry-All-Macro", "All Macro Defensive": "BAB-All-Macro", 
            "All Macro Multi-style": "Multi-style-All-Macro",
            "All asset classes Value": "HML-All", "All asset classes Momentum": "UMD-All", 
            "All asset classes Carry": "Carry-All", "All asset classes Defensive": "BAB-All", 
            "All asset classes Multi-style": "Multi-style-All",
            "Equity indices Market": "MKT-Equity", "Fixed income Market": "MKT-FI", 
            "Commodities Market": "MKT-COM", "All Macro Market": "MKT-All-Macro"
        }
    },
    "COM": {
        "path": r"C:\Users\JulianHeron\Downloads\Commodities for the Long Run Index Level Data Monthly.xlsx",
        "sheet": 0,
        "header": 10,
        "paper": "Commodities for the Long Run"
    }
}

def process_factors(file_config, sheet=0, is_com=False, parent_path=None):
    file_path = parent_path if parent_path else file_config['path']
    logger.info(f"Processing {file_path} (sheet {sheet})...")
    try:
        # Read Excel with correct header row
        df = pd.read_excel(file_path, sheet_name=sheet, header=file_config['header'])
        logger.info(f"Raw columns: {list(df.columns)}")
        
        if is_com:
            df.columns = [
                'date', 'excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt',
                'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short',
                'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango', 
                'state_backwardation_contango', 'state_inflation'
            ]
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df['associated_paper'] = file_config['paper']
            df = df.dropna(subset=['date'])
            
            with engine.connect() as connection:
                existing = pd.read_sql("SELECT date FROM aqr_cmdty_factors", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(existing['date'])
            
            df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')
            df_new = df[~df['date_str'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'aqr_cmdty_factors',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'date': DATE,
                        'excess_return_eqwt': DECIMAL(15, 6),
                        'excess_spot_return_eqwt': DECIMAL(15, 6),
                        'ir_adjusted_carry_eqwt': DECIMAL(15, 6),
                        'spot_return_eqwt': DECIMAL(15, 6),
                        'carry_eqwt': DECIMAL(15, 6),
                        'excess_return_long_short': DECIMAL(15, 6),
                        'excess_spot_return_long_short': DECIMAL(15, 6),
                        'ir_adjusted_carry_long_short': DECIMAL(15, 6),
                        'aggregate_backwardation_contango': DECIMAL(15, 6),
                        'state_backwardation_contango': VARCHAR(50),
                        'state_inflation': VARCHAR(50),
                        'associated_paper': VARCHAR(100)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into aqr_cmdty_factors.")
            else:
                logger.info("No new rows to load into aqr_cmdty_factors.")
        else:
            # Identify date column
            date_col = next(
                (col for col in df.columns if isinstance(col, str) and ('DATE' in col.upper() or 'Date' in col)),
                df.columns[0]
            )
            
            # Filter and rename columns
            if 'columns' in file_config:
                valid_cols = list(file_config['columns'].keys())
                if date_col not in valid_cols:
                    valid_cols.append(date_col)
                df = df[[col for col in df.columns if col in valid_cols]]
                new_columns = []
                for col in df.columns:
                    if col == date_col:
                        new_columns.append('date')
                    else:
                        new_columns.append(file_config['columns'].get(col, col))
                df.columns = new_columns
            
            logger.info(f"Columns after renaming: {list(df.columns)}")
            
            # Parse dates
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            
            # Melt the DataFrame
            df_long = df.melt(id_vars=['date'], var_name='factor', value_name='value')
            df_long = df_long.dropna(subset=['value', 'date'])
            logger.info(f"Rows after melting: {len(df_long)}")
            
            # Assign asset class and region
            if 'TSMOM' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].map({
                    "TSM-MA": "Multi-Asset",
                    "TSM-Com": "Commodities",
                    "TSM-EQ": "Equity",
                    "TSM-FI": "Fixed Income",
                    "TSM-FX": "Currencies"
                })
                df_long['region'] = "Global"
            elif 'Quality Minus Junk' in file_config['path']:
                df_long['asset_class'] = "Equity"
                df_long['region'] = df_long['date'].apply(
                    lambda x: "USA" if x < pd.Timestamp("1989-07-31") else "Global"
                )
            elif 'Century' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].str.extract(
                    '(Stock Selection|Equity indices|Fixed income|Currencies|Commodities|All Macro|All asset classes)'
                ).fillna('Equity')  # Default to Equity if no match
                df_long['region'] = df_long['factor'].apply(
                    lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global'
                )
            else:  # BAB_multi
                df_long['asset_class'] = "Equity" if 'Risk Free Rate' not in file_config['columns'].values() else "Fixed Income"
                region_map = {}
                for col, factor in file_config['columns'].items():
                    if col == 'DATE':
                        continue
                    if col == 'USA':
                        region_map[factor] = 'USA'
                    elif col == 'Global':
                        region_map[factor] = 'Global'
                    elif col == 'Global Ex USA':
                        region_map[factor] = 'Intl'
                region_map['RF'] = 'USA'
                df_long['region'] = df_long['factor'].map(region_map)
            
            df_long['associated_paper'] = file_config['paper']
            df_long = df_long[['factor', 'date', 'associated_paper', 'asset_class', 'value', 'region']]
            
            with engine.connect() as connection:
                existing = pd.read_sql(
                    "SELECT factor, date, associated_paper, asset_class FROM factor_returns",
                    connection
                )
                # Handle NULL asset_class in existing data
                existing['asset_class'] = existing['asset_class'].fillna('Unknown')
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(tuple(row) for row in existing[['factor', 'date', 'associated_paper', 'asset_class']].values)
            
            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            df_long['asset_class'] = df_long['asset_class'].fillna('Unknown')  # Ensure no NULLs
            df_long['key'] = df_long.apply(
                lambda row: (row['factor'], row['date_str'], row['associated_paper'] if row['associated_paper'] else 'None', row['asset_class']), axis=1
            )
            df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['date_str', 'key'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'factor_returns',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'factor': VARCHAR(50),
                        'date': DATE,
                        'associated_paper': VARCHAR(100),
                        'asset_class': VARCHAR(50),
                        'value': DECIMAL(15, 6),
                        'region': VARCHAR(50)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into factor_returns from {file_path} (sheet {sheet}).")
            else:
                logger.info(f"No new rows to load into factor_returns from {file_path} (sheet {sheet}).")
            
    except Exception as e:
        logger.error(f"Error processing {file_path} (sheet {sheet}): {str(e)}")

# Process all datasets
for key, config in data_files.items():
    if key == "BAB_multi":
        for subkey, subconfig in config['sheets'].items():
            process_factors(subconfig, sheet=subconfig['sheet'], parent_path=config['path'])
    elif key == "COM":
        process_factors(config, sheet=config['sheet'], is_com=True)
    else:
        process_factors(config, sheet=config['sheet'])

logger.info("All data processing complete!")

2025-04-10 17:35:29,053 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0)...
2025-04-10 17:35:30,378 - INFO - Raw columns: ['DATE', 'AUS', 'AUT', 'BEL', 'CAN', 'CHE', 'DEU', 'DNK', 'ESP', 'FIN', 'FRA', 'GBR', 'GRC', 'HKG', 'IRL', 'ISR', 'ITA', 'JPN', 'NLD', 'NOR', 'NZL', 'PRT', 'SGP', 'SWE', 'USA', 'Global', 'Global Ex USA', 'Europe', 'North America', 'Pacific']
2025-04-10 17:35:30,380 - INFO - Columns after renaming: ['date', 'BAB', 'BAB', 'BAB']
2025-04-10 17:35:30,392 - INFO - Rows after melting: 2039
2025-04-10 17:35:30,394 - ERROR - Error processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0): 'path'
2025-04-10 17:35:30,395 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Fact

In [17]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
from sqlalchemy.sql import text
import logging

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)

# Database connection
engine = create_engine(
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

# File configurations
data_files = {
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {"sheet": 0, "header": 18, "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)", "columns": {"DATE": "DATE", "USA": "BAB", "Global": "BAB", "Global Ex USA": "BAB"}},
            "MKT": {"sheet": 4, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "MKT", "Global": "MKT", "Global Ex USA": "MKT"}},
            "SMB": {"sheet": 5, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "SMB", "Global": "SMB", "Global Ex USA": "SMB"}},
            "HML-FF": {"sheet": 6, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "HML-FF", "Global": "HML-FF", "Global Ex USA": "HML-FF"}},
            "HML-D": {"sheet": 7, "header": 18, "paper": "The Devil's in HML's Details", "columns": {"DATE": "DATE", "USA": "HML-D", "Global": "HML-D", "Global Ex USA": "HML-D"}},
            "UMD": {"sheet": 8, "header": 18, "paper": "On Persistence in Mutual Fund Performance", "columns": {"DATE": "DATE", "USA": "UMD", "Global": "UMD", "Global Ex USA": "UMD"}},
            "ME": {"sheet": 9, "header": 18, "paper": "AQR Factors", "columns": {"DATE": "DATE", "USA": "ME", "Global": "ME", "Global Ex USA": "ME"}},
            "RF": {"sheet": 10, "header": 18, "paper": None, "columns": {"DATE": "DATE", "Risk Free Rate": "RF"}}
        }
    },
    "TSMOM": {
        "path": r"C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx",
        "sheet": 0,
        "header": 17,
        "paper": "Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)",
        "columns": {
            "Unnamed: 0": "DATE",
            "TSMOM": "TSM-MA",
            "TSMOM^CM": "TSM-Com",
            "TSMOM^EQ": "TSM-EQ",
            "TSMOM^FI": "TSM-FI",
            "TSMOM^FX": "TSM-FX"
        }
    },
    "QMJ": {
        "path": r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)",
        "columns": {
            "DATE": "DATE",
            "P1 (low quality)": "QMJ-P1-LQ",
            "P2": "QMJ-P2",
            "P3": "QMJ-P3",
            "P4": "QMJ-P4",
            "P5": "QMJ-P5",
            "P6": "QMJ-P6",
            "P7": "QMJ-P7",
            "P8": "QMJ-P8",
            "P9": "QMJ-P9",
            "P10 (high quality)": "QMJ-P10-HQ",
            "P10-P1": "QMJ"
        }
    },
    "Century": {
        "path": r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Century of Factor Premia Monthly-Ilmanen et al. (2021)",
        "columns": {
            "Date": "DATE",
            "US Stock Selection Value": "HML-FF-US", "US Stock Selection Momentum": "UMD-US", 
            "US Stock Selection Defensive": "BAB-US", "US Stock Selection Multi-style": "Multi-style-US",
            "Intl Stock Selection Value": "HML-FF-Intl", "Intl Stock Selection Momentum": "UMD-Intl", 
            "Intl Stock Selection Defensive": "BAB-Intl", "Intl Stock Selection Multi-style": "Multi-style-Intl",
            "Equity indices Value": "HML-Equity", "Equity indices Momentum": "UMD-Equity", 
            "Equity indices Carry": "Carry-Equity", "Equity indices Defensive": "BAB-Equity", 
            "Equity indices Multi-style": "Multi-style-Equity",
            "Fixed income Value": "HML-FI", "Fixed income Momentum": "UMD-FI", 
            "Fixed income Carry": "Carry-FI", "Fixed income Defensive": "BAB-FI", 
            "Fixed income Multi-style": "Multi-style-FI",
            "Currencies Value": "HML-FX", "Currencies Momentum": "UMD-FX", 
            "Currencies Carry": "Carry-FX", "Currencies Multi-style": "Multi-style-FX",
            "Commodities Value": "HML-COM", "Commodities Momentum": "UMD-COM", 
            "Commodities Carry": "Carry-COM", "Commodities Multi-style": "Multi-style-COM",
            "All Stock Selection Value": "HML-All-SS", "All Stock Selection Momentum": "UMD-All-SS", 
            "All Stock Selection Defensive": "BAB-All-SS", "All Stock Selection Multi-style": "Multi-style-All-SS",
            "All Macro Value": "HML-All-Macro", "All Macro Momentum": "UMD-All-Macro", 
            "All Macro Carry": "Carry-All-Macro", "All Macro Defensive": "BAB-All-Macro", 
            "All Macro Multi-style": "Multi-style-All-Macro",
            "All asset classes Value": "HML-All", "All asset classes Momentum": "UMD-All", 
            "All asset classes Carry": "Carry-All", "All asset classes Defensive": "BAB-All", 
            "All asset classes Multi-style": "Multi-style-All",
            "Equity indices Market": "MKT-Equity", "Fixed income Market": "MKT-FI", 
            "Commodities Market": "MKT-COM", "All Macro Market": "MKT-All-Macro"
        }
    },
    "COM": {
        "path": r"C:\Users\JulianHeron\Downloads\Commodities for the Long Run Index Level Data Monthly.xlsx",
        "sheet": 0,
        "header": 10,
        "paper": "Commodities for the Long Run"
    }
}

def process_factors(file_config, sheet=0, is_com=False, parent_path=None):
    file_path = parent_path if parent_path else file_config['path']
    logger.info(f"Processing {file_path} (sheet {sheet})...")
    try:
        # Read Excel with correct header row
        df = pd.read_excel(file_path, sheet_name=sheet, header=file_config['header'])
        logger.info(f"Raw columns: {list(df.columns)}")
        
        if is_com:
            df.columns = [
                'date', 'excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt',
                'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short',
                'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango', 
                'state_backwardation_contango', 'state_inflation'
            ]
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df['associated_paper'] = file_config['paper']
            df = df.dropna(subset=['date'])
            
            with engine.connect() as connection:
                existing = pd.read_sql("SELECT date FROM aqr_cmdty_factors", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(existing['date'])
            
            df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')
            df_new = df[~df['date_str'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'aqr_cmdty_factors',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'date': DATE,
                        'excess_return_eqwt': DECIMAL(15, 6),
                        'excess_spot_return_eqwt': DECIMAL(15, 6),
                        'ir_adjusted_carry_eqwt': DECIMAL(15, 6),
                        'spot_return_eqwt': DECIMAL(15, 6),
                        'carry_eqwt': DECIMAL(15, 6),
                        'excess_return_long_short': DECIMAL(15, 6),
                        'excess_spot_return_long_short': DECIMAL(15, 6),
                        'ir_adjusted_carry_long_short': DECIMAL(15, 6),
                        'aggregate_backwardation_contango': DECIMAL(15, 6),
                        'state_backwardation_contango': VARCHAR(50),
                        'state_inflation': VARCHAR(50),
                        'associated_paper': VARCHAR(100)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into aqr_cmdty_factors.")
            else:
                logger.info("No new rows to load into aqr_cmdty_factors.")
        else:
            # Identify date column
            date_col = next(
                (col for col in df.columns if isinstance(col, str) and ('DATE' in col.upper() or 'Date' in col)),
                df.columns[0]
            )
            
            # Filter and rename columns
            if 'columns' in file_config:
                valid_cols = list(file_config['columns'].keys())
                if date_col not in valid_cols:
                    valid_cols.append(date_col)
                df = df[[col for col in df.columns if col in valid_cols]]
                new_columns = []
                for col in df.columns:
                    if col == date_col:
                        new_columns.append('date')
                    else:
                        new_columns.append(file_config['columns'].get(col, col))
                df.columns = new_columns
            
            logger.info(f"Columns after renaming: {list(df.columns)}")
            
            # Parse dates
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            
            # Melt the DataFrame
            df_long = df.melt(id_vars=['date'], var_name='factor', value_name='value')
            df_long = df_long.dropna(subset=['value', 'date'])
            logger.info(f"Rows after melting: {len(df_long)}")
            
            # Assign asset class and region
            if 'TSMOM' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].map({
                    "TSM-MA": "Multi-Asset",
                    "TSM-Com": "Commodities",
                    "TSM-EQ": "Equity",
                    "TSM-FI": "Fixed Income",
                    "TSM-FX": "Currencies"
                })
                df_long['region'] = "Global"
            elif 'Quality Minus Junk' in file_config['path']:
                df_long['asset_class'] = "Equity"
                df_long['region'] = df_long['date'].apply(
                    lambda x: "USA" if x < pd.Timestamp("1989-07-31") else "Global"
                )
            elif 'Century' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].str.extract(
                    '(Stock Selection|Equity indices|Fixed income|Currencies|Commodities|All Macro|All asset classes)'
                ).fillna('Equity')
                df_long['region'] = df_long['factor'].apply(
                    lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global'
                )
            else:  # BAB_multi
                df_long['asset_class'] = "Equity" if 'Risk Free Rate' not in file_config['columns'].values() else "Fixed Income"
                # Create region mapping based on original column names
                region_map = {}
                orig_cols = list(file_config['columns'].keys())
                for i, col in enumerate(df.columns):
                    if col == 'date':
                        continue
                    orig_col = list(file_config['columns'].keys())[i]  # Match by position
                    factor = file_config['columns'][orig_col]
                    if orig_col == 'USA':
                        region_map[factor + '_USA'] = 'USA'
                    elif orig_col == 'Global':
                        region_map[factor + '_Global'] = 'Global'
                    elif orig_col == 'Global Ex USA':
                        region_map[factor + '_Intl'] = 'Intl'
                    elif orig_col == 'Risk Free Rate':
                        region_map['RF'] = 'USA'
                # Apply region based on factor and original column
                df_long['region'] = df_long.apply(
                    lambda row: next((reg for key, reg in region_map.items() if row['factor'] in key), 'Unknown'), axis=1
                )
                # Adjust factor to avoid duplicates
                df_long['factor'] = df_long.apply(
                    lambda row: f"{row['factor']}_{row['region']}" if row['factor'] != 'RF' else 'RF', axis=1
                )
            
            df_long['associated_paper'] = file_config['paper']
            df_long = df_long[['factor', 'date', 'associated_paper', 'asset_class', 'value', 'region']]
            
            with engine.connect() as connection:
                existing = pd.read_sql(
                    "SELECT factor, date, associated_paper, asset_class FROM factor_returns",
                    connection
                )
                existing['asset_class'] = existing['asset_class'].fillna('Unknown')
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(tuple(row) for row in existing[['factor', 'date', 'associated_paper', 'asset_class']].values)
            
            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            df_long['asset_class'] = df_long['asset_class'].fillna('Unknown')
            df_long['key'] = df_long.apply(
                lambda row: (row['factor'], row['date_str'], row['associated_paper'] if row['associated_paper'] else 'None', row['asset_class']), axis=1
            )
            df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['date_str', 'key'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'factor_returns',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'factor': VARCHAR(50),
                        'date': DATE,
                        'associated_paper': VARCHAR(100),
                        'asset_class': VARCHAR(50),
                        'value': DECIMAL(15, 6),
                        'region': VARCHAR(50)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into factor_returns from {file_path} (sheet {sheet}).")
            else:
                logger.info(f"No new rows to load into factor_returns from {file_path} (sheet {sheet}).")
            
    except Exception as e:
        logger.error(f"Error processing {file_path} (sheet {sheet}): {str(e)}")

# Process all datasets, skipping Century and COM since they're complete
for key, config in data_files.items():
    if key in ["Century", "COM"]:
        logger.info(f"Skipping {key} as data is already fully loaded.")
        continue
    if key == "BAB_multi":
        for subkey, subconfig in config['sheets'].items():
            process_factors(subconfig, sheet=subconfig['sheet'], parent_path=config['path'])
    else:
        process_factors(config, sheet=config['sheet'])

logger.info("All data processing complete!")

2025-04-10 17:39:12,908 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0)...
2025-04-10 17:39:14,416 - INFO - Raw columns: ['DATE', 'AUS', 'AUT', 'BEL', 'CAN', 'CHE', 'DEU', 'DNK', 'ESP', 'FIN', 'FRA', 'GBR', 'GRC', 'HKG', 'IRL', 'ISR', 'ITA', 'JPN', 'NLD', 'NOR', 'NZL', 'PRT', 'SGP', 'SWE', 'USA', 'Global', 'Global Ex USA', 'Europe', 'North America', 'Pacific']
2025-04-10 17:39:14,419 - INFO - Columns after renaming: ['date', 'BAB', 'BAB', 'BAB']
2025-04-10 17:39:14,432 - INFO - Rows after melting: 2039
2025-04-10 17:39:14,434 - ERROR - Error processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0): 'path'
2025-04-10 17:39:14,436 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Fact

In [18]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
from sqlalchemy.sql import text
import logging

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)

# Database connection
engine = create_engine(
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

# File configurations
data_files = {
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {"sheet": 0, "header": 18, "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)", "columns": {"DATE": "DATE", "USA": "BAB", "Global": "BAB", "Global Ex USA": "BAB"}},
            "MKT": {"sheet": 4, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "MKT", "Global": "MKT", "Global Ex USA": "MKT"}},
            "SMB": {"sheet": 5, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "SMB", "Global": "SMB", "Global Ex USA": "SMB"}},
            "HML-FF": {"sheet": 6, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "HML-FF", "Global": "HML-FF", "Global Ex USA": "HML-FF"}},
            "HML-D": {"sheet": 7, "header": 18, "paper": "The Devil's in HML's Details", "columns": {"DATE": "DATE", "USA": "HML-D", "Global": "HML-D", "Global Ex USA": "HML-D"}},
            "UMD": {"sheet": 8, "header": 18, "paper": "On Persistence in Mutual Fund Performance", "columns": {"DATE": "DATE", "USA": "UMD", "Global": "UMD", "Global Ex USA": "UMD"}},
            "ME": {"sheet": 9, "header": 18, "paper": "AQR Factors", "columns": {"DATE": "DATE", "USA": "ME", "Global": "ME", "Global Ex USA": "ME"}},
            "RF": {"sheet": 10, "header": 18, "paper": None, "columns": {"DATE": "DATE", "Risk Free Rate": "RF"}}
        }
    },
    "TSMOM": {
        "path": r"C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx",
        "sheet": 0,
        "header": 17,
        "paper": "Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)",
        "columns": {
            "Unnamed: 0": "DATE",
            "TSMOM": "TSM-MA",
            "TSMOM^CM": "TSM-Com",
            "TSMOM^EQ": "TSM-EQ",
            "TSMOM^FI": "TSM-FI",
            "TSMOM^FX": "TSM-FX"
        }
    },
    "QMJ": {
        "path": r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)",
        "columns": {
            "DATE": "DATE",
            "P1 (low quality)": "QMJ-P1-LQ",
            "P2": "QMJ-P2",
            "P3": "QMJ-P3",
            "P4": "QMJ-P4",
            "P5": "QMJ-P5",
            "P6": "QMJ-P6",
            "P7": "QMJ-P7",
            "P8": "QMJ-P8",
            "P9": "QMJ-P9",
            "P10 (high quality)": "QMJ-P10-HQ",
            "P10-P1": "QMJ"
        }
    },
    "Century": {
        "path": r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Century of Factor Premia Monthly-Ilmanen et al. (2021)",
        "columns": {
            "Date": "DATE",
            "US Stock Selection Value": "HML-FF-US", "US Stock Selection Momentum": "UMD-US", 
            "US Stock Selection Defensive": "BAB-US", "US Stock Selection Multi-style": "Multi-style-US",
            "Intl Stock Selection Value": "HML-FF-Intl", "Intl Stock Selection Momentum": "UMD-Intl", 
            "Intl Stock Selection Defensive": "BAB-Intl", "Intl Stock Selection Multi-style": "Multi-style-Intl",
            "Equity indices Value": "HML-Equity", "Equity indices Momentum": "UMD-Equity", 
            "Equity indices Carry": "Carry-Equity", "Equity indices Defensive": "BAB-Equity", 
            "Equity indices Multi-style": "Multi-style-Equity",
            "Fixed income Value": "HML-FI", "Fixed income Momentum": "UMD-FI", 
            "Fixed income Carry": "Carry-FI", "Fixed income Defensive": "BAB-FI", 
            "Fixed income Multi-style": "Multi-style-FI",
            "Currencies Value": "HML-FX", "Currencies Momentum": "UMD-FX", 
            "Currencies Carry": "Carry-FX", "Currencies Multi-style": "Multi-style-FX",
            "Commodities Value": "HML-COM", "Commodities Momentum": "UMD-COM", 
            "Commodities Carry": "Carry-COM", "Commodities Multi-style": "Multi-style-COM",
            "All Stock Selection Value": "HML-All-SS", "All Stock Selection Momentum": "UMD-All-SS", 
            "All Stock Selection Defensive": "BAB-All-SS", "All Stock Selection Multi-style": "Multi-style-All-SS",
            "All Macro Value": "HML-All-Macro", "All Macro Momentum": "UMD-All-Macro", 
            "All Macro Carry": "Carry-All-Macro", "All Macro Defensive": "BAB-All-Macro", 
            "All Macro Multi-style": "Multi-style-All-Macro",
            "All asset classes Value": "HML-All", "All asset classes Momentum": "UMD-All", 
            "All asset classes Carry": "Carry-All", "All asset classes Defensive": "BAB-All", 
            "All asset classes Multi-style": "Multi-style-All",
            "Equity indices Market": "MKT-Equity", "Fixed income Market": "MKT-FI", 
            "Commodities Market": "MKT-COM", "All Macro Market": "MKT-All-Macro"
        }
    },
    "COM": {
        "path": r"C:\Users\JulianHeron\Downloads\Commodities for the Long Run Index Level Data Monthly.xlsx",
        "sheet": 0,
        "header": 10,
        "paper": "Commodities for the Long Run"
    }
}

def process_factors(file_config, sheet=0, is_com=False, parent_path=None):
    file_path = parent_path if parent_path else file_config['path']
    logger.info(f"Processing {file_path} (sheet {sheet})...")
    try:
        # Read Excel with correct header row
        df = pd.read_excel(file_path, sheet_name=sheet, header=file_config['header'])
        logger.info(f"Raw columns: {list(df.columns)}")
        
        if is_com:
            df.columns = [
                'date', 'excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt',
                'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short',
                'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango', 
                'state_backwardation_contango', 'state_inflation'
            ]
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df['associated_paper'] = file_config['paper']
            df = df.dropna(subset=['date'])
            
            with engine.connect() as connection:
                existing = pd.read_sql("SELECT date FROM aqr_cmdty_factors", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(existing['date'])
            
            df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')
            df_new = df[~df['date_str'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'aqr_cmdty_factors',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'date': DATE,
                        'excess_return_eqwt': DECIMAL(15, 6),
                        'excess_spot_return_eqwt': DECIMAL(15, 6),
                        'ir_adjusted_carry_eqwt': DECIMAL(15, 6),
                        'spot_return_eqwt': DECIMAL(15, 6),
                        'carry_eqwt': DECIMAL(15, 6),
                        'excess_return_long_short': DECIMAL(15, 6),
                        'excess_spot_return_long_short': DECIMAL(15, 6),
                        'ir_adjusted_carry_long_short': DECIMAL(15, 6),
                        'aggregate_backwardation_contango': DECIMAL(15, 6),
                        'state_backwardation_contango': VARCHAR(50),
                        'state_inflation': VARCHAR(50),
                        'associated_paper': VARCHAR(100)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into aqr_cmdty_factors.")
            else:
                logger.info("No new rows to load into aqr_cmdty_factors.")
        else:
            # Identify date column
            date_col = next(
                (col for col in df.columns if isinstance(col, str) and ('DATE' in col.upper() or 'Date' in col)),
                df.columns[0]
            )
            
            # Filter and rename columns
            if 'columns' in file_config:
                valid_cols = list(file_config['columns'].keys())
                if date_col not in valid_cols:
                    valid_cols.append(date_col)
                df = df[[col for col in df.columns if col in valid_cols]]
                new_columns = []
                for col in df.columns:
                    if col == date_col:
                        new_columns.append('date')
                    else:
                        new_columns.append(file_config['columns'].get(col, col))
                df.columns = new_columns
            
            logger.info(f"Columns after renaming: {list(df.columns)}")
            
            # Parse dates
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            
            # Melt the DataFrame with value_vars to track original columns
            value_vars = [col for col in df.columns if col != 'date']
            df_long = df.melt(id_vars=['date'], value_vars=value_vars, var_name='factor', value_name='value')
            df_long = df_long.dropna(subset=['value', 'date'])
            logger.info(f"Rows after melting: {len(df_long)}")
            
            # Assign asset class and region
            if 'TSMOM' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].map({
                    "TSM-MA": "Multi-Asset",
                    "TSM-Com": "Commodities",
                    "TSM-EQ": "Equity",
                    "TSM-FI": "Fixed Income",
                    "TSM-FX": "Currencies"
                })
                df_long['region'] = "Global"
            elif 'Quality Minus Junk' in file_config['path']:
                df_long['asset_class'] = "Equity"
                df_long['region'] = df_long['date'].apply(
                    lambda x: "USA" if x < pd.Timestamp("1989-07-31") else "Global"
                )
            elif 'Century' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].str.extract(
                    '(Stock Selection|Equity indices|Fixed income|Currencies|Commodities|All Macro|All asset classes)'
                ).fillna('Equity')
                df_long['region'] = df_long['factor'].apply(
                    lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global'
                )
            else:  # BAB_multi
                df_long['asset_class'] = "Equity" if 'Risk Free Rate' not in file_config['columns'].values() else "Fixed Income"
                # Map regions based on original column names
                region_map = {
                    'USA': 'USA',
                    'Global': 'Global',
                    'Global Ex USA': 'Intl',
                    'Risk Free Rate': 'USA'
                }
                df_long['region'] = df_long['factor'].map(
                    lambda x: region_map.get(next((k for k, v in file_config['columns'].items() if v == x and k != 'DATE'), ''), 'Unknown')
                )
                # Append region to factor to avoid duplicates
                df_long['factor'] = df_long.apply(
                    lambda row: f"{row['factor']}_{row['region']}" if row['factor'] != 'RF' and row['region'] != 'Unknown' else row['factor'], axis=1
                )
            
            df_long['associated_paper'] = file_config['paper']
            df_long = df_long[['factor', 'date', 'associated_paper', 'asset_class', 'value', 'region']]
            
            with engine.connect() as connection:
                existing = pd.read_sql(
                    "SELECT factor, date, associated_paper, asset_class FROM factor_returns",
                    connection
                )
                existing['asset_class'] = existing['asset_class'].fillna('Unknown')
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(tuple(row) for row in existing[['factor', 'date', 'associated_paper', 'asset_class']].values)
            
            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            df_long['asset_class'] = df_long['asset_class'].fillna('Unknown')
            df_long['key'] = df_long.apply(
                lambda row: (row['factor'], row['date_str'], row['associated_paper'] if row['associated_paper'] else 'None', row['asset_class']), axis=1
            )
            df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['date_str', 'key'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'factor_returns',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'factor': VARCHAR(50),
                        'date': DATE,
                        'associated_paper': VARCHAR(100),
                        'asset_class': VARCHAR(50),
                        'value': DECIMAL(15, 6),
                        'region': VARCHAR(50)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into factor_returns from {file_path} (sheet {sheet}).")
            else:
                logger.info(f"No new rows to load into factor_returns from {file_path} (sheet {sheet}).")
            
    except Exception as e:
        logger.error(f"Error processing {file_path} (sheet {sheet}): {str(e)}")

# Process all datasets, skipping Century and COM since they're complete
for key, config in data_files.items():
    if key in ["Century", "COM"]:
        logger.info(f"Skipping {key} as data is already fully loaded.")
        continue
    if key == "BAB_multi":
        for subkey, subconfig in config['sheets'].items():
            process_factors(subconfig, sheet=subconfig['sheet'], parent_path=config['path'])
    else:
        process_factors(config, sheet=config['sheet'])

logger.info("All data processing complete!")

2025-04-10 17:42:27,129 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0)...
2025-04-10 17:42:28,463 - INFO - Raw columns: ['DATE', 'AUS', 'AUT', 'BEL', 'CAN', 'CHE', 'DEU', 'DNK', 'ESP', 'FIN', 'FRA', 'GBR', 'GRC', 'HKG', 'IRL', 'ISR', 'ITA', 'JPN', 'NLD', 'NOR', 'NZL', 'PRT', 'SGP', 'SWE', 'USA', 'Global', 'Global Ex USA', 'Europe', 'North America', 'Pacific']
2025-04-10 17:42:28,466 - INFO - Columns after renaming: ['date', 'BAB', 'BAB', 'BAB']
2025-04-10 17:42:28,477 - INFO - Rows after melting: 2039
2025-04-10 17:42:28,479 - ERROR - Error processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0): 'path'
2025-04-10 17:42:28,480 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Fact

In [19]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
from sqlalchemy.sql import text
import logging

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)

# Database connection
engine = create_engine(
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

# File configurations
data_files = {
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {"sheet": 0, "header": 18, "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)", "columns": {"DATE": "DATE", "USA": "BAB", "Global": "BAB", "Global Ex USA": "BAB"}},
            "MKT": {"sheet": 4, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "MKT", "Global": "MKT", "Global Ex USA": "MKT"}},
            "SMB": {"sheet": 5, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "SMB", "Global": "SMB", "Global Ex USA": "SMB"}},
            "HML-FF": {"sheet": 6, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "HML-FF", "Global": "HML-FF", "Global Ex USA": "HML-FF"}},
            "HML-D": {"sheet": 7, "header": 18, "paper": "The Devil's in HML's Details", "columns": {"DATE": "DATE", "USA": "HML-D", "Global": "HML-D", "Global Ex USA": "HML-D"}},
            "UMD": {"sheet": 8, "header": 18, "paper": "On Persistence in Mutual Fund Performance", "columns": {"DATE": "DATE", "USA": "UMD", "Global": "UMD", "Global Ex USA": "UMD"}},
            "ME": {"sheet": 9, "header": 18, "paper": "AQR Factors", "columns": {"DATE": "DATE", "USA": "ME", "Global": "ME", "Global Ex USA": "ME"}},
            "RF": {"sheet": 10, "header": 18, "paper": None, "columns": {"DATE": "DATE", "Risk Free Rate": "RF"}}
        }
    },
    "TSMOM": {
        "path": r"C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx",
        "sheet": 0,
        "header": 17,
        "paper": "Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)",
        "columns": {
            "Unnamed: 0": "DATE",
            "TSMOM": "TSM-MA",
            "TSMOM^CM": "TSM-Com",
            "TSMOM^EQ": "TSM-EQ",
            "TSMOM^FI": "TSM-FI",
            "TSMOM^FX": "TSM-FX"
        }
    },
    "QMJ": {
        "path": r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)",
        "columns": {
            "DATE": "DATE",
            "P1 (low quality)": "QMJ-P1-LQ",
            "P2": "QMJ-P2",
            "P3": "QMJ-P3",
            "P4": "QMJ-P4",
            "P5": "QMJ-P5",
            "P6": "QMJ-P6",
            "P7": "QMJ-P7",
            "P8": "QMJ-P8",
            "P9": "QMJ-P9",
            "P10 (high quality)": "QMJ-P10-HQ",
            "P10-P1": "QMJ"
        }
    },
    "Century": {
        "path": r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Century of Factor Premia Monthly-Ilmanen et al. (2021)",
        "columns": {
            "Date": "DATE",
            "US Stock Selection Value": "HML-FF-US", "US Stock Selection Momentum": "UMD-US", 
            "US Stock Selection Defensive": "BAB-US", "US Stock Selection Multi-style": "Multi-style-US",
            "Intl Stock Selection Value": "HML-FF-Intl", "Intl Stock Selection Momentum": "UMD-Intl", 
            "Intl Stock Selection Defensive": "BAB-Intl", "Intl Stock Selection Multi-style": "Multi-style-Intl",
            "Equity indices Value": "HML-Equity", "Equity indices Momentum": "UMD-Equity", 
            "Equity indices Carry": "Carry-Equity", "Equity indices Defensive": "BAB-Equity", 
            "Equity indices Multi-style": "Multi-style-Equity",
            "Fixed income Value": "HML-FI", "Fixed income Momentum": "UMD-FI", 
            "Fixed income Carry": "Carry-FI", "Fixed income Defensive": "BAB-FI", 
            "Fixed income Multi-style": "Multi-style-FI",
            "Currencies Value": "HML-FX", "Currencies Momentum": "UMD-FX", 
            "Currencies Carry": "Carry-FX", "Currencies Multi-style": "Multi-style-FX",
            "Commodities Value": "HML-COM", "Commodities Momentum": "UMD-COM", 
            "Commodities Carry": "Carry-COM", "Commodities Multi-style": "Multi-style-COM",
            "All Stock Selection Value": "HML-All-SS", "All Stock Selection Momentum": "UMD-All-SS", 
            "All Stock Selection Defensive": "BAB-All-SS", "All Stock Selection Multi-style": "Multi-style-All-SS",
            "All Macro Value": "HML-All-Macro", "All Macro Momentum": "UMD-All-Macro", 
            "All Macro Carry": "Carry-All-Macro", "All Macro Defensive": "BAB-All-Macro", 
            "All Macro Multi-style": "Multi-style-All-Macro",
            "All asset classes Value": "HML-All", "All asset classes Momentum": "UMD-All", 
            "All asset classes Carry": "Carry-All", "All asset classes Defensive": "BAB-All", 
            "All asset classes Multi-style": "Multi-style-All",
            "Equity indices Market": "MKT-Equity", "Fixed income Market": "MKT-FI", 
            "Commodities Market": "MKT-COM", "All Macro Market": "MKT-All-Macro"
        }
    },
    "COM": {
        "path": r"C:\Users\JulianHeron\Downloads\Commodities for the Long Run Index Level Data Monthly.xlsx",
        "sheet": 0,
        "header": 10,
        "paper": "Commodities for the Long Run"
    }
}

def process_factors(file_config, sheet=0, is_com=False, parent_path=None):
    file_path = parent_path if parent_path else file_config['path']
    logger.info(f"Processing {file_path} (sheet {sheet})...")
    try:
        # Read Excel with correct header row
        df = pd.read_excel(file_path, sheet_name=sheet, header=file_config['header'])
        logger.info(f"Raw columns: {list(df.columns)}")
        
        if is_com:
            df.columns = [
                'date', 'excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt',
                'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short',
                'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango', 
                'state_backwardation_contango', 'state_inflation'
            ]
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df['associated_paper'] = file_config['paper']
            df = df.dropna(subset=['date'])
            
            with engine.connect() as connection:
                existing = pd.read_sql("SELECT date FROM aqr_cmdty_factors", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(existing['date'])
            
            df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')
            df_new = df[~df['date_str'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'aqr_cmdty_factors',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'date': DATE,
                        'excess_return_eqwt': DECIMAL(15, 6),
                        'excess_spot_return_eqwt': DECIMAL(15, 6),
                        'ir_adjusted_carry_eqwt': DECIMAL(15, 6),
                        'spot_return_eqwt': DECIMAL(15, 6),
                        'carry_eqwt': DECIMAL(15, 6),
                        'excess_return_long_short': DECIMAL(15, 6),
                        'excess_spot_return_long_short': DECIMAL(15, 6),
                        'ir_adjusted_carry_long_short': DECIMAL(15, 6),
                        'aggregate_backwardation_contango': DECIMAL(15, 6),
                        'state_backwardation_contango': VARCHAR(50),
                        'state_inflation': VARCHAR(50),
                        'associated_paper': VARCHAR(100)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into aqr_cmdty_factors.")
            else:
                logger.info("No new rows to load into aqr_cmdty_factors.")
        else:
            # Identify date column
            date_col = next(
                (col for col in df.columns if isinstance(col, str) and ('DATE' in col.upper() or 'Date' in col)),
                df.columns[0]
            )
            
            # Filter and rename columns, keeping original column names for region mapping
            if 'columns' in file_config:
                valid_cols = list(file_config['columns'].keys())
                if date_col not in valid_cols:
                    valid_cols.append(date_col)
                df = df[[col for col in df.columns if col in valid_cols]]
                orig_cols = df.columns.tolist()
                new_columns = []
                for col in df.columns:
                    if col == date_col:
                        new_columns.append('date')
                    else:
                        new_columns.append(file_config['columns'].get(col, col))
                df.columns = new_columns
            
            logger.info(f"Columns after renaming: {list(df.columns)}")
            
            # Parse dates
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            
            # Melt the DataFrame with value_vars to track original columns
            value_vars = [col for col in df.columns if col != 'date']
            df_long = df.melt(id_vars=['date'], value_vars=value_vars, var_name='factor', value_name='value')
            df_long = df_long.dropna(subset=['value', 'date'])
            logger.info(f"Rows after melting: {len(df_long)}")
            
            # Assign asset class and region
            if 'TSMOM' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].map({
                    "TSM-MA": "Multi-Asset",
                    "TSM-Com": "Commodities",
                    "TSM-EQ": "Equity",
                    "TSM-FI": "Fixed Income",
                    "TSM-FX": "Currencies"
                })
                df_long['region'] = "Global"
            elif 'Quality Minus Junk' in file_config['path']:
                df_long['asset_class'] = "Equity"
                df_long['region'] = df_long['date'].apply(
                    lambda x: "USA" if x < pd.Timestamp("1989-07-31") else "Global"
                )
            elif 'Century' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].str.extract(
                    '(Stock Selection|Equity indices|Fixed income|Currencies|Commodities|All Macro|All asset classes)'
                ).fillna('Equity')
                df_long['region'] = df_long['factor'].apply(
                    lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global'
                )
            else:  # BAB_multi
                df_long['asset_class'] = "Equity" if 'Risk Free Rate' not in file_config['columns'].values() else "Fixed Income"
                # Map regions based on original column names
                region_map = {}
                factor_map = {}
                for orig_col, factor in file_config['columns'].items():
                    if orig_col == 'DATE':
                        continue
                    if orig_col == 'USA':
                        region_map[factor] = 'USA'
                    elif orig_col == 'Global':
                        region_map[factor] = 'Global'
                    elif orig_col == 'Global Ex USA':
                        region_map[factor] = 'Intl'
                    elif orig_col == 'Risk Free Rate':
                        region_map[factor] = 'USA'
                    factor_map[orig_col] = factor
                
                # Assign region and adjust factor
                df_long['region'] = df_long['factor'].map(lambda x: region_map.get(x, 'Unknown'))
                df_long['factor'] = df_long.apply(
                    lambda row: f"{row['factor']}_{row['region']}" if row['factor'] != 'RF' and row['region'] != 'Unknown' else row['factor'], axis=1
                )
            
            df_long['associated_paper'] = file_config['paper']
            df_long = df_long[['factor', 'date', 'associated_paper', 'asset_class', 'value', 'region']]
            
            with engine.connect() as connection:
                existing = pd.read_sql(
                    "SELECT factor, date, associated_paper, asset_class FROM factor_returns",
                    connection
                )
                existing['asset_class'] = existing['asset_class'].fillna('Unknown')
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(tuple(row) for row in existing[['factor', 'date', 'associated_paper', 'asset_class']].values)
            
            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            df_long['asset_class'] = df_long['asset_class'].fillna('Unknown')
            df_long['key'] = df_long.apply(
                lambda row: (row['factor'], row['date_str'], row['associated_paper'] if row['associated_paper'] else 'None', row['asset_class']), axis=1
            )
            df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['date_str', 'key'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'factor_returns',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'factor': VARCHAR(50),
                        'date': DATE,
                        'associated_paper': VARCHAR(100),
                        'asset_class': VARCHAR(50),
                        'value': DECIMAL(15, 6),
                        'region': VARCHAR(50)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into factor_returns from {file_path} (sheet {sheet}).")
            else:
                logger.info(f"No new rows to load into factor_returns from {file_path} (sheet {sheet}).")
            
    except Exception as e:
        logger.error(f"Error processing {file_path} (sheet {sheet}): {str(e)}")

# Process all datasets, skipping Century and COM since they're complete
for key, config in data_files.items():
    if key in ["Century", "COM"]:
        logger.info(f"Skipping {key} as data is already fully loaded.")
        continue
    if key == "BAB_multi":
        for subkey, subconfig in config['sheets'].items():
            process_factors(subconfig, sheet=subconfig['sheet'], parent_path=config['path'])
    else:
        process_factors(config, sheet=config['sheet'])

logger.info("All data processing complete!")

2025-04-10 17:44:59,653 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0)...
2025-04-10 17:45:00,932 - INFO - Raw columns: ['DATE', 'AUS', 'AUT', 'BEL', 'CAN', 'CHE', 'DEU', 'DNK', 'ESP', 'FIN', 'FRA', 'GBR', 'GRC', 'HKG', 'IRL', 'ISR', 'ITA', 'JPN', 'NLD', 'NOR', 'NZL', 'PRT', 'SGP', 'SWE', 'USA', 'Global', 'Global Ex USA', 'Europe', 'North America', 'Pacific']
2025-04-10 17:45:00,934 - INFO - Columns after renaming: ['date', 'BAB', 'BAB', 'BAB']
2025-04-10 17:45:00,946 - INFO - Rows after melting: 2039
2025-04-10 17:45:00,947 - ERROR - Error processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0): 'path'
2025-04-10 17:45:00,948 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Fact

In [22]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
from sqlalchemy.sql import text
import logging

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)

# Database connection
engine = create_engine(
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

# File configurations (Lines 20-149)
data_files = {
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {"sheet": 0, "header": 18, "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)", "columns": {"DATE": "DATE", "USA": "BAB", "Global": "BAB", "Global Ex USA": "BAB"}},
            "MKT": {"sheet": 4, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "MKT", "Global": "MKT", "Global Ex USA": "MKT"}},
            "SMB": {"sheet": 5, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "SMB", "Global": "SMB", "Global Ex USA": "SMB"}},
            "HML-FF": {"sheet": 6, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "HML-FF", "Global": "HML-FF", "Global Ex USA": "HML-FF"}},
            "HML-D": {"sheet": 7, "header": 18, "paper": "The Devil's in HML's Details", "columns": {"DATE": "DATE", "USA": "HML-D", "Global": "HML-D", "Global Ex USA": "HML-D"}},
            "UMD": {"sheet": 8, "header": 18, "paper": "On Persistence in Mutual Fund Performance", "columns": {"DATE": "DATE", "USA": "UMD", "Global": "UMD", "Global Ex USA": "UMD"}},
            "ME": {"sheet": 9, "header": 18, "paper": "AQR Factors", "columns": {"DATE": "DATE", "USA": "ME", "Global": "ME", "Global Ex USA": "ME"}},
            "RF": {"sheet": 10, "header": 18, "paper": None, "columns": {"DATE": "DATE", "Risk Free Rate": "RF"}}
        }
    },
    "TSMOM": {
        "path": r"C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx",
        "sheet": 0,
        "header": 17,
        "paper": "Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)",
        "columns": {
            "Unnamed: 0": "DATE",
            "TSMOM": "TSM-MA",
            "TSMOM^CM": "TSM-Com",
            "TSMOM^EQ": "TSM-EQ",
            "TSMOM^FI": "TSM-FI",
            "TSMOM^FX": "TSM-FX"
        }
    },
    "QMJ": {
        "path": r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)",
        "columns": {
            "DATE": "DATE",
            "P1 (low quality)": "QMJ-P1-LQ",
            "P2": "QMJ-P2",
            "P3": "QMJ-P3",
            "P4": "QMJ-P4",
            "P5": "QMJ-P5",
            "P6": "QMJ-P6",
            "P7": "QMJ-P7",
            "P8": "QMJ-P8",
            "P9": "QMJ-P9",
            "P10 (high quality)": "QMJ-P10-HQ",
            "P10-P1": "QMJ"
        }
    },
    "Century": {
        "path": r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Century of Factor Premia Monthly-Ilmanen et al. (2021)",
        "columns": {
            "Date": "DATE",
            "US Stock Selection Value": "HML-FF-US", "US Stock Selection Momentum": "UMD-US", 
            "US Stock Selection Defensive": "BAB-US", "US Stock Selection Multi-style": "Multi-style-US",
            "Intl Stock Selection Value": "HML-FF-Intl", "Intl Stock Selection Momentum": "UMD-Intl", 
            "Intl Stock Selection Defensive": "BAB-Intl", "Intl Stock Selection Multi-style": "Multi-style-Intl",
            "Equity indices Value": "HML-Equity", "Equity indices Momentum": "UMD-Equity", 
            "Equity indices Carry": "Carry-Equity", "Equity indices Defensive": "BAB-Equity", 
            "Equity indices Multi-style": "Multi-style-Equity",
            "Fixed income Value": "HML-FI", "Fixed income Momentum": "UMD-FI", 
            "Fixed income Carry": "Carry-FI", "Fixed income Defensive": "BAB-FI", 
            "Fixed income Multi-style": "Multi-style-FI",
            "Currencies Value": "HML-FX", "Currencies Momentum": "UMD-FX", 
            "Currencies Carry": "Carry-FX", "Currencies Multi-style": "Multi-style-FX",
            "Commodities Value": "HML-COM", "Commodities Momentum": "UMD-COM", 
            "Commodities Carry": "Carry-COM", "Commodities Multi-style": "Multi-style-COM",
            "All Stock Selection Value": "HML-All-SS", "All Stock Selection Momentum": "UMD-All-SS", 
            "All Stock Selection Defensive": "BAB-All-SS", "All Stock Selection Multi-style": "Multi-style-All-SS",
            "All Macro Value": "HML-All-Macro", "All Macro Momentum": "UMD-All-Macro", 
            "All Macro Carry": "Carry-All-Macro", "All Macro Defensive": "BAB-All-Macro", 
            "All Macro Multi-style": "Multi-style-All-Macro",
            "All asset classes Value": "HML-All", "All asset classes Momentum": "UMD-All", 
            "All asset classes Carry": "Carry-All", "All asset classes Defensive": "BAB-All", 
            "All asset classes Multi-style": "Multi-style-All",
            "Equity indices Market": "MKT-Equity", "Fixed income Market": "MKT-FI", 
            "Commodities Market": "MKT-COM", "All Macro Market": "MKT-All-Macro"
        }
    },
    "COM": {
        "path": r"C:\Users\JulianHeron\Downloads\Commodities for the Long Run Index Level Data Monthly.xlsx",
        "sheet": 0,
        "header": 10,
        "paper": "Commodities for the Long Run"
    }
}

# Line 150
def process_factors(file_config, sheet=0, is_com=False, parent_path=None):
    file_path = parent_path if parent_path else file_config['path']
    logger.info(f"Processing {file_path} (sheet {sheet})...")
    try:
        # Read Excel with correct header row
        df = pd.read_excel(file_path, sheet_name=sheet, header=file_config['header'])
        logger.info(f"Raw columns: {list(df.columns)}")
        
        if is_com:
            df.columns = [
                'date', 'excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt',
                'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short',
                'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango', 
                'state_backwardation_contango', 'state_inflation'
            ]
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df['associated_paper'] = file_config['paper']
            df = df.dropna(subset=['date'])
            
            with engine.connect() as connection:
                existing = pd.read_sql("SELECT date FROM aqr_cmdty_factors", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(existing['date'])
            
            df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')
            df_new = df[~df['date_str'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'aqr_cmdty_factors',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'date': DATE,
                        'excess_return_eqwt': DECIMAL(15, 6),
                        'excess_spot_return_eqwt': DECIMAL(15, 6),
                        'ir_adjusted_carry_eqwt': DECIMAL(15, 6),
                        'spot_return_eqwt': DECIMAL(15, 6),
                        'carry_eqwt': DECIMAL(15, 6),
                        'excess_return_long_short': DECIMAL(15, 6),
                        'excess_spot_return_long_short': DECIMAL(15, 6),
                        'ir_adjusted_carry_long_short': DECIMAL(15, 6),
                        'aggregate_backwardation_contango': DECIMAL(15, 6),
                        'state_backwardation_contango': VARCHAR(50),
                        'state_inflation': VARCHAR(50),
                        'associated_paper': VARCHAR(100)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into aqr_cmdty_factors.")
            else:
                logger.info("No new rows to load into aqr_cmdty_factors.")
        else:
            # Identify date column (Line 209)
            date_col = next(
                (col for col in df.columns if isinstance(col, str) and ('DATE' in col.upper() or 'Date' in col)),
                df.columns[0]
            )
            
            # Filter and rename columns, keeping track of original columns (Line 214)
            orig_cols = df.columns.tolist()
            if 'columns' in file_config:
                valid_cols = list(file_config['columns'].keys())
                if date_col not in valid_cols:
                    valid_cols.append(date_col)
                df = df[[col for col in df.columns if col in valid_cols]]
                new_columns = []
                for col in df.columns:
                    if col == date_col:
                        new_columns.append('date')
                    else:
                        new_columns.append(file_config['columns'].get(col, col))
                df.columns = new_columns
            
            logger.info(f"Columns after renaming: {list(df.columns)}")
            
            # Parse dates (Line 236)
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            
            # Melt the DataFrame with value_vars to track original columns (Line 239)
            value_vars = [col for col in df.columns if col != 'date']
            df_long = df.melt(id_vars=['date'], value_vars=value_vars, var_name='factor', value_name='value')
            df_long = df_long.dropna(subset=['value', 'date'])
            logger.info(f"Rows after melting: {len(df_long)}")
            
            # Assign asset class and region (Line 246)
            if 'TSMOM' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].map({
                    "TSM-MA": "Multi-Asset",
                    "TSM-Com": "Commodities",
                    "TSM-EQ": "Equity",
                    "TSM-FI": "Fixed Income",
                    "TSM-FX": "Currencies"
                })
                df_long['region'] = "Global"
            elif 'Quality Minus Junk' in file_config['path']:
                df_long['asset_class'] = "Equity"
                df_long['region'] = df_long['date'].apply(
                    lambda x: "USA" if x < pd.Timestamp("1989-07-31") else "Global"
                )
            elif 'Century' in file_config['path']:
                df_long['asset_class'] = df_long['factor'].str.extract(
                    '(Stock Selection|Equity indices|Fixed income|Currencies|Commodities|All Macro|All asset classes)'
                ).fillna('Equity')
                df_long['region'] = df_long['factor'].apply(
                    lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global'
                )
            else:  # BAB_multi (Line 260)
                df_long['asset_class'] = "Equity" if 'Risk Free Rate' not in file_config['columns'].values() else "Fixed Income"
                # Map regions based on original column names before melting (Lines 262-272)
                region_map = {}
                for i, col in enumerate(orig_cols):
                    if col == date_col:
                        continue
                    factor = file_config['columns'].get(col, col)
                    if col == 'USA':
                        region_map[factor] = 'USA'
                    elif col == 'Global':
                        region_map[factor] = 'Global'
                    elif col == 'Global Ex USA':
                        region_map[factor] = 'Intl'
                    elif col == 'Risk Free Rate':
                        region_map[factor] = 'USA'
                # Assign region and adjust factor (Lines 274-278)
                df_long['region'] = df_long['factor'].map(region_map)
                df_long['factor'] = df_long.apply(
                    lambda row: f"{row['factor']}_{row['region']}" if row['factor'] != 'RF' and row['region'] is not None else row['factor'], axis=1
                )
            
            df_long['associated_paper'] = file_config['paper']
            df_long = df_long[['factor', 'date', 'associated_paper', 'asset_class', 'value', 'region']]
            
            with engine.connect() as connection:
                existing = pd.read_sql(
                    "SELECT factor, date, associated_paper, asset_class FROM factor_returns",
                    connection
                )
                existing['asset_class'] = existing['asset_class'].fillna('Unknown')
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(tuple(row) for row in existing[['factor', 'date', 'associated_paper', 'asset_class']].values)
            
            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            df_long['asset_class'] = df_long['asset_class'].fillna('Unknown')
            df_long['key'] = df_long.apply(
                lambda row: (row['factor'], row['date_str'], row['associated_paper'] if row['associated_paper'] else 'None', row['asset_class']), axis=1
            )
            df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['date_str', 'key'])
            
            if not df_new.empty:
                df_new.to_sql(
                    'factor_returns',
                    engine,
                    if_exists='append',
                    index=False,
                    dtype={
                        'factor': VARCHAR(50),
                        'date': DATE,
                        'associated_paper': VARCHAR(100),
                        'asset_class': VARCHAR(50),
                        'value': DECIMAL(15, 6),
                        'region': VARCHAR(50)
                    }
                )
                logger.info(f"Loaded {len(df_new)} new rows into factor_returns from {file_path} (sheet {sheet}).")
            else:
                logger.info(f"No new rows to load into factor_returns from {file_path} (sheet {sheet}).")
            
    except Exception as e:  # Line 366
        logger.error(f"Error processing {file_path} (sheet {sheet}): {str(e)}")

# Process all datasets, skipping Century and COM since they're complete (Line 369)
for key, config in data_files.items():
    if key in ["Century", "COM"]:
        logger.info(f"Skipping {key} as data is already fully loaded.")
        continue
    if key == "BAB_multi":
        for subkey, subconfig in config['sheets'].items():
            process_factors(subconfig, sheet=subconfig['sheet'], parent_path=config['path'])
    else:
        process_factors(config, sheet=config['sheet'])

logger.info("All data processing complete!")

2025-04-10 17:56:45,191 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0)...
2025-04-10 17:56:46,503 - INFO - Raw columns: ['DATE', 'AUS', 'AUT', 'BEL', 'CAN', 'CHE', 'DEU', 'DNK', 'ESP', 'FIN', 'FRA', 'GBR', 'GRC', 'HKG', 'IRL', 'ISR', 'ITA', 'JPN', 'NLD', 'NOR', 'NZL', 'PRT', 'SGP', 'SWE', 'USA', 'Global', 'Global Ex USA', 'Europe', 'North America', 'Pacific']
2025-04-10 17:56:46,505 - INFO - Columns after renaming: ['date', 'BAB', 'BAB', 'BAB']
2025-04-10 17:56:46,518 - INFO - Rows after melting: 2039
2025-04-10 17:56:46,519 - ERROR - Error processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0): 'path'
2025-04-10 17:56:46,521 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Fact

In [23]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
import logging

# Logging setup - Use DEBUG for detailed output during development
logging.basicConfig(
    level=logging.DEBUG,  # Changed to DEBUG for more visibility
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)

# Database connection
engine = create_engine(
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

# File configurations
data_files = {
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {"sheet": 0, "header": 18, "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)", "columns": {"DATE": "DATE", "USA": "BAB", "Global": "BAB", "Global Ex USA": "BAB"}},
            "MKT": {"sheet": 4, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "MKT", "Global": "MKT", "Global Ex USA": "MKT"}},
            "SMB": {"sheet": 5, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "SMB", "Global": "SMB", "Global Ex USA": "SMB"}},
            "HML-FF": {"sheet": 6, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "HML-FF", "Global": "HML-FF", "Global Ex USA": "HML-FF"}},
            "HML-D": {"sheet": 7, "header": 18, "paper": "The Devil's in HML's Details", "columns": {"DATE": "DATE", "USA": "HML-D", "Global": "HML-D", "Global Ex USA": "HML-D"}},
            "UMD": {"sheet": 8, "header": 18, "paper": "On Persistence in Mutual Fund Performance", "columns": {"DATE": "DATE", "USA": "UMD", "Global": "UMD", "Global Ex USA": "UMD"}},
            "ME": {"sheet": 9, "header": 18, "paper": "AQR Factors", "columns": {"DATE": "DATE", "USA": "ME", "Global": "ME", "Global Ex USA": "ME"}},
            "RF": {"sheet": 10, "header": 18, "paper": None, "columns": {"DATE": "DATE", "Risk Free Rate": "RF"}}
        }
    },
    "TSMOM": {
        "path": r"C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx",
        "sheet": 0,
        "header": 17,
        "paper": "Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)",
        "columns": {
            "Unnamed: 0": "DATE",
            "TSMOM": "TSM-MA",
            "TSMOM^CM": "TSM-Com",
            "TSMOM^EQ": "TSM-EQ",
            "TSMOM^FI": "TSM-FI",
            "TSMOM^FX": "TSM-FX"
        }
    },
    "QMJ": {
        "path": r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)",
        "columns": {
            "DATE": "DATE",
            "P1 (low quality)": "QMJ-P1-LQ",
            "P2": "QMJ-P2",
            "P3": "QMJ-P3",
            "P4": "QMJ-P4",
            "P5": "QMJ-P5",
            "P6": "QMJ-P6",
            "P7": "QMJ-P7",
            "P8": "QMJ-P8",
            "P9": "QMJ-P9",
            "P10 (high quality)": "QMJ-P10-HQ",
            "P10-P1": "QMJ"
        }
    },
    "Century": {
        "path": r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Century of Factor Premia Monthly-Ilmanen et al. (2021)",
        "columns": {
            "Date": "DATE",
            "US Stock Selection Value": "HML-FF-US", "US Stock Selection Momentum": "UMD-US", 
            "US Stock Selection Defensive": "BAB-US", "US Stock Selection Multi-style": "Multi-style-US",
            "Intl Stock Selection Value": "HML-FF-Intl", "Intl Stock Selection Momentum": "UMD-Intl", 
            "Intl Stock Selection Defensive": "BAB-Intl", "Intl Stock Selection Multi-style": "Multi-style-Intl",
            "Equity indices Value": "HML-Equity", "Equity indices Momentum": "UMD-Equity", 
            "Equity indices Carry": "Carry-Equity", "Equity indices Defensive": "BAB-Equity", 
            "Equity indices Multi-style": "Multi-style-Equity",
            "Fixed income Value": "HML-FI", "Fixed income Momentum": "UMD-FI", 
            "Fixed income Carry": "Carry-FI", "Fixed income Defensive": "BAB-FI", 
            "Fixed income Multi-style": "Multi-style-FI",
            "Currencies Value": "HML-FX", "Currencies Momentum": "UMD-FX", 
            "Currencies Carry": "Carry-FX", "Currencies Multi-style": "Multi-style-FX",
            "Commodities Value": "HML-COM", "Commodities Momentum": "UMD-COM", 
            "Commodities Carry": "Carry-COM", "Commodities Multi-style": "Multi-style-COM",
            "All Stock Selection Value": "HML-All-SS", "All Stock Selection Momentum": "UMD-All-SS", 
            "All Stock Selection Defensive": "BAB-All-SS", "All Stock Selection Multi-style": "Multi-style-All-SS",
            "All Macro Value": "HML-All-Macro", "All Macro Momentum": "UMD-All-Macro", 
            "All Macro Carry": "Carry-All-Macro", "All Macro Defensive": "BAB-All-Macro", 
            "All Macro Multi-style": "Multi-style-All-Macro",
            "All asset classes Value": "HML-All", "All asset classes Momentum": "UMD-All", 
            "All asset classes Carry": "Carry-All", "All asset classes Defensive": "BAB-All", 
            "All asset classes Multi-style": "Multi-style-All",
            "Equity indices Market": "MKT-Equity", "Fixed income Market": "MKT-FI", 
            "Commodities Market": "MKT-COM", "All Macro Market": "MKT-All-Macro"
        }
    },
    "COM": {
        "path": r"C:\Users\JulianHeron\Downloads\Commodities for the Long Run Index Level Data Monthly.xlsx",
        "sheet": 0,
        "header": 10,
        "paper": "Commodities for the Long Run",
        "columns": {  # Added for flexibility
            "Date": "date",
            "Excess Return (Eq Wt)": "excess_return_eqwt",
            "Excess Spot Return (Eq Wt)": "excess_spot_return_eqwt",
            "IR-Adjusted Carry (Eq Wt)": "ir_adjusted_carry_eqwt",
            "Spot Return (Eq Wt)": "spot_return_eqwt",
            "Carry (Eq Wt)": "carry_eqwt",
            "Excess Return (Long-Short)": "excess_return_long_short",
            "Excess Spot Return (Long-Short)": "excess_spot_return_long_short",
            "IR-Adjusted Carry (Long-Short)": "ir_adjusted_carry_long_short",
            "Aggregate Backwardation/Contango": "aggregate_backwardation_contango",
            "State Backwardation/Contango": "state_backwardation_contango",
            "State Inflation": "state_inflation"
        }
    }
}

def process_factors(file_config, sheet=0, is_com=False, parent_path=None):
    file_path = parent_path if parent_path else file_config['path']
    logger.info(f"Processing {file_path} (sheet {sheet}) with is_com={is_com}")
    try:
        # Read Excel
        df = pd.read_excel(file_path, sheet_name=sheet, header=file_config['header'])
        logger.info(f"Read {len(df)} rows with raw columns: {list(df.columns)}")
        
        if is_com:
            # Commodity processing
            if 'columns' in file_config:
                expected_cols = list(file_config['columns'].keys())
                missing_cols = [col for col in expected_cols if col not in df.columns]
                if missing_cols:
                    logger.warning(f"Missing columns in {file_path} (sheet {sheet}): {missing_cols}")
                df = df[[col for col in df.columns if col in expected_cols]]
                df.columns = [file_config['columns'][col] for col in df.columns]
            else:
                logger.warning(f"No column mapping provided for {file_path} (sheet {sheet}). Using default commodity columns.")
                df.columns = [
                    'date', 'excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt',
                    'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short',
                    'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango', 
                    'state_backwardation_contango', 'state_inflation'
                ]
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df['associated_paper'] = file_config['paper']
            df = df.dropna(subset=['date'])
            logger.debug(f"After dropping NA dates, {len(df)} rows remain")
            
            with engine.connect() as connection:
                # Optimize query by limiting date range
                min_date = df['date'].min().strftime('%Y-%m-%d')
                existing = pd.read_sql(f"SELECT date FROM aqr_cmdty_factors WHERE date >= '{min_date}'", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(existing['date'])
            
            df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')
            df_new = df[~df['date_str'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                # Batch write for large datasets
                for chunk in [df_new[i:i+10000] for i in range(0, len(df_new), 10000)]:
                    chunk.to_sql(
                        'aqr_cmdty_factors',
                        engine,
                        if_exists='append',
                        index=False,
                        dtype={
                            'date': DATE,
                            'excess_return_eqwt': DECIMAL(15, 6),
                            'excess_spot_return_eqwt': DECIMAL(15, 6),
                            'ir_adjusted_carry_eqwt': DECIMAL(15, 6),
                            'spot_return_eqwt': DECIMAL(15, 6),
                            'carry_eqwt': DECIMAL(15, 6),
                            'excess_return_long_short': DECIMAL(15, 6),
                            'excess_spot_return_long_short': DECIMAL(15, 6),
                            'ir_adjusted_carry_long_short': DECIMAL(15, 6),
                            'aggregate_backwardation_contango': DECIMAL(15, 6),
                            'state_backwardation_contango': VARCHAR(50),
                            'state_inflation': VARCHAR(50),
                            'associated_paper': VARCHAR(100)
                        }
                    )
                logger.info(f"Loaded {len(df_new)} new rows into aqr_cmdty_factors from {file_path} (sheet {sheet})")
            else:
                logger.info(f"No new rows to load into aqr_cmdty_factors from {file_path} (sheet {sheet})")
                logger.debug(f"Input date range: {df['date'].min()} to {df['date'].max()}")
                logger.debug(f"Database date range: {existing['date'].min() if not existing.empty else 'N/A'} to {existing['date'].max() if not existing.empty else 'N/A'}")
        else:
            # Factor processing
            # Find date column
            date_col = None
            for col in df.columns:
                if isinstance(col, str) and ('DATE' in col.upper() or 'Date' in col):
                    date_col = col
                    break
            if date_col is None:
                logger.warning(f"No date column found in {file_path} (sheet {sheet}). Using first column.")
                date_col = df.columns[0]
            
            # Validate and rename columns
            orig_cols = df.columns.tolist()
            if 'columns' in file_config:
                expected_cols = list(file_config['columns'].keys())
                valid_cols = [col for col in expected_cols if col in df.columns]
                missing_cols = [col for col in expected_cols if col not in df.columns]
                if missing_cols:
                    logger.warning(f"Missing columns in {file_path} (sheet {sheet}): {missing_cols}")
                if not valid_cols:
                    logger.error(f"No valid columns found in {file_path} (sheet {sheet}). Skipping.")
                    return
                if date_col not in valid_cols:
                    valid_cols.append(date_col)
                df = df[[col for col in df.columns if col in valid_cols]]
                new_columns = ['date' if col == date_col else file_config['columns'].get(col, col) for col in df.columns]
                df.columns = new_columns
            else:
                logger.warning(f"No column mapping provided for {file_path} (sheet {sheet}). Using raw columns.")
            
            logger.info(f"Columns after renaming: {list(df.columns)}")
            
            # Parse dates
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            invalid_dates = df['date'].isna().sum()
            if invalid_dates > 0:
                logger.warning(f"Dropped {invalid_dates} rows due to invalid dates in {file_path} (sheet {sheet})")
            df = df.dropna(subset=['date'])
            
            # Melt DataFrame
            value_vars = [col for col in df.columns if col != 'date']
            df_long = df.melt(id_vars=['date'], value_vars=value_vars, var_name='factor', value_name='value')
            df_long = df_long.dropna(subset=['value', 'date'])
            logger.debug(f"Rows after melting and dropping NA: {len(df_long)}")
            
            # Assign asset class and region
            file_path_for_check = parent_path if parent_path else file_config.get('path', '')
            if 'TSMOM' in file_path_for_check:
                df_long['asset_class'] = df_long['factor'].map({
                    "TSM-MA": "Multi-Asset",
                    "TSM-Com": "Commodities",
                    "TSM-EQ": "Equity",
                    "TSM-FI": "Fixed Income",
                    "TSM-FX": "Currencies"
                })
                df_long['region'] = "Global"
            elif 'Quality Minus Junk' in file_path_for_check:
                df_long['asset_class'] = "Equity"
                df_long['region'] = df_long['date'].apply(
                    lambda x: "USA" if x < pd.Timestamp("1989-07-31") else "Global"
                )
            elif 'Century' in file_path_for_check:
                df_long['asset_class'] = df_long['factor'].str.extract(
                    '(Stock Selection|Equity indices|Fixed income|Currencies|Commodities|All Macro|All asset classes)'
                ).fillna('Equity')
                df_long['region'] = df_long['factor'].apply(
                    lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global'
                )
            else:  # BAB_multi
                df_long['asset_class'] = "Equity" if 'Risk Free Rate' not in file_config['columns'].values() else "Fixed Income"
                region_map = {}
                for i, col in enumerate(orig_cols):
                    if col == date_col:
                        continue
                    factor = file_config['columns'].get(col, col)
                    if col == 'USA':
                        region_map[factor] = 'USA'
                    elif col == 'Global':
                        region_map[factor] = 'Global'
                    elif col == 'Global Ex USA':
                        region_map[factor] = 'Intl'
                    elif col == 'Risk Free Rate':
                        region_map[factor] = 'USA'
                df_long['region'] = df_long['factor'].map(region_map)
                df_long['factor'] = df_long.apply(
                    lambda row: f"{row['factor']}_{row['region']}" if row['factor'] != 'RF' and row['region'] is not None else row['factor'], axis=1
                )
            
            df_long['associated_paper'] = file_config['paper']
            df_long = df_long[['factor', 'date', 'associated_paper', 'asset_class', 'value', 'region']]
            
            with engine.connect() as connection:
                # Optimize query by limiting date range
                min_date = df_long['date'].min().strftime('%Y-%m-%d')
                existing = pd.read_sql(
                    f"SELECT factor, date, associated_paper, asset_class FROM factor_returns WHERE date >= '{min_date}'",
                    connection
                )
                existing['asset_class'] = existing['asset_class'].fillna('Unknown')
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(tuple(row) for row in existing[['factor', 'date', 'associated_paper', 'asset_class']].values)
            
            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            df_long['asset_class'] = df_long['asset_class'].fillna('Unknown')
            df_long['key'] = df_long.apply(
                lambda row: (row['factor'], row['date_str'], row['associated_paper'] if row['associated_paper'] else 'None', row['asset_class']), axis=1
            )
            df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['date_str', 'key'])
            
            if not df_new.empty:
                # Batch write for large datasets
                for chunk in [df_new[i:i+10000] for i in range(0, len(df_new), 10000)]:
                    chunk.to_sql(
                        'factor_returns',
                        engine,
                        if_exists='append',
                        index=False,
                        dtype={
                            'factor': VARCHAR(50),
                            'date': DATE,
                            'associated_paper': VARCHAR(100),
                            'asset_class': VARCHAR(50),
                            'value': DECIMAL(15, 6),
                            'region': VARCHAR(50)
                        }
                    )
                logger.info(f"Loaded {len(df_new)} new rows into factor_returns from {file_path} (sheet {sheet})")
            else:
                logger.info(f"No new rows to load into factor_returns from {file_path} (sheet {sheet})")
                logger.debug(f"Input date range: {df_long['date'].min()} to {df_long['date'].max()}")
                logger.debug(f"Database date range: {existing['date'].min() if not existing.empty else 'N/A'} to {existing['date'].max() if not existing.empty else 'N/A'}")
            
    except FileNotFoundError as e:
        logger.error(f"File not found: {file_path} (sheet {sheet}): {str(e)}")
    except pd.errors.ParserError as e:
        logger.error(f"Error parsing Excel file {file_path} (sheet {sheet}): {str(e)}")
    except KeyError as e:
        logger.error(f"Key error processing {file_path} (sheet {sheet}): {str(e)}")
    except Exception as e:
        logger.error(f"Unexpected error processing {file_path} (sheet {sheet}): {type(e).__name__}: {str(e)}")

# Main loop - Process all datasets, no skips
for key, config in data_files.items():
    logger.info(f"Processing dataset: {key}")
    if key == "BAB_multi":
        for subkey, subconfig in config['sheets'].items():
            process_factors(subconfig, sheet=subconfig['sheet'], parent_path=config['path'])
    else:
        is_com = (key == "COM")
        process_factors(config, sheet=config['sheet'], is_com=is_com)

logger.info("All data processing complete!")

2025-04-10 18:04:50,153 - INFO - Processing dataset: BAB_multi
2025-04-10 18:04:50,155 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0) with is_com=False
2025-04-10 18:04:51,730 - INFO - Read 1129 rows with raw columns: ['DATE', 'AUS', 'AUT', 'BEL', 'CAN', 'CHE', 'DEU', 'DNK', 'ESP', 'FIN', 'FRA', 'GBR', 'GRC', 'HKG', 'IRL', 'ISR', 'ITA', 'JPN', 'NLD', 'NOR', 'NZL', 'PRT', 'SGP', 'SWE', 'USA', 'Global', 'Global Ex USA', 'Europe', 'North America', 'Pacific']
2025-04-10 18:04:51,733 - INFO - Columns after renaming: ['date', 'BAB', 'BAB', 'BAB']
2025-04-10 18:04:52,682 - ERROR - Unexpected error processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0): IntegrityError: (pyodbc.IntegrityError) ('23000', "[23000] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Violatio

In [24]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
import logging

# Logging setup - Ensure DEBUG level is active
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)  # Explicitly set to DEBUG

# Database connection
engine = create_engine(
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

# File configurations (unchanged except for clarity)
data_files = {
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {"sheet": 0, "header": 18, "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)", "columns": {"DATE": "DATE", "USA": "BAB", "Global": "BAB", "Global Ex USA": "BAB"}},
            "MKT": {"sheet": 4, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "MKT", "Global": "MKT", "Global Ex USA": "MKT"}},
            "SMB": {"sheet": 5, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "SMB", "Global": "SMB", "Global Ex USA": "SMB"}},
            "HML-FF": {"sheet": 6, "header": 18, "paper": "Fama-French Factors", "columns": {"DATE": "DATE", "USA": "HML-FF", "Global": "HML-FF", "Global Ex USA": "HML-FF"}},
            "HML-D": {"sheet": 7, "header": 18, "paper": "The Devil's in HML's Details", "columns": {"DATE": "DATE", "USA": "HML-D", "Global": "HML-D", "Global Ex USA": "HML-D"}},
            "UMD": {"sheet": 8, "header": 18, "paper": "On Persistence in Mutual Fund Performance", "columns": {"DATE": "DATE", "USA": "UMD", "Global": "UMD", "Global Ex USA": "UMD"}},
            "ME": {"sheet": 9, "header": 18, "paper": "AQR Factors", "columns": {"DATE": "DATE", "USA": "ME", "Global": "ME", "Global Ex USA": "ME"}},
            "RF": {"sheet": 10, "header": 18, "paper": None, "columns": {"DATE": "DATE", "Risk Free Rate": "RF"}}
        }
    },
    # ... (other datasets unchanged)
}

def process_factors(file_config, sheet=0, is_com=False, parent_path=None):
    file_path = parent_path if parent_path else file_config['path']
    logger.info(f"Processing {file_path} (sheet {sheet}) with is_com={is_com}")
    try:
        # Read Excel
        df = pd.read_excel(file_path, sheet_name=sheet, header=file_config['header'])
        logger.info(f"Read {len(df)} rows with raw columns: {list(df.columns)}")
        
        if is_com:
            if 'columns' in file_config:
                expected_cols = list(file_config['columns'].keys())
                missing_cols = [col for col in expected_cols if col not in df.columns]
                if missing_cols:
                    logger.warning(f"Missing columns in {file_path} (sheet {sheet}): {missing_cols}")
                df = df[[col for col in df.columns if col in expected_cols]]
                df.columns = [file_config['columns'][col] for col in df.columns]
            else:
                logger.warning(f"No column mapping provided for {file_path} (sheet {sheet}). Using default commodity columns.")
                df.columns = [
                    'date', 'excess_return_eqwt', 'excess_spot_return_eqwt', 'ir_adjusted_carry_eqwt',
                    'spot_return_eqwt', 'carry_eqwt', 'excess_return_long_short', 'excess_spot_return_long_short',
                    'ir_adjusted_carry_long_short', 'aggregate_backwardation_contango', 
                    'state_backwardation_contango', 'state_inflation'
                ]
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df['associated_paper'] = file_config['paper'] if file_config['paper'] is not None else 'Unknown'
            df = df.dropna(subset=['date'])
            logger.debug(f"After dropping NA dates, {len(df)} rows remain")
            
            with engine.connect() as connection:
                min_date = df['date'].min().strftime('%Y-%m-%d')
                existing = pd.read_sql(f"SELECT date FROM aqr_cmdty_factors WHERE date >= '{min_date}'", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(existing['date'])
                logger.debug(f"Existing commodity dates sample (first 5): {list(existing_keys)[:5]}")
            
            df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')
            df_new = df[~df['date_str'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                for chunk in [df_new[i:i+10000] for i in range(0, len(df_new), 10000)]:
                    chunk.to_sql(
                        'aqr_cmdty_factors',
                        engine,
                        if_exists='append',
                        index=False,
                        dtype={
                            'date': DATE,
                            'excess_return_eqwt': DECIMAL(15, 6),
                            'excess_spot_return_eqwt': DECIMAL(15, 6),
                            'ir_adjusted_carry_eqwt': DECIMAL(15, 6),
                            'spot_return_eqwt': DECIMAL(15, 6),
                            'carry_eqwt': DECIMAL(15, 6),
                            'excess_return_long_short': DECIMAL(15, 6),
                            'excess_spot_return_long_short': DECIMAL(15, 6),
                            'ir_adjusted_carry_long_short': DECIMAL(15, 6),
                            'aggregate_backwardation_contango': DECIMAL(15, 6),
                            'state_backwardation_contango': VARCHAR(50),
                            'state_inflation': VARCHAR(50),
                            'associated_paper': VARCHAR(100)
                        }
                    )
                logger.info(f"Loaded {len(df_new)} new rows into aqr_cmdty_factors from {file_path} (sheet {sheet})")
            else:
                logger.info(f"No new rows to load into aqr_cmdty_factors from {file_path} (sheet {sheet})")
                logger.debug(f"Input date range: {df['date'].min()} to {df['date'].max()}")
                logger.debug(f"Database date range: {existing['date'].min() if not existing.empty else 'N/A'} to {existing['date'].max() if not existing.empty else 'N/A'}")
        else:
            # Find date column
            date_col = next(
                (col for col in df.columns if isinstance(col, str) and ('DATE' in col.upper() or 'Date' in col)),
                None
            )
            if date_col is None:
                logger.warning(f"No date column found in {file_path} (sheet {sheet}). Using first column.")
                date_col = df.columns[0]
            
            # Validate and rename columns
            orig_cols = df.columns.tolist()
            if 'columns' in file_config:
                expected_cols = list(file_config['columns'].keys())
                valid_cols = [col for col in expected_cols if col in df.columns]
                missing_cols = [col for col in expected_cols if col not in df.columns]
                if missing_cols:
                    logger.warning(f"Missing columns in {file_path} (sheet {sheet}): {missing_cols}")
                if not valid_cols:
                    logger.error(f"No valid columns found in {file_path} (sheet {sheet}). Skipping.")
                    return
                if date_col not in valid_cols:
                    valid_cols.append(date_col)
                df = df[[col for col in df.columns if col in valid_cols]]
                new_columns = ['date' if col == date_col else file_config['columns'].get(col, col) for col in df.columns]
                df.columns = new_columns
            
            logger.info(f"Columns after renaming: {list(df.columns)}")
            
            # Parse dates
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            invalid_dates = df['date'].isna().sum()
            if invalid_dates > 0:
                logger.warning(f"Dropped {invalid_dates} rows due to invalid dates in {file_path} (sheet {sheet})")
            df = df.dropna(subset=['date'])
            logger.debug(f"After dropping NA dates, {len(df)} rows remain")
            
            # Melt DataFrame
            value_vars = [col for col in df.columns if col != 'date']
            df_long = df.melt(id_vars=['date'], value_vars=value_vars, var_name='factor', value_name='value')
            df_long = df_long.dropna(subset=['value', 'date'])
            logger.debug(f"Rows after melting and dropping NA: {len(df_long)}")
            
            # Assign asset class and region
            file_path_for_check = parent_path if parent_path else file_config.get('path', '')
            if 'TSMOM' in file_path_for_check:
                df_long['asset_class'] = df_long['factor'].map({
                    "TSM-MA": "Multi-Asset",
                    "TSM-Com": "Commodities",
                    "TSM-EQ": "Equity",
                    "TSM-FI": "Fixed Income",
                    "TSM-FX": "Currencies"
                })
                df_long['region'] = "Global"
            elif 'Quality Minus Junk' in file_path_for_check:
                df_long['asset_class'] = "Equity"
                df_long['region'] = df_long['date'].apply(
                    lambda x: "USA" if x < pd.Timestamp("1989-07-31") else "Global"
                )
            elif 'Century' in file_path_for_check:
                df_long['asset_class'] = df_long['factor'].str.extract(
                    '(Stock Selection|Equity indices|Fixed income|Currencies|Commodities|All Macro|All asset classes)'
                ).fillna('Equity')
                df_long['region'] = df_long['factor'].apply(
                    lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global'
                )
            else:  # BAB_multi
                df_long['asset_class'] = "Equity" if 'Risk Free Rate' not in file_config['columns'].values() else "Fixed Income"
                region_map = {}
                for col in orig_cols:
                    if col == date_col:
                        continue
                    factor = file_config['columns'].get(col, col)
                    if col == 'USA':
                        region_map[factor] = 'USA'
                    elif col == 'Global':
                        region_map[factor] = 'Global'
                    elif col == 'Global Ex USA':
                        region_map[factor] = 'Intl'
                    elif col == 'Risk Free Rate':
                        region_map[factor] = 'USA'
                df_long['region'] = df_long['factor'].map(region_map)
                df_long['factor'] = df_long.apply(
                    lambda row: f"{row['factor']}_{row['region']}" if row['factor'] != 'RF' and row['region'] is not None else row['factor'], axis=1
                )
            
            df_long['associated_paper'] = file_config['paper'] if file_config['paper'] is not None else 'Unknown'
            df_long = df_long[['factor', 'date', 'associated_paper', 'asset_class', 'value', 'region']]
            
            with engine.connect() as connection:
                min_date = df_long['date'].min().strftime('%Y-%m-%d')
                existing = pd.read_sql(
                    f"SELECT factor, date, associated_paper, asset_class FROM factor_returns WHERE date >= '{min_date}'",
                    connection
                )
                existing['asset_class'] = existing['asset_class'].fillna('Unknown')
                existing['associated_paper'] = existing['associated_paper'].fillna('None')
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(tuple(row) for row in existing[['factor', 'date', 'associated_paper', 'asset_class']].values)
                logger.debug(f"Existing keys sample (first 5): {list(existing_keys)[:5]}")
            
            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            df_long['asset_class'] = df_long['asset_class'].fillna('Unknown')
            df_long['associated_paper'] = df_long['associated_paper'].fillna('None')
            df_long['key'] = df_long.apply(
                lambda row: (row['factor'], row['date_str'], row['associated_paper'], row['asset_class']), axis=1
            )
            logger.debug(f"New keys sample (first 5): {df_long['key'].head().tolist()}")
            df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['date_str', 'key'])
            
            if not df_new.empty:
                for chunk in [df_new[i:i+10000] for i in range(0, len(df_new), 10000)]:
                    chunk.to_sql(
                        'factor_returns',
                        engine,
                        if_exists='append',
                        index=False,
                        dtype={
                            'factor': VARCHAR(50),
                            'date': DATE,
                            'associated_paper': VARCHAR(100),
                            'asset_class': VARCHAR(50),
                            'value': DECIMAL(15, 6),
                            'region': VARCHAR(50)
                        }
                    )
                logger.info(f"Loaded {len(df_new)} new rows into factor_returns from {file_path} (sheet {sheet})")
            else:
                logger.info(f"No new rows to load into factor_returns from {file_path} (sheet {sheet})")
                logger.debug(f"Input date range: {df_long['date'].min()} to {df_long['date'].max()}")
                logger.debug(f"Database date range: {existing['date'].min() if not existing.empty else 'N/A'} to {existing['date'].max() if not existing.empty else 'N/A'}")
            
    except FileNotFoundError as e:
        logger.error(f"File not found: {file_path} (sheet {sheet}): {str(e)}")
    except pd.errors.ParserError as e:
        logger.error(f"Error parsing Excel file {file_path} (sheet {sheet}): {str(e)}")
    except Exception as e:
        logger.error(f"Unexpected error processing {file_path} (sheet {sheet}): {type(e).__name__}: {str(e)}")
        raise  # Re-raise to see full stack trace during debugging

# Main loop
for key, config in data_files.items():
    logger.info(f"Processing dataset: {key}")
    if key == "BAB_multi":
        for subkey, subconfig in config['sheets'].items():
            process_factors(subconfig, sheet=subconfig['sheet'], parent_path=config['path'])
    else:
        is_com = (key == "COM")
        process_factors(config, sheet=config['sheet'], is_com=is_com)

logger.info("All data processing complete!")

2025-04-11 11:22:26,871 - INFO - Processing dataset: BAB_multi
2025-04-11 11:22:26,872 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 0) with is_com=False
2025-04-11 11:22:28,277 - INFO - Read 1129 rows with raw columns: ['DATE', 'AUS', 'AUT', 'BEL', 'CAN', 'CHE', 'DEU', 'DNK', 'ESP', 'FIN', 'FRA', 'GBR', 'GRC', 'HKG', 'IRL', 'ISR', 'ITA', 'JPN', 'NLD', 'NOR', 'NZL', 'PRT', 'SGP', 'SWE', 'USA', 'Global', 'Global Ex USA', 'Europe', 'North America', 'Pacific']
2025-04-11 11:22:28,279 - INFO - Columns after renaming: ['date', 'BAB', 'BAB', 'BAB']
2025-04-11 11:22:28,290 - DEBUG - After dropping NA dates, 1129 rows remain
2025-04-11 11:22:28,296 - DEBUG - Rows after melting and dropping NA: 2039
2025-04-11 11:22:29,135 - DEBUG - Existing keys sample (first 5): [('TSM-Com_Unknown', '2016-10-31', 'Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)', 'Equity'), ('QMJ-P3

IntegrityError: (pyodbc.IntegrityError) ('23000', "[23000] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Violation of PRIMARY KEY constraint 'PK__factor_r__C3CBC7054E79A6D5'. Cannot insert duplicate key in object 'dbo.factor_returns'. The duplicate key value is (BAB_Intl, 2018-03-31, Betting Against Beta (Frazzini and Pedersen, 2014)). (2627) (SQLExecDirectW); [23000] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]The statement has been terminated. (3621)")
[SQL: INSERT INTO factor_returns (factor, date, associated_paper, asset_class, value, region) VALUES (?, ?, ?, ?, ?, ?), (?, ?, ?, ?, ?, ?), (?, ?, ?, ?, ?, ?), (?, ?, ?, ?, ?, ?), (?, ?, ?, ?, ?, ?), (?, ?, ?, ?, ?, ?), (?, ?, ?, ?, ?, ?), (?, ?, ?, ?, ?, ... 4663 characters truncated ... , (?, ?, ?, ?, ?, ?), (?, ?, ?, ?, ?, ?), (?, ?, ?, ?, ?, ?), (?, ?, ?, ?, ?, ?), (?, ?, ?, ?, ?, ?)]
[parameters: ('BAB_Intl', datetime.datetime(2018, 3, 31, 0, 0), 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Equity', 0.024726359719057713, 'Intl', 'BAB_Intl', datetime.datetime(2018, 4, 30, 0, 0), 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Equity', 0.0007799553393408837, 'Intl', 'BAB_Intl', datetime.datetime(2018, 5, 31, 0, 0), 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Equity', 0.0061733720885607715, 'Intl', 'BAB_Intl', datetime.datetime(2018, 6, 30, 0, 0), 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Equity', 0.023681918622375155, 'Intl', 'BAB_Intl', datetime.datetime(2018, 7, 31, 0, 0), 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Equity', -0.003681432428082633, 'Intl', 'BAB_Intl', datetime.datetime(2018, 8, 31, 0, 0), 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Equity', -0.012114392481366903, 'Intl', 'BAB_Intl', datetime.datetime(2018, 9, 30, 0, 0), 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Equity', -0.006209095953888497, 'Intl', 'BAB_Intl', datetime.datetime(2018, 10, 31, 0, 0), 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Equity', 0.02111187283759784, 'Intl', 'BAB_Intl', datetime.datetime(2018, 11, 30, 0, 0) ... 1376 parameters truncated ... -2.5060403776764988e-05, 'Intl', 'BAB_Intl', datetime.datetime(2024, 5, 31, 0, 0), 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Equity', 0.002565263878111521, 'Intl', 'BAB_Intl', datetime.datetime(2024, 6, 30, 0, 0), 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Equity', 0.012043546603209901, 'Intl', 'BAB_Intl', datetime.datetime(2024, 7, 31, 0, 0), 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Equity', 0.013925485845601635, 'Intl', 'BAB_Intl', datetime.datetime(2024, 8, 31, 0, 0), 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Equity', 0.02046397011764839, 'Intl', 'BAB_Intl', datetime.datetime(2024, 9, 30, 0, 0), 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Equity', -0.0019636642634532978, 'Intl', 'BAB_Intl', datetime.datetime(2024, 10, 31, 0, 0), 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Equity', 0.004500638512225897, 'Intl', 'BAB_Intl', datetime.datetime(2024, 11, 30, 0, 0), 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Equity', -0.0030851874094016255, 'Intl', 'BAB_Intl', datetime.datetime(2024, 12, 31, 0, 0), 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Equity', -0.011472252667659768, 'Intl')]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

In [25]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
import logging

# Logging setup
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# Database connection
engine = create_engine(
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

# File configurations updated with sheet names and headers
data_files = {
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {"sheet": "BAB Factors", "header": 19, "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)", "factor_name": "Bet Against Beta", "columns": {"DATE": "DATE", "USA": "BAB", "Global": "BAB", "Global Ex USA": "BAB"}},
            "MKT": {"sheet": "MKT", "header": 19, "paper": "Fama-French Factors", "factor_name": "Market", "columns": {"DATE": "DATE", "USA": "MKT", "Global": "MKT", "Global Ex USA": "MKT"}},
            "SMB": {"sheet": "SMB", "header": 19, "paper": "Fama-French Factors", "factor_name": "Small Minus Big", "columns": {"DATE": "DATE", "USA": "SMB", "Global": "SMB", "Global Ex USA": "SMB"}},
            "HML-FF": {"sheet": "HML FF", "header": 19, "paper": "Fama-French Factors", "factor_name": "High Minus Low - Fama French", "columns": {"DATE": "DATE", "USA": "HML-FF", "Global": "HML-FF", "Global Ex USA": "HML-FF"}},
            "HML-D": {"sheet": "HML Devil", "header": 19, "paper": "The Devil's in HML's Details", "factor_name": "High Minus Low - Devil/AQR", "columns": {"DATE": "DATE", "USA": "HML-D", "Global": "HML-D", "Global Ex USA": "HML-D"}},
            "UMD": {"sheet": "UMD", "header": 19, "paper": "On Persistence in Mutual Fund Performance", "factor_name": "Up Minus Down", "columns": {"DATE": "DATE", "USA": "UMD", "Global": "UMD", "Global Ex USA": "UMD"}},
            "ME": {"sheet": "ME(t-1)", "header": 19, "paper": "AQR Factors", "factor_name": "Market Value of Equity", "columns": {"DATE": "DATE", "USA": "ME", "Global": "ME", "Global Ex USA": "ME"}},
            "RF": {"sheet": "RF", "header": 18, "paper": None, "factor_name": "Risk Free", "columns": {"DATE": "DATE", "Risk Free Rate": "RF"}}
        }
    },
    "TSMOM": {
        "path": r"C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx",
        "sheet": "TSMOM Factors",
        "header": 17,
        "paper": "Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)",
        "columns": {
            "Unnamed: 0": "DATE",
            "TSMOM": "TSM-MA",
            "TSMOM^CM": "TSM-Com",
            "TSMOM^EQ": "TSM-EQ",
            "TSMOM^FI": "TSM-FI",
            "TSMOM^FX": "TSM-FX"
        }
    },
    "QMJ": {
        "path": r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx",
        "sheet": "10 Portfolios Formed on Quality",
        "header": 19,
        "paper": "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)",
        "columns": {
            "DATE": "DATE",
            "P1 (low quality)": "QMJ-P1-LQ",
            "P2": "QMJ-P2",
            "P3": "QMJ-P3",
            "P4": "QMJ-P4",
            "P5": "QMJ-P5",
            "P6": "QMJ-P6",
            "P7": "QMJ-P7",
            "P8": "QMJ-P8",
            "P9": "QMJ-P9",
            "P10 (high quality)": "QMJ-P10-HQ",
            "P10-P1": "QMJ"
        }
    },
    "Century": {
        "path": r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Century of Factor Premia Monthly-Ilmanen et al. (2021)",
        "columns": {
            "Date": "DATE",
            "US Stock Selection Value": "HML-FF-US", "US Stock Selection Momentum": "UMD-US", 
            "US Stock Selection Defensive": "BAB-US", "US Stock Selection Multi-style": "Multi-style-US",
            "Intl Stock Selection Value": "HML-FF-Intl", "Intl Stock Selection Momentum": "UMD-Intl", 
            "Intl Stock Selection Defensive": "BAB-Intl", "Intl Stock Selection Multi-style": "Multi-style-Intl",
            "Equity indices Value": "HML-Equity", "Equity indices Momentum": "UMD-Equity", 
            "Equity indices Carry": "Carry-Equity", "Equity indices Defensive": "BAB-Equity", 
            "Equity indices Multi-style": "Multi-style-Equity",
            "Fixed income Value": "HML-FI", "Fixed income Momentum": "UMD-FI", 
            "Fixed income Carry": "Carry-FI", "Fixed income Defensive": "BAB-FI", 
            "Fixed income Multi-style": "Multi-style-FI",
            "Currencies Value": "HML-FX", "Currencies Momentum": "UMD-FX", 
            "Currencies Carry": "Carry-FX", "Currencies Multi-style": "Multi-style-FX",
            "Commodities Value": "HML-COM", "Commodities Momentum": "UMD-COM", 
            "Commodities Carry": "Carry-COM", "Commodities Multi-style": "Multi-style-COM",
            "All Stock Selection Value": "HML-All-SS", "All Stock Selection Momentum": "UMD-All-SS", 
            "All Stock Selection Defensive": "BAB-All-SS", "All Stock Selection Multi-style": "Multi-style-All-SS",
            "All Macro Value": "HML-All-Macro", "All Macro Momentum": "UMD-All-Macro", 
            "All Macro Carry": "Carry-All-Macro", "All Macro Defensive": "BAB-All-Macro", 
            "All Macro Multi-style": "Multi-style-All-Macro",
            "All asset classes Value": "HML-All", "All asset classes Momentum": "UMD-All", 
            "All asset classes Carry": "Carry-All", "All asset classes Defensive": "BAB-All", 
            "All asset classes Multi-style": "Multi-style-All",
            "Equity indices Market": "MKT-Equity", "Fixed income Market": "MKT-FI", 
            "Commodities Market": "MKT-COM", "All Macro Market": "MKT-All-Macro"
        }
    },
    "COM": {
        "path": r"C:\Users\JulianHeron\Downloads\Commodities for the Long Run Index Level Data Monthly.xlsx",
        "sheet": 0,
        "header": 10,
        "paper": "Commodities for the Long Run",
        "columns": {
            "Date": "date",
            "Excess return of equal-weight commodities portfolio": "excess_return_eqwt",
            "Excess spot return of equal-weight commodities portfolio": "excess_spot_return_eqwt",
            "Interest rate adjusted carry of equal-weight commodities portfolio": "ir_adjusted_carry_eqwt",
            "Spot return of equal-weight commodities portfolio": "spot_return_eqwt",
            "Carry of equal-weight commodities portfolio": "carry_eqwt",
            "Excess return of long/short commodities portfolio": "excess_return_long_short",
            "Excess spot return of long/short commodities portfolio": "excess_spot_return_long_short",
            "Interest rate adjusted carry of long/short commodities portfolio": "ir_adjusted_carry_long_short",
            "Aggregate backwardation/contango": "aggregate_backwardation_contango",
            "State of backwardation/contango": "state_backwardation_contango",
            "State of inflation": "state_inflation"
        }
    }
}

def process_factors(file_config, sheet=0, is_com=False, parent_path=None):
    file_path = parent_path if parent_path else file_config['path']
    logger.info(f"Processing {file_path} (sheet {sheet}) with is_com={is_com}")
    try:
        df = pd.read_excel(file_path, sheet_name=sheet, header=file_config['header'])
        logger.info(f"Read {len(df)} rows with raw columns: {list(df.columns)}")
        
        if is_com:
            expected_cols = list(file_config['columns'].keys())
            missing_cols = [col for col in expected_cols if col not in df.columns]
            if missing_cols:
                logger.warning(f"Missing columns in {file_path} (sheet {sheet}): {missing_cols}")
            df = df[[col for col in df.columns if col in expected_cols]]
            df.columns = [file_config['columns'][col] for col in df.columns]
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df['associated_paper'] = file_config['paper'] if file_config['paper'] is not None else 'Unknown'
            df = df.dropna(subset=['date'])
            logger.debug(f"After dropping NA dates, {len(df)} rows remain")
            
            with engine.connect() as connection:
                existing = pd.read_sql("SELECT date FROM aqr_cmdty_factors", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(existing['date'])
                logger.debug(f"Existing commodity dates sample (first 5): {list(existing_keys)[:5]}")
            
            df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')
            df_new = df[~df['date_str'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                for chunk in [df_new[i:i+10000] for i in range(0, len(df_new), 10000)]:
                    chunk.to_sql(
                        'aqr_cmdty_factors',
                        engine,
                        if_exists='append',
                        index=False,
                        dtype={
                            'date': DATE,
                            'excess_return_eqwt': DECIMAL(15, 6),
                            'excess_spot_return_eqwt': DECIMAL(15, 6),
                            'ir_adjusted_carry_eqwt': DECIMAL(15, 6),
                            'spot_return_eqwt': DECIMAL(15, 6),
                            'carry_eqwt': DECIMAL(15, 6),
                            'excess_return_long_short': DECIMAL(15, 6),
                            'excess_spot_return_long_short': DECIMAL(15, 6),
                            'ir_adjusted_carry_long_short': DECIMAL(15, 6),
                            'aggregate_backwardation_contango': DECIMAL(15, 6),
                            'state_backwardation_contango': VARCHAR(50),
                            'state_inflation': VARCHAR(50),
                            'associated_paper': VARCHAR(100)
                        }
                    )
                logger.info(f"Loaded {len(df_new)} new rows into aqr_cmdty_factors from {file_path} (sheet {sheet})")
            else:
                logger.info(f"No new rows to load into aqr_cmdty_factors from {file_path} (sheet {sheet})")
                logger.debug(f"Input date range: {df['date'].min()} to {df['date'].max()}")
                logger.debug(f"Database date range: {existing['date'].min() if not existing.empty else 'N/A'} to {existing['date'].max() if not existing.empty else 'N/A'}")
        else:
            date_col = next(
                (col for col in df.columns if isinstance(col, str) and ('DATE' in col.upper() or 'Date' in col)),
                None
            )
            if date_col is None:
                logger.warning(f"No date column found in {file_path} (sheet {sheet}). Using first column.")
                date_col = df.columns[0]
            
            orig_cols = df.columns.tolist()
            if 'columns' in file_config:
                expected_cols = list(file_config['columns'].keys())
                valid_cols = [col for col in expected_cols if col in df.columns]
                missing_cols = [col for col in expected_cols if col not in df.columns]
                if missing_cols:
                    logger.warning(f"Missing columns in {file_path} (sheet {sheet}): {missing_cols}")
                if not valid_cols:
                    logger.error(f"No valid columns found in {file_path} (sheet {sheet}). Skipping.")
                    return
                if date_col not in valid_cols:
                    valid_cols.append(date_col)
                df = df[[col for col in df.columns if col in valid_cols]]
                new_columns = ['date' if col == date_col else file_config['columns'].get(col, col) for col in df.columns]
                df.columns = new_columns
            
            logger.info(f"Columns after renaming: {list(df.columns)}")
            
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            invalid_dates = df['date'].isna().sum()
            if invalid_dates > 0:
                logger.warning(f"Dropped {invalid_dates} rows due to invalid dates in {file_path} (sheet {sheet})")
            df = df.dropna(subset=['date'])
            logger.debug(f"After dropping NA dates, {len(df)} rows remain")
            
            value_vars = [col for col in df.columns if col != 'date']
            df_long = df.melt(id_vars=['date'], value_vars=value_vars, var_name='factor_symbol', value_name='value')
            df_long = df_long.dropna(subset=['value', 'date'])
            logger.debug(f"Rows after melting and dropping NA: {len(df_long)}")
            
            # Assign columns based on table design
            file_path_for_check = parent_path if parent_path else file_config.get('path', '')
            if 'TSMOM' in file_path_for_check:
                df_long['factor_symbol'] = df_long['factor_symbol']  # Keep as is (TSM-MA, etc.)
                df_long['portfolio'] = 'TSMOM'  # Generic portfolio name for TSMOM factors
                df_long['region'] = "Global"
                df_long['factor_name'] = df_long['factor_symbol'].map({
                    "TSM-MA": "Time Series Momentum Multi-Asset",
                    "TSM-Com": "Time Series Momentum Commodities",
                    "TSM-EQ": "Time Series Momentum Equity",
                    "TSM-FI": "Time Series Momentum Fixed Income",
                    "TSM-FX": "Time Series Momentum Currencies"
                })
            elif 'Quality Minus Junk' in file_path_for_check:
                df_long['factor_symbol'] = df_long['factor_symbol'].apply(lambda x: "QMJ" if x == "QMJ" else x.split('-')[0])
                df_long['portfolio'] = df_long['factor_symbol'].apply(lambda x: x if x != "QMJ" else "P10-P1")
                df_long['region'] = df_long['date'].apply(lambda x: "USA" if x < pd.Timestamp("1989-07-31") else "Global")
                df_long['factor_name'] = df_long['factor_symbol'].apply(lambda x: "Quality Minus Junk" if x == "QMJ" else f"Quality Portfolio {x.split('-')[1]}")
            elif 'Century' in file_path_for_check:
                df_long['factor_symbol'] = df_long['factor_symbol'].apply(lambda x: x.split('-')[0])
                df_long['portfolio'] = df_long['factor_symbol'].apply(lambda x: x)  # Use factor as portfolio for simplicity
                df_long['region'] = df_long['factor_symbol'].apply(lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global')
                df_long['factor_name'] = df_long['factor_symbol'].map({
                    "HML": "High Minus Low", "UMD": "Up Minus Down", "BAB": "Bet Against Beta",
                    "MKT": "Market", "Carry": "Carry", "Multi-style": "Multi-style"
                }).fillna(df_long['factor_symbol'])
            else:  # BAB_multi
                df_long['factor_symbol'] = df_long['factor_symbol']  # Keep base factor (e.g., "BAB")
                df_long['portfolio'] = file_config.get('sheet', 'Unknown')  # Use sheet name as portfolio
                region_map = {}
                for col in orig_cols:
                    if col == date_col:
                        continue
                    factor = file_config['columns'].get(col, col)
                    if col == 'USA':
                        region_map[factor] = 'USA'
                    elif col == 'Global':
                        region_map[factor] = 'Global'
                    elif col == 'Global Ex USA':
                        region_map[factor] = 'Intl'
                    elif col == 'Risk Free Rate':
                        region_map[factor] = 'USA'
                df_long['region'] = df_long['factor_symbol'].map(region_map)
                df_long['factor_name'] = file_config.get('factor_name', df_long['factor_symbol'])
            
            df_long['associated_paper'] = file_config['paper'] if file_config['paper'] is not None else 'Unknown'
            df_long = df_long[['factor_symbol', 'factor_name', 'portfolio', 'region', 'date', 'value', 'associated_paper']]
            
            with engine.connect() as connection:
                existing = pd.read_sql(
                    "SELECT factor_symbol, portfolio, region, date FROM factor_returns",
                    connection
                )
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(tuple(row) for row in existing[['factor_symbol', 'portfolio', 'region', 'date']].values)
                logger.debug(f"Existing keys sample (first 5): {list(existing_keys)[:5]}")
                logger.debug(f"Total existing keys: {len(existing_keys)}")
            
            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            df_long['key'] = df_long.apply(
                lambda row: (row['factor_symbol'], row['portfolio'], row['region'], row['date_str']), axis=1
            )
            logger.debug(f"New keys sample (first 5): {df_long['key'].head().tolist()}")
            df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['date_str', 'key'])
            
            if not df_new.empty:
                logger.debug(f"New rows to insert: {len(df_new)}. Sample (first 5): {df_new[['factor_symbol', 'portfolio', 'region', 'date']].head().to_dict('records')}")
                for chunk in [df_new[i:i+10000] for i in range(0, len(df_new), 10000)]:
                    chunk.to_sql(
                        'factor_returns',
                        engine,
                        if_exists='append',
                        index=False,
                        dtype={
                            'factor_symbol': VARCHAR(20),
                            'factor_name': VARCHAR(100),
                            'portfolio': VARCHAR(100),
                            'region': VARCHAR(50),
                            'date': DATE,
                            'value': DECIMAL(15, 6),
                            'associated_paper': VARCHAR(150)
                        }
                    )
                logger.info(f"Loaded {len(df_new)} new rows into factor_returns from {file_path} (sheet {sheet})")
            else:
                logger.info(f"No new rows to load into factor_returns from {file_path} (sheet {sheet})")
                logger.debug(f"Input date range: {df_long['date'].min()} to {df_long['date'].max()}")
                logger.debug(f"Database date range: {existing['date'].min() if not existing.empty else 'N/A'} to {existing['date'].max() if not existing.empty else 'N/A'}")
            
    except FileNotFoundError as e:
        logger.error(f"File not found: {file_path} (sheet {sheet}): {str(e)}")
    except pd.errors.ParserError as e:
        logger.error(f"Error parsing Excel file {file_path} (sheet {sheet}): {str(e)}")
    except Exception as e:
        logger.error(f"Unexpected error processing {file_path} (sheet {sheet}): {type(e).__name__}: {str(e)}")

# Main loop
for key, config in data_files.items():
    logger.info(f"Processing dataset: {key}")
    if key == "BAB_multi":
        for subkey, subconfig in config['sheets'].items():
            process_factors(subconfig, sheet=subconfig['sheet'], parent_path=config['path'])
    else:
        is_com = (key == "COM")
        process_factors(config, sheet=config['sheet'], is_com=is_com)

logger.info("All data processing complete!")

2025-04-11 11:56:49,654 - INFO - Processing dataset: BAB_multi
2025-04-11 11:56:49,656 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet BAB Factors) with is_com=False
2025-04-11 11:56:51,046 - INFO - Read 1128 rows with raw columns: ['12/31/1930', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', -0.000557986425086, 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29']
2025-04-11 11:56:51,049 - ERROR - No valid columns found in C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet 

In [26]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
import logging

# Logging setup
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# Database connection
engine = create_engine(
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

# File configurations
data_files = {
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {"sheet": "BAB Factors", "header": 18, "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)", "factor_name": "Bet Against Beta", "columns": {"DATE": "DATE", "USA": "BAB", "Global": "BAB", "Global Ex USA": "BAB"}},
            "MKT": {"sheet": "MKT", "header": 18, "paper": "Fama-French Factors", "factor_name": "Market", "columns": {"DATE": "DATE", "USA": "MKT", "Global": "MKT", "Global Ex USA": "MKT"}},
            "SMB": {"sheet": "SMB", "header": 18, "paper": "Fama-French Factors", "factor_name": "Small Minus Big", "columns": {"DATE": "DATE", "USA": "SMB", "Global": "SMB", "Global Ex USA": "SMB"}},
            "HML-FF": {"sheet": "HML FF", "header": 18, "paper": "Fama-French Factors", "factor_name": "High Minus Low - Fama French", "columns": {"DATE": "DATE", "USA": "HML-FF", "Global": "HML-FF", "Global Ex USA": "HML-FF"}},
            "HML-D": {"sheet": "HML Devil", "header": 18, "paper": "The Devil's in HML's Details", "factor_name": "High Minus Low - Devil/AQR", "columns": {"DATE": "DATE", "USA": "HML-D", "Global": "HML-D", "Global Ex USA": "HML-D"}},
            "UMD": {"sheet": "UMD", "header": 18, "paper": "On Persistence in Mutual Fund Performance", "factor_name": "Up Minus Down", "columns": {"DATE": "DATE", "USA": "UMD", "Global": "UMD", "Global Ex USA": "UMD"}},
            "ME": {"sheet": "ME(t-1)", "header": 18, "paper": "AQR Factors", "factor_name": "Market Value of Equity", "columns": {"DATE": "DATE", "USA": "ME", "Global": "ME", "Global Ex USA": "ME"}},
            "RF": {"sheet": "RF", "header": 17, "paper": None, "factor_name": "Risk Free", "columns": {"DATE": "DATE", "Risk Free Rate": "RF"}}
        }
    },
    "TSMOM": {
        "path": r"C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx",
        "sheet": "TSMOM Factors",
        "header": 17,
        "paper": "Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)",
        "columns": {
            "Unnamed: 0": "DATE",
            "TSMOM": "TSM-MA",
            "TSMOM^CM": "TSM-Com",
            "TSMOM^EQ": "TSM-EQ",
            "TSMOM^FI": "TSM-FI",
            "TSMOM^FX": "TSM-FX"
        }
    },
    "QMJ": {
        "path": r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx",
        "sheet": "10 Portfolios Formed on Quality",
        "header": 18,
        "paper": "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)",
        "columns": {
            "DATE": "DATE",
            "P1 (low quality)": "QMJ-P1-LQ",
            "P2": "QMJ-P2",
            "P3": "QMJ-P3",
            "P4": "QMJ-P4",
            "P5": "QMJ-P5",
            "P6": "QMJ-P6",
            "P7": "QMJ-P7",
            "P8": "QMJ-P8",
            "P9": "QMJ-P9",
            "P10 (high quality)": "QMJ-P10-HQ",
            "P10-P1": "QMJ"
        }
    },
    "Century": {
        "path": r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx",
        "sheet": 0,
        "header": 18,
        "paper": "Century of Factor Premia Monthly-Ilmanen et al. (2021)",
        "columns": {
            "Date": "DATE",
            "US Stock Selection Value": "HML-FF-US", "US Stock Selection Momentum": "UMD-US", 
            "US Stock Selection Defensive": "BAB-US", "US Stock Selection Multi-style": "Multi-style-US",
            "Intl Stock Selection Value": "HML-FF-Intl", "Intl Stock Selection Momentum": "UMD-Intl", 
            "Intl Stock Selection Defensive": "BAB-Intl", "Intl Stock Selection Multi-style": "Multi-style-Intl",
            "Equity indices Value": "HML-Equity", "Equity indices Momentum": "UMD-Equity", 
            "Equity indices Carry": "Carry-Equity", "Equity indices Defensive": "BAB-Equity", 
            "Equity indices Multi-style": "Multi-style-Equity",
            "Fixed income Value": "HML-FI", "Fixed income Momentum": "UMD-FI", 
            "Fixed income Carry": "Carry-FI", "Fixed income Defensive": "BAB-FI", 
            "Fixed income Multi-style": "Multi-style-FI",
            "Currencies Value": "HML-FX", "Currencies Momentum": "UMD-FX", 
            "Currencies Carry": "Carry-FX", "Currencies Multi-style": "Multi-style-FX",
            "Commodities Value": "HML-COM", "Commodities Momentum": "UMD-COM", 
            "Commodities Carry": "Carry-COM", "Commodities Multi-style": "Multi-style-COM",
            "All Stock Selection Value": "HML-All-SS", "All Stock Selection Momentum": "UMD-All-SS", 
            "All Stock Selection Defensive": "BAB-All-SS", "All Stock Selection Multi-style": "Multi-style-All-SS",
            "All Macro Value": "HML-All-Macro", "All Macro Momentum": "UMD-All-Macro", 
            "All Macro Carry": "Carry-All-Macro", "All Macro Defensive": "BAB-All-Macro", 
            "All Macro Multi-style": "Multi-style-All-Macro",
            "All asset classes Value": "HML-All", "All asset classes Momentum": "UMD-All", 
            "All asset classes Carry": "Carry-All", "All asset classes Defensive": "BAB-All", 
            "All asset classes Multi-style": "Multi-style-All",
            "Equity indices Market": "MKT-Equity", "Fixed income Market": "MKT-FI", 
            "Commodities Market": "MKT-COM", "All Macro Market": "MKT-All-Macro"
        }
    },
    "COM": {
        "path": r"C:\Users\JulianHeron\Downloads\Commodities for the Long Run Index Level Data Monthly.xlsx",
        "sheet": 0,
        "header": 10,
        "paper": "Commodities for the Long Run",
        "columns": {
            "Unnamed: 0": "date",
            "Excess return of equal-weight commodities portfolio": "excess_return_eqwt",
            "Excess spot return of equal-weight commodities portfolio": "excess_spot_return_eqwt",
            "Interest rate adjusted carry of equal-weight commodities portfolio": "ir_adjusted_carry_eqwt",
            "Spot return of equal-weight commodities portfolio": "spot_return_eqwt",
            "Carry of equal-weight commodities portfolio": "carry_eqwt",
            "Excess return of long/short commodities portfolio": "excess_return_long_short",
            "Excess spot return of long/short commodities portfolio": "excess_spot_return_long_short",
            "Interest rate adjusted carry of long/short commodities portfolio": "ir_adjusted_carry_long_short",
            "Aggregate backwardation/contango": "aggregate_backwardation_contango",
            "State of backwardation/contango": "state_backwardation_contango",
            "State of inflation": "state_inflation"
        }
    }
}

def process_factors(file_config, sheet=0, is_com=False, parent_path=None):
    file_path = parent_path if parent_path else file_config['path']
    logger.info(f"Processing {file_path} (sheet {sheet}) with is_com={is_com}")
    try:
        df = pd.read_excel(file_path, sheet_name=sheet, header=file_config['header'])
        logger.info(f"Read {len(df)} rows with raw columns: {list(df.columns)}")
        
        if is_com:
            expected_cols = list(file_config['columns'].keys())
            missing_cols = [col for col in expected_cols if col not in df.columns]
            if missing_cols:
                logger.warning(f"Missing columns in {file_path} (sheet {sheet}): {missing_cols}")
            df = df[[col for col in df.columns if col in expected_cols]]
            df.columns = [file_config['columns'][col] for col in df.columns]
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df['associated_paper'] = file_config['paper'] if file_config['paper'] is not None else 'Unknown'
            df = df.dropna(subset=['date'])
            logger.debug(f"After dropping NA dates, {len(df)} rows remain")
            
            with engine.connect() as connection:
                existing = pd.read_sql("SELECT date FROM aqr_cmdty_factors", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(existing['date'])
                logger.debug(f"Existing commodity dates sample (first 5): {list(existing_keys)[:5]}")
            
            df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')
            df_new = df[~df['date_str'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                for chunk in [df_new[i:i+10000] for i in range(0, len(df_new), 10000)]:
                    chunk.to_sql(
                        'aqr_cmdty_factors',
                        engine,
                        if_exists='append',
                        index=False,
                        dtype={
                            'date': DATE,
                            'excess_return_eqwt': DECIMAL(15, 6),
                            'excess_spot_return_eqwt': DECIMAL(15, 6),
                            'ir_adjusted_carry_eqwt': DECIMAL(15, 6),
                            'spot_return_eqwt': DECIMAL(15, 6),
                            'carry_eqwt': DECIMAL(15, 6),
                            'excess_return_long_short': DECIMAL(15, 6),
                            'excess_spot_return_long_short': DECIMAL(15, 6),
                            'ir_adjusted_carry_long_short': DECIMAL(15, 6),
                            'aggregate_backwardation_contango': DECIMAL(15, 6),
                            'state_backwardation_contango': VARCHAR(50),
                            'state_inflation': VARCHAR(50),
                            'associated_paper': VARCHAR(100)
                        }
                    )
                logger.info(f"Loaded {len(df_new)} new rows into aqr_cmdty_factors from {file_path} (sheet {sheet})")
            else:
                logger.info(f"No new rows to load into aqr_cmdty_factors from {file_path} (sheet {sheet})")
                logger.debug(f"Input date range: {df['date'].min()} to {df['date机床.max()}")
                logger.debug(f"Database date range: {existing['date'].min() if not existing.empty else 'N/A'} to {existing['date'].max() if not existing.empty else 'N/A'}")
        else:
            date_col = next(
                (col for col in df.columns if isinstance(col, str) and ('DATE' in col.upper() or 'Date' in col)),
                None
            )
            if date_col is None:
                logger.warning(f"No date column found in {file_path} (sheet {sheet}). Using first column.")
                date_col = df.columns[0]
            
            orig_cols = df.columns.tolist()
            if 'columns' in file_config:
                expected_cols = list(file_config['columns'].keys())
                valid_cols = [col for col in expected_cols if col in df.columns]
                missing_cols = [col for col in expected_cols if col not in df.columns]
                if missing_cols:
                    logger.warning(f"Missing columns in {file_path} (sheet {sheet}): {missing_cols}")
                if not valid_cols:
                    logger.error(f"No valid columns found in {file_path} (sheet {sheet}). Skipping.")
                    return
                if date_col not in valid_cols:
                    valid_cols.append(date_col)
                df = df[[col for col in df.columns if col in valid_cols]]
                new_columns = ['date' if col == date_col else file_config['columns'].get(col, col) for col in df.columns]
                df.columns = new_columns
            
            logger.info(f"Columns after renaming: {list(df.columns)}")
            
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            invalid_dates = df['date'].isna().sum()
            if invalid_dates > 0:
                logger.warning(f"Dropped {invalid_dates} rows due to invalid dates in {file_path} (sheet {sheet})")
            df = df.dropna(subset=['date'])
            logger.debug(f"After dropping NA dates, {len(df)} rows remain")
            
            value_vars = [col for col in df.columns if col != 'date']
            df_long = df.melt(id_vars=['date'], value_vars=value_vars, var_name='factor_symbol', value_name='value')
            df_long = df_long.dropna(subset=['value', 'date'])
            logger.debug(f"Rows after melting and dropping NA: {len(df_long)}")
            
            file_path_for_check = parent_path if parent_path else file_config.get('path', '')
            if 'TSMOM' in file_path_for_check:
                df_long['factor_symbol'] = df_long['factor_symbol']
                df_long['portfolio'] = 'TSMOM'
                df_long['region'] = "Global"
                df_long['factor_name'] = df_long['factor_symbol'].map({
                    "TSM-MA": "Time Series Momentum Multi-Asset",
                    "TSM-Com": "Time Series Momentum Commodities",
                    "TSM-EQ": "Time Series Momentum Equity",
                    "TSM-FI": "Time Series Momentum Fixed Income",
                    "TSM-FX": "Time Series Momentum Currencies"
                })
            elif 'Quality Minus Junk' in file_path_for_check:
                df_long['factor_symbol'] = df_long['factor_symbol'].apply(lambda x: "QMJ" if x == "QMJ" else x.split('-')[0])
                df_long['portfolio'] = df_long['factor_symbol'].apply(lambda x: x if x != "QMJ" else "P10-P1")
                df_long['region'] = df_long['date'].apply(lambda x: "USA" if x < pd.Timestamp("1989-07-31") else "Global")
                df_long['factor_name'] = df_long['factor_symbol'].apply(lambda x: "Quality Minus Junk" if x == "QMJ" else f"Quality Portfolio {x.split('-')[1]}")
            elif 'Century' in file_path_for_check:
                df_long['factor_symbol'] = df_long['factor_symbol'].apply(lambda x: x.split('-')[0])
                df_long['portfolio'] = df_long['factor_symbol'].apply(lambda x: x)
                df_long['region'] = df_long['factor_symbol'].apply(lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global')
                df_long['factor_name'] = df_long['factor_symbol'].map({
                    "HML": "High Minus Low", "UMD": "Up Minus Down", "BAB": "Bet Against Beta",
                    "MKT": "Market", "Carry": "Carry", "Multi-style": "Multi-style"
                }).fillna(df_long['factor_symbol'])
            else:  # BAB_multi
                df_long['factor_symbol'] = df_long['factor_symbol']
                df_long['portfolio'] = file_config.get('sheet', 'Unknown')
                region_map = {}
                for col in orig_cols:
                    if col == date_col:
                        continue
                    factor = file_config['columns'].get(col, col)
                    if col == 'USA':
                        region_map[factor] = 'USA'
                    elif col == 'Global':
                        region_map[factor] = 'Global'
                    elif col == 'Global Ex USA':
                        region_map[factor] = 'Intl'
                    elif col == 'Risk Free Rate':
                        region_map[factor] = 'USA'
                df_long['region'] = df_long['factor_symbol'].map(region_map)
                df_long['factor_name'] = file_config.get('factor_name', df_long['factor_symbol'])
            
            df_long['associated_paper'] = file_config['paper'] if file_config['paper'] is not None else 'Unknown'
            df_long = df_long[['factor_symbol', 'factor_name', 'portfolio', 'region', 'date', 'value', 'associated_paper']]
            
            with engine.connect() as connection:
                # Update query to match existing table columns
                existing = pd.read_sql(
                    "SELECT factor, date, associated_paper, region FROM factor_returns",
                    connection
                )
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                # Adjust key to match old table for now
                existing_keys = set(tuple(row) for row in existing[['factor', 'date', 'associated_paper', 'region']].values)
                logger.debug(f"Existing keys sample (first 5): {list(existing_keys)[:5]}")
                logger.debug(f"Total existing keys: {len(existing_keys)}")
            
            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            # Temporarily align with old table structure
            df_long['factor'] = df_long['factor_symbol']  # Map to old 'factor' column
            df_new = df_long[~df_long.apply(lambda row: (row['factor'], row['date_str'], row['associated_paper'], row['region']), axis=1).isin(existing_keys)]
            df_new = df_new.drop(columns=['date_str', 'factor_symbol']).rename(columns={'factor': 'factor_symbol'})
            
            if not df_new.empty:
                logger.debug(f"New rows to insert: {len(df_new)}. Sample (first 5): {df_new[['factor_symbol', 'portfolio', 'region', 'date']].head().to_dict('records')}")
                for chunk in [df_new[i:i+10000] for i in range(0, len(df_new), 10000)]:
                    chunk.to_sql(
                        'factor_returns',
                        engine,
                        if_exists='append',
                        index=False,
                        dtype={
                            'factor_symbol': VARCHAR(20),
                            'factor_name': VARCHAR(100),
                            'portfolio': VARCHAR(100),
                            'region': VARCHAR(50),
                            'date': DATE,
                            'value': DECIMAL(15, 6),
                            'associated_paper': VARCHAR(150)
                        }
                    )
                logger.info(f"Loaded {len(df_new)} new rows into factor_returns from {file_path} (sheet {sheet})")
            else:
                logger.info(f"No new rows to load into factor_returns from {file_path} (sheet {sheet})")
                logger.debug(f"Input date range: {df_long['date'].min()} to {df_long['date'].max()}")
                logger.debug(f"Database date range: {existing['date'].min() if not existing.empty else 'N/A'} to {existing['date'].max() if not existing.empty else 'N/A'}")
            
    except FileNotFoundError as e:
        logger.error(f"File not found: {file_path} (sheet {sheet}): {str(e)}")
    except pd.errors.ParserError as e:
        logger.error(f"Error parsing Excel file {file_path} (sheet {sheet}): {str(e)}")
    except Exception as e:
        logger.error(f"Unexpected error processing {file_path} (sheet {sheet}): {type(e).__name__}: {str(e)}")

# Main loop
for key, config in data_files.items():
    logger.info(f"Processing dataset: {key}")
    if key == "BAB_multi":
        for subkey, subconfig in config['sheets'].items():
            process_factors(subconfig, sheet=subconfig['sheet'], parent_path=config['path'])
    else:
        is_com = (key == "COM")
        process_factors(config, sheet=config['sheet'], is_com=is_com)

logger.info("All data processing complete!")

SyntaxError: unterminated string literal (detected at line 179) (2007911644.py, line 179)

In [27]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
import logging

# Logging setup
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# Database connection
engine = create_engine(
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

# File configurations with explicit headers
data_files = {
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {
                "sheet": "BAB Factors",
                "start_row": 19,  # Data starts on row 20 (0-based 19)
                "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "BAB", "Global": "BAB", "Global Ex USA": "BAB"}
            },
            "MKT": {
                "sheet": "MKT",
                "start_row": 19,
                "paper": "Fama-French Factors",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "MKT", "Global": "MKT", "Global Ex USA": "MKT"}
            },
            "SMB": {
                "sheet": "SMB",
                "start_row": 19,
                "paper": "Fama-French Factors",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "SMB", "Global": "SMB", "Global Ex USA": "SMB"}
            },
            "HML-FF": {
                "sheet": "HML FF",
                "start_row": 19,
                "paper": "Fama-French Factors",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "HML-FF", "Global": "HML-FF", "Global Ex USA": "HML-FF"}
            },
            "HML-D": {
                "sheet": "HML Devil",
                "start_row": 19,
                "paper": "The Devil's in HML's Details",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "HML-D", "Global": "HML-D", "Global Ex USA": "HML-D"}
            },
            "UMD": {
                "sheet": "UMD",
                "start_row": 19,
                "paper": "On Persistence in Mutual Fund Performance",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "UMD", "Global": "UMD", "Global Ex USA": "UMD"}
            },
            "ME": {
                "sheet": "ME(t-1)",
                "start_row": 19,
                "paper": "AQR Factors",
                "headers": ["DATE", "AUT", "HKG", "ESP", "GBR", "ITA", "DEU", "DNK", "NZL", "NLD", "USA", "PRT", "BEL", "ISR", "GRC", "NOR", "SGP", "CHE", "IRL", "CAN", "FIN", "JPN", "SWE", "FRA", "AUS", "Global Ex USA", "Global", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "ME", "Global": "ME", "Global Ex USA": "ME"}
            },
            "RF": {
                "sheet": "RF",
                "start_row": 18,
                "paper": None,
                "headers": ["DATE", "Risk Free Rate"],
                "columns": {"DATE": "DATE", "Risk Free Rate": "RF"}
            }
        }
    },
    "TSMOM": {
        "path": r"C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx",
        "sheet": "TSMOM Factors",
        "start_row": 18,
        "paper": "Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)",
        "headers": ["DATE", "TSMOM", "TSMOM^CM", "TSMOM^EQ", "TSMOM^FI", "TSMOM^FX"],
        "columns": {
            "DATE": "DATE",
            "TSMOM": "TSM-MA",
            "TSMOM^CM": "TSM-Com",
            "TSMOM^EQ": "TSM-EQ",
            "TSMOM^FI": "TSM-FI",
            "TSMOM^FX": "TSM-FX"
        }
    },
    "QMJ": {
        "path": r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx",
        "sheet": "10 Portfolios Formed on Quality",
        "start_row": 19,
        "paper": "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)",
        "headers": ["DATE", "P1 (low quality)", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10 (high quality)", "P10-P1"],
        "columns": {
            "DATE": "DATE",
            "P1 (low quality)": "QMJ-P1-LQ",
            "P2": "QMJ-P2",
            "P3": "QMJ-P3",
            "P4": "QMJ-P4",
            "P5": "QMJ-P5",
            "P6": "QMJ-P6",
            "P7": "QMJ-P7",
            "P8": "QMJ-P8",
            "P9": "QMJ-P9",
            "P10 (high quality)": "QMJ-P10-HQ",
            "P10-P1": "QMJ"
        }
    },
    "Century": {
        "path": r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx",
        "sheet": 0,
        "start_row": 18,
        "paper": "Century of Factor Premia Monthly-Ilmanen et al. (2021)",
        "headers": ["DATE", "US Stock Selection Value", "US Stock Selection Momentum", "US Stock Selection Defensive", "US Stock Selection Multi-style", "Intl Stock Selection Value", "Intl Stock Selection Momentum", "Intl Stock Selection Defensive", "Intl Stock Selection Multi-style", "Equity indices Value", "Equity indices Momentum", "Equity indices Carry", "Equity indices Defensive", "Equity indices Multi-style", "Fixed income Value", "Fixed income Momentum", "Fixed income Carry", "Fixed income Defensive", "Fixed income Multi-style", "Currencies Value", "Currencies Momentum", "Currencies Carry", "Currencies Multi-style", "Commodities Value", "Commodities Momentum", "Commodities Carry", "Commodities Multi-style", "All Stock Selection Value", "All Stock Selection Momentum", "All Stock Selection Defensive", "All Stock Selection Multi-style", "All Macro Value", "All Macro Momentum", "All Macro Carry", "All Macro Defensive", "All Macro Multi-style", "All asset classes Value", "All asset classes Momentum", "All asset classes Carry", "All asset classes Defensive", "All asset classes Multi-style", "Equity indices Market", "Fixed income Market", "Commodities Market", "All Macro Market"],
        "columns": {
            "DATE": "DATE",
            "US Stock Selection Value": "HML-FF-US", "US Stock Selection Momentum": "UMD-US", 
            "US Stock Selection Defensive": "BAB-US", "US Stock Selection Multi-style": "Multi-style-US",
            "Intl Stock Selection Value": "HML-FF-Intl", "Intl Stock Selection Momentum": "UMD-Intl", 
            "Intl Stock Selection Defensive": "BAB-Intl", "Intl Stock Selection Multi-style": "Multi-style-Intl",
            "Equity indices Value": "HML-Equity", "Equity indices Momentum": "UMD-Equity", 
            "Equity indices Carry": "Carry-Equity", "Equity indices Defensive": "BAB-Equity", 
            "Equity indices Multi-style": "Multi-style-Equity",
            "Fixed income Value": "HML-FI", "Fixed income Momentum": "UMD-FI", 
            "Fixed income Carry": "Carry-FI", "Fixed income Defensive": "BAB-FI", 
            "Fixed income Multi-style": "Multi-style-FI",
            "Currencies Value": "HML-FX", "Currencies Momentum": "UMD-FX", 
            "Currencies Carry": "Carry-FX", "Currencies Multi-style": "Multi-style-FX",
            "Commodities Value": "HML-COM", "Commodities Momentum": "UMD-COM", 
            "Commodities Carry": "Carry-COM", "Commodities Multi-style": "Multi-style-COM",
            "All Stock Selection Value": "HML-All-SS", "All Stock Selection Momentum": "UMD-All-SS", 
            "All Stock Selection Defensive": "BAB-All-SS", "All Stock Selection Multi-style": "Multi-style-All-SS",
            "All Macro Value": "HML-All-Macro", "All Macro Momentum": "UMD-All-Macro", 
            "All Macro Carry": "Carry-All-Macro", "All Macro Defensive": "BAB-All-Macro", 
            "All Macro Multi-style": "Multi-style-All-Macro",
            "All asset classes Value": "HML-All", "All asset classes Momentum": "UMD-All", 
            "All asset classes Carry": "Carry-All", "All asset classes Defensive": "BAB-All", 
            "All asset classes Multi-style": "Multi-style-All",
            "Equity indices Market": "MKT-Equity", "Fixed income Market": "MKT-FI", 
            "Commodities Market": "MKT-COM", "All Macro Market": "MKT-All-Macro"
        }
    },
    "COM": {
        "path": r"C:\Users\JulianHeron\Downloads\Commodities for the Long Run Index Level Data Monthly.xlsx",
        "sheet": 0,
        "start_row": 11,
        "paper": "Commodities for the Long Run",
        "headers": ["DATE", "Excess return of equal-weight commodities portfolio", "Excess spot return of equal-weight commodities portfolio", "Interest rate adjusted carry of equal-weight commodities portfolio", "Spot return of equal-weight commodities portfolio", "Carry of equal-weight commodities portfolio", "Excess return of long/short commodities portfolio", "Excess spot return of long/short commodities portfolio", "Interest rate adjusted carry of long/short commodities portfolio", "Aggregate backwardation/contango", "State of backwardation/contango", "State of inflation"],
        "columns": {
            "DATE": "date",
            "Excess return of equal-weight commodities portfolio": "excess_return_eqwt",
            "Excess spot return of equal-weight commodities portfolio": "excess_spot_return_eqwt",
            "Interest rate adjusted carry of equal-weight commodities portfolio": "ir_adjusted_carry_eqwt",
            "Spot return of equal-weight commodities portfolio": "spot_return_eqwt",
            "Carry of equal-weight commodities portfolio": "carry_eqwt",
            "Excess return of long/short commodities portfolio": "excess_return_long_short",
            "Excess spot return of long/short commodities portfolio": "excess_spot_return_long_short",
            "Interest rate adjusted carry of long/short commodities portfolio": "ir_adjusted_carry_long_short",
            "Aggregate backwardation/contango": "aggregate_backwardation_contango",
            "State of backwardation/contango": "state_backwardation_contango",
            "State of inflation": "state_inflation"
        }
    }
}

def process_factors(file_config, sheet=0, is_com=False, parent_path=None):
    file_path = parent_path if parent_path else file_config['path']
    logger.info(f"Processing {file_path} (sheet {sheet}) with is_com={is_com}")
    try:
        # Read Excel without headers, then set them
        df = pd.read_excel(file_path, sheet_name=sheet, header=None, skiprows=file_config['start_row'])
        df.columns = file_config['headers']
        logger.info(f"Read {len(df)} rows with set columns: {list(df.columns)}")
        
        if is_com:
            expected_cols = list(file_config['columns'].keys())
            df = df[expected_cols]
            df.columns = [file_config['columns'][col] for col in df.columns]
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df['associated_paper'] = file_config['paper'] if file_config['paper'] is not None else 'Unknown'
            df = df.dropna(subset=['date'])
            logger.debug(f"After dropping NA dates, {len(df)} rows remain")
            
            with engine.connect() as connection:
                existing = pd.read_sql("SELECT date FROM aqr_cmdty_factors", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(existing['date'])
                logger.debug(f"Existing commodity dates sample (first 5): {list(existing_keys)[:5]}")
            
            df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')
            df_new = df[~df['date_str'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                for chunk in [df_new[i:i+10000] for i in range(0, len(df_new), 10000)]:
                    chunk.to_sql(
                        'aqr_cmdty_factors',
                        engine,
                        if_exists='append',
                        index=False,
                        dtype={
                            'date': DATE,
                            'excess_return_eqwt': DECIMAL(15, 6),
                            'excess_spot_return_eqwt': DECIMAL(15, 6),
                            'ir_adjusted_carry_eqwt': DECIMAL(15, 6),
                            'spot_return_eqwt': DECIMAL(15, 6),
                            'carry_eqwt': DECIMAL(15, 6),
                            'excess_return_long_short': DECIMAL(15, 6),
                            'excess_spot_return_long_short': DECIMAL(15, 6),
                            'ir_adjusted_carry_long_short': DECIMAL(15, 6),
                            'aggregate_backwardation_contango': DECIMAL(15, 6),
                            'state_backwardation_contango': VARCHAR(50),
                            'state_inflation': VARCHAR(50),
                            'associated_paper': VARCHAR(100)
                        }
                    )
                logger.info(f"Loaded {len(df_new)} new rows into aqr_cmdty_factors from {file_path} (sheet {sheet})")
            else:
                logger.info(f"No new rows to load into aqr_cmdty_factors from {file_path} (sheet {sheet})")
                logger.debug(f"Input date range: {df['date'].min()} to {df['date'].max()}")
                logger.debug(f"Database date range: {existing['date'].min() if not existing.empty else 'N/A'} to {existing['date'].max() if not existing.empty else 'N/A'}")
        else:
            date_col = 'DATE'
            orig_cols = df.columns.tolist()
            if 'columns' in file_config:
                expected_cols = list(file_config['columns'].keys())
                valid_cols = [col for col in expected_cols if col in df.columns]
                if not valid_cols:
                    logger.error(f"No valid columns found in {file_path} (sheet {sheet}). Skipping.")
                    return
                if date_col not in valid_cols:
                    valid_cols.append(date_col)
                df = df[valid_cols]
                new_columns = ['date' if col == date_col else file_config['columns'].get(col, col) for col in df.columns]
                df.columns = new_columns
            
            logger.info(f"Columns after renaming: {list(df.columns)}")
            
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            invalid_dates = df['date'].isna().sum()
            if invalid_dates > 0:
                logger.warning(f"Dropped {invalid_dates} rows due to invalid dates in {file_path} (sheet {sheet})")
            df = df.dropna(subset=['date'])
            logger.debug(f"After dropping NA dates, {len(df)} rows remain")
            
            value_vars = [col for col in df.columns if col != 'date']
            df_long = df.melt(id_vars=['date'], value_vars=value_vars, var_name='factor', value_name='value')
            df_long = df_long.dropna(subset=['value', 'date'])
            logger.debug(f"Rows after melting and dropping NA: {len(df_long)}")
            
            file_path_for_check = parent_path if parent_path else file_config.get('path', '')
            if 'TSMOM' in file_path_for_check:
                df_long['asset_class'] = df_long['factor'].map({
                    "TSM-MA": "Multi-Asset",
                    "TSM-Com": "Commodities",
                    "TSM-EQ": "Equity",
                    "TSM-FI": "Fixed Income",
                    "TSM-FX": "Currencies"
                })
                df_long['region'] = "Global"
            elif 'Quality Minus Junk' in file_path_for_check:
                df_long['asset_class'] = "Equity"
                df_long['region'] = df_long['date'].apply(lambda x: "USA" if x < pd.Timestamp("1989-07-31") else "Global")
            elif 'Century' in file_path_for_check:
                df_long['asset_class'] = df_long['factor'].str.extract(
                    '(Stock Selection|Equity indices|Fixed income|Currencies|Commodities|All Macro|All asset classes)'
                ).fillna('Equity')
                df_long['region'] = df_long['factor'].apply(lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global')
            else:  # BAB_multi
                df_long['asset_class'] = "Equity" if 'Risk Free Rate' not in file_config['columns'].values() else "Fixed Income"
                region_map = {
                    "BAB": {"USA": "USA", "Global": "Global", "Global Ex USA": "Intl"},
                    "MKT": {"USA": "USA", "Global": "Global", "Global Ex USA": "Intl"},
                    "SMB": {"USA": "USA", "Global": "Global", "Global Ex USA": "Intl"},
                    "HML-FF": {"USA": "USA", "Global": "Global", "Global Ex USA": "Intl"},
                    "HML-D": {"USA": "USA", "Global": "Global", "Global Ex USA": "Intl"},
                    "UMD": {"USA": "USA", "Global": "Global", "Global Ex USA": "Intl"},
                    "ME": {"USA": "USA", "Global": "Global", "Global Ex USA": "Intl"},
                    "RF": {"Risk Free Rate": "USA"}
                }.get(df_long['factor'].iloc[0], {})
                df_long['region'] = df_long['factor'].map(region_map)
            
            df_long['associated_paper'] = file_config['paper'] if file_config['paper'] is not None else 'Unknown'
            df_long = df_long[['factor', 'date', 'associated_paper', 'value', 'region', 'asset_class']]
            
            with engine.connect() as connection:
                existing = pd.read_sql(
                    "SELECT factor, date, associated_paper, region FROM factor_returns",
                    connection
                )
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(tuple(row) for row in existing[['factor', 'date', 'associated_paper', 'region']].values)
                logger.debug(f"Existing keys sample (first 5): {list(existing_keys)[:5]}")
                logger.debug(f"Total existing keys: {len(existing_keys)}")
            
            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            df_long['key'] = df_long.apply(
                lambda row: (row['factor'], row['date_str'], row['associated_paper'], row['region']), axis=1
            )
            logger.debug(f"New keys sample (first 5): {df_long['key'].head().tolist()}")
            df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                logger.debug(f"New rows to insert: {len(df_new)}. Sample (first 5): {df_new[['factor', 'date', 'region']].head().to_dict('records')}")
                for chunk in [df_new[i:i+10000] for i in range(0, len(df_new), 10000)]:
                    chunk.to_sql(
                        'factor_returns',
                        engine,
                        if_exists='append',
                        index=False,
                        dtype={
                            'factor': VARCHAR(50),
                            'date': DATE,
                            'associated_paper': VARCHAR(100),
                            'value': DECIMAL(15, 6),
                            'region': VARCHAR(50),
                            'asset_class': VARCHAR(50)
                        }
                    )
                logger.info(f"Loaded {len(df_new)} new rows into factor_returns from {file_path} (sheet {sheet})")
            else:
                logger.info(f"No new rows to load into factor_returns from {file_path} (sheet {sheet})")
                logger.debug(f"Input date range: {df_long['date'].min()} to {df_long['date'].max()}")
                logger.debug(f"Database date range: {existing['date'].min() if not existing.empty else 'N/A'} to {existing['date'].max() if not existing.empty else 'N/A'}")
            
    except FileNotFoundError as e:
        logger.error(f"File not found: {file_path} (sheet {sheet}): {str(e)}")
    except pd.errors.ParserError as e:
        logger.error(f"Error parsing Excel file {file_path} (sheet {sheet}): {str(e)}")
    except Exception as e:
        logger.error(f"Unexpected error processing {file_path} (sheet {sheet}): {type(e).__name__}: {str(e)}")

# Main loop
for key, config in data_files.items():
    logger.info(f"Processing dataset: {key}")
    if key == "BAB_multi":
        for subkey, subconfig in config['sheets'].items():
            process_factors(subconfig, sheet=subconfig['sheet'], parent_path=config['path'])
    else:
        is_com = (key == "COM")
        process_factors(config, sheet=config['sheet'], is_com=is_com)

logger.info("All data processing complete!")

2025-04-11 12:33:07,576 - INFO - Processing dataset: BAB_multi
2025-04-11 12:33:07,577 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet BAB Factors) with is_com=False
2025-04-11 12:33:08,183 - INFO - Read 1129 rows with set columns: ['DATE', 'AUS', 'AUT', 'BEL', 'CAN', 'CHE', 'DEU', 'DNK', 'ESP', 'FIN', 'FRA', 'GBR', 'GRC', 'HKG', 'IRL', 'ISR', 'ITA', 'JPN', 'NLD', 'NOR', 'NZL', 'PRT', 'SGP', 'SWE', 'USA', 'Global', 'Global Ex USA', 'Europe', 'North America', 'Pacific']
2025-04-11 12:33:08,187 - INFO - Columns after renaming: ['date', 'BAB', 'BAB', 'BAB']
2025-04-11 12:33:08,205 - DEBUG - After dropping NA dates, 1129 rows remain
2025-04-11 12:33:08,216 - DEBUG - Rows after melting and dropping NA: 2039
2025-04-11 12:33:10,854 - DEBUG - Existing keys sample (first 5): [('BAB_Intl', '2006-06-30', 'Betting Against Beta (Frazzini and Pedersen, 2014)', 'Intl'), ('UMD-COM', '1

In [28]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
import logging

# Logging setup
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# Database connection
engine = create_engine(
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

# File configurations with explicit headers
data_files = {
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {
                "sheet": "BAB Factors",
                "start_row": 19,
                "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "BAB", "Global": "BAB", "Global Ex USA": "BAB"}
            },
            "MKT": {
                "sheet": "MKT",
                "start_row": 19,
                "paper": "Fama-French Factors",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "MKT", "Global": "MKT", "Global Ex USA": "MKT"}
            },
            "SMB": {
                "sheet": "SMB",
                "start_row": 19,
                "paper": "Fama-French Factors",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "SMB", "Global": "SMB", "Global Ex USA": "SMB"}
            },
            "HML-FF": {
                "sheet": "HML FF",
                "start_row": 19,
                "paper": "Fama-French Factors",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "HML-FF", "Global": "HML-FF", "Global Ex USA": "HML-FF"}
            },
            "HML-D": {
                "sheet": "HML Devil",
                "start_row": 19,
                "paper": "The Devil's in HML's Details",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "HML-D", "Global": "HML-D", "Global Ex USA": "HML-D"}
            },
            "UMD": {
                "sheet": "UMD",
                "start_row": 19,
                "paper": "On Persistence in Mutual Fund Performance",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "UMD", "Global": "UMD", "Global Ex USA": "UMD"}
            },
            "ME": {
                "sheet": "ME(t-1)",
                "start_row": 19,
                "paper": "AQR Factors",
                "headers": ["DATE", "AUT", "HKG", "ESP", "GBR", "ITA", "DEU", "DNK", "NZL", "NLD", "USA", "PRT", "BEL", "ISR", "GRC", "NOR", "SGP", "CHE", "IRL", "CAN", "FIN", "JPN", "SWE", "FRA", "AUS", "Global Ex USA", "Global", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "ME", "Global": "ME", "Global Ex USA": "ME"}
            },
            "RF": {
                "sheet": "RF",
                "start_row": 18,
                "paper": None,
                "headers": ["DATE", "Risk Free Rate"],
                "columns": {"DATE": "DATE", "Risk Free Rate": "RF"}
            }
        }
    },
    "TSMOM": {
        "path": r"C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx",
        "sheet": "TSMOM Factors",
        "start_row": 18,
        "paper": "Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)",
        "headers": ["DATE", "TSMOM", "TSMOM^CM", "TSMOM^EQ", "TSMOM^FI", "TSMOM^FX"],
        "columns": {
            "DATE": "DATE",
            "TSMOM": "TSM-MA",
            "TSMOM^CM": "TSM-Com",
            "TSMOM^EQ": "TSM-EQ",
            "TSMOM^FI": "TSM-FI",
            "TSMOM^FX": "TSM-FX"
        }
    },
    "QMJ": {
        "path": r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx",
        "sheet": "10 Portfolios Formed on Quality",
        "start_row": 19,
        "paper": "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)",
        "headers": ["DATE", "P1 (low quality)", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10 (high quality)", "P10-P1", "P1 (low quality).1", "P2.1", "P3.1", "P4.1", "P5.1", "P6.1", "P7.1", "P8.1", "P9.1", "P10 (high quality).1", "P10-P1.1"],
        "columns": {
            "DATE": "DATE",
            "P1 (low quality)": "QMJ-P1-LQ",
            "P2": "QMJ-P2",
            "P3": "QMJ-P3",
            "P4": "QMJ-P4",
            "P5": "QMJ-P5",
            "P6": "QMJ-P6",
            "P7": "QMJ-P7",
            "P8": "QMJ-P8",
            "P9": "QMJ-P9",
            "P10 (high quality)": "QMJ-P10-HQ",
            "P10-P1": "QMJ",
            "P1 (low quality).1": "QMJ-P1-LQ",
            "P2.1": "QMJ-P2",
            "P3.1": "QMJ-P3",
            "P4.1": "QMJ-P4",
            "P5.1": "QMJ-P5",
            "P6.1": "QMJ-P6",
            "P7.1": "QMJ-P7",
            "P8.1": "QMJ-P8",
            "P9.1": "QMJ-P9",
            "P10 (high quality).1": "QMJ-P10-HQ",
            "P10-P1.1": "QMJ"
        }
    },
    "Century": {
        "path": r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx",
        "sheet": 0,
        "start_row": 18,
        "paper": "Century of Factor Premia Monthly-Ilmanen et al. (2021)",
        "headers": ["DATE", "US Stock Selection Value", "US Stock Selection Momentum", "US Stock Selection Defensive", "US Stock Selection Multi-style", "Intl Stock Selection Value", "Intl Stock Selection Momentum", "Intl Stock Selection Defensive", "Intl Stock Selection Multi-style", "Equity indices Value", "Equity indices Momentum", "Equity indices Carry", "Equity indices Defensive", "Equity indices Multi-style", "Fixed income Value", "Fixed income Momentum", "Fixed income Carry", "Fixed income Defensive", "Fixed income Multi-style", "Currencies Value", "Currencies Momentum", "Currencies Carry", "Currencies Multi-style", "Commodities Value", "Commodities Momentum", "Commodities Carry", "Commodities Multi-style", "All Stock Selection Value", "All Stock Selection Momentum", "All Stock Selection Defensive", "All Stock Selection Multi-style", "All Macro Value", "All Macro Momentum", "All Macro Carry", "All Macro Defensive", "All Macro Multi-style", "All asset classes Value", "All asset classes Momentum", "All asset classes Carry", "All asset classes Defensive", "All asset classes Multi-style", "Equity indices Market", "Fixed income Market", "Commodities Market", "All Macro Market"],
        "columns": {
            "DATE": "DATE",
            "US Stock Selection Value": "HML-FF-US", "US Stock Selection Momentum": "UMD-US", 
            "US Stock Selection Defensive": "BAB-US", "US Stock Selection Multi-style": "Multi-style-US",
            "Intl Stock Selection Value": "HML-FF-Intl", "Intl Stock Selection Momentum": "UMD-Intl", 
            "Intl Stock Selection Defensive": "BAB-Intl", "Intl Stock Selection Multi-style": "Multi-style-Intl",
            "Equity indices Value": "HML-Equity", "Equity indices Momentum": "UMD-Equity", 
            "Equity indices Carry": "Carry-Equity", "Equity indices Defensive": "BAB-Equity", 
            "Equity indices Multi-style": "Multi-style-Equity",
            "Fixed income Value": "HML-FI", "Fixed income Momentum": "UMD-FI", 
            "Fixed income Carry": "Carry-FI", "Fixed income Defensive": "BAB-FI", 
            "Fixed income Multi-style": "Multi-style-FI",
            "Currencies Value": "HML-FX", "Currencies Momentum": "UMD-FX", 
            "Currencies Carry": "Carry-FX", "Currencies Multi-style": "Multi-style-FX",
            "Commodities Value": "HML-COM", "Commodities Momentum": "UMD-COM", 
            "Commodities Carry": "Carry-COM", "Commodities Multi-style": "Multi-style-COM",
            "All Stock Selection Value": "HML-All-SS", "All Stock Selection Momentum": "UMD-All-SS", 
            "All Stock Selection Defensive": "BAB-All-SS", "All Stock Selection Multi-style": "Multi-style-All-SS",
            "All Macro Value": "HML-All-Macro", "All Macro Momentum": "UMD-All-Macro", 
            "All Macro Carry": "Carry-All-Macro", "All Macro Defensive": "BAB-All-Macro", 
            "All Macro Multi-style": "Multi-style-All-Macro",
            "All asset classes Value": "HML-All", "All asset classes Momentum": "UMD-All", 
            "All asset classes Carry": "Carry-All", "All asset classes Defensive": "BAB-All", 
            "All asset classes Multi-style": "Multi-style-All",
            "Equity indices Market": "MKT-Equity", "Fixed income Market": "MKT-FI", 
            "Commodities Market": "MKT-COM", "All Macro Market": "MKT-All-Macro"
        }
    },
    "COM": {
        "path": r"C:\Users\JulianHeron\Downloads\Commodities for the Long Run Index Level Data Monthly.xlsx",
        "sheet": 0,
        "start_row": 11,
        "paper": "Commodities for the Long Run",
        "headers": ["DATE", "Excess return of equal-weight commodities portfolio", "Excess spot return of equal-weight commodities portfolio", "Interest rate adjusted carry of equal-weight commodities portfolio", "Spot return of equal-weight commodities portfolio", "Carry of equal-weight commodities portfolio", "Excess return of long/short commodities portfolio", "Excess spot return of long/short commodities portfolio", "Interest rate adjusted carry of long/short commodities portfolio", "Aggregate backwardation/contango", "State of backwardation/contango", "State of inflation"],
        "columns": {
            "DATE": "date",
            "Excess return of equal-weight commodities portfolio": "excess_return_eqwt",
            "Excess spot return of equal-weight commodities portfolio": "excess_spot_return_eqwt",
            "Interest rate adjusted carry of equal-weight commodities portfolio": "ir_adjusted_carry_eqwt",
            "Spot return of equal-weight commodities portfolio": "spot_return_eqwt",
            "Carry of equal-weight commodities portfolio": "carry_eqwt",
            "Excess return of long/short commodities portfolio": "excess_return_long_short",
            "Excess spot return of long/short commodities portfolio": "excess_spot_return_long_short",
            "Interest rate adjusted carry of long/short commodities portfolio": "ir_adjusted_carry_long_short",
            "Aggregate backwardation/contango": "aggregate_backwardation_contango",
            "State of backwardation/contango": "state_backwardation_contango",
            "State of inflation": "state_inflation"
        }
    }
}

def process_factors(file_config, sheet=0, is_com=False, parent_path=None):
    file_path = parent_path if parent_path else file_config['path']
    logger.info(f"Processing {file_path} (sheet {sheet}) with is_com={is_com}")
    try:
        # Read Excel without headers, then set them
        df = pd.read_excel(file_path, sheet_name=sheet, header=None, skiprows=file_config['start_row'])
        df.columns = file_config['headers']
        logger.info(f"Read {len(df)} rows with set columns: {list(df.columns)}")
        
        if is_com:
            expected_cols = list(file_config['columns'].keys())
            df = df[expected_cols]
            df.columns = [file_config['columns'][col] for col in df.columns]
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df['associated_paper'] = file_config['paper'] if file_config['paper'] is not None else 'Unknown'
            df = df.dropna(subset=['date'])
            logger.debug(f"After dropping NA dates, {len(df)} rows remain")
            
            with engine.connect() as connection:
                existing = pd.read_sql("SELECT date FROM aqr_cmdty_factors", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(existing['date'])
                logger.debug(f"Existing commodity dates sample (first 5): {list(existing_keys)[:5]}")
            
            df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')
            df_new = df[~df['date_str'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                for chunk in [df_new[i:i+10000] for i in range(0, len(df_new), 10000)]:
                    chunk.to_sql(
                        'aqr_cmdty_factors',
                        engine,
                        if_exists='append',
                        index=False,
                        dtype={
                            'date': DATE,
                            'excess_return_eqwt': DECIMAL(15, 6),
                            'excess_spot_return_eqwt': DECIMAL(15, 6),
                            'ir_adjusted_carry_eqwt': DECIMAL(15, 6),
                            'spot_return_eqwt': DECIMAL(15, 6),
                            'carry_eqwt': DECIMAL(15, 6),
                            'excess_return_long_short': DECIMAL(15, 6),
                            'excess_spot_return_long_short': DECIMAL(15, 6),
                            'ir_adjusted_carry_long_short': DECIMAL(15, 6),
                            'aggregate_backwardation_contango': DECIMAL(15, 6),
                            'state_backwardation_contango': VARCHAR(50),
                            'state_inflation': VARCHAR(50),
                            'associated_paper': VARCHAR(100)
                        }
                    )
                logger.info(f"Loaded {len(df_new)} new rows into aqr_cmdty_factors from {file_path} (sheet {sheet})")
            else:
                logger.info(f"No new rows to load into aqr_cmdty_factors from {file_path} (sheet {sheet})")
                logger.debug(f"Input date range: {df['date'].min()} to {df['date'].max()}")
                logger.debug(f"Database date range: {existing['date'].min() if not existing.empty else 'N/A'} to {existing['date'].max() if not existing.empty else 'N/A'}")
        else:
            date_col = 'DATE'
            orig_cols = df.columns.tolist()
            if 'columns' in file_config:
                expected_cols = list(file_config['columns'].keys())
                valid_cols = [col for col in expected_cols if col in df.columns]
                if not valid_cols:
                    logger.error(f"No valid columns found in {file_path} (sheet {sheet}). Skipping.")
                    return
                if date_col not in valid_cols:
                    valid_cols.append(date_col)
                df = df[valid_cols]
                new_columns = ['date' if col == date_col else file_config['columns'].get(col, col) for col in df.columns]
                df.columns = new_columns
            
            logger.info(f"Columns after renaming: {list(df.columns)}")
            
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            invalid_dates = df['date'].isna().sum()
            if invalid_dates > 0:
                logger.warning(f"Dropped {invalid_dates} rows due to invalid dates in {file_path} (sheet {sheet})")
            df = df.dropna(subset=['date'])
            logger.debug(f"After dropping NA dates, {len(df)} rows remain")
            
            # Assign region before melting
            file_path_for_check = parent_path if parent_path else file_config.get('path', '')
            if 'TSMOM' in file_path_for_check:
                df['region'] = "Global"
                df['asset_class'] = df.columns.map({
                    "date": None,
                    "TSM-MA": "Multi-Asset",
                    "TSM-Com": "Commodities",
                    "TSM-EQ": "Equity",
                    "TSM-FI": "Fixed Income",
                    "TSM-FX": "Currencies"
                }).fillna("Unknown")
            elif 'Quality Minus Junk' in file_path_for_check:
                df['region'] = df['date'].apply(lambda x: "USA" if x < pd.Timestamp("1989-07-31") else "Global")
                df['asset_class'] = "Equity"
            elif 'Century' in file_path_for_check:
                df['region'] = df.columns.map(lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global').fillna("Global")
                df['asset_class'] = df.columns.str.extract(
                    '(Stock Selection|Equity indices|Fixed income|Currencies|Commodities|All Macro|All asset classes)'
                )[0].fillna('Equity')
            else:  # BAB_multi
                df['region'] = df.columns.map({
                    "DATE": None,
                    "USA": "USA",
                    "Global": "Global",
                    "Global Ex USA": "Intl",
                    "Risk Free Rate": "USA"
                }).fillna(None)
                df['asset_class'] = "Equity" if "Risk Free Rate" not in df.columns else "Fixed Income"
            
            value_vars = [col for col in df.columns if col != 'date' and col not in ['region', 'asset_class']]
            df_long = df.melt(id_vars=['date', 'region', 'asset_class'], value_vars=value_vars, var_name='factor', value_name='value')
            df_long = df_long.dropna(subset=['value', 'date'])
            logger.debug(f"Rows after melting and dropping NA: {len(df_long)}")
            
            df_long['associated_paper'] = file_config['paper'] if file_config['paper'] is not None else 'Unknown'
            df_long = df_long[['factor', 'date', 'associated_paper', 'value', 'region', 'asset_class']]
            
            with engine.connect() as connection:
                existing = pd.read_sql(
                    "SELECT factor, date, associated_paper, region FROM factor_returns",
                    connection
                )
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(tuple(row) for row in existing[['factor', 'date', 'associated_paper', 'region']].values)
                logger.debug(f"Existing keys sample (first 5): {list(existing_keys)[:5]}")
                logger.debug(f"Total existing keys: {len(existing_keys)}")
            
            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            df_long['key'] = df_long.apply(
                lambda row: (row['factor'], row['date_str'], row['associated_paper'], row['region'] if pd.notna(row['region']) else 'Unknown'), axis=1
            )
            logger.debug(f"New keys sample (first 5): {df_long['key'].head().tolist()}")
            df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['key', 'date_str'])
            
            if not df_new.empty:
                logger.debug(f"New rows to insert: {len(df_new)}. Sample (first 5): {df_new[['factor', 'date', 'region']].head().to_dict('records')}")
                for chunk in [df_new[i:i+10000] for i in range(0, len(df_new), 10000)]:
                    chunk.to_sql(
                        'factor_returns',
                        engine,
                        if_exists='append',
                        index=False,
                        dtype={
                            'factor': VARCHAR(50),
                            'date': DATE,
                            'associated_paper': VARCHAR(100),
                            'value': DECIMAL(15, 6),
                            'region': VARCHAR(50),
                            'asset_class': VARCHAR(50)
                        }
                    )
                logger.info(f"Loaded {len(df_new)} new rows into factor_returns from {file_path} (sheet {sheet})")
            else:
                logger.info(f"No new rows to load into factor_returns from {file_path} (sheet {sheet})")
                logger.debug(f"Input date range: {df_long['date'].min()} to {df_long['date'].max()}")
                logger.debug(f"Database date range: {existing['date'].min() if not existing.empty else 'N/A'} to {existing['date'].max() if not existing.empty else 'N/A'}")
            
    except FileNotFoundError as e:
        logger.error(f"File not found: {file_path} (sheet {sheet}): {str(e)}")
    except pd.errors.ParserError as e:
        logger.error(f"Error parsing Excel file {file_path} (sheet {sheet}): {str(e)}")
    except Exception as e:
        logger.error(f"Unexpected error processing {file_path} (sheet {sheet}): {type(e).__name__}: {str(e)}")

# Main loop
for key, config in data_files.items():
    logger.info(f"Processing dataset: {key}")
    if key == "BAB_multi":
        for subkey, subconfig in config['sheets'].items():
            process_factors(subconfig, sheet=subconfig['sheet'], parent_path=config['path'])
    else:
        is_com = (key == "COM")
        process_factors(config, sheet=config['sheet'], is_com=is_com)

logger.info("All data processing complete!")

2025-04-11 12:51:40,263 - INFO - Processing dataset: BAB_multi
2025-04-11 12:51:40,265 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet BAB Factors) with is_com=False
2025-04-11 12:51:40,572 - INFO - Read 1129 rows with set columns: ['DATE', 'AUS', 'AUT', 'BEL', 'CAN', 'CHE', 'DEU', 'DNK', 'ESP', 'FIN', 'FRA', 'GBR', 'GRC', 'HKG', 'IRL', 'ISR', 'ITA', 'JPN', 'NLD', 'NOR', 'NZL', 'PRT', 'SGP', 'SWE', 'USA', 'Global', 'Global Ex USA', 'Europe', 'North America', 'Pacific']
2025-04-11 12:51:40,575 - INFO - Columns after renaming: ['date', 'BAB', 'BAB', 'BAB']
2025-04-11 12:51:40,586 - DEBUG - After dropping NA dates, 1129 rows remain
2025-04-11 12:51:40,588 - ERROR - Unexpected error processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet BAB Factors): ValueError: Length of val

In [29]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DATE, DECIMAL
import logging

# Logging setup
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(), logging.FileHandler('load_aqr_data.log')]
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# Database connection
engine = create_engine(
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

# File configurations with explicit headers
data_files = {
    "BAB_multi": {
        "path": r"C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx",
        "sheets": {
            "BAB": {
                "sheet": "BAB Factors",
                "start_row": 19,
                "paper": "Betting Against Beta (Frazzini and Pedersen, 2014)",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "BAB", "Global": "BAB", "Global Ex USA": "BAB"}
            },
            "MKT": {
                "sheet": "MKT",
                "start_row": 19,
                "paper": "Fama-French Factors",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "MKT", "Global": "MKT", "Global Ex USA": "MKT"}
            },
            "SMB": {
                "sheet": "SMB",
                "start_row": 19,
                "paper": "Fama-French Factors",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "SMB", "Global": "SMB", "Global Ex USA": "SMB"}
            },
            "HML-FF": {
                "sheet": "HML FF",
                "start_row": 19,
                "paper": "Fama-French Factors",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "HML-FF", "Global": "HML-FF", "Global Ex USA": "HML-FF"}
            },
            "HML-D": {
                "sheet": "HML Devil",
                "start_row": 19,
                "paper": "The Devil's in HML's Details",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "HML-D", "Global": "HML-D", "Global Ex USA": "HML-D"}
            },
            "UMD": {
                "sheet": "UMD",
                "start_row": 19,
                "paper": "On Persistence in Mutual Fund Performance",
                "headers": ["DATE", "AUS", "AUT", "BEL", "CAN", "CHE", "DEU", "DNK", "ESP", "FIN", "FRA", "GBR", "GRC", "HKG", "IRL", "ISR", "ITA", "JPN", "NLD", "NOR", "NZL", "PRT", "SGP", "SWE", "USA", "Global", "Global Ex USA", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "UMD", "Global": "UMD", "Global Ex USA": "UMD"}
            },
            "ME": {
                "sheet": "ME(t-1)",
                "start_row": 19,
                "paper": "AQR Factors",
                "headers": ["DATE", "AUT", "HKG", "ESP", "GBR", "ITA", "DEU", "DNK", "NZL", "NLD", "USA", "PRT", "BEL", "ISR", "GRC", "NOR", "SGP", "CHE", "IRL", "CAN", "FIN", "JPN", "SWE", "FRA", "AUS", "Global Ex USA", "Global", "Europe", "North America", "Pacific"],
                "columns": {"DATE": "DATE", "USA": "ME", "Global": "ME", "Global Ex USA": "ME"}
            },
            "RF": {
                "sheet": "RF",
                "start_row": 18,
                "paper": None,
                "headers": ["DATE", "Risk Free Rate"],
                "columns": {"DATE": "DATE", "Risk Free Rate": "RF"}
            }
        }
    },
    "TSMOM": {
        "path": r"C:\Users\JulianHeron\Downloads\Time Series Momentum Factors Monthly.xlsx",
        "sheet": "TSMOM Factors",
        "start_row": 18,
        "paper": "Time Series Momentum (Moskowitz, Ooi, and Pedersen, 2012)",
        "headers": ["DATE", "TSMOM", "TSMOM^CM", "TSMOM^EQ", "TSMOM^FI", "TSMOM^FX"],
        "columns": {
            "DATE": "DATE",
            "TSMOM": "TSM-MA",
            "TSMOM^CM": "TSM-Com",
            "TSMOM^EQ": "TSM-EQ",
            "TSMOM^FI": "TSM-FI",
            "TSMOM^FX": "TSM-FX"
        }
    },
    "QMJ": {
        "path": r"C:\Users\JulianHeron\Downloads\Quality Minus Junk 10 QualitySorted Portfolios Monthly.xlsx",
        "sheet": "10 Portfolios Formed on Quality",
        "start_row": 19,
        "paper": "Quality Minus Junk (Asness, Frazzini and Pedersen, 2014)",
        "headers": ["DATE", "P1 (low quality)", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10 (high quality)", "P10-P1", "P1 (low quality).1", "P2.1", "P3.1", "P4.1", "P5.1", "P6.1", "P7.1", "P8.1", "P9.1", "P10 (high quality).1", "P10-P1.1"],
        "columns": {
            "DATE": "DATE",
            "P1 (low quality)": "QMJ_P1",
            "P2": "QMJ_P2",
            "P3": "QMJ_P3",
            "P4": "QMJ_P4",
            "P5": "QMJ_P5",
            "P6": "QMJ_P6",
            "P7": "QMJ_P7",
            "P8": "QMJ_P8",
            "P9": "QMJ_P9",
            "P10 (high quality)": "QMJ_P10",
            "P10-P1": "QMJ",
            "P1 (low quality).1": "QMJ_P1",
            "P2.1": "QMJ_P2",
            "P3.1": "QMJ_P3",
            "P4.1": "QMJ_P4",
            "P5.1": "QMJ_P5",
            "P6.1": "QMJ_P6",
            "P7.1": "QMJ_P7",
            "P8.1": "QMJ_P8",
            "P9.1": "QMJ_P9",
            "P10 (high quality).1": "QMJ_P10",
            "P10-P1.1": "QMJ"
        }
    },
    "Century": {
        "path": r"C:\Users\JulianHeron\Downloads\Century of Factor Premia Monthly (1).xlsx",
        "sheet": 0,
        "start_row": 18,
        "paper": "Century of Factor Premia Monthly-Ilmanen et al. (2021)",
        "headers": ["DATE", "US Stock Selection Value", "US Stock Selection Momentum", "US Stock Selection Defensive", "US Stock Selection Multi-style", "Intl Stock Selection Value", "Intl Stock Selection Momentum", "Intl Stock Selection Defensive", "Intl Stock Selection Multi-style", "Equity indices Value", "Equity indices Momentum", "Equity indices Carry", "Equity indices Defensive", "Equity indices Multi-style", "Fixed income Value", "Fixed income Momentum", "Fixed income Carry", "Fixed income Defensive", "Fixed income Multi-style", "Currencies Value", "Currencies Momentum", "Currencies Carry", "Currencies Multi-style", "Commodities Value", "Commodities Momentum", "Commodities Carry", "Commodities Multi-style", "All Stock Selection Value", "All Stock Selection Momentum", "All Stock Selection Defensive", "All Stock Selection Multi-style", "All Macro Value", "All Macro Momentum", "All Macro Carry", "All Macro Defensive", "All Macro Multi-style", "All asset classes Value", "All asset classes Momentum", "All asset classes Carry", "All asset classes Defensive", "All asset classes Multi-style", "Equity indices Market", "Fixed income Market", "Commodities Market", "All Macro Market"],
        "columns": {
            "DATE": "DATE",
            "US Stock Selection Value": "HML-FF-US", "US Stock Selection Momentum": "UMD-US", 
            "US Stock Selection Defensive": "BAB-US", "US Stock Selection Multi-style": "Multi-style-US",
            "Intl Stock Selection Value": "HML-FF-Intl", "Intl Stock Selection Momentum": "UMD-Intl", 
            "Intl Stock Selection Defensive": "BAB-Intl", "Intl Stock Selection Multi-style": "Multi-style-Intl",
            "Equity indices Value": "HML-Equity", "Equity indices Momentum": "UMD-Equity", 
            "Equity indices Carry": "Carry-Equity", "Equity indices Defensive": "BAB-Equity", 
            "Equity indices Multi-style": "Multi-style-Equity",
            "Fixed income Value": "HML-FI", "Fixed income Momentum": "UMD-FI", 
            "Fixed income Carry": "Carry-FI", "Fixed income Defensive": "BAB-FI", 
            "Fixed income Multi-style": "Multi-style-FI",
            "Currencies Value": "HML-FX", "Currencies Momentum": "UMD-FX", 
            "Currencies Carry": "Carry-FX", "Currencies Multi-style": "Multi-style-FX",
            "Commodities Value": "HML-COM", "Commodities Momentum": "UMD-COM", 
            "Commodities Carry": "Carry-COM", "Commodities Multi-style": "Multi-style-COM",
            "All Stock Selection Value": "HML-All-SS", "All Stock Selection Momentum": "UMD-All-SS", 
            "All Stock Selection Defensive": "BAB-All-SS", "All Stock Selection Multi-style": "Multi-style-All-SS",
            "All Macro Value": "HML-All-Macro", "All Macro Momentum": "UMD-All-Macro", 
            "All Macro Carry": "Carry-All-Macro", "All Macro Defensive": "BAB-All-Macro", 
            "All Macro Multi-style": "Multi-style-All-Macro",
            "All asset classes Value": "HML-All", "All asset classes Momentum": "UMD-All", 
            "All asset classes Carry": "Carry-All", "All asset classes Defensive": "BAB-All", 
            "All asset classes Multi-style": "Multi-style-All",
            "Equity indices Market": "MKT-Equity", "Fixed income Market": "MKT-FI", 
            "Commodities Market": "MKT-COM", "All Macro Market": "MKT-All-Macro"
        }
    },
    "COM": {
        "path": r"C:\Users\JulianHeron\Downloads\Commodities for the Long Run Index Level Data Monthly.xlsx",
        "sheet": 0,
        "start_row": 11,
        "paper": "Commodities for the Long Run",
        "headers": ["DATE", "Excess return of equal-weight commodities portfolio", "Excess spot return of equal-weight commodities portfolio", "Interest rate adjusted carry of equal-weight commodities portfolio", "Spot return of equal-weight commodities portfolio", "Carry of equal-weight commodities portfolio", "Excess return of long/short commodities portfolio", "Excess spot return of long/short commodities portfolio", "Interest rate adjusted carry of long/short commodities portfolio", "Aggregate backwardation/contango", "State of backwardation/contango", "State of inflation"],
        "columns": {
            "DATE": "date",
            "Excess return of equal-weight commodities portfolio": "excess_return_eqwt",
            "Excess spot return of equal-weight commodities portfolio": "excess_spot_return_eqwt",
            "Interest rate adjusted carry of equal-weight commodities portfolio": "ir_adjusted_carry_eqwt",
            "Spot return of equal-weight commodities portfolio": "spot_return_eqwt",
            "Carry of equal-weight commodities portfolio": "carry_eqwt",
            "Excess return of long/short commodities portfolio": "excess_return_long_short",
            "Excess spot return of long/short commodities portfolio": "excess_spot_return_long_short",
            "Interest rate adjusted carry of long/short commodities portfolio": "ir_adjusted_carry_long_short",
            "Aggregate backwardation/contango": "aggregate_backwardation_contango",
            "State of backwardation/contango": "state_backwardation_contango",
            "State of inflation": "state_inflation"
        }
    }
}

def process_factors(file_config, sheet=0, is_com=False, parent_path=None):
    file_path = parent_path if parent_path else file_config['path']
    logger.info(f"Processing {file_path} (sheet {sheet}) with is_com={is_com}")
    try:
        # Read Excel without headers, then set them
        df = pd.read_excel(file_path, sheet_name=sheet, header=None, skiprows=file_config['start_row'])
        df.columns = file_config['headers']
        logger.info(f"Read {len(df)} rows with set columns: {list(df.columns)}")
        
        if is_com:
            expected_cols = list(file_config['columns'].keys())
            df = df[expected_cols]
            df.columns = [file_config['columns'][col] for col in df.columns]
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df['associated_paper'] = file_config['paper'] if file_config['paper'] is not None else 'Unknown'
            df = df.dropna(subset=['date'])
            logger.debug(f"After dropping NA dates, {len(df)} rows remain")
            
            with engine.connect() as connection:
                existing = pd.read_sql("SELECT date FROM aqr_cmdty_factors", connection)
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(existing['date'])
                logger.debug(f"Existing commodity dates sample (first 5): {list(existing_keys)[:5]}")
            
            df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')
            df_new = df[~df['date_str'].isin(existing_keys)].drop(columns=['date_str'])
            
            if not df_new.empty:
                for chunk in [df_new[i:i+10000] for i in range(0, len(df_new), 10000)]:
                    chunk.to_sql(
                        'aqr_cmdty_factors',
                        engine,
                        if_exists='append',
                        index=False,
                        dtype={
                            'date': DATE,
                            'excess_return_eqwt': DECIMAL(15, 6),
                            'excess_spot_return_eqwt': DECIMAL(15, 6),
                            'ir_adjusted_carry_eqwt': DECIMAL(15, 6),
                            'spot_return_eqwt': DECIMAL(15, 6),
                            'carry_eqwt': DECIMAL(15, 6),
                            'excess_return_long_short': DECIMAL(15, 6),
                            'excess_spot_return_long_short': DECIMAL(15, 6),
                            'ir_adjusted_carry_long_short': DECIMAL(15, 6),
                            'aggregate_backwardation_contango': DECIMAL(15, 6),
                            'state_backwardation_contango': VARCHAR(50),
                            'state_inflation': VARCHAR(50),
                            'associated_paper': VARCHAR(100)
                        }
                    )
                logger.info(f"Loaded {len(df_new)} new rows into aqr_cmdty_factors from {file_path} (sheet {sheet})")
            else:
                logger.info(f"No new rows to load into aqr_cmdty_factors from {file_path} (sheet {sheet})")
                logger.debug(f"Input date range: {df['date'].min()} to {df['date'].max()}")
                logger.debug(f"Database date range: {existing['date'].min() if not existing.empty else 'N/A'} to {existing['date'].max() if not existing.empty else 'N/A'}")
        else:
            date_col = 'DATE'
            orig_cols = df.columns.tolist()
            if 'columns' in file_config:
                expected_cols = list(file_config['columns'].keys())
                valid_cols = [col for col in expected_cols if col in df.columns]
                if not valid_cols:
                    logger.error(f"No valid columns found in {file_path} (sheet {sheet}). Skipping.")
                    return
                if date_col not in valid_cols:
                    valid_cols.append(date_col)
                df = df[valid_cols]
                new_columns = ['date' if col == date_col else file_config['columns'].get(col, col) for col in df.columns]
                df.columns = new_columns
            
            logger.info(f"Columns after renaming: {list(df.columns)}")
            
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            invalid_dates = df['date'].isna().sum()
            if invalid_dates > 0:
                logger.warning(f"Dropped {invalid_dates} rows due to invalid dates in {file_path} (sheet {sheet})")
            df = df.dropna(subset=['date'])
            logger.debug(f"After dropping NA dates, {len(df)} rows remain")
            
            # Melt the DataFrame
            value_vars = [col for col in df.columns if col != 'date']
            df_long = df.melt(id_vars=['date'], value_vars=value_vars, var_name='factor', value_name='value', col_level=0)
            df_long = df_long.dropna(subset=['value', 'date'])
            logger.debug(f"Rows after melting and dropping NA: {len(df_long)}")
            
            # Assign region and asset_class post-melt
            file_path_for_check = parent_path if parent_path else file_config.get('path', '')
            if 'TSMOM' in file_path_for_check:
                df_long['region'] = "Global"
                df_long['asset_class'] = df_long['factor'].map({
                    "TSM-MA": "Multi-Asset",
                    "TSM-Com": "Commodities",
                    "TSM-EQ": "Equity",
                    "TSM-FI": "Fixed Income",
                    "TSM-FX": "Currencies"
                })
            elif 'Quality Minus Junk' in file_path_for_check:
                df_long['region'] = df_long['date'].apply(lambda x: "USA" if x < pd.Timestamp("1989-07-31") else "Global")
                df_long['asset_class'] = "Equity"
            elif 'Century' in file_path_for_check:
                df_long['region'] = df_long['factor'].apply(lambda x: 'US' if 'US' in x else 'Intl' if 'Intl' in x else 'Global')
                df_long['asset_class'] = df_long['factor'].str.extract(
                    '(Stock Selection|Equity indices|Fixed income|Currencies|Commodities|All Macro|All asset classes)'
                ).fillna('Equity')
            else:  # BAB_multi
                region_map = {
                    "USA": "USA",
                    "Global": "Global",
                    "Global Ex USA": "Intl",
                    "Risk Free Rate": "USA"
                }
                df_long['region'] = df_long['factor'].map(region_map)
                df_long['asset_class'] = "Equity" if "Risk Free Rate" not in orig_cols else "Fixed Income"
            
            df_long['associated_paper'] = file_config['paper'] if file_config['paper'] is not None else 'Unknown'
            df_long = df_long[['factor', 'date', 'associated_paper', 'value', 'region', 'asset_class']]
            
            with engine.connect() as connection:
                existing = pd.read_sql(
                    "SELECT factor, date, associated_paper, region FROM factor_returns",
                    connection
                )
                existing['date'] = pd.to_datetime(existing['date']).dt.strftime('%Y-%m-%d')
                existing_keys = set(tuple(row) for row in existing[['factor', 'date', 'associated_paper', 'region']].values)
                logger.debug(f"Existing keys sample (first 5): {list(existing_keys)[:5]}")
                logger.debug(f"Total existing keys: {len(existing_keys)}")
            
            df_long['date_str'] = df_long['date'].dt.strftime('%Y-%m-%d')
            df_long['key'] = df_long.apply(
                lambda row: (row['factor'], row['date_str'], row['associated_paper'], row['region'] if pd.notna(row['region']) else 'Unknown'), axis=1
            )
            logger.debug(f"New keys sample (first 5): {df_long['key'].head().tolist()}")
            df_new = df_long[~df_long['key'].isin(existing_keys)].drop(columns=['key', 'date_str'])
            
            if not df_new.empty:
                logger.debug(f"New rows to insert: {len(df_new)}. Sample (first 5): {df_new[['factor', 'date', 'region']].head().to_dict('records')}")
                for chunk in [df_new[i:i+10000] for i in range(0, len(df_new), 10000)]:
                    chunk.to_sql(
                        'factor_returns',
                        engine,
                        if_exists='append',
                        index=False,
                        dtype={
                            'factor': VARCHAR(50),
                            'date': DATE,
                            'associated_paper': VARCHAR(100),
                            'value': DECIMAL(15, 6),
                            'region': VARCHAR(50),
                            'asset_class': VARCHAR(50)
                        }
                    )
                logger.info(f"Loaded {len(df_new)} new rows into factor_returns from {file_path} (sheet {sheet})")
            else:
                logger.info(f"No new rows to load into factor_returns from {file_path} (sheet {sheet})")
                logger.debug(f"Input date range: {df_long['date'].min()} to {df_long['date'].max()}")
                logger.debug(f"Database date range: {existing['date'].min() if not existing.empty else 'N/A'} to {existing['date'].max() if not existing.empty else 'N/A'}")
            
    except FileNotFoundError as e:
        logger.error(f"File not found: {file_path} (sheet {sheet}): {str(e)}")
    except pd.errors.ParserError as e:
        logger.error(f"Error parsing Excel file {file_path} (sheet {sheet}): {str(e)}")
    except Exception as e:
        logger.error(f"Unexpected error processing {file_path} (sheet {sheet}): {type(e).__name__}: {str(e)}")

# Main loop
for key, config in data_files.items():
    logger.info(f"Processing dataset: {key}")
    if key == "BAB_multi":
        for subkey, subconfig in config['sheets'].items():
            process_factors(subconfig, sheet=subconfig['sheet'], parent_path=config['path'])
    else:
        is_com = (key == "COM")
        process_factors(config, sheet=config['sheet'], is_com=is_com)

logger.info("All data processing complete!")

2025-04-11 13:07:25,108 - INFO - Processing dataset: BAB_multi
2025-04-11 13:07:25,110 - INFO - Processing C:\Users\JulianHeron\OneDrive - Credentialed Wealth Advisors\Documents\AQR_Factor_Checks\Betting_Against_Beta_Equity_Factors_Monthly.xlsx (sheet BAB Factors) with is_com=False
2025-04-11 13:07:26,145 - INFO - Read 1129 rows with set columns: ['DATE', 'AUS', 'AUT', 'BEL', 'CAN', 'CHE', 'DEU', 'DNK', 'ESP', 'FIN', 'FRA', 'GBR', 'GRC', 'HKG', 'IRL', 'ISR', 'ITA', 'JPN', 'NLD', 'NOR', 'NZL', 'PRT', 'SGP', 'SWE', 'USA', 'Global', 'Global Ex USA', 'Europe', 'North America', 'Pacific']
2025-04-11 13:07:26,149 - INFO - Columns after renaming: ['date', 'BAB', 'BAB', 'BAB']
2025-04-11 13:07:26,174 - DEBUG - After dropping NA dates, 1129 rows remain
2025-04-11 13:07:26,205 - DEBUG - Rows after melting and dropping NA: 2039
2025-04-11 13:07:26,545 - DEBUG - Existing keys sample (first 5): []
2025-04-11 13:07:26,546 - DEBUG - Total existing keys: 0
2025-04-11 13:07:26,567 - DEBUG - New keys sa