In [3]:
import math
import certifi
import ssl
import requests
import sqlalchemy
import pandas as pd
import datetime
import time
import concurrent.futures
from requests.adapters import HTTPAdapter
import logging

# -----------------------------
# Logging Configuration
# -----------------------------
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# -----------------------------
# Configuration
# -----------------------------
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)

YCHARTS_API_URL = "https://api.ycharts.com"
API_HEADERS = {
    "X-YCHARTSAUTHORIZATION": "yIIphqbsQysnTvWWxfW33w",  # Replace with your actual API key
    "X-YCHARTSEXCELSESSION": "b645cd897b2446bfa3796acfa3a879db",
    "X-YCHARTSEXCELVERSION": "4.4",
    "X-YCHARTSOPERATINGSYSTEM": "Microsoft Windows NT 10.0.26100.0",
    "Content-Type": "application/x-www-form-urlencoded"
}

# -----------------------------
# Metric Mapping Dictionary
# -----------------------------
# Maps database column names to corresponding YCharts function codes.
yc_metric_mapping = {
    # Stylebox (Equity Exposure) Metrics
    "LCG": "equity_stylebox_large_cap_growth_exposure",
    "LCB": "equity_stylebox_large_cap_blend_exposure",
    "LCV": "equity_stylebox_large_cap_value_exposure",
    "MCG": "equity_stylebox_mid_cap_growth_exposure",
    "MCB": "equity_stylebox_mid_cap_blend_exposure",
    "MCV": "equity_stylebox_mid_cap_value_exposure",
    "SCG": "equity_stylebox_small_cap_growth_exposure",
    "SCB": "equity_stylebox_small_cap_blend_exposure",
    "SCV": "equity_stylebox_small_cap_value_exposure",
    "Giant_Cap_Exposure": "giant_cap_exposure",
    "Large_Cap_Exposure": "large_cap_exposure",
    "Medium_Cap_Exposure": "medium_cap_exposure",
    "Micro_Cap_Exposure": "micro_cap_exposure",
    "Sensitive_Exposure": "sensitive_exposure",
    "Small_Cap_Exposure": "small_cap_exposure",
    "Cyclical_Exposure": "cyclical_exposure",
    "Defensive_Exposure": "defensive_exposure",
    # Bond Term (Maturity) Metrics
    "Term_10_15_Y": "10_to_15_years_maturity_bond_exposure",
    "Term_1_3_Y": "1_to_3_years_maturity_bond_exposure",
    "Term_15_20_Y": "15_to_20_years_maturity_bond_exposure",
    "Term_1_7_D": "1_to_7_days_maturity_bond_exposure",
    "Term_183_364_D": "183_to_364_days_maturity_bond_exposure",
    "Term_20_30_Y": "20_to_30_years_maturity_bond_exposure",
    "Term_30_Plus_Y": "over_30_years_maturity_bond_exposure",
    "Term_31_90_D": "31_to_90_days_maturity_bond_exposure",
    "Term_3_5_Y": "3_to_5_years_maturity_bond_exposure",
    "Term_5_7_Y": "5_to_7_years_maturity_bond_exposure",
    "Term_7_10_Y": "7_to_10_years_maturity_bond_exposure",
    "Term_8_30_D": "8_to_30_days_maturity_bond_exposure",
    "Term_91_182_D": "91_to_182_days_maturity_bond_exposure",
    # Term Exposure Metrics
    "LT_Exposure": "long_term_exposure",
    "Interm_Exposure": "intermediate_term_exposure",
    "ST_Exposure": "short_term_exposure",
    # Bond Rating Metrics
    "AAA_Rated": "aaa_bond_exposure",
    "AA_Rated": "aa_bond_exposure",
    "A_Rated": "a_bond_exposure",
    "BBB_Rated": "bbb_bond_exposure",
    "B_Rated": "b_bond_exposure",
    "BB_Rated": "bb_bond_exposure",
    "Below_B_Rated": "below_b_bond_exposure",
    "Not_Rated": "not_rated_bond_exposure"
}

# Use the keys from the mapping as the list of metrics (database column names)
METRICS = list(yc_metric_mapping.keys())

# SQL to retrieve fund symbols and their Fund_Type_ID from the Funds_to_Screen table.
FUND_SYMBOLS_SQL = """
SELECT DISTINCT SymbolCUSIP AS Fund_Symbol, Fund_Type_ID
FROM Funds_to_Screen
"""

# -----------------------------
# Custom SSL Context and HTTP Adapter
# -----------------------------
context = ssl.create_default_context()
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE  # For debugging only; use CERT_REQUIRED in production.

class CustomHTTPAdapter(HTTPAdapter):
    def init_poolmanager(self, *args, **kwargs):
        kwargs['ssl_context'] = context
        return super().init_poolmanager(*args, **kwargs)

session = requests.Session()
session.mount('https://', CustomHTTPAdapter())

# -----------------------------
# Helper Functions for YCP Calls
# -----------------------------
def build_points_lines(symbol, yc_code, is_mutual):
    """
    Construct the points line for a YCP (data point) call.
    Format: "symbol_for_api,yc_code"
    Note: No date is included.
    """
    symbol_for_api = f"M:{symbol}" if is_mutual else symbol
    return f"{symbol_for_api},{yc_code}"

def fetch_data_point(symbol, metric, fund_type_id):
    """
    Fetch a single descriptive data point for the given metric using the YCP endpoint.
    Uses the yc_metric_mapping to get the proper YC function code.
    
    This function now checks for the "data" key; if absent, it falls back to an empty string key.
    """
    is_mutual = (fund_type_id == 3)
    yc_code = yc_metric_mapping.get(metric)
    if not yc_code:
        logger.error(f"No YC mapping found for metric '{metric}'.")
        return None
    points_line = build_points_lines(symbol, yc_code, is_mutual)
    payload = f"points={points_line}"
    api_url = f"{YCHARTS_API_URL}/v3/excel/points"
    logger.info(f"Fetching data point for metric '{metric}' (YC Code: {yc_code}) for symbol '{symbol}' with payload: {payload}")
    try:
        response = session.post(api_url, headers=API_HEADERS, data=payload, timeout=60)
        response.raise_for_status()
        data = response.json()
        symbol_for_api = f"M:{symbol}" if is_mutual else symbol
        if 'response' in data and symbol_for_api in data['response']:
            results = data['response'][symbol_for_api].get("results", {})
            if yc_code in results:
                # Try the expected "data" key first
                datapoints = results[yc_code].get("data")
                if not datapoints:
                    # Fallback to the empty string key if present
                    fallback = results[yc_code].get("")
                    if fallback and "results" in fallback:
                        # Wrap the result list in a list to mimic a list of datapoints.
                        datapoints = [fallback["results"]]
                if datapoints and isinstance(datapoints, list):
                    # Expecting each datapoint to be a list where the second element is the value.
                    last_point = datapoints[-1]
                    if isinstance(last_point, list) and len(last_point) > 1:
                        value = last_point[1]
                        logger.info(f"Fetched {metric} for {symbol}: {value}")
                        return value
        logger.warning(f"No valid data returned for symbol '{symbol}' on metric '{metric}' (YC Code: {yc_code})")
        return None
    except requests.exceptions.RequestException as e:
        logger.error(f"API error for symbol '{symbol}' on metric '{metric}' (YC Code: {yc_code}): {e}")
        return None

# -----------------------------
# Other Helper Functions
# -----------------------------
def convert_metric_to_column_name(metric):
    """
    In this implementation, the database column name is the same as the metric key.
    """
    return metric

def verify_columns(engine, table_name, expected_columns):
    """
    Verify that all expected columns exist in the specified table.
    Logs any missing columns.
    """
    inspector = sqlalchemy.inspect(engine)
    actual_columns = [col['name'] for col in inspector.get_columns(table_name)]
    missing_columns = [col for col in expected_columns if col not in actual_columns]
    if missing_columns:
        logger.warning(f"Missing columns in {table_name}: {missing_columns}")
    else:
        logger.info(f"All expected columns are present in {table_name}.")

def check_and_update_fund_data(engine, symbol, data):
    """
    Check if a record for the given symbol exists in Funds_to_Screen, then update or insert.
    Only the columns we are updating (our metrics) are included.
    The SQL is built dynamically based on our METRICS.
    """
    query_sql = """
    SELECT 1 FROM Funds_to_Screen WHERE SymbolCUSIP = :symbol
    """
    
    insert_sql = """
    INSERT INTO Funds_to_Screen (
        SymbolCUSIP, {columns}
    )
    VALUES (
        :symbol, {placeholders}
    )
    """
    
    update_sql = """
    UPDATE Funds_to_Screen 
    SET {assignments}
    WHERE SymbolCUSIP = :symbol
    """
    
    # Build the dynamic SQL parts based solely on our METRICS.
    columns = ", ".join(METRICS)
    placeholders = ", ".join(f":{col}" for col in METRICS)
    assignments = ", ".join(f"{col} = :{col}" for col in METRICS)
    
    insert_sql = insert_sql.format(columns=columns, placeholders=placeholders)
    update_sql = update_sql.format(assignments=assignments)
    
    with engine.connect() as conn:
        try:
            result = conn.execute(sqlalchemy.text(query_sql), {'symbol': symbol}).fetchone()
            if result:
                conn.execute(sqlalchemy.text(update_sql), data)
                logger.info(f"Updated data for symbol {symbol}")
            else:
                conn.execute(sqlalchemy.text(insert_sql), data)
                logger.info(f"Inserted new record for symbol {symbol}")
            conn.commit()
        except sqlalchemy.exc.SQLAlchemyError as e:
            logger.error(f"Database error for {symbol}: {e}")
            conn.rollback()

def call_update_derived_metrics(engine):
    """
    Call the SQL stored procedure to update derived metrics.
    """
    with engine.connect() as conn:
        try:
            conn.execute(sqlalchemy.text("EXEC UpdateDerivedMetrics"))
            conn.commit()
            logger.info("Executed stored procedure UpdateDerivedMetrics.")
        except sqlalchemy.exc.SQLAlchemyError as e:
            logger.error(f"Error executing stored procedure: {e}")
            conn.rollback()

def process_symbol(symbol, fund_type_id, engine):
    """
    Process a single symbol:
      - Fetch each metric using a YCP (data point) call.
      - Build a data dictionary keyed by our metric (database column) names.
      - Update or insert the record in the database.
      
    (Optional: Add logic to skip metrics not applicable for a given fund type.)
    """
    logger.info(f"Starting processing for symbol: {symbol}")
    data = {'symbol': symbol}
    
    for metric in METRICS:
        column_name = convert_metric_to_column_name(metric)
        value = fetch_data_point(symbol, metric, fund_type_id)
        data[column_name] = value

    # Ensure every expected key is present (even if value is None)
    for key in METRICS:
        data.setdefault(key, None)
    
    logger.debug(f"Data dictionary for {symbol}: {data}")
    check_and_update_fund_data(engine, symbol, data)
    logger.info(f"Finished processing for symbol: {symbol}")

def main():
    start_time = time.time()
    logger.info("Starting data fetch and insertion process.")
    
    engine = sqlalchemy.create_engine(connection_string,
                                      pool_size=20,
                                      max_overflow=50,
                                      pool_timeout=300)
    
    # Verify that all expected columns exist in the Funds_to_Screen table.
    expected_columns = ['SymbolCUSIP'] + METRICS
    verify_columns(engine, "Funds_to_Screen", expected_columns)
    
    # Test mode: process only SPY and AGG.
    test_mode = True
    if test_mode:
        # For testing, assume SPY and AGG have fund_type_id 1 (non-mutual)
        symbols_list = [("SPY", 1), ("AGG", 1)]
        logger.info("Test mode enabled: Processing only SPY and AGG.")
    else:
        symbols_df = pd.read_sql(FUND_SYMBOLS_SQL, engine)
        symbols_list = symbols_df[['Fund_Symbol', 'Fund_Type_ID']].values.tolist()
        logger.info(f"Processing {len(symbols_list)} symbols from the database.")
    
    # Process symbols in parallel using ThreadPoolExecutor.
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        futures = []
        for symbol, fund_type_id in symbols_list:
            futures.append(executor.submit(process_symbol, symbol, fund_type_id, engine))
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as exc:
                logger.error(f"Error processing symbol: {exc}")
    
    # Call the stored procedure to update derived metrics.
    call_update_derived_metrics(engine)
    
    total_time = time.time() - start_time
    logger.info(f"All metrics processed successfully in {total_time:.2f} seconds")
    engine.dispose()

if __name__ == "__main__":
    main()


2025-02-10 14:11:46,926 - INFO - Starting data fetch and insertion process.
2025-02-10 14:11:48,526 - INFO - All expected columns are present in Funds_to_Screen.
2025-02-10 14:11:48,527 - INFO - Test mode enabled: Processing only SPY and AGG.
2025-02-10 14:11:48,554 - INFO - Starting processing for symbol: SPY
2025-02-10 14:11:48,557 - INFO - Fetching data point for metric 'LCG' (YC Code: equity_stylebox_large_cap_growth_exposure) for symbol 'SPY' with payload: points=SPY,equity_stylebox_large_cap_growth_exposure
2025-02-10 14:11:48,573 - INFO - Starting processing for symbol: AGG
2025-02-10 14:11:48,580 - INFO - Fetching data point for metric 'LCG' (YC Code: equity_stylebox_large_cap_growth_exposure) for symbol 'AGG' with payload: points=AGG,equity_stylebox_large_cap_growth_exposure
2025-02-10 14:11:49,080 - INFO - Fetched LCG for SPY: 0.16203
2025-02-10 14:11:49,080 - INFO - Fetching data point for metric 'LCB' (YC Code: equity_stylebox_large_cap_blend_exposure) for symbol 'SPY' with

In [4]:
import math
import certifi
import ssl
import requests
import sqlalchemy
import pandas as pd
import datetime
import time
import concurrent.futures
from requests.adapters import HTTPAdapter
import logging

# -----------------------------
# Logging Configuration
# -----------------------------
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# -----------------------------
# Configuration
# -----------------------------
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)

YCHARTS_API_URL = "https://api.ycharts.com"
API_HEADERS = {
    "X-YCHARTSAUTHORIZATION": "yIIphqbsQysnTvWWxfW33w",  # Replace with your actual API key
    "X-YCHARTSEXCELSESSION": "b645cd897b2446bfa3796acfa3a879db",
    "X-YCHARTSEXCELVERSION": "4.4",
    "X-YCHARTSOPERATINGSYSTEM": "Microsoft Windows NT 10.0.26100.0",
    "Content-Type": "application/x-www-form-urlencoded"
}

# -----------------------------
# Metric Mapping Dictionary
# -----------------------------
# Maps database column names (metrics we update) to corresponding YCharts function codes.
yc_metric_mapping = {
    # Stylebox (Equity Exposure) Metrics
    "LCG": "equity_stylebox_large_cap_growth_exposure",
    "LCB": "equity_stylebox_large_cap_blend_exposure",
    "LCV": "equity_stylebox_large_cap_value_exposure",
    "MCG": "equity_stylebox_mid_cap_growth_exposure",
    "MCB": "equity_stylebox_mid_cap_blend_exposure",
    "MCV": "equity_stylebox_mid_cap_value_exposure",
    "SCG": "equity_stylebox_small_cap_growth_exposure",
    "SCB": "equity_stylebox_small_cap_blend_exposure",
    "SCV": "equity_stylebox_small_cap_value_exposure",
    "Giant_Cap_Exposure": "giant_cap_exposure",
    "Large_Cap_Exposure": "large_cap_exposure",
    "Medium_Cap_Exposure": "medium_cap_exposure",
    "Micro_Cap_Exposure": "micro_cap_exposure",
    "Sensitive_Exposure": "sensitive_exposure",
    "Small_Cap_Exposure": "small_cap_exposure",
    "Cyclical_Exposure": "cyclical_exposure",
    "Defensive_Exposure": "defensive_exposure",
    # Bond Term (Maturity) Metrics
    "Term_10_15_Y": "10_to_15_years_maturity_bond_exposure",
    "Term_1_3_Y": "1_to_3_years_maturity_bond_exposure",
    "Term_15_20_Y": "15_to_20_years_maturity_bond_exposure",
    "Term_1_7_D": "1_to_7_days_maturity_bond_exposure",
    "Term_183_364_D": "183_to_364_days_maturity_bond_exposure",
    "Term_20_30_Y": "20_to_30_years_maturity_bond_exposure",
    "Term_30_Plus_Y": "over_30_years_maturity_bond_exposure",
    "Term_31_90_D": "31_to_90_days_maturity_bond_exposure",
    "Term_3_5_Y": "3_to_5_years_maturity_bond_exposure",
    "Term_5_7_Y": "5_to_7_years_maturity_bond_exposure",
    "Term_7_10_Y": "7_to_10_years_maturity_bond_exposure",
    "Term_8_30_D": "8_to_30_days_maturity_bond_exposure",
    "Term_91_182_D": "91_to_182_days_maturity_bond_exposure",
    # Term Exposure Metrics
    "LT_Exposure": "long_term_exposure",
    "Interm_Exposure": "intermediate_term_exposure",
    "ST_Exposure": "short_term_exposure",
    # Bond Rating Metrics
    "AAA_Rated": "aaa_bond_exposure",
    "AA_Rated": "aa_bond_exposure",
    "A_Rated": "a_bond_exposure",
    "BBB_Rated": "bbb_bond_exposure",
    "B_Rated": "b_bond_exposure",
    "BB_Rated": "bb_bond_exposure",
    "Below_B_Rated": "below_b_bond_exposure",
    "Not_Rated": "not_rated_bond_exposure"
}

# Use the keys from the mapping as the list of metrics (database column names) we update.
METRICS = list(yc_metric_mapping.keys())

# SQL to retrieve fund symbols and their Fund_Type_ID from the Funds_to_Screen table.
FUND_SYMBOLS_SQL = """
SELECT DISTINCT SymbolCUSIP AS Fund_Symbol, Fund_Type_ID
FROM Funds_to_Screen
"""

# -----------------------------
# Custom SSL Context and HTTP Adapter
# -----------------------------
context = ssl.create_default_context()
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE  # For debugging only; use CERT_REQUIRED in production.

class CustomHTTPAdapter(HTTPAdapter):
    def init_poolmanager(self, *args, **kwargs):
        kwargs['ssl_context'] = context
        return super().init_poolmanager(*args, **kwargs)

session = requests.Session()
session.mount('https://', CustomHTTPAdapter())

# -----------------------------
# Helper Functions for YCP Calls
# -----------------------------
def build_points_lines(symbol, yc_code, is_mutual):
    """
    Construct the points line for a YCP (data point) call.
    Format: "symbol_for_api,yc_code"
    The "M:" prefix is added only if the fund is mutual.
    No date is included.
    """
    symbol_for_api = f"M:{symbol}" if is_mutual else symbol
    return f"{symbol_for_api},{yc_code}"

def fetch_data_point(symbol, metric, fund_type_id):
    """
    Fetch a single descriptive data point for the given metric using the YCP endpoint.
    Uses the yc_metric_mapping to get the proper YC function code.
    """
    is_mutual = (fund_type_id == 3)
    yc_code = yc_metric_mapping.get(metric)
    if not yc_code:
        logger.error(f"No YC mapping found for metric '{metric}'.")
        return None
    points_line = build_points_lines(symbol, yc_code, is_mutual)
    payload = f"points={points_line}"
    api_url = f"{YCHARTS_API_URL}/v3/excel/points"
    logger.info(f"Fetching data point for metric '{metric}' (YC Code: {yc_code}) for symbol '{symbol}' with payload: {payload}")
    try:
        response = session.post(api_url, headers=API_HEADERS, data=payload, timeout=60)
        response.raise_for_status()
        data = response.json()
        symbol_for_api = f"M:{symbol}" if is_mutual else symbol
        if 'response' in data and symbol_for_api in data['response']:
            results = data['response'][symbol_for_api].get("results", {})
            if yc_code in results:
                # Try to get data from the expected "data" key.
                datapoints = results[yc_code].get("data")
                if not datapoints:
                    # Fall back to the empty string key if "data" is not present.
                    fallback = results[yc_code].get("")
                    if fallback and "results" in fallback:
                        datapoints = [fallback["results"]]
                if datapoints and isinstance(datapoints, list):
                    last_point = datapoints[-1]
                    if isinstance(last_point, list) and len(last_point) > 1:
                        value = last_point[1]
                        logger.info(f"Fetched {metric} for {symbol}: {value}")
                        return value
        logger.warning(f"No valid data returned for symbol '{symbol}' on metric '{metric}' (YC Code: {yc_code})")
        return None
    except requests.exceptions.RequestException as e:
        logger.error(f"API error for symbol '{symbol}' on metric '{metric}' (YC Code: {yc_code}): {e}")
        return None

# -----------------------------
# Other Helper Functions
# -----------------------------
def convert_metric_to_column_name(metric):
    """
    In this implementation, the database column name is the same as the metric key.
    """
    return metric

def verify_columns(engine, table_name, expected_columns):
    """
    Verify that all expected columns exist in the specified table.
    Logs any missing columns.
    """
    inspector = sqlalchemy.inspect(engine)
    actual_columns = [col['name'] for col in inspector.get_columns(table_name)]
    missing_columns = [col for col in expected_columns if col not in actual_columns]
    if missing_columns:
        logger.warning(f"Missing columns in {table_name}: {missing_columns}")
    else:
        logger.info(f"All expected columns are present in {table_name}.")

def check_and_update_fund_data(engine, symbol, data):
    """
    Check if a record for the given symbol exists in Funds_to_Screen.
    For updates, only the metrics columns are updated.
    For inserts, we supply a default for ProductName using the symbol.
    """
    query_sql = "SELECT ProductName FROM Funds_to_Screen WHERE SymbolCUSIP = :symbol"
    # Build update SQL for only the metrics columns.
    update_sql = "UPDATE Funds_to_Screen SET " + ", ".join(f"{col} = :{col}" for col in METRICS) + " WHERE SymbolCUSIP = :symbol"
    # Build insert SQL that includes SymbolCUSIP, ProductName, and the metrics.
    columns = "SymbolCUSIP, ProductName, " + ", ".join(METRICS)
    placeholders = ":symbol, :ProductName, " + ", ".join(f":{col}" for col in METRICS)
    insert_sql = f"INSERT INTO Funds_to_Screen ({columns}) VALUES ({placeholders})"
    
    with engine.connect() as conn:
        try:
            result = conn.execute(sqlalchemy.text(query_sql), {'symbol': symbol}).fetchone()
            if result:
                # Record exists: update metrics only (do not change ProductName).
                conn.execute(sqlalchemy.text(update_sql), data)
                logger.info(f"Updated data for symbol {symbol}")
            else:
                # Record does not exist: insert new record with ProductName defaulting to symbol.
                if "ProductName" not in data or data["ProductName"] is None:
                    data["ProductName"] = symbol
                conn.execute(sqlalchemy.text(insert_sql), data)
                logger.info(f"Inserted new record for symbol {symbol}")
            conn.commit()
        except sqlalchemy.exc.SQLAlchemyError as e:
            logger.error(f"Database error for {symbol}: {e}")
            conn.rollback()

def call_update_derived_metrics(engine):
    """
    Call the SQL stored procedure to update derived metrics.
    """
    with engine.connect() as conn:
        try:
            conn.execute(sqlalchemy.text("EXEC UpdateDerivedMetrics"))
            conn.commit()
            logger.info("Executed stored procedure UpdateDerivedMetrics.")
        except sqlalchemy.exc.SQLAlchemyError as e:
            logger.error(f"Error executing stored procedure: {e}")
            conn.rollback()

def process_symbol(symbol, fund_type_id, engine):
    """
    Process a single symbol:
      - Fetch each metric using a YCP (data point) call.
      - Build a data dictionary keyed by our metric (database column) names.
      - Update or insert the record in the database.
      
    (Optional: Add logic to skip metrics not applicable for a given fund type.)
    """
    logger.info(f"Starting processing for symbol: {symbol}")
    data = {'symbol': symbol}
    for metric in METRICS:
        column_name = convert_metric_to_column_name(metric)
        value = fetch_data_point(symbol, metric, fund_type_id)
        data[column_name] = value
    # Ensure every expected key is present.
    for key in METRICS:
        data.setdefault(key, None)
    logger.debug(f"Data dictionary for {symbol}: {data}")
    check_and_update_fund_data(engine, symbol, data)
    logger.info(f"Finished processing for symbol: {symbol}")

def main():
    start_time = time.time()
    logger.info("Starting data fetch and insertion process.")
    
    engine = sqlalchemy.create_engine(connection_string,
                                      pool_size=20,
                                      max_overflow=50,
                                      pool_timeout=300)
    
    # Verify that all expected columns exist in the Funds_to_Screen table.
    expected_columns = ['SymbolCUSIP'] + METRICS
    verify_columns(engine, "Funds_to_Screen", expected_columns)
    
    # Test mode: process only SPY and AGG.
    test_mode = True
    if test_mode:
        # For testing, assume SPY and AGG have fund_type_id 1 (non-mutual)
        symbols_list = [("SPY", 1), ("AGG", 1)]
        logger.info("Test mode enabled: Processing only SPY and AGG.")
    else:
        symbols_df = pd.read_sql(FUND_SYMBOLS_SQL, engine)
        symbols_list = symbols_df[['Fund_Symbol', 'Fund_Type_ID']].values.tolist()
        logger.info(f"Processing {len(symbols_list)} symbols from the database.")
    
    # Process symbols in parallel using ThreadPoolExecutor.
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        futures = []
        for symbol, fund_type_id in symbols_list:
            futures.append(executor.submit(process_symbol, symbol, fund_type_id, engine))
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as exc:
                logger.error(f"Error processing symbol: {exc}")
    
    # Call the stored procedure to update derived metrics.
    call_update_derived_metrics(engine)
    
    total_time = time.time() - start_time
    logger.info(f"All metrics processed successfully in {total_time:.2f} seconds")
    engine.dispose()

if __name__ == "__main__":
    main()


2025-02-10 14:33:12,946 - INFO - Starting data fetch and insertion process.
2025-02-10 14:33:13,147 - INFO - All expected columns are present in Funds_to_Screen.
2025-02-10 14:33:13,148 - INFO - Test mode enabled: Processing only SPY and AGG.
2025-02-10 14:33:13,149 - INFO - Starting processing for symbol: SPY
2025-02-10 14:33:13,204 - INFO - Fetching data point for metric 'LCG' (YC Code: equity_stylebox_large_cap_growth_exposure) for symbol 'SPY' with payload: points=SPY,equity_stylebox_large_cap_growth_exposure
2025-02-10 14:33:13,204 - INFO - Starting processing for symbol: AGG
2025-02-10 14:33:13,207 - INFO - Fetching data point for metric 'LCG' (YC Code: equity_stylebox_large_cap_growth_exposure) for symbol 'AGG' with payload: points=AGG,equity_stylebox_large_cap_growth_exposure
2025-02-10 14:33:13,628 - INFO - Fetched LCG for SPY: 0.16203
2025-02-10 14:33:13,629 - INFO - Fetching data point for metric 'LCB' (YC Code: equity_stylebox_large_cap_blend_exposure) for symbol 'SPY' with

In [None]:
import requests
import sqlalchemy
import pandas as pd
import datetime
import time
import logging
import json
from concurrent.futures import ThreadPoolExecutor
from dateutil.relativedelta import relativedelta
import ssl
import certifi
from requests.adapters import HTTPAdapter
from tenacity import retry, stop_after_attempt, wait_exponential

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# SQL Server connection configuration with increased pool settings
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database?"
    "driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = sqlalchemy.create_engine(
    connection_string,
    pool_size=20,
    max_overflow=50,
    pool_timeout=300
)

# YCharts API configuration and headers
YCHARTS_API_URL = "https://api.ycharts.com"
API_HEADERS = {
    "X-YCHARTSAUTHORIZATION": "yIIphqbsQysnTvWWxfW33w",  # Replace with your actual API key
    "X-YCHARTSEXCELSESSION": "b645cd897b2446bfa3796acfa3a879db",
    "X-YCHARTSEXCELVERSION": "4.4",
    "X-YCHARTSOPERATINGSYSTEM": "Microsoft Windows NT 10.0.26100.0",
    "X-YCHARTSIP": "50.58.50.123",
    "Host": "api.ycharts.com",
    "Connection": "Keep-Alive"
}

# Define the YCI metrics
YCI_METRICS = [
    "index_fund", "inverse_fund", "leveraged_fund", "socially_responsible_fund", 
    "synthetic_replication_fund", "aum_usd", "open_to_existing_investors", 
    "open_to_new_investors", "ycharts_url", "investment_strategy", 
    "related_securities", "fund_family", "fund_of_funds"
]

# SQL query to fetch fund symbols from the Funds_to_Screen table
FUND_SYMBOLS_SQL = """
SELECT DISTINCT SymbolCUSIP AS Fund_Symbol, Fund_Type_ID
FROM Funds_to_Screen
"""

# Create a custom SSL context to bypass certificate verification (for debugging only)
context = ssl.create_default_context(cafile=certifi.where())
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE  # In production, use CERT_REQUIRED

# Create a custom HTTPAdapter using the SSL context
class CustomHTTPAdapter(HTTPAdapter):
    def init_poolmanager(self, *args, **kwargs):
        kwargs['ssl_context'] = context
        return super().init_poolmanager(*args, **kwargs)

# Create a global requests session with the custom adapter mounted for HTTPS
session = requests.Session()
session.mount("https://", CustomHTTPAdapter())

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def get_api_url(symbol, fund_type_id, metric):
    """
    Construct the YCI API URL based on fund type and metric with retry logic.
    """
    if fund_type_id == 3:
        symbol_for_api = f"M:{symbol}"
        endpoint = "mutual_funds"
    else:
        symbol_for_api = symbol
        endpoint = "companies"
    return f"{YCHARTS_API_URL}/v3/{endpoint}/{symbol_for_api}/info/{metric}?retrieve_ttl=true"

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def fetch_yci_metric(symbol, fund_type_id, metric):
    """
    Fetch a single YCI metric value for the given fund symbol, 
    with special handling for 'related_securities' to extract only symbols.
    """
    api_url = get_api_url(symbol, fund_type_id, metric)
    logger.info(f"Fetching {metric} for {symbol} using URL: {api_url}")
    try:
        response = session.get(api_url, headers=API_HEADERS, timeout=30)
        response.raise_for_status()
        data = response.json()
        response_key = f"M:{symbol}" if fund_type_id == 3 else symbol
        metric_value = data.get("response", {}).get(response_key, {}) \
                           .get("results", {}).get(metric, {}) \
                           .get("data")
        
        if metric == "related_securities" and isinstance(metric_value, list):
            # Extract only security_id from each related security
            metric_value = [sec.get('security_id') for sec in metric_value if isinstance(sec, dict) and 'security_id' in sec]

        logger.info(f"Fetched {metric} for {symbol}: {metric_value}")
        return metric_value
    except Exception as e:
        logger.error(f"Error fetching {metric} for {symbol}: {e}")
        return None

def prepare_db_params(data):
    """
    Prepare the dictionary values for database insertion.
    Convert lists to JSON strings for 'related_securities', 
    otherwise handle as before.
    """
    return {
        key: (json.dumps(val) if key == 'related_securities' else 
              (json.dumps(val) if isinstance(val, (list, dict)) else val)) 
        for key, val in data.items()
    }

def update_fund_in_db(symbol, data):
    """
    Check if a record exists for the given symbol in Funds_to_Screen.
    If it exists, update it; otherwise, insert a new record.
    Uses engine.begin() for transaction management with error handling.
    """
    query_sql = "SELECT 1 FROM Funds_to_Screen WHERE SymbolCUSIP = :symbol"
    update_sql = """
    UPDATE Funds_to_Screen 
    SET index_fund = :index_fund,
        inverse_fund = :inverse_fund,
        leveraged_fund = :leveraged_fund,
        socially_responsible_fund = :socially_responsible_fund,
        synthetic_replication_fund = :synthetic_replication_fund,
        aum_usd = :aum_usd,
        open_to_existing_investors = :open_to_existing_investors,
        open_to_new_investors = :open_to_new_investors,
        ycharts_url = :ycharts_url,
        investment_strategy = :investment_strategy,
        related_securities = :related_securities,
        fund_family = :fund_family,
        fund_of_funds = :fund_of_funds
    WHERE SymbolCUSIP = :symbol
    """
    insert_sql = """
    INSERT INTO Funds_to_Screen 
    (SymbolCUSIP, index_fund, inverse_fund, leveraged_fund, socially_responsible_fund, synthetic_replication_fund, aum_usd, 
     open_to_existing_investors, open_to_new_investors, ycharts_url, investment_strategy, related_securities, fund_family, fund_of_funds)
    VALUES 
    (:symbol, :index_fund, :inverse_fund, :leveraged_fund, :socially_responsible_fund, :synthetic_replication_fund, :aum_usd, 
     :open_to_existing_investors, :open_to_new_investors, :ycharts_url, :investment_strategy, :related_securities, :fund_family, :fund_of_funds)
    """
    try:
        params = prepare_db_params(data)
        with engine.begin() as conn:
            result = conn.execute(sqlalchemy.text(query_sql), {"symbol": symbol}).fetchone()
            if result:
                conn.execute(sqlalchemy.text(update_sql), params)
                logger.info(f"Updated DB record for {symbol}")
            else:
                conn.execute(sqlalchemy.text(insert_sql), params)
                logger.info(f"Inserted DB record for {symbol}")
    except Exception as e:
        logger.error(f"Database error for {symbol}: {e}")
        # Note: In a more robust system, you might want to consider rolling back the transaction here
        # conn.rollback() 

def process_fund(row):
    """
    For a given fund (dict with keys 'Fund_Symbol' and 'Fund_Type_ID'),
    fetch all YCI metrics and update the database accordingly.
    """
    symbol = row["Fund_Symbol"]
    fund_type_id = row["Fund_Type_ID"]
    data = {"symbol": symbol}
    for metric in YCI_METRICS:
        data[metric] = fetch_yci_metric(symbol, fund_type_id, metric)
    update_fund_in_db(symbol, data)
    return symbol

def main():
    overall_start = time.time()
    logger.info("Starting YCI data fetch and update process.")
    
    funds_df = pd.read_sql(FUND_SYMBOLS_SQL, engine)
    funds = funds_df.to_dict(orient="records")
    total_funds = len(funds)
    logger.info(f"Retrieved {total_funds} funds to process.")
    
    processed_symbols = []
    # Use a ThreadPoolExecutor with max_workers set to 20
    with ThreadPoolExecutor(max_workers=20) as executor:
        for symbol in executor.map(process_fund, funds):
            processed_symbols.append(symbol)
    
    elapsed = time.time() - overall_start
    logger.info(f"Processed {len(processed_symbols)} funds in {elapsed:.2f} seconds.")
    
    if len(processed_symbols) != total_funds:
        logger.warning(f"Some funds might not have been processed. Expected {total_funds}, processed {len(processed_symbols)}")
    else:
        logger.info("🎉 YCI data fetch and update complete!")

if __name__ == "__main__":
    main()

2025-02-11 10:04:35,384 - INFO - Starting YCI data fetch and update process.
2025-02-11 10:04:35,451 - INFO - Retrieved 1060 funds to process.
2025-02-11 10:04:35,453 - INFO - Fetching index_fund for NTSX using URL: https://api.ycharts.com/v3/companies/NTSX/info/index_fund?retrieve_ttl=true
2025-02-11 10:04:35,454 - INFO - Fetching index_fund for HIPS using URL: https://api.ycharts.com/v3/companies/HIPS/info/index_fund?retrieve_ttl=true
2025-02-11 10:04:35,455 - INFO - Fetching index_fund for EAOA using URL: https://api.ycharts.com/v3/companies/EAOA/info/index_fund?retrieve_ttl=true
2025-02-11 10:04:35,456 - INFO - Fetching index_fund for AOM using URL: https://api.ycharts.com/v3/companies/AOM/info/index_fund?retrieve_ttl=true
2025-02-11 10:04:35,458 - INFO - Fetching index_fund for AOK using URL: https://api.ycharts.com/v3/companies/AOK/info/index_fund?retrieve_ttl=true
2025-02-11 10:04:35,475 - INFO - Fetching index_fund for EAOK using URL: https://api.ycharts.com/v3/companies/EAOK/i

In [4]:
# updating metadata but checking for the data first

import math
import ssl
import requests
import sqlalchemy
import pandas as pd
import time
import concurrent.futures
from requests.adapters import HTTPAdapter
import logging

# -----------------------------
# Logging Configuration
# -----------------------------
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# -----------------------------
# Database and API Configuration
# -----------------------------
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
YCHARTS_API_URL = "https://api.ycharts.com"
API_HEADERS = {
    "X-YCHARTSAUTHORIZATION": "yIIphqbsQysnTvWWxfW33w",  # Replace with your actual API key
    "X-YCHARTSEXCELSESSION": "b645cd897b2446bfa3796acfa3a879db",
    "X-YCHARTSEXCELVERSION": "4.4",
    "X-YCHARTSOPERATINGSYSTEM": "Microsoft Windows NT 10.0.26100.0",
    "Content-Type": "application/x-www-form-urlencoded"
}

# -----------------------------
# New Metrics Mapping Dictionary
# -----------------------------
# Maps our new (shortened) column names to their corresponding YCharts function codes.
new_metrics_mapping = {
    "average_manager_tenure": "average_manager_tenure",
    "max_manager_tenure": "maximum_manager_tenure",
    "median_manager_tenure": "median_manager_tenure",
    "min_manager_tenure": "minimum_manager_tenure"
}

# List of new metric (column) names to update
NEW_METRICS = list(new_metrics_mapping.keys())

# SQL to retrieve fund symbols and their Fund_Type_ID from Funds_to_Screen
FUND_SYMBOLS_SQL = """
SELECT DISTINCT SymbolCUSIP AS Fund_Symbol, Fund_Type_ID
FROM Funds_to_Screen
"""

# -----------------------------
# Custom SSL Context and HTTP Adapter
# -----------------------------
context = ssl.create_default_context()
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE  # For debugging; use CERT_REQUIRED in production

class CustomHTTPAdapter(HTTPAdapter):
    def init_poolmanager(self, *args, **kwargs):
        kwargs['ssl_context'] = context
        return super().init_poolmanager(*args, **kwargs)

session = requests.Session()
session.mount('https://', CustomHTTPAdapter())

# -----------------------------
# Helper Functions for YCP Calls (New Metrics)
# -----------------------------
def build_points_lines_new(symbol, yc_code, is_mutual):
    """
    Build the points line for a YCP call for new metrics.
    Format: "symbol_for_api,yc_code"
    The "M:" prefix is added only for mutual funds.
    """
    symbol_for_api = f"M:{symbol}" if is_mutual else symbol
    return f"{symbol_for_api},{yc_code}"

def fetch_new_data_point(symbol, metric, fund_type_id):
    """
    Fetch a single data point for the new metric using the YCP endpoint.
    Uses new_metrics_mapping to obtain the correct YC function code.
    """
    is_mutual = (fund_type_id == 3)
    yc_code = new_metrics_mapping.get(metric)
    if not yc_code:
        logger.error(f"No YC mapping found for new metric '{metric}'.")
        return None
    points_line = build_points_lines_new(symbol, yc_code, is_mutual)
    payload = f"points={points_line}"
    api_url = f"{YCHARTS_API_URL}/v3/excel/points"
    logger.info(f"Fetching new data point for metric '{metric}' (YC Code: {yc_code}) for symbol '{symbol}' with payload: {payload}")
    try:
        response = session.post(api_url, headers=API_HEADERS, data=payload, timeout=60)
        response.raise_for_status()
        data = response.json()
        symbol_for_api = f"M:{symbol}" if is_mutual else symbol
        if 'response' in data and symbol_for_api in data['response']:
            results = data['response'][symbol_for_api].get("results", {})
            if yc_code in results:
                # Try the "data" key first
                datapoints = results[yc_code].get("data")
                if not datapoints:
                    fallback = results[yc_code].get("")
                    if fallback and "results" in fallback:
                        datapoints = [fallback["results"]]
                if datapoints and isinstance(datapoints, list):
                    last_point = datapoints[-1]
                    if isinstance(last_point, list) and len(last_point) > 1:
                        value = last_point[1]
                        logger.info(f"Fetched new metric '{metric}' for {symbol}: {value}")
                        return value
        logger.warning(f"No valid data returned for symbol '{symbol}' on new metric '{metric}' (YC Code: {yc_code})")
        return None
    except requests.exceptions.RequestException as e:
        logger.error(f"API error for symbol '{symbol}' on new metric '{metric}' (YC Code: {yc_code}): {e}")
        return None

# -----------------------------
# Helper Functions for Database Updates (New Metrics)
# -----------------------------
def verify_new_columns(engine, table_name, expected_columns):
    """
    Verify that all expected new metric columns exist in the table.
    """
    inspector = sqlalchemy.inspect(engine)
    actual_columns = [col['name'] for col in inspector.get_columns(table_name)]
    missing_columns = [col for col in expected_columns if col not in actual_columns]
    if missing_columns:
        logger.warning(f"Missing new columns in {table_name}: {missing_columns}")
    else:
        logger.info(f"All expected new columns are present in {table_name}.")

def check_and_update_new_data(engine, symbol, data):
    """
    Update (or insert) the new metric columns for a given symbol.
    The SQL is built dynamically based solely on NEW_METRICS.
    For new records, a default ProductName is supplied (using the symbol).
    """
    query_sql = "SELECT ProductName FROM Funds_to_Screen WHERE SymbolCUSIP = :symbol"
    update_sql = "UPDATE Funds_to_Screen SET " + ", ".join(f"{col} = :{col}" for col in NEW_METRICS) + " WHERE SymbolCUSIP = :symbol"
    columns = "SymbolCUSIP, ProductName, " + ", ".join(NEW_METRICS)
    placeholders = ":symbol, :ProductName, " + ", ".join(f":{col}" for col in NEW_METRICS)
    insert_sql = f"INSERT INTO Funds_to_Screen ({columns}) VALUES ({placeholders})"
    
    with engine.connect() as conn:
        try:
            result = conn.execute(sqlalchemy.text(query_sql), {'symbol': symbol}).fetchone()
            if result:
                conn.execute(sqlalchemy.text(update_sql), data)
                logger.info(f"Updated new metrics for symbol {symbol}")
            else:
                if "ProductName" not in data or data["ProductName"] is None:
                    data["ProductName"] = symbol
                conn.execute(sqlalchemy.text(insert_sql), data)
                logger.info(f"Inserted new record for new metrics for symbol {symbol}")
            conn.commit()
        except sqlalchemy.exc.SQLAlchemyError as e:
            logger.error(f"Database error for symbol {symbol} (new metrics): {e}")
            conn.rollback()

def process_symbol_new(symbol, fund_type_id, engine):
    """
    Process a single symbol for new metrics:
      - First, query the database for existing new metric values.
      - For each new metric, if the column already has a non-NULL value, use it;
        otherwise, call the YCP API to fetch the data.
      - Update (or insert) the new metric values in the database.
    """
    logger.info(f"Starting new metrics processing for symbol: {symbol}")
    data = {'symbol': symbol}
    
    # Query existing new metric values
    with engine.connect() as conn:
        query = "SELECT " + ", ".join(NEW_METRICS) + " FROM Funds_to_Screen WHERE SymbolCUSIP = :symbol"
        result = conn.execute(sqlalchemy.text(query), {'symbol': symbol}).fetchone()
    
    for metric in NEW_METRICS:
        if result is not None and result[metric] is not None:
            data[metric] = result[metric]
            logger.info(f"Existing value for {metric} found for {symbol}: {result[metric]}. Skipping API call.")
        else:
            value = fetch_new_data_point(symbol, metric, fund_type_id)
            data[metric] = value

    # Ensure every expected key is present.
    for key in NEW_METRICS:
        data.setdefault(key, None)
    logger.debug(f"New data dictionary for {symbol}: {data}")
    check_and_update_new_data(engine, symbol, data)
    logger.info(f"Finished new metrics processing for symbol: {symbol}")

def main():
    start_time = time.time()
    logger.info("Starting new metrics data fetch and insertion process.")
    
    engine = sqlalchemy.create_engine(connection_string,
                                      pool_size=20,
                                      max_overflow=50,
                                      pool_timeout=300)
    
    # Verify that all expected new columns exist.
    verify_new_columns(engine, "Funds_to_Screen", NEW_METRICS)
    
    # Retrieve all symbols from the database.
    symbols_df = pd.read_sql(FUND_SYMBOLS_SQL, engine)
    symbols_list = symbols_df[['Fund_Symbol', 'Fund_Type_ID']].values.tolist()
    logger.info(f"Processing new metrics for {len(symbols_list)} symbols from the database.")
    
    # Process symbols in parallel.
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        futures = []
        for item in symbols_list:
            symbol, fund_type_id = item  # Unpack the tuple
            futures.append(executor.submit(process_symbol_new, symbol, fund_type_id, engine))
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as exc:
                logger.error(f"Error processing symbol for new metrics: {exc}")
    
    total_time = time.time() - start_time
    logger.info(f"All new metrics processed successfully in {total_time:.2f} seconds")
    engine.dispose()

if __name__ == "__main__":
    main()

2025-02-11 10:22:40,266 - INFO - Starting new metrics data fetch and insertion process.
2025-02-11 10:22:40,299 - INFO - All expected new columns are present in Funds_to_Screen.
2025-02-11 10:22:40,305 - INFO - Processing new metrics for 1060 symbols from the database.
2025-02-11 10:22:40,306 - INFO - Starting new metrics processing for symbol: NTSX
2025-02-11 10:22:40,310 - INFO - Starting new metrics processing for symbol: HIPS
2025-02-11 10:22:40,313 - INFO - Starting new metrics processing for symbol: EAOA
2025-02-11 10:22:40,313 - INFO - Starting new metrics processing for symbol: AOM
2025-02-11 10:22:40,316 - INFO - Starting new metrics processing for symbol: AOK
2025-02-11 10:22:40,318 - INFO - Starting new metrics processing for symbol: EAOK
2025-02-11 10:22:40,320 - INFO - Starting new metrics processing for symbol: EAOM
2025-02-11 10:22:40,322 - INFO - Starting new metrics processing for symbol: INCM
2025-02-11 10:22:40,322 - INFO - Starting new metrics processing for symbol:

In [5]:
# updating metadata but checking for the data first

import math
import ssl
import requests
import sqlalchemy
import pandas as pd
import time
import concurrent.futures
from requests.adapters import HTTPAdapter
import logging

# -----------------------------
# Logging Configuration
# -----------------------------
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# -----------------------------
# Database and API Configuration
# -----------------------------
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
YCHARTS_API_URL = "https://api.ycharts.com"
API_HEADERS = {
    "X-YCHARTSAUTHORIZATION": "yIIphqbsQysnTvWWxfW33w",  # Replace with your actual API key
    "X-YCHARTSEXCELSESSION": "b645cd897b2446bfa3796acfa3a879db",
    "X-YCHARTSEXCELVERSION": "4.4",
    "X-YCHARTSOPERATINGSYSTEM": "Microsoft Windows NT 10.0.26100.0",
    "Content-Type": "application/x-www-form-urlencoded"
}

# -----------------------------
# New Metrics Mapping Dictionary
# -----------------------------
# Maps our new (shortened) column names to their corresponding YCharts function codes.
new_metrics_mapping = {
    "average_manager_tenure": "average_manager_tenure",
    "max_manager_tenure": "maximum_manager_tenure",
    "median_manager_tenure": "median_manager_tenure",
    "min_manager_tenure": "minimum_manager_tenure"
}

# List of new metric (column) names to update
NEW_METRICS = list(new_metrics_mapping.keys())

# SQL to retrieve fund symbols and their Fund_Type_ID from Funds_to_Screen
FUND_SYMBOLS_SQL = """
SELECT DISTINCT SymbolCUSIP AS Fund_Symbol, Fund_Type_ID
FROM Funds_to_Screen
"""

# -----------------------------
# Custom SSL Context and HTTP Adapter
# -----------------------------
context = ssl.create_default_context()
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE  # For debugging; use CERT_REQUIRED in production

class CustomHTTPAdapter(HTTPAdapter):
    def init_poolmanager(self, *args, **kwargs):
        kwargs['ssl_context'] = context
        return super().init_poolmanager(*args, **kwargs)

session = requests.Session()
session.mount('https://', CustomHTTPAdapter())

# -----------------------------
# Helper Functions for YCP Calls (New Metrics)
# -----------------------------
def build_points_lines_new(symbol, yc_code, is_mutual):
    """
    Build the points line for a YCP call for new metrics.
    Format: "symbol_for_api,yc_code"
    The "M:" prefix is added only for mutual funds.
    """
    symbol_for_api = f"M:{symbol}" if is_mutual else symbol
    return f"{symbol_for_api},{yc_code}"

def fetch_new_data_point(symbol, metric, fund_type_id):
    """
    Fetch a single data point for the new metric using the YCP endpoint.
    Uses new_metrics_mapping to obtain the correct YC function code.
    """
    is_mutual = (fund_type_id == 3)
    yc_code = new_metrics_mapping.get(metric)
    if not yc_code:
        logger.error(f"No YC mapping found for new metric '{metric}'.")
        return None
    points_line = build_points_lines_new(symbol, yc_code, is_mutual)
    payload = f"points={points_line}"
    api_url = f"{YCHARTS_API_URL}/v3/excel/points"
    logger.info(f"Fetching new data point for metric '{metric}' (YC Code: {yc_code}) for symbol '{symbol}' with payload: {payload}")
    try:
        response = session.post(api_url, headers=API_HEADERS, data=payload, timeout=60)
        response.raise_for_status()
        data = response.json()
        symbol_for_api = f"M:{symbol}" if is_mutual else symbol
        if 'response' in data and symbol_for_api in data['response']:
            results = data['response'][symbol_for_api].get("results", {})
            if yc_code in results:
                # Try the "data" key first
                datapoints = results[yc_code].get("data")
                if not datapoints:
                    fallback = results[yc_code].get("")
                    if fallback and "results" in fallback:
                        datapoints = [fallback["results"]]
                if datapoints and isinstance(datapoints, list):
                    last_point = datapoints[-1]
                    if isinstance(last_point, list) and len(last_point) > 1:
                        value = last_point[1]
                        logger.info(f"Fetched new metric '{metric}' for {symbol}: {value}")
                        return value
        logger.warning(f"No valid data returned for symbol '{symbol}' on new metric '{metric}' (YC Code: {yc_code})")
        return None
    except requests.exceptions.RequestException as e:
        logger.error(f"API error for symbol '{symbol}' on new metric '{metric}' (YC Code: {yc_code}): {e}")
        return None

# -----------------------------
# Helper Functions for Database Updates (New Metrics)
# -----------------------------
def verify_new_columns(engine, table_name, expected_columns):
    """
    Verify that all expected new metric columns exist in the table.
    """
    inspector = sqlalchemy.inspect(engine)
    actual_columns = [col['name'] for col in inspector.get_columns(table_name)]
    missing_columns = [col for col in expected_columns if col not in actual_columns]
    if missing_columns:
        logger.warning(f"Missing new columns in {table_name}: {missing_columns}")
    else:
        logger.info(f"All expected new columns are present in {table_name}.")

def check_and_update_new_data(engine, symbol, data):
    """
    Update (or insert) the new metric columns for a given symbol.
    The SQL is built dynamically based solely on NEW_METRICS.
    For new records, a default ProductName is supplied (using the symbol).
    """
    query_sql = "SELECT ProductName FROM Funds_to_Screen WHERE SymbolCUSIP = :symbol"
    update_sql = "UPDATE Funds_to_Screen SET " + ", ".join(f"{col} = :{col}" for col in NEW_METRICS) + " WHERE SymbolCUSIP = :symbol"
    columns = "SymbolCUSIP, ProductName, " + ", ".join(NEW_METRICS)
    placeholders = ":symbol, :ProductName, " + ", ".join(f":{col}" for col in NEW_METRICS)
    insert_sql = f"INSERT INTO Funds_to_Screen ({columns}) VALUES ({placeholders})"
    
    with engine.connect() as conn:
        try:
            result = conn.execute(sqlalchemy.text(query_sql), {'symbol': symbol}).fetchone()
            if result:
                conn.execute(sqlalchemy.text(update_sql), data)
                logger.info(f"Updated new metrics for symbol {symbol}")
            else:
                if "ProductName" not in data or data["ProductName"] is None:
                    data["ProductName"] = symbol
                conn.execute(sqlalchemy.text(insert_sql), data)
                logger.info(f"Inserted new record for new metrics for symbol {symbol}")
            conn.commit()
        except sqlalchemy.exc.SQLAlchemyError as e:
            logger.error(f"Database error for symbol {symbol} (new metrics): {e}")
            conn.rollback()

def process_symbol_new(symbol, fund_type_id, engine):
    """
    Process a single symbol for new metrics:
      - First, query the database for existing new metric values.
      - For each new metric, if the column already has a non-NULL value, use it;
        otherwise, call the YCP API to fetch the data.
      - Update (or insert) the new metric values in the database.
    """
    logger.info(f"Starting new metrics processing for symbol: {symbol}")
    data = {'symbol': symbol}
    
    # Query existing new metric values
    with engine.connect() as conn:
        query = "SELECT " + ", ".join(NEW_METRICS) + " FROM Funds_to_Screen WHERE SymbolCUSIP = :symbol"
        result = conn.execute(sqlalchemy.text(query), {'symbol': symbol}).fetchone()
    
    for metric in NEW_METRICS:
        if result is not None and result[metric] is not None:
            data[metric] = result[metric]
            logger.info(f"Existing value for {metric} found for {symbol}: {result[metric]}. Skipping API call.")
        else:
            value = fetch_new_data_point(symbol, metric, fund_type_id)
            data[metric] = value

    # Ensure every expected key is present.
    for key in NEW_METRICS:
        data.setdefault(key, None)
    logger.debug(f"New data dictionary for {symbol}: {data}")
    check_and_update_new_data(engine, symbol, data)
    logger.info(f"Finished new metrics processing for symbol: {symbol}")

def main():
    start_time = time.time()
    logger.info("Starting new metrics data fetch and insertion process.")
    
    engine = sqlalchemy.create_engine(connection_string,
                                      pool_size=20,
                                      max_overflow=50,
                                      pool_timeout=300)
    
    # Verify that all expected new columns exist.
    verify_new_columns(engine, "Funds_to_Screen", NEW_METRICS)
    
    # Retrieve all symbols from the database.
    symbols_df = pd.read_sql(FUND_SYMBOLS_SQL, engine)
    symbols_list = symbols_df[['Fund_Symbol', 'Fund_Type_ID']].values.tolist()
    logger.info(f"Processing new metrics for {len(symbols_list)} symbols from the database.")
    
    # Process symbols in parallel.
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        futures = []
        for item in symbols_list:
            symbol = item[0]  # Directly access by integer index
            fund_type_id = item[1]  # Directly access by integer index
            futures.append(executor.submit(process_symbol_new, symbol, fund_type_id, engine))
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as exc:
                logger.error(f"Error processing symbol for new metrics: {exc}")
    
    total_time = time.time() - start_time
    logger.info(f"All new metrics processed successfully in {total_time:.2f} seconds")
    engine.dispose()

if __name__ == "__main__":
    main()

2025-02-11 10:28:33,761 - INFO - Starting new metrics data fetch and insertion process.
2025-02-11 10:28:33,786 - INFO - All expected new columns are present in Funds_to_Screen.
2025-02-11 10:28:33,794 - INFO - Processing new metrics for 1060 symbols from the database.
2025-02-11 10:28:33,796 - INFO - Starting new metrics processing for symbol: NTSX
2025-02-11 10:28:33,799 - INFO - Starting new metrics processing for symbol: HIPS
2025-02-11 10:28:33,800 - INFO - Starting new metrics processing for symbol: EAOA
2025-02-11 10:28:33,802 - INFO - Starting new metrics processing for symbol: AOM
2025-02-11 10:28:33,803 - INFO - Starting new metrics processing for symbol: AOK
2025-02-11 10:28:33,806 - INFO - Starting new metrics processing for symbol: EAOK
2025-02-11 10:28:33,809 - INFO - Starting new metrics processing for symbol: EAOM
2025-02-11 10:28:33,811 - INFO - Starting new metrics processing for symbol: INCM
2025-02-11 10:28:33,812 - INFO - Starting new metrics processing for symbol:

In [None]:
import math
import ssl
import requests
import sqlalchemy
import pandas as pd
import time
import concurrent.futures
from requests.adapters import HTTPAdapter
import logging
import os

# Configure Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Database Connection Details
DB_CONNECTION_STRING = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)

# YCharts API Configuration
YCHARTS_API_URL = "https://api.ycharts.com"
API_HEADERS = {
    "X-YCHARTSAUTHORIZATION": os.getenv("YCHARTS_API_KEY", "yIIphqbsQysnTvWWxfW33w"),  # Use environment variable, fallback to hardcoded for testing
    "X-YCHARTSEXCELSESSION": "b645cd897b2446bfa3796acfa3a879db",
    "X-YCHARTSEXCELVERSION": "4.4",
    "X-YCHARTSOPERATINGSYSTEM": "Microsoft Windows NT 10.0.26100.0",
    "Content-Type": "application/x-www-form-urlencoded"
}

# Metrics to fetch from YCharts and insert into database
METRICS_MAPPING = {
    "average_manager_tenure": "average_manager_tenure",
    "max_manager_tenure": "maximum_manager_tenure",
    "median_manager_tenure": "median_manager_tenure",
    "min_manager_tenure": "minimum_manager_tenure"
}

# SQL query to get symbols
GET_SYMBOLS_SQL = """
SELECT DISTINCT SymbolCUSIP AS Fund_Symbol, Fund_Type_ID
FROM Funds_to_Screen
"""

# Custom SSL Context for HTTPS requests - Revert for debugging
def create_ssl_context():
    context = ssl.create_default_context()
    context.check_hostname = False
    context.verify_mode = ssl.CERT_NONE  # For debugging, use CERT_REQUIRED in production
    return context

# Custom HTTP Adapter for requests
class CustomHTTPAdapter(HTTPAdapter):
    def init_poolmanager(self, *args, **kwargs):
        kwargs['ssl_context'] = create_ssl_context()
        return super().init_poolmanager(*args, **kwargs)

# Function to build the API call payload
def build_api_payload(symbol, metric, is_mutual):
    symbol_for_api = f"M:{symbol}" if is_mutual else symbol
    return f"points={symbol_for_api},{metric}"

# Fetch data from YCharts API
def fetch_metric_from_api(symbol, metric, fund_type_id):
    is_mutual = (fund_type_id == 3)
    yc_code = METRICS_MAPPING.get(metric)
    if not yc_code:
        logger.error(f"No YC mapping found for metric '{metric}'.")
        return None
    
    payload = build_api_payload(symbol, yc_code, is_mutual)
    api_url = f"{YCHARTS_API_URL}/v3/excel/points"
    logger.info(f"Fetching metric '{metric}' for symbol '{symbol}' with payload: {payload}")

    session = requests.Session()
    session.mount('https://', CustomHTTPAdapter())

    try:
        response = session.post(api_url, headers=API_HEADERS, data=payload, timeout=60)
        response.raise_for_status()
        data = response.json()
        symbol_for_api = f"M:{symbol}" if is_mutual else symbol
        if 'response' in data and symbol_for_api in data['response']:
            results = data['response'][symbol_for_api].get("results", {})
            if yc_code in results:
                nested_results = results[yc_code].get("", {})
                if "results" not in nested_results:
                    logger.warning(f"Unusual results structure for symbol '{symbol}' on metric '{metric}'")
                else:
                    datapoints = nested_results["results"]
                    if isinstance(datapoints, (float, int)):  # Check if it's a direct float or int
                        logger.info(f"Direct value returned for symbol '{symbol}' on metric '{metric}': {datapoints}")
                        return datapoints  # Return the float or int directly
                    elif datapoints and isinstance(datapoints, list) and len(datapoints) > 0:
                        # Ensure datapoints[-1] is indeed a list or tuple before accessing its elements
                        if isinstance(datapoints[-1], (list, tuple)):
                            return datapoints[-1][1]  # last point's value
                        else:
                            logger.warning(f"Unexpected data format for symbol '{symbol}' on metric '{metric}'. Got: {type(datapoints[-1])}")
                            return datapoints[-1]  # Return the float if it's not a list/tuple
                    else:
                        logger.warning(f"Datapoints for symbol '{symbol}' on metric '{metric}' are empty or not a list")
        logger.warning(f"No valid data returned for symbol '{symbol}' on metric '{metric}'")
        return None
    except requests.RequestException as e:
        logger.error(f"API error for symbol '{symbol}' on metric '{metric}': {e}")
        return None

# Update or insert metrics into the database
def update_db(engine, symbol, data):
    update_sql = f"""
    UPDATE Funds_to_Screen 
    SET {', '.join(f"{key} = :{key}" for key in METRICS_MAPPING.keys())}
    WHERE SymbolCUSIP = :symbol
    """
    insert_sql = f"""
    INSERT INTO Funds_to_Screen (SymbolCUSIP, {', '.join(METRICS_MAPPING.keys())})
    VALUES (:symbol, {', '.join(':' + key for key in METRICS_MAPPING.keys())})
    """
    
    with engine.connect() as conn:
        try:
            result = conn.execute(sqlalchemy.text("SELECT 1 FROM Funds_to_Screen WHERE SymbolCUSIP = :symbol"), {"symbol": symbol}).fetchone()
            if result:
                conn.execute(sqlalchemy.text(update_sql), data)
                logger.info(f"Updated metrics for symbol {symbol}")
            else:
                conn.execute(sqlalchemy.text(insert_sql), data)
                logger.info(f"Inserted metrics for symbol {symbol}")
            conn.commit()
        except sqlalchemy.exc.SQLAlchemyError as e:
            logger.error(f"Database error for symbol {symbol}: {e}")
            conn.rollback()

# Process one symbol
def process_symbol(symbol, fund_type_id, engine):
    logger.info(f"Processing symbol: {symbol}")
    data = {'symbol': symbol}
    for metric in METRICS_MAPPING.keys():
        result = fetch_metric_from_api(symbol, metric, fund_type_id)
        if isinstance(result, (float, int)):
            data[metric] = result
        elif isinstance(result, (list, tuple)) and len(result) > 0:
            data[metric] = result[-1] if isinstance(result[-1], (float, int)) else None
        else:
            data[metric] = None  # Handle unexpected or no data
    update_db(engine, symbol, data)

def main():
    start_time = time.time()
    logger.info("Starting metrics update process.")

    engine = sqlalchemy.create_engine(
        DB_CONNECTION_STRING,
        pool_size=20,
        max_overflow=50,
        pool_timeout=300
    )

    symbols_df = pd.read_sql(GET_SYMBOLS_SQL, engine)
    symbols_list = symbols_df.values.tolist()
    logger.info(f"Processing {len(symbols_list)} symbols.")

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:  # Reduced max_workers for potential rate limit compliance
        futures = []
        for symbol, fund_type_id in symbols_list:
            futures.append(executor.submit(process_symbol, symbol, fund_type_id, engine))
        
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as exc:
                logger.error(f"Error processing symbol: {exc}")

    total_time = time.time() - start_time
    logger.info(f"Finished metrics update in {total_time:.2f} seconds")
    engine.dispose()

if __name__ == "__main__":
    main()

2025-02-11 17:24:04,071 - INFO - Starting metrics update process.
2025-02-11 17:24:04,142 - INFO - Processing 1060 symbols.
2025-02-11 17:24:04,143 - INFO - Processing symbol: NTSX
2025-02-11 17:24:04,145 - INFO - Fetching metric 'average_manager_tenure' for symbol 'NTSX' with payload: points=NTSX,average_manager_tenure
2025-02-11 17:24:04,145 - INFO - Processing symbol: HIPS
2025-02-11 17:24:04,163 - INFO - Processing symbol: EAOA
2025-02-11 17:24:04,164 - INFO - Fetching metric 'average_manager_tenure' for symbol 'HIPS' with payload: points=HIPS,average_manager_tenure
2025-02-11 17:24:04,166 - INFO - Processing symbol: AOM
2025-02-11 17:24:04,167 - INFO - Fetching metric 'average_manager_tenure' for symbol 'EAOA' with payload: points=EAOA,average_manager_tenure
2025-02-11 17:24:04,167 - INFO - Processing symbol: AOK
2025-02-11 17:24:04,167 - INFO - Processing symbol: EAOK
2025-02-11 17:24:04,230 - INFO - Fetching metric 'average_manager_tenure' for symbol 'EAOK' with payload: points=

In [6]:
#test of NLP in investment_strategy to see if it can be used to classify funds

import pandas as pd
from sqlalchemy import create_engine
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

# Connection string
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes"
    "&TrustServerCertificate=yes"
)

# Create SQLAlchemy engine
engine = create_engine(connection_string)

# SQL query to fetch data based on specified conditions
query = """
SELECT TOP 100 FundID, investment_strategy
FROM Funds_to_Screen
WHERE CWA_Broad_Category_ID = 28 
  AND YC_Global_Category_ID = 55 
  AND YC_Category_ID = 54
"""

# Fetch data from database
data = pd.read_sql(query, engine)

# Basic cleaning function
def basic_clean(text):
    if pd.isna(text):  # Handle NaN values
        return ""
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    cleaned_words = [word for word in words if word.isalnum() and word not in stop_words]
    return ' '.join(cleaned_words)

# Apply cleaning
data['cleaned_strategy'] = data['investment_strategy'].apply(basic_clean)

# Define categories and keywords
categories = {
    'Growth': ['growth', 'expand', 'potential'],
    'Value': ['value', 'undervalued', 'dividend'],
    'Other': []  # If no specific keywords match, we'll categorize as 'Other'
}

# Function to categorize based on keywords
def categorize_strategy(text):
    for category, keywords in categories.items():
        if any(keyword in text for keyword in keywords):
            return category
    return 'Other'

# Apply categorization
data['Category'] = data['cleaned_strategy'].apply(categorize_strategy)

# Output results to Excel
output_file = 'fund_categories.xlsx'
data.to_excel(output_file, index=False)
print(f"Results have been written to {output_file}")

# Optionally, display the results in the console
print(data[['FundID', 'investment_strategy', 'cleaned_strategy', 'Category']])


Results have been written to fund_categories.xlsx
    FundID                                investment_strategy  \
0      292  The investment seeks long-term capital appreci...   
1      324  The investment seeks current income, growth of...   
2      325  The investment seeks long-term growth of capit...   
3      327  The investment seeks to track the price and yi...   
4      328  The investment seeks to track the price and yi...   
5      329  The investment seeks to track the investment r...   
6      330  The investment seeks to track the performance ...   
7      331  The investment seeks to track the investment r...   
8      332  The investment seeks to track the investment r...   
9      333  The investment seeks to track the investment r...   
10     334  The investment seeks to track the investment r...   
11     335  The investment seeks long-term growth of capit...   
12     336  The investment seeks to provide investment ret...   
13     337  The investment seeks to prov

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JulianHeron\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JulianHeron\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\JulianHeron\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [14]:
pip install --upgrade pip


Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 1.8/1.8 MB 25.1 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.3.1
    Uninstalling pip-24.3.1:
      Successfully uninstalled pip-24.3.1
Successfully installed pip-25.0.1
Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
from sqlalchemy import create_engine
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

# Connection details
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database"
    "?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes"
    "&TrustServerCertificate=yes"
)

engine = create_engine(connection_string)

query = """
SELECT FundID, investment_strategy
FROM Funds_to_Screen
WHERE CWA_Broad_Category_ID = 28 
  AND YC_Global_Category_ID = 55 
  AND YC_Category_ID = 54
"""

data = pd.read_sql(query, engine)

def basic_clean(text):
    if pd.isna(text):
        return ""
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    cleaned_words = [word for word in words if word.isalnum() and word not in stop_words]
    return cleaned_words

# Apply cleaning and get key phrases
data['key_phrases'] = data['investment_strategy'].apply(basic_clean)

# Here we're not categorizing explicitly but showing key phrases instead
# For actual categorization without predefined keywords, you'd use more advanced NLP:
# - Key phrase extraction algorithms
# - Topic modeling (LDA)
# - Named Entity Recognition (NER)

# Output to Excel
output_file = 'fund_key_phrases.xlsx'
data.to_excel(output_file, index=False)
print(f"Results have been written to {output_file}")

# Display results
print(data[['FundID', 'investment_strategy', 'key_phrases']])

Results have been written to fund_key_phrases.xlsx
    FundID                                investment_strategy  \
0      292  The investment seeks long-term capital appreci...   
1      324  The investment seeks current income, growth of...   
2      325  The investment seeks long-term growth of capit...   
3      327  The investment seeks to track the price and yi...   
4      328  The investment seeks to track the price and yi...   
5      329  The investment seeks to track the investment r...   
6      330  The investment seeks to track the performance ...   
7      331  The investment seeks to track the investment r...   
8      332  The investment seeks to track the investment r...   
9      333  The investment seeks to track the investment r...   
10     334  The investment seeks to track the investment r...   
11     335  The investment seeks long-term growth of capit...   
12     336  The investment seeks to provide investment ret...   
13     337  The investment seeks to pro

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JulianHeron\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JulianHeron\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [1]:
pip list | grep "spacy\|gensim"

Note: you may need to restart the kernel to use updated packages.


'grep' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
import requests
import sqlalchemy
import pandas as pd
import logging
from tenacity import retry, stop_after_attempt, wait_exponential

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# SQL Server connection configuration
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database?"
    "driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = sqlalchemy.create_engine(connection_string)

# YCharts API configuration and headers
YCHARTS_API_URL = "https://api.ycharts.com"
API_HEADERS = {
    "X-YCHARTSAUTHORIZATION": "yIIphqbsQysnTvWWxfW33w",  # Replace with your actual API key
    "X-YCHARTSEXCELSESSION": "b645cd897b2446bfa3796acfa3a879db",
    "X-YCHARTSEXCELVERSION": "4.4",
    "X-YCHARTSOPERATINGSYSTEM": "Microsoft Windows NT 10.0.26100.0",
    "X-YCHARTSIP": "50.58.50.123",
    "Host": "api.ycharts.com",
    "Connection": "Keep-Alive"
}

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def get_api_url(symbol, fund_type_id, metric):
    """
    Construct the YCI API URL based on fund type and metric with retry logic.
    """
    if fund_type_id == 3:
        symbol_for_api = f"M:{symbol}"
        endpoint = "mutual_funds"
    else:
        symbol_for_api = symbol
        endpoint = "companies"
    return f"{YCHARTS_API_URL}/v3/{endpoint}/{symbol_for_api}/info/{metric}?retrieve_ttl=true"

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def fetch_yci_metric(symbol, fund_type_id, metric):
    """
    Fetch a single YCI metric value for the given fund symbol.
    """
    api_url = get_api_url(symbol, fund_type_id, metric)
    logger.info(f"Fetching {metric} for {symbol} using URL: {api_url}")
    try:
        response = requests.get(api_url, headers=API_HEADERS, timeout=30)
        response.raise_for_status()
        data = response.json()
        response_key = f"M:{symbol}" if fund_type_id == 3 else symbol
        return data.get("response", {}).get(response_key, {}) \
                   .get("results", {}).get(metric, {}) \
                   .get("data")
    except Exception as e:
        logger.error(f"Error fetching {metric} for {symbol}: {e}")
        return None

def update_funds_with_benchmarks():
    """
    Fetch benchmark data for all funds and update the Funds_to_Screen table.
    """
    FUND_SYMBOLS_SQL = """
    SELECT DISTINCT SymbolCUSIP AS Fund_Symbol, Fund_Type_ID
    FROM Funds_to_Screen
    """
    
    funds_df = pd.read_sql(FUND_SYMBOLS_SQL, engine)
    
    for index, row in funds_df.iterrows():
        symbol = row["Fund_Symbol"]
        fund_type_id = row["Fund_Type_ID"]
        
        benchmark_category = fetch_yci_metric(symbol, fund_type_id, "ycharts_benchmark_category")
        benchmark_symbol = fetch_yci_metric(symbol, fund_type_id, "ycharts_benchmark_index_symbol")
        
        with engine.begin() as conn:
            conn.execute(sqlalchemy.text("""
            UPDATE Funds_to_Screen 
            SET YC_BM_Category = :category,
                YC_BM_Symbol = :symbol
            WHERE SymbolCUSIP = :fund_symbol
            """), {"category": benchmark_category, "symbol": benchmark_symbol, "fund_symbol": symbol})
        logger.info(f"Updated {symbol} with benchmark category: {benchmark_category}, symbol: {benchmark_symbol}")

def manage_benchmarks():
    """
    Check unique benchmark symbols against the Benchmarks table and add missing ones with additional static information.
    """
    unique_symbols_sql = """
    SELECT DISTINCT YC_BM_Symbol
    FROM Funds_to_Screen
    WHERE YC_BM_Symbol IS NOT NULL
    """
    unique_symbols_df = pd.read_sql(unique_symbols_sql, engine)
    
    for symbol in unique_symbols_df["YC_BM_Symbol"]:
        with engine.begin() as conn:
            # Check if the benchmark symbol exists
            result = conn.execute(sqlalchemy.text("SELECT 1 FROM Benchmarks WHERE Benchmark_Symbol = :symbol"), {"symbol": symbol}).fetchone()
            if not result:
                # Fetch the security name and description for the new benchmark
                security_name = fetch_yci_metric(symbol, 1, "security_name")  # Assuming fund_type_id for indices is 1
                description = fetch_yci_metric(symbol, 1, "description")
                
                if security_name:
                    conn.execute(sqlalchemy.text("""
                    INSERT INTO Benchmarks 
                    (Benchmark_Symbol, Benchmark_Name, DataSource, CWA_Broad_Category, Bench_Creator, Description)
                    VALUES 
                    (:symbol, :name, 'YCharts', 'Benchmark', 'Unknown', :description)
                    """), {"symbol": symbol, "name": security_name, "description": description})
                    logger.info(f"Added new benchmark {symbol} with name {security_name}")
                else:
                    logger.warning(f"Could not fetch security name for {symbol}")

if __name__ == "__main__":
    update_funds_with_benchmarks()
    manage_benchmarks()

2025-02-12 14:02:35,080 - INFO - Fetching ycharts_benchmark_category for NTSX using URL: https://api.ycharts.com/v3/companies/NTSX/info/ycharts_benchmark_category?retrieve_ttl=true
2025-02-12 14:02:35,443 - INFO - Fetching ycharts_benchmark_index_symbol for NTSX using URL: https://api.ycharts.com/v3/companies/NTSX/info/ycharts_benchmark_index_symbol?retrieve_ttl=true
2025-02-12 14:02:35,783 - INFO - Updated NTSX with benchmark category: Target Allocation, symbol: ^STRB
2025-02-12 14:02:35,785 - INFO - Fetching ycharts_benchmark_category for HIPS using URL: https://api.ycharts.com/v3/companies/HIPS/info/ycharts_benchmark_category?retrieve_ttl=true
2025-02-12 14:02:36,101 - INFO - Fetching ycharts_benchmark_index_symbol for HIPS using URL: https://api.ycharts.com/v3/companies/HIPS/info/ycharts_benchmark_index_symbol?retrieve_ttl=true
2025-02-12 14:02:36,447 - INFO - Updated HIPS with benchmark category: Target Allocation, symbol: ^STRB
2025-02-12 14:02:36,448 - INFO - Fetching ycharts_be

In [3]:
import sqlalchemy
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# SQL Server connection configuration
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database?"
    "driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = sqlalchemy.create_engine(connection_string)

# YCharts API configuration and headers (Assuming you have this setup from before)
YCHARTS_API_URL = "https://api.ycharts.com"
API_HEADERS = {
    "X-YCHARTSAUTHORIZATION": "yIIphqbsQysnTvWWxfW33w",  # Replace with your actual API key
    "X-YCHARTSEXCELSESSION": "b645cd897b2446bfa3796acfa3a879db",
    "X-YCHARTSEXCELVERSION": "4.4",
    "X-YCHARTSOPERATINGSYSTEM": "Microsoft Windows NT 10.0.26100.0",
    "X-YCHARTSIP": "50.58.50.123",
    "Host": "api.ycharts.com",
    "Connection": "Keep-Alive"
}

import requests
from tenacity import retry, stop_after_attempt, wait_exponential

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def fetch_yci_metric(symbol, metric):
    api_url = f"{YCHARTS_API_URL}/v3/companies/{symbol}/info/{metric}?retrieve_ttl=true"
    logger.info(f"Fetching {metric} for {symbol} using URL: {api_url}")
    try:
        response = requests.get(api_url, headers=API_HEADERS, timeout=30)
        response.raise_for_status()
        data = response.json()
        return data.get("response", {}).get(symbol, {}) \
                   .get("results", {}).get(metric, {}) \
                   .get("data")
    except Exception as e:
        logger.error(f"Error fetching {metric} for {symbol}: {e}")
        return None

def insert_benchmarks(symbols):
    for symbol in symbols:
        name = fetch_yci_metric(symbol, "name")
        description = fetch_yci_metric(symbol, "description")
        
        if name and description:
            with engine.begin() as conn:
                conn.execute(sqlalchemy.text("""
                INSERT INTO Benchmarks 
                (Benchmark_Symbol, Benchmark_Name, DataSource, CWA_Broad_Category, Bench_Creator, Description)
                VALUES 
                (:symbol, :name, 'YCharts', 'Benchmark', 'Unknown', :description)
                ON CONFLICT (Benchmark_Symbol) DO NOTHING  -- This syntax is for SQL dialects supporting UPSERT. Adjust for SQL Server.
                """), {"symbol": symbol, "name": name, "description": description})
                logger.info(f"Benchmark {symbol} inserted with name: {name}")
        else:
            logger.warning(f"Could not fetch name and/or description for {symbol}")

if __name__ == "__main__":
    symbols_to_insert = [
        "^BBMB15YTR", "^BBUSCRTR", "^BBUSMBSTR", "^BBUSTTR", "^BBUTISTR", "^LGALSEUS", 
        "^MSACAS", "^MSACSMCNTR", "^MSACXUSNTR", "^MSACXUSSCN", "^MSEMLA", "^MSIN", 
        "^MSMX", "^MSWMMTR", "^MSWTR", "^PEATR", "^RLVTR", "^RMCTR", "^RMVTR", 
        "^RUJTR", "^RUOTR", "^RUTTR", "^SPXUSTR", "^STRB"
    ]
    insert_benchmarks(symbols_to_insert)

2025-02-12 14:54:19,023 - INFO - Fetching name for ^BBMB15YTR using URL: https://api.ycharts.com/v3/companies/^BBMB15YTR/info/name?retrieve_ttl=true
2025-02-12 14:54:20,786 - ERROR - Error fetching name for ^BBMB15YTR: 404 Client Error: Not Found for url: https://api.ycharts.com/v3/companies/%5EBBMB15YTR/info/name?retrieve_ttl=true
2025-02-12 14:54:20,787 - INFO - Fetching description for ^BBMB15YTR using URL: https://api.ycharts.com/v3/companies/^BBMB15YTR/info/description?retrieve_ttl=true
2025-02-12 14:54:22,239 - ERROR - Error fetching description for ^BBMB15YTR: 404 Client Error: Not Found for url: https://api.ycharts.com/v3/companies/%5EBBMB15YTR/info/description?retrieve_ttl=true
2025-02-12 14:54:22,240 - INFO - Fetching name for ^BBUSCRTR using URL: https://api.ycharts.com/v3/companies/^BBUSCRTR/info/name?retrieve_ttl=true
2025-02-12 14:54:23,362 - ERROR - Error fetching name for ^BBUSCRTR: 404 Client Error: Not Found for url: https://api.ycharts.com/v3/companies/%5EBBUSCRTR/in

In [5]:
import pandas as pd
import sqlalchemy

# SQL Server connection configuration
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database?"
    "driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)

engine = sqlalchemy.create_engine(connection_string)

# SQL query to fetch data, with data cleaning and handling of NULLs
query = """
SELECT 
    f.SymbolCUSIP,
    f.investment_strategy,
    f.YC_BM_Category,
    COALESCE(c.CWA_Broad_Category_Name, 'Unknown') AS CWA_Broad_Category,
    TRIM(f.YC_BM_Symbol) AS YC_BM_Symbol,
    COALESCE(b.Benchmark_Name, 'No Matching Benchmark') AS Benchmark_Name
FROM 
    Funds_to_Screen f
LEFT JOIN 
    CWA_Broad_Category_List c ON f.CWA_Broad_Category_ID = c.ID
LEFT JOIN 
    Benchmarks b ON TRIM(f.YC_BM_Symbol) = b.Benchmark_Symbol
WHERE 
    f.YC_BM_Symbol IS NOT NULL
"""

# Read the SQL query into a DataFrame
df = pd.read_sql(query, engine)

# Display any rows where Benchmark_Name or CWA_Broad_Category are still missing
print("Rows with missing or default data:")
print(df[(df['Benchmark_Name'] == 'No Matching Benchmark') | (df['CWA_Broad_Category'] == 'Unknown')])

# Export to Excel
output_file = 'funds_with_benchmarks_cleaned.xlsx'
df.to_excel(output_file, index=False)

print(f"Data has been exported to {output_file}")

Rows with missing or default data:
     SymbolCUSIP                                investment_strategy  \
0           NTSX  The investment seeks total return.\n The fund ...   
1           HIPS  The investment seeks to track the performance,...   
2           EAOA  The investment seeks to track the investment r...   
3            AOM  The investment seeks to track the investment r...   
4            AOK  The investment seeks to track the investment r...   
...          ...                                                ...   
1052       WAARX  The investment seeks to maximize long-term tot...   
1053       WHGMX  The investment seeks long-term capital appreci...   
1057       APGRX  The investment seeks growth of capital.\n The ...   
1058         SPY  The investment seeks to provide investment res...   
1059         AGG  The investment seeks to track the investment r...   

                   YC_BM_Category CWA_Broad_Category YC_BM_Symbol  \
0               Target Allocation         A

In [None]:
# Code below is for fixing DB inserts issue with Return Driver's script

In [2]:
# Likely final classification of Return Drivers via Grok

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy.sql import text
import re
from collections import defaultdict

# Adjustable scaling factor for fund family distribution
FUND_FAMILY_SCALE_FACTOR = 5  # Adjust this value to change the influence of fund family data

# Adjustable toggle for Excel output (default off)
write_to_excel = True  # Set to True to generate Excel output, False to skip

# Define output path for Excel (if used)
output_path = r"C:\Users\JulianHeron\Software Projects\Return_Drivers_V1.xlsx"

# Database connection
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# Define keywords for each category
keywords = {
    "Index Based": ["index fund", "tracks", "replicates", "indexed", "underlying index", "thematic", "passive",
                    "economic characteristics that are substantially", "bond index", "market-cap weighted",
                    "low tracking error", "high correlation", "benchmark", "low-cost", "broad market exposure",
                    "aggregate bond", "mirrors", "equal-weighted", "beta", "value index", "growth index"],
    "Rules Based": ["rules-based", "factor-based", "factor tilt", "multi-factor", "factor investing", "momentum",
                    "low volatility", "low vol", "value factor", "quality", "quality factor", "free cash flow",
                    "fcf", "objective", "relatively", "go up", "certain fundamental metrics", "momentum index",
                    "quality index", "relatively lower valuations", "factors", "minimum volatility",
                    "high dividend yield", "enhanced index", "revenue weighted", "dividend weighted",
                    "enhanced returns", "fundamental weighting", "yield weighted", "low volatility", "rotation",
                    "rules based methodology", "cash cows", "alphaDEX", "ranked", "lower volatility",
                    "tilt", "optimized", "component securities", "economic characteristics", "free cash flow yield",
                    "high dividend yields", "ranking system", "consistently increased dividends", "dividend",
                    "dividends", "strong cash", "low debt", "increasing earnings", "earnings", "rising dividend",
                    "achievers", "volatility weighted", "long/cash", "low beta", "low size"],
    "Active Discretionary": ["actively managed", "actively-managed", "manager believes", "manager's judgment",
                             "active bottom‑up", "active strategy", "discretionary", "active management",
                             "active-management", "machine learning", "ai", "research-driven", "fundamental",
                             "strategically", "tactical allocation", "active", "rotation", "judgment", "analysis",
                             "outperform", "selection", "tactical", "trend-following", "trend following", "Bottom-Up Approach"
                            "the advisor", "advisor considers", "long-term", "appraisal"],
    "Quant Systematic": ["quantitative", "algorithm-driven", "systematic", "levered", "algorithm", "implied volatility",
                        "data-driven", "back-tested", "long-short", "model-based", "rotation", "statistical",
                        "rules-driven", "trend-following", "trend following", "tactical", "machine learning", "ai",
                        "long/short"],
    "Multi Strategy": ["multi-strategy", "multi-asset", "hybrid strategy", "multi-manager", "dynamic allocation",
                      "absolute return", "blended", "combination", "hybrid", "flexible", "alternative"]
}

# Direct mapping keywords (added to existing keywords)
direct_mappings = {
    "Active Discretionary": ["Active Management", "Actively Managed", "Discretionary", "Active Strategy", "Active",
                             "Active Bottom-Up", "Active-Management", "Actively-Managed", "Actively",
                             "Actively Allocates", "Active Allocation", "Active Trading", "Trading Actively",
                             "Actively Trading"],
    "Rules Based": ["Rules-Based", "Factor-Based", "Multi-Factor"],
    "Quant Systematic": []
}

# Update keywords with direct mappings
for category, terms in direct_mappings.items():
    keywords[category].extend(terms)

# Convert keywords to lowercase for matching
keywords = {cat: [term.lower() for term in terms] for cat, terms in keywords.items()}

# Define meaningful categories for direct classification
meaningful_categories = {
    "Index Based": {
        "YC_Category": ["Target Maturity", "Digital Assets", "Single Currency", "Muni Target Maturity"],
        "CWA_Broad_Category": ["Currency", "Digital Asset", "Single Stock"],
        "YC_Global_Category": ["Currency"]
    },
    "Quant Systematic": {
        "CWA_Broad_Category": ["Defined Outcome", "Trading/Tactical"],
        "YC_Global_Category": ["Defined Outcome", "Trading Tools", "Systematic Trend"],
        "YC_Category": ["Trading--Inverse Commodities", "Trading--Inverse Debt", "Trading--Inverse Equity",
                        "Trading--Leveraged Commodities", "Trading--Leveraged Debt", "Trading--Leveraged Equity",
                        "Trading—Miscellaneous"]
    }
}

# Simplified assist categories using pattern matching for "Target-Date" and "Muni"
assist_categories = [
    # Pattern-based rules
    {
        "pattern": "Target-Date",
        "cat_type": "YC_Category",
        "actions": {"remove": ["Rules Based", "Quant Systematic"]}
    },
    {
        "pattern": "Target Date",
        "cat_type": "CWA_Broad_Category",
        "actions": {"remove": ["Rules Based", "Quant Systematic"]}
    },
    {
        "pattern": "Target Date",
        "cat_type": "YC_Global_Category",
        "actions": {"remove": ["Rules Based", "Quant Systematic"]}
    },
    {
        "pattern": "Muni",
        "cat_type": "YC_Category",
        "actions": {"remove": ["Multi Strategy", "Quant Systematic"]}
    },
    {
        "pattern": "US Municipal Fixed Income",
        "cat_type": "YC_Global_Category",
        "actions": {"remove": ["Multi Strategy", "Quant Systematic"]}
    },
    # Exact match rules (non-patterned categories)
    {
        "exact": "Commodity",
        "cat_type": "CWA_Broad_Category",
        "actions": {"remove": ["Rules Based"]}
    },
    {
        "exact": "Commodities Broad Basket",
        "cat_type": "YC_Category",
        "actions": {"remove": ["Rules Based"]}
    },
    {
        "exact": "Commodities Focused",
        "cat_type": "YC_Category",
        "actions": {"remove": ["Rules Based"]}
    },
    {
        "exact": "Commodities Broad Basket",
        "cat_type": "YC_Global_Category",
        "actions": {"remove": ["Rules Based"]}
    },
    {
        "exact": "Commodities Specified",
        "cat_type": "YC_Global_Category",
        "actions": {"remove": ["Rules Based"]}
    },
    {
        "exact": "Long/Short Equity",
        "cat_type": "YC_Global_Category",
        "actions": {"boost": ["Active Discretionary", "Quant Systematic"],
                    "remove": ["Multi Strategy", "Rules Based"]}
    },
    {
        "exact": "inflation-protected bond",
        "cat_type": "YC_Category",
        "actions": {"boost": ["Active Discretionary", "Index Based"],
                    "remove": ["Multi Strategy", "Rules Based", "Quant Systematic"]}
    },
    {
        "exact": "Nontraditional",
        "cat_type": "CWA_Broad_Category",
        "actions": {"boost": ["Active Discretionary", "Rules Based", "Quant Systematic"],
                    "remove": ["Index Based"]}
    },
    {
        "exact": "Nontraditional Equity",
        "cat_type": "YC_Broad_Asset_Class",
        "actions": {"boost": ["Active Discretionary", "Rules Based", "Quant Systematic"]}
    },
    {
        "exact": "Sector/Industry",
        "cat_type": "CWA_Broad_Category",
        "actions": {"boost": ["Active Discretionary", "Index Based", "Rules Based"]}
    }
]

# Load data from database (select only ID columns along with necessary fields)
query_funds = """
SELECT SymbolCUSIP, ProductName, fund_family, investment_strategy, FS_insight, index_fund,
       inverse_fund, leveraged_fund, socially_responsible_fund, synthetic_replication_fund,
       fund_of_funds, ycharts_url, YC_Category_ID, CWA_Broad_Category_ID,
       YC_Global_Category_ID, YC_Broad_Asset_Class_ID,
       currency_hedged_fund
FROM Funds_to_Screen
"""
funds_df = pd.read_sql(query_funds, engine)

# Load category mappings with exact column names from your schema
category_mappings = {
    "CWA_Broad_Category": pd.read_sql("SELECT ID, CWA_Broad_Category_Name FROM CWA_Broad_Category_List", engine),
    "YC_Category": pd.read_sql("SELECT ID, Category_Name FROM YC_Category_List", engine),
    "YC_Global_Category": pd.read_sql("SELECT ID, Global_Category_Name FROM YC_Global_Category_List", engine),
    "YC_Broad_Asset_Class": pd.read_sql("SELECT ID, YC_Broad_Asset_Class_Name FROM YC_Broad_Asset_Class_List", engine)
}

# Merge category names into funds_df using the ID columns, drop 'ID' after each merge
funds_df = funds_df.merge(category_mappings["CWA_Broad_Category"], left_on="CWA_Broad_Category_ID", right_on="ID", how="left").drop(columns=["ID"])
funds_df = funds_df.merge(category_mappings["YC_Category"], left_on="YC_Category_ID", right_on="ID", how="left").drop(columns=["ID"])
funds_df = funds_df.merge(category_mappings["YC_Global_Category"], left_on="YC_Global_Category_ID", right_on="ID", how="left").drop(columns=["ID"])
funds_df = funds_df.merge(category_mappings["YC_Broad_Asset_Class"], left_on="YC_Broad_Asset_Class_ID", right_on="ID", how="left").drop(columns=["ID"])

# Normalize Boolean fields (True/False and 1/0 to 1/0) and add debugging
boolean_cols = ["index_fund", "inverse_fund", "leveraged_fund", "socially_responsible_fund",
               "synthetic_replication_fund", "fund_of_funds", "currency_hedged_fund"]
# Debug raw values before normalization
print("Raw index_fund values before normalization:", funds_df["index_fund"].head().tolist())
# Normalize with broader type handling
for col in boolean_cols:
    funds_df[col] = funds_df[col].apply(lambda x: 1 if str(x).lower() in ['true', '1', 'yes'] else 0)
# Debug after normalization
print("Index_fund values after normalization:", funds_df["index_fund"].head().tolist())
# Specifically check a few funds
print("VLUE index_fund after normalization:", funds_df[funds_df["SymbolCUSIP"] == "VLUE"]["index_fund"].values)

# Define categories with new names
categories = ["Index Based", "Rules Based", "Active Discretionary", "Quant Systematic", "Multi Strategy"]
category_mapping = {
    "Index Based": "index_based",
    "Rules Based": "rules_based",
    "Active Discretionary": "active_discretionary",
    "Quant Systematic": "quant_systematic",
    "Multi Strategy": "multi_strategy"
}

# Initialize dictionaries to track scoring components
for cat in categories:
    db_cat = category_mapping[cat]
    score_col = f"score_{db_cat}"
    funds_df[score_col] = 0.0

# Initialize intermediate score columns for each component
for cat in categories:
    db_cat = category_mapping[cat]
    funds_df[f"keyword_score_{db_cat}"] = 0.0
    funds_df[f"meaningful_score_{db_cat}"] = 0.0
    funds_df[f"assist_score_{db_cat}"] = 0.0
    funds_df[f"boolean_score_{db_cat}"] = 0.0
    funds_df[f"fundfamily_score_{db_cat}"] = 0.0

# Initialize columns to track matched keywords (concatenate matched keywords as strings)
for cat in categories:
    db_cat = category_mapping[cat]
    funds_df[f"matched_keywords_{db_cat}"] = ""

# Debugging: Print columns to confirm score columns are created
print("Columns after initializing scores:", funds_df.columns.tolist())

# Function to count keywords and return matched keywords
def count_keywords(text, keyword_list):
    if pd.isna(text):
        return 0, ""
    text = text.lower()
    matches = [keyword for keyword in keyword_list if re.search(r'\b' + re.escape(keyword) + r'\b', text)]
    return len(matches), "; ".join(matches)

# Apply scoring based on keywords and track matched keywords
text_columns = ["ProductName", "investment_strategy", "FS_insight"]
for cat, kw_list in keywords.items():
    db_cat = category_mapping[cat]
    score_col = f"keyword_score_{db_cat}"
    matched_col = f"matched_keywords_{db_cat}"
    for text_col in text_columns:
        counts_and_matches = funds_df[text_col].apply(lambda x: count_keywords(x, kw_list))
        funds_df[score_col] += counts_and_matches.apply(lambda x: x[0])
        funds_df[matched_col] = funds_df[matched_col] + "; " + counts_and_matches.apply(lambda x: x[1])
    # Clean up matched keywords column (remove duplicate semicolons, trim)
    funds_df[matched_col] = funds_df[matched_col].str.replace(r'\s*;\s*;\s*', '; ', regex=True).str.strip('; ')

# Apply meaningful category rules
for cat, mappings in meaningful_categories.items():
    db_cat = category_mapping[cat]
    score_col = f"meaningful_score_{db_cat}"
    for map_type, values in mappings.items():
        col_name = {
            "CWA_Broad_Category": "CWA_Broad_Category_Name",
            "YC_Category": "Category_Name",
            "YC_Global_Category": "Global_Category_Name"
        }[map_type]
        matches = funds_df[funds_df[col_name].isin(values)][["SymbolCUSIP", col_name]]
        if not matches.empty:
            print(f"Meaningful category matches for {cat}, {map_type}:")
            print(matches.head())
        funds_df.loc[funds_df[col_name].isin(values), score_col] += 10  # High score for direct match

# Apply assist category rules with pattern matching
for rule in assist_categories:
    cat_type = rule["cat_type"]
    col_name = {
        "CWA_Broad_Category": "CWA_Broad_Category_Name",
        "YC_Category": "Category_Name",
        "YC_Global_Category": "Global_Category_Name",
        "YC_Broad_Asset_Class": "YC_Broad_Asset_Class_Name"
    }[cat_type]
    actions = rule["actions"]

    if "pattern" in rule:
        pattern = rule["pattern"]
        mask = funds_df[col_name].fillna("").str.contains(pattern, case=False, na=False)
        matches = funds_df[mask][["SymbolCUSIP", col_name]]
        if not matches.empty:
            print(f"Assist category pattern '{pattern}' matches for {cat_type}:")
            print(matches.head())
    else:
        exact_value = rule["exact"]
        mask = funds_df[col_name].fillna("").str.contains(exact_value, case=False, na=False)
        matches = funds_df[mask][["SymbolCUSIP", col_name]]
        if not matches.empty:
            print(f"Assist category exact '{exact_value}' matches for {cat_type}:")
            print(matches.head())

    if "remove" in actions:
        for remove_cat in actions["remove"]:
            db_remove_cat = category_mapping[remove_cat]
            score_col = f"assist_score_{db_remove_cat}"
            funds_df.loc[mask, score_col] = -float('inf')
    if "boost" in actions:
        for boost_cat in actions["boost"]:
            db_boost_cat = category_mapping[boost_cat]
            score_col = f"assist_score_{db_boost_cat}"
            funds_df.loc[mask, score_col] += 5

# Reset any assist scores not explicitly set to 0 (avoid incorrect leftovers like 0.25)
for cat in categories:
    db_cat = category_mapping[cat]
    score_col = f"assist_score_{db_cat}"
    funds_df[score_col] = funds_df[score_col].replace(0.25, 0)

# Apply Boolean rules and track contributions
mask_index = (funds_df["index_fund"] == 1) & (
    (funds_df["inverse_fund"] == 1) |
    (funds_df["leveraged_fund"] == 1) |
    (funds_df["socially_responsible_fund"] == 1) |
    (funds_df["synthetic_replication_fund"] == 1)
)
funds_df.loc[mask_index, "boolean_score_index_based"] += 20

mask_remove = funds_df["index_fund"] == 1
funds_df.loc[mask_remove, "boolean_score_active_discretionary"] = -float('inf')
funds_df.loc[mask_remove, "boolean_score_quant_systematic"] = -float('inf')

funds_df.loc[mask_remove, "boolean_score_index_based"] += 5
funds_df.loc[mask_remove, "boolean_score_rules_based"] += 5

funds_df.loc[funds_df["index_fund"] == 0, "boolean_score_index_based"] = -float('inf')

mask_remove2 = (funds_df["index_fund"] == 1) & (funds_df["fund_of_funds"] == 1)
funds_df.loc[mask_remove2, "boolean_score_active_discretionary"] = -float('inf')
funds_df.loc[mask_remove2, "boolean_score_quant_systematic"] = -float('inf')

mask_boost = funds_df["fund_of_funds"] == 1
funds_df.loc[mask_boost, "boolean_score_active_discretionary"] += 5
funds_df.loc[mask_boost, "boolean_score_quant_systematic"] += 5
funds_df.loc[mask_boost, "boolean_score_multi_strategy"] += 5

mask_currency = funds_df["currency_hedged_fund"] == 1
funds_df.loc[mask_currency, "boolean_score_active_discretionary"] += 5
funds_df.loc[mask_currency, "boolean_score_index_based"] += 5

funds_df.loc[mask_currency, "boolean_score_rules_based"] = -float('inf')

# Merge FundFamilyData and apply as a weighted factor
fund_family_df = pd.read_sql("SELECT FundFamilyName, Dist_Index, Dist_Active, Dist_Rules_Based, Dist_Quant, Dist_Multi FROM FundFamilyData", engine)
funds_df = funds_df.merge(fund_family_df, left_on="fund_family", right_on="FundFamilyName", how="left")
funds_df["Dist_Index"] = funds_df["Dist_Index"].fillna(0) / 100
funds_df["Dist_Active"] = funds_df["Dist_Active"].fillna(0) / 100
funds_df["Dist_Rules_Based"] = funds_df["Dist_Rules_Based"].fillna(0) / 100
funds_df["Dist_Quant"] = funds_df["Dist_Quant"].fillna(0) / 100
funds_df["Dist_Multi"] = funds_df["Dist_Multi"].fillna(0) / 100

funds_df["fundfamily_score_index_based"] += funds_df["Dist_Index"] * FUND_FAMILY_SCALE_FACTOR
funds_df["fundfamily_score_active_discretionary"] += funds_df["Dist_Active"] * FUND_FAMILY_SCALE_FACTOR
funds_df["fundfamily_score_rules_based"] += funds_df["Dist_Rules_Based"] * FUND_FAMILY_SCALE_FACTOR
funds_df["fundfamily_score_quant_systematic"] += funds_df["Dist_Quant"] * FUND_FAMILY_SCALE_FACTOR
funds_df["fundfamily_score_multi_strategy"] += funds_df["Dist_Multi"] * FUND_FAMILY_SCALE_FACTOR

# Sum all intermediate scores into final scores
for cat in categories:
    db_cat = category_mapping[cat]
    score_col = f"score_{db_cat}"
    funds_df[score_col] = (
        funds_df[f"keyword_score_{db_cat}"] +
        funds_df[f"meaningful_score_{db_cat}"] +
        funds_df[f"assist_score_{db_cat}"] +
        funds_df[f"boolean_score_{db_cat}"] +
        funds_df[f"fundfamily_score_{db_cat}"]
    )

# Determine final category with tiebreaker based on keyword scores
score_columns = [f"score_{category_mapping[cat]}" for cat in categories]
# Initial idxmax
funds_df["Return_Driver"] = funds_df[score_columns].idxmax(axis=1).apply(
    lambda x: category_mapping.get(x.replace("score_", ""), "None") if pd.notnull(x) else "None"
)
# Apply tiebreaker: if scores are tied, prefer category with higher keyword score
for idx in funds_df.index:
    scores = funds_df.loc[idx, score_columns]
    max_score = scores.max()
    tied_categories = [col for col, score in scores.items() if score == max_score]
    if len(tied_categories) > 1:
        # Find category with highest keyword score among tied categories
        keyword_scores = {col: funds_df.loc[idx, f"keyword_score_{col.replace('score_', '')}"] for col in tied_categories}
        max_keyword_score = max(keyword_scores.values())
        best_tied_category = max(keyword_scores, key=keyword_scores.get)
        funds_df.loc[idx, "Return_Driver"] = best_tied_category.replace("score_", "")

# Clean SymbolCUSIP to ensure exact matching with database
funds_df["SymbolCUSIP"] = funds_df["SymbolCUSIP"].str.strip().str.upper()

# Write scores and Return_Driver to the database with logging
with engine.connect() as conn:
    # Use a transaction to ensure all updates are atomic
    with conn.begin() as transaction:
        for cat in categories:
            db_cat = category_mapping[cat]
            score_col = f"score_{db_cat}"
            # Replace -inf, inf, and NaN with None (NULL in SQL) to avoid invalid float errors
            funds_df[score_col] = funds_df[score_col].replace([np.inf, -np.inf, np.nan], None)
            # Define the SQL query with named parameters
            update_query = text(f"""
                UPDATE Funds_to_Screen
                SET {db_cat} = :score
                WHERE SymbolCUSIP = :symbol_cusip
            """)
            # Prepare data for update
            score_data = [(row["SymbolCUSIP"], row[score_col]) for _, row in funds_df.iterrows()]
            # Execute the update for each row with logging
            for sym, score in score_data:
                params = {"symbol_cusip": sym, "score": float(score) if score is not None else None}
                result = conn.execute(update_query, params)
                if result.rowcount == 0:
                    print(f"Warning: No rows updated for SymbolCUSIP {sym} with score {score} for {db_cat}")
                else:
                    print(f"Updated {result.rowcount} row(s) for SymbolCUSIP {sym} with score {score} for {db_cat}")

        # Update the Return_Driver column
        update_driver_query = text("""
            UPDATE Funds_to_Screen
            SET return_driver = :return_driver
            WHERE SymbolCUSIP = :symbol_cusip
        """)
        driver_data = [(row["SymbolCUSIP"], row["Return_Driver"]) for _, row in funds_df.iterrows()]
        for sym, driver in driver_data:
            result = conn.execute(update_driver_query, {"symbol_cusip": sym, "return_driver": driver})
            if result.rowcount == 0:
                print(f"Warning: No rows updated for SymbolCUSIP {sym} with Return_Driver {driver}")
            else:
                print(f"Updated {result.rowcount} row(s) for SymbolCUSIP {sym} with Return_Driver {driver}")

    # Explicitly commit the transaction
    conn.commit()
    print("Database updates committed successfully.")

# Export to Excel if toggled on (default off)
if write_to_excel:
    output_columns = (
        ["SymbolCUSIP", "ProductName", "fund_family", "Return_Driver", "ycharts_url"] +
        [f"score_{category_mapping[cat]}" for cat in categories] +
        [f"keyword_score_{category_mapping[cat]}" for cat in categories] +
        [f"meaningful_score_{category_mapping[cat]}" for cat in categories] +
        [f"assist_score_{category_mapping[cat]}" for cat in categories] +
        [f"boolean_score_{category_mapping[cat]}" for cat in categories] +
        [f"fundfamily_score_{category_mapping[cat]}" for cat in categories] +
        [f"matched_keywords_{category_mapping[cat]}" for cat in categories] +
        ["CWA_Broad_Category_Name", "Category_Name", "Global_Category_Name", "YC_Broad_Asset_Class_Name"] +
        ["index_fund", "inverse_fund", "leveraged_fund", "socially_responsible_fund", "synthetic_replication_fund", "fund_of_funds", "currency_hedged_fund"] +
        ["Dist_Index", "Dist_Active", "Dist_Rules_Based", "Dist_Quant", "Dist_Multi"]
    )
    funds_df[output_columns].to_excel(output_path, index=False)
    print(f"Results exported to {output_path}")
else:
    print("Excel output skipped (write_to_excel=False). Data written to database.")

Raw index_fund values before normalization: ['0', '1', '1', '1', '1']
Index_fund values after normalization: [0, 1, 1, 1, 1]
VLUE index_fund after normalization: [1]
Columns after initializing scores: ['SymbolCUSIP', 'ProductName', 'fund_family', 'investment_strategy', 'FS_insight', 'index_fund', 'inverse_fund', 'leveraged_fund', 'socially_responsible_fund', 'synthetic_replication_fund', 'fund_of_funds', 'ycharts_url', 'YC_Category_ID', 'CWA_Broad_Category_ID', 'YC_Global_Category_ID', 'YC_Broad_Asset_Class_ID', 'currency_hedged_fund', 'CWA_Broad_Category_Name', 'Category_Name', 'Global_Category_Name', 'YC_Broad_Asset_Class_Name', 'score_index_based', 'score_rules_based', 'score_active_discretionary', 'score_quant_systematic', 'score_multi_strategy', 'keyword_score_index_based', 'meaningful_score_index_based', 'assist_score_index_based', 'boolean_score_index_based', 'fundfamily_score_index_based', 'keyword_score_rules_based', 'meaningful_score_rules_based', 'assist_score_rules_based', 