In [None]:
# new risk drivers attempt 1

In [9]:
# Version 6.2: Enhanced with updated FundFamilyData distributions, increased keyword weight,
# and adjusted decision tree bonuses (metadata bonus removed)

import pandas as pd
from sqlalchemy import create_engine

"""
classify_return_drivers.py (V6)

Classifies investment funds into return generation strategies:
- Index Based
- Factor/Smart Beta
- Active Discretionary
- Quant/Systematic
- Multi-Strategy

Enhancements in V6:
- Uses FundFamilyData distribution fields (Dist_Index, Dist_Active, Dist_Rules_Based, Dist_Quant, Dist_Multi)
  for scoring fund family signals.
- KEYWORDS_WEIGHT is increased to 50.
- Metadata bonus points are removed.
- Decision-tree adjustments:
    * If index_fund is true, a balanced bonus of +50 is added to "Index Based" and +50 to "Factor/Smart Beta."
    * Additionally, if index_fund is true and any of (esg, leveraged, synthetic) flags are true, an extra +100 is added to "Index Based."
- All other scoring components (keywords, category lookup) remain as before.
"""

# --- Tunable Parameters and Dry Run Switches ---
TIER_CAP = 100
FUND_FAMILY_WEIGHT = 40
KEYWORDS_WEIGHT = 50    # Increased to 50
CATEGORIES_WEIGHT = 15
# METADATA_WEIGHT is defined but its scores will not be added.
METADATA_WEIGHT = 15
HIGH_CONF_THRESHOLD = 50
LIKELY_FUND_TYPE_WEIGHT = 0.5
DRY_RUN = False
TEST_FUNDS = ["VOO", "DFSVX", "BSIIX", "IVV", "FBALX", "LCEAX", "JSMBX"]

# We'll continue to use tier weights for keywords and categories.
TIER_WEIGHTS = {"High": 1.0, "Medium": 0.5, "Low": 0.25}

# --- Constants ---
RETURN_CATEGORIES = ["Index Based", "Factor/Smart Beta", "Active Discretionary", "Quant/Systematic", "Multi-Strategy"]

# --- Updated Keyword Lists ---
high_conf_keywords = {
    "Index Based": [
        "index fund", "tracks", "replicates", "indexed", "underlying index",
        "thematic", "passive", "economic characteristics that are substantially",
        "passively managed"  # For GCLN
    ],
    "Factor/Smart Beta": [
        "rules-based", "factor-based", "factor tilt", "multi-factor", "factor investing",
        "momentum", "low volatility", "low vol", "value factor", "quality", "quality factor",
        "free cash flow", "fcf", "factor", "objective", "time tested", "relatively", "go up",
        "certain fundamental metrics", "momentum index", "quality index", "relatively lower valuations",
        "factors", "minimum volatility", "high dividend yield",
        "dividend growth rate", "high dividend-paying",
        "fundamental score multiplier", "growth and quality factors"  # For GPOW, TTAI, IQDE, TTAC
    ],
    "Active Discretionary": [
        "actively managed", "actively-managed", "manager believes", "manager's judgment",
        "active bottom‑up", "active strategy", "discretionary", "active management", "active-management",
        "actively invests"  # For DIVD, GREI
    ],
    "Quant/Systematic": [
        "quantitative", "algorithm-driven", "systematic", "levered", "algorithm", "implied volatility",
        "rules-based methodology", "equity index futures", "market neutral",
        "quantitative research", "contrarian strategy",
        "dynamically allocating", "economic indicators"  # For KOOL
    ],
    "Multi-Strategy": [
        "multi-strategy", "multi-asset", "hybrid strategy",
        "cautious allocation"  # For TWIO
    ]
}

med_conf_keywords = {
    "Index Based": [
        "bond index", "market-cap weighted", "low tracking error", "high correlation", "benchmark",
        "low-cost", "broad market exposure", "passive", "aggregate bond", "passive", "broad"
    ],
    "Factor/Smart Beta": [
        "enhanced index", "revenue weighted", "dividend weighted", "enhanced returns", "thematic",
        "fundamental weighting", "yield weighted", "quality factor", "low volatility", "rotation",
        "rules based methodology", "cash cows", "alphaDEX", "ranked", "lower volatility"
    ],
    "Active Discretionary": [
        "machine learning", "ai", "research-driven", "fundamental", "strategically",
        "tactical allocation", "active", "rotation"
    ],
    "Quant/Systematic": [
        "data-driven", "systematic", "backtested", "long-short", "model-based", "rotation"
    ],
    "Multi-Strategy": [
        "multi-manager", "dynamic allocation", "multi-asset", "absolute return", "blended"
    ]
}

low_conf_keywords = {
    "Index Based": [
        "index", "mirrors", "equal-weighted", "bond index", "beta", "etf", "value index", "growth index"
    ],
    "Factor/Smart Beta": [
        "fundamental weighting", "enhanced index", "thematic", "tilt", "optimized", "component securities",
        "economic characteristics", "free cash flow yield", "objective", "high dividend yields",
        "ranking system", "consistantly increased dividends", "dividend", "dividend yield", "dividends",
        "strong cash", "low debt", "increasing earnings", "earnings", "rising dividend", "acheivers",
        "volatility weighted", "long/cash", "low beta", "low size"
    ],
    "Active Discretionary": [
        "discretionary", "judgment", "analysis", "outperform", "selection", "tactical",
        "trend-following", "trend following"
    ],
    "Quant/Systematic": [
        "quantitative", "algorithm", "statistical", "rules-driven", "trend-following",
        "trend following", "tactical"
    ],
    "Multi-Strategy": [
        "combination", "hybrid", "multi-asset", "flexible", "alternative"
    ]
}

# --- Updated Category Lookup ---
CATEGORY_LOOKUP = {
    "High": {
        "Defined Outcome": {"Quant/Systematic": 15},
        "Quantitative/Tactical": {"Quant/Systematic": 15},
        "Trading/Tactical": {"Quant/Systematic": 10},
        "Digital Asset": {"Index Based": 15},
        "Currency": {"Index Based": 15},
        "Target Maturity": {"Index Based": 15},
        "Systematic Trend": {"Quant/Systematic": 15}
    },
    "Medium": {
        "Bond Strategy": {"Active Discretionary": 5, "Factor/Smart Beta": 5, "Quant/Systematic": 5},
        "Trading Tools": {"Quant/Systematic": 10},
        "Trading": {"Quant/Systematic": 10},
        "Target Date": {"Active Discretionary": 5, "Index Based": 2, "Multi-Strategy": 8},
        "Options Trading": {"Active Discretionary": 5, "Factor/Smart Beta": 5, "Quant/Systematic": 5},
        "Long/Short Equity": {"Active Discretionary": 5, "Quant/Systematic": 5},
        "Commodities": {"Active Discretionary": 4, "Factor/Smart Beta": 1, "Quant/Systematic": 4, "Index Based": 4, "Multi-Strategy": 1},
        "Municipal": {"Active Discretionary": 6, "Factor/Smart Beta": 2, "Index Based": 6, "Multi-Strategy": 1},
        "Derivative Income": {"Active Discretionary": 5, "Quant/Systematic": 4, "Factor/Smart Beta": 2, "Index Based": 4},
        "Event Driven": {"Active Discretionary": 8, "Quant/Systematic": 7},
        "Inflation-Protected Bond": {"Active Discretionary": 6, "Index Based": 9}
    },
    "Low": {
        "Multialternative": {"Quant/Systematic": 7, "Active Discretionary": 8},
        "Specialty": {"Multi-Strategy": 4, "Quant/Systematic": 4, "Active Discretionary": 5},
        "Strategic": {"Multi-Strategy": 4, "Quant/Systematic": 4, "Active Discretionary": 5},
        "Nontraditional": {"Multi-Strategy": 2, "Quant/Systematic": 3, "Active Discretionary": 3, "Factor/Smart Beta": 3},
        "Equity Hedged": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5},
        "Multisector Bond": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5},
        "Nontraditional Bond": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5}
    }
}

# --- Database Connection ---
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# --- SQL Query (Corrected Column Names) ---
query = """
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy, fs.FS_insight,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.ycharts_url,
    yc_ba.YC_Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name,
    ff.Dist_Index, ff.Dist_Active, ff.Dist_Rules_Based, ff.Dist_Quant, ff.Dist_Multi
FROM Funds_to_Screen fs
LEFT JOIN FundFamilyData ff ON fs.fund_family = ff.FundFamilyName
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""
df = pd.read_sql(query, engine)

if DRY_RUN:
    df = df[df['SymbolCUSIP'].isin(TEST_FUNDS)]

# --- Function Definitions ---

def score_keywords(text, keyword_dict, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for category, keywords in keyword_dict.items():
        hits = sum(keyword in text for keyword in keywords)
        if len(keywords) > 0:
            scores[category] = min(hits * (max_points / len(keywords)), max_points)
    return scores

def score_categories(row, tier_dict, max_points):
    category_cols = ['YC_Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name']
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for col in category_cols:
        if pd.notna(row[col]):
            value = row[col].lower()
            for keyword, weights in tier_dict.items():
                if keyword in value:
                    for cat, points in weights.items():
                        scores[cat] += points
    for cat in scores:
        scores[cat] = min(scores[cat], max_points)
    return scores

def score_metadata(row, max_points):
    # Retained for reference; metadata bonus not added to final score.
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    if row['index_fund'] == 'True':
        scores["Index Based"] += max_points
    if row['leveraged_fund'] == 'True' or row['inverse_fund'] == 'True':
        scores["Quant/Systematic"] += max_points / 2
    if row['fund_of_funds'] == 'True':
        scores["Active Discretionary"] += max_points / 3
        scores["Factor/Smart Beta"] += max_points / 3
        scores["Index Based"] += max_points / 3
    return scores

def score_fund_family(row, max_points):
    """
    Updated scoring for Fund Family using FundFamilyData distribution columns.
    Expected columns from FundFamilyData join:
      - Dist_Index, Dist_Active, Dist_Rules_Based, Dist_Quant, Dist_Multi
    Returns a dictionary with bonus scores for each category.
    """
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    try:
        dist_index = float(row.get('Dist_Index', 0))
        dist_active = float(row.get('Dist_Active', 0))
        dist_rules_based = float(row.get('Dist_Rules_Based', 0))
        dist_quant = float(row.get('Dist_Quant', 0))
        dist_multi = float(row.get('Dist_Multi', 0))
    except Exception:
        dist_index = dist_active = dist_rules_based = dist_quant = dist_multi = 0

    scores["Index Based"] += (dist_index / 100.0) * max_points
    scores["Active Discretionary"] += (dist_active / 100.0) * max_points
    scores["Factor/Smart Beta"] += (dist_rules_based / 100.0) * max_points
    scores["Quant/Systematic"] += (dist_quant / 100.0) * max_points
    scores["Multi-Strategy"] += (dist_multi / 100.0) * max_points

    if (dist_index + dist_active + dist_rules_based + dist_quant + dist_multi) == 0:
        scores["Active Discretionary"] += max_points * 0.5

    return scores

def classify_return(row):
    # Combine text fields (lowercased)
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) else ""
    investment_strategy = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) else ""
    fs_insight = row['FS_insight'].lower() if pd.notna(row['FS_insight']) else ""
    text = product_name + " " + investment_strategy + " " + fs_insight

    # --- Compute Tiered Scores for Keywords and Categories ---
    keywords_scores = {tier: score_keywords(text, kw_dict, KEYWORDS_WEIGHT)
                       for tier, kw_dict in zip(TIER_WEIGHTS, [high_conf_keywords, med_conf_keywords, low_conf_keywords])}
    categories_scores = {tier: score_categories(row, CATEGORY_LOOKUP[tier], CATEGORIES_WEIGHT)
                         for tier in TIER_WEIGHTS}

    tier_total_scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for tier in TIER_WEIGHTS:
        for cat in RETURN_CATEGORIES:
            tier_total_scores[cat] += keywords_scores[tier][cat] + categories_scores[tier][cat]
    
    # --- Fund Family Scores ---
    fund_family_scores = score_fund_family(row, FUND_FAMILY_WEIGHT)
    for cat in RETURN_CATEGORIES:
        tier_total_scores[cat] += fund_family_scores[cat]

    # --- Decision Tree Rules Adjustments ---
    decision_scores = {cat: 0 for cat in RETURN_CATEGORIES}

    def to_bool(val):
        return str(val).strip().lower() in ["true", "1"]

    index_flag          = to_bool(row.get("index_fund", False))
    esg_flag            = to_bool(row.get("socially_responsible_fund", False))
    leveraged_flag      = to_bool(row.get("leveraged_fund", False))
    synthetic_flag      = to_bool(row.get("synthetic_replication_fund", False))
    currency_hedged_flag = to_bool(row.get("currency_hedged_fund", False))
    fund_of_funds_flag  = to_bool(row.get("fund_of_funds", False))

    if index_flag:
        decision_scores["Index Based"] += 50
        decision_scores["Factor/Smart Beta"] += 50
        if esg_flag or leveraged_flag or synthetic_flag:
            decision_scores["Index Based"] += 100
    if not index_flag:
        decision_scores["Index Based"] = -9999
    if fund_of_funds_flag:
        if index_flag:
            decision_scores["Active Discretionary"] -= 100
            decision_scores["Quant/Systematic"] -= 100
        else:
            decision_scores["Active Discretionary"] += 30
            decision_scores["Quant/Systematic"] += 10
            decision_scores["Multi-Strategy"] += 10
    if currency_hedged_flag:
        decision_scores["Factor/Smart Beta"] = -9999
        decision_scores["Active Discretionary"] += 20
        decision_scores["Index Based"] += 20
    if esg_flag:
        decision_scores["Factor/Smart Beta"] = -9999
    if index_flag and (leveraged_flag or synthetic_flag):
        decision_scores["Active Discretionary"] = -9999
        decision_scores["Multi-Strategy"] = -9999
        decision_scores["Quant/Systematic"] = -9999
        decision_scores["Factor/Smart Beta"] = -9999
        decision_scores["Index Based"] += 50

    # --- Category-Based Decision Rules ---
    category_text = ""
    for col in ['YC_Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name']:
        if pd.notna(row[col]):
            category_text += " " + row[col].lower()

    if "defined outcome" in category_text:
        decision_scores["Quant/Systematic"] += 100
    if "target maturity" in category_text:
        decision_scores["Index Based"] += 100
    trading_keywords = ["trading tools", "trading/tactical"]
    trading_categories = ["trading--inverse commodities", "trading--inverse debt", "trading--inverse equity",
                          "trading--leveraged commodities", "trading--leveraged debt", "trading--leveraged equity", "trading--miscellaneous"]
    if any(kw in category_text for kw in trading_keywords) or any(tc in category_text for tc in trading_categories):
        decision_scores["Quant/Systematic"] += 100
    if "target date" in category_text or "target-date" in category_text:
        decision_scores["Factor/Smart Beta"] = -9999
        decision_scores["Quant/Systematic"] = -9999
    if "digital asset" in category_text or "currency" in category_text:
        decision_scores["Index Based"] += 100
    if "commodity" in category_text:
        decision_scores["Factor/Smart Beta"] = -9999
    if "single stock" in category_text:
        decision_scores["Index Based"] += 100
    if "municipal" in category_text:
        decision_scores["Multi-Strategy"] = -9999
    if "systematic trend" in category_text:
        decision_scores["Quant/Systematic"] += 100
    if ("long/short equity" in category_text or "event driven" in category_text) and not index_flag:
        decision_scores["Active Discretionary"] += 50
        decision_scores["Quant/Systematic"] += 50
    if "inflation-protected bond" in category_text:
        decision_scores["Active Discretionary"] += 50
        decision_scores["Index Based"] += 50

    # --- Active Management Variants ---
    active_management_terms = [
        "actively managed", "actively-managed", "active management", "active bottom‑up approach",
        "actively trades", "active management strategy", "actively managed etf",
        "actively managed fund of funds", "actively managed strategy", "active allocation",
        "actively allocates", "tactically allocates assets", "active trading",
        "trade securities actively", "active trading strategy", "bottom‑up approach",
        "exceptional management", "long-term investments", "growth companies",
        "actively managed mandate", "full discretion", "investment quality score",
        "actively-managed etf", "fund advisor’s discretion",
        "actively invests"
    ]
    for term in active_management_terms:
        if term in text:
            decision_scores["Active Discretionary"] += 100
            print(f"SymbolCUSIP: {row['SymbolCUSIP']}, Matched term: {term}")
            break

    # --- Combine All Scores ---
    final_combined_scores = {cat: tier_total_scores.get(cat, 0) + decision_scores.get(cat, 0)
                             for cat in RETURN_CATEGORIES}
    
    # --- Boost Weak Signals ---
    max_score = max(final_combined_scores.values())
    if max_score < 60:
        max_cat = max(final_combined_scores, key=final_combined_scores.get)
        final_combined_scores[max_cat] += 30
        print(f"SymbolCUSIP: {row['SymbolCUSIP']}, Boosted {max_cat} by 30")

    print(f"SymbolCUSIP: {row['SymbolCUSIP']}, Scores: {final_combined_scores}")

    viable_scores = {cat: score for cat, score in final_combined_scores.items() if score > -9999}
    if not viable_scores:
        predicted_category = "Active Discretionary"
        print(f"SymbolCUSIP: {row['SymbolCUSIP']}, Defaulted to Active Discretionary")
    else:
        predicted_category = max(viable_scores, key=viable_scores.get)

    # --- Prepare Output ---
    result = {'Return_Category': predicted_category}
    for cat in RETURN_CATEGORIES:
        result[f"Final_{cat}_Score"] = final_combined_scores[cat]

    return pd.Series(result)

# --- Apply Classification ---
result_cols = ['Return_Category'] + [f"Final_{cat}_Score" for cat in RETURN_CATEGORIES]
df[result_cols] = df.apply(classify_return, axis=1)

# --- Prepare Output Excel File ---
output_cols = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Return_Category', 'ycharts_url'
] + [col for col in df.columns if 'Score' in col] + [
    'investment_strategy', 'FS_insight', 'index_fund', 'inverse_fund', 'leveraged_fund', 'socially_responsible_fund',
    'synthetic_replication_fund', 'fund_of_funds', 'YC_Broad_Asset_Class_Name', 'Broad_Category_Name',
    'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name'
]
df[output_cols].to_excel(r"C:\Users\JulianHeron\Software Projects\Return Drivers\classified_return_funds_v6.xlsx", index=False)

# --- Display Classification Distribution ---
distribution = df['Return_Category'].value_counts().to_dict()
print("\nClassification Distribution:")
for category, count in distribution.items():
    print(f"{category}: {count}")

print("Classification complete. Results saved to 'C:\\Users\\JulianHeron\\Software Projects\\Return Drivers\\classified_return_funds_v6.xlsx'.")

SymbolCUSIP: NTSX, Matched term: actively managed
SymbolCUSIP: NTSX, Scores: {'Index Based': -9979.0, 'Factor/Smart Beta': 9.724137931034482, 'Active Discretionary': 123.25, 'Quant/Systematic': 2.0, 'Multi-Strategy': 0.0}
SymbolCUSIP: HIPS, Scores: {'Index Based': 77.14646464646465, 'Factor/Smart Beta': 83.72413793103448, 'Active Discretionary': 0.0, 'Quant/Systematic': 16.0, 'Multi-Strategy': 42.5}
SymbolCUSIP: EAOA, Scores: {'Index Based': 194.6010101010101, 'Factor/Smart Beta': -9981.275862068966, 'Active Discretionary': -91.75, 'Quant/Systematic': -100.0, 'Multi-Strategy': 2.0}
SymbolCUSIP: AOM, Scores: {'Index Based': 90.05555555555556, 'Factor/Smart Beta': 69.72413793103448, 'Active Discretionary': -85.5, 'Quant/Systematic': -100.0, 'Multi-Strategy': 2.0}
SymbolCUSIP: AOK, Scores: {'Index Based': 94.6010101010101, 'Factor/Smart Beta': 71.44827586206897, 'Active Discretionary': -85.5, 'Quant/Systematic': -100.0, 'Multi-Strategy': 2.0}
SymbolCUSIP: EAOK, Scores: {'Index Based': 194

In [None]:
# next edition

In [11]:
# Version 6.2: Enhanced with updated FundFamilyData distributions, increased keyword weight,
# and adjusted decision tree bonuses (metadata bonus removed)

import pandas as pd
from sqlalchemy import create_engine

"""
classify_return_drivers.py (V6.2)

Classifies investment funds into return generation strategies:
- Index Based
- Factor/Smart Beta
- Active Discretionary
- Quant/Systematic
- Multi-Strategy

Enhancements in V6.2:
- Uses FundFamilyData distribution fields for scoring.
- KEYWORDS_WEIGHT is 50.
- Metadata bonus removed.
- Decision-tree adjustments: +50 balanced bonus for index_fund, +100 extra for esg/leveraged/synthetic.
- Tracks defaulted classifications in 'IsDefaulted' column.
"""

# --- Tunable Parameters and Dry Run Switches ---
TIER_CAP = 100
FUND_FAMILY_WEIGHT = 40
KEYWORDS_WEIGHT = 50
CATEGORIES_WEIGHT = 15
METADATA_WEIGHT = 15  # Not used
HIGH_CONF_THRESHOLD = 50
LIKELY_FUND_TYPE_WEIGHT = 0.5
DRY_RUN = False
TEST_FUNDS = ["VOO", "DFSVX", "BSIIX", "IVV", "FBALX", "LCEAX", "JSMBX"]

TIER_WEIGHTS = {"High": 1.0, "Medium": 0.5, "Low": 0.25}
RETURN_CATEGORIES = ["Index Based", "Factor/Smart Beta", "Active Discretionary", "Quant/Systematic", "Multi-Strategy"]

# --- Updated Keyword Lists ---
high_conf_keywords = {
    "Index Based": [
        "index fund", "tracks", "replicates", "indexed", "underlying index",
        "thematic", "passive", "economic characteristics that are substantially",
        "passively managed"
    ],
    "Factor/Smart Beta": [
        "rules-based", "factor-based", "factor tilt", "multi-factor", "factor investing",
        "momentum", "low volatility", "low vol", "value factor", "quality", "quality factor",
        "free cash flow", "fcf", "factor", "objective", "time tested", "relatively", "go up",
        "certain fundamental metrics", "momentum index", "quality index", "relatively lower valuations",
        "factors", "minimum volatility", "high dividend yield",
        "dividend growth rate", "high dividend-paying",
        "fundamental score multiplier", "growth and quality factors"
    ],
    "Active Discretionary": [
        "actively managed", "actively-managed", "manager believes", "manager's judgment",
        "active bottom‑up", "active strategy", "discretionary", "active management", "active-management",
        "actively invests"
    ],
    "Quant/Systematic": [
        "quantitative", "algorithm-driven", "systematic", "levered", "algorithm", "implied volatility",
        "rules-based methodology", "equity index futures", "market neutral",
        "quantitative research", "contrarian strategy",
        "dynamically allocating", "economic indicators"
    ],
    "Multi-Strategy": [
        "multi-strategy", "multi-asset", "hybrid strategy",
        "cautious allocation"
    ]
}

med_conf_keywords = {
    "Index Based": [
        "bond index", "market-cap weighted", "low tracking error", "high correlation", "benchmark",
        "low-cost", "broad market exposure", "passive", "aggregate bond", "passive", "broad"
    ],
    "Factor/Smart Beta": [
        "enhanced index", "revenue weighted", "dividend weighted", "enhanced returns", "thematic",
        "fundamental weighting", "yield weighted", "quality factor", "low volatility", "rotation",
        "rules based methodology", "cash cows", "alphaDEX", "ranked", "lower volatility"
    ],
    "Active Discretionary": [
        "machine learning", "ai", "research-driven", "fundamental", "strategically",
        "tactical allocation", "active", "rotation"
    ],
    "Quant/Systematic": [
        "data-driven", "systematic", "backtested", "long-short", "model-based", "rotation"
    ],
    "Multi-Strategy": [
        "multi-manager", "dynamic allocation", "multi-asset", "absolute return", "blended"
    ]
}

low_conf_keywords = {
    "Index Based": [
        "index", "mirrors", "equal-weighted", "bond index", "beta", "etf", "value index", "growth index"
    ],
    "Factor/Smart Beta": [
        "fundamental weighting", "enhanced index", "thematic", "tilt", "optimized", "component securities",
        "economic characteristics", "free cash flow yield", "objective", "high dividend yields",
        "ranking system", "consistantly increased dividends", "dividend", "dividend yield", "dividends",
        "strong cash", "low debt", "increasing earnings", "earnings", "rising dividend", "acheivers",
        "volatility weighted", "long/cash", "low beta", "low size"
    ],
    "Active Discretionary": [
        "discretionary", "judgment", "analysis", "outperform", "selection", "tactical",
        "trend-following", "trend following"
    ],
    "Quant/Systematic": [
        "quantitative", "algorithm", "statistical", "rules-driven", "trend-following",
        "trend following", "tactical"
    ],
    "Multi-Strategy": [
        "combination", "hybrid", "multi-asset", "flexible", "alternative"
    ]
}

# --- Updated Category Lookup ---
CATEGORY_LOOKUP = {
    "High": {
        "Defined Outcome": {"Quant/Systematic": 15},
        "Quantitative/Tactical": {"Quant/Systematic": 15},
        "Trading/Tactical": {"Quant/Systematic": 10},
        "Digital Asset": {"Index Based": 15},
        "Currency": {"Index Based": 15},
        "Target Maturity": {"Index Based": 15},
        "Systematic Trend": {"Quant/Systematic": 15}
    },
    "Medium": {
        "Bond Strategy": {"Active Discretionary": 5, "Factor/Smart Beta": 5, "Quant/Systematic": 5},
        "Trading Tools": {"Quant/Systematic": 10},
        "Trading": {"Quant/Systematic": 10},
        "Target Date": {"Active Discretionary": 5, "Index Based": 2, "Multi-Strategy": 8},
        "Options Trading": {"Active Discretionary": 5, "Factor/Smart Beta": 5, "Quant/Systematic": 5},
        "Long/Short Equity": {"Active Discretionary": 5, "Quant/Systematic": 5},
        "Commodities": {"Active Discretionary": 4, "Factor/Smart Beta": 1, "Quant/Systematic": 4, "Index Based": 4, "Multi-Strategy": 1},
        "Municipal": {"Active Discretionary": 6, "Factor/Smart Beta": 2, "Index Based": 6, "Multi-Strategy": 1},
        "Derivative Income": {"Active Discretionary": 5, "Quant/Systematic": 4, "Factor/Smart Beta": 2, "Index Based": 4},
        "Event Driven": {"Active Discretionary": 8, "Quant/Systematic": 7},
        "Inflation-Protected Bond": {"Active Discretionary": 6, "Index Based": 9}
    },
    "Low": {
        "Multialternative": {"Quant/Systematic": 7, "Active Discretionary": 8},
        "Specialty": {"Multi-Strategy": 4, "Quant/Systematic": 4, "Active Discretionary": 5},
        "Strategic": {"Multi-Strategy": 4, "Quant/Systematic": 4, "Active Discretionary": 5},
        "Nontraditional": {"Multi-Strategy": 2, "Quant/Systematic": 3, "Active Discretionary": 3, "Factor/Smart Beta": 3},
        "Equity Hedged": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5},
        "Multisector Bond": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5},
        "Nontraditional Bond": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5}
    }
}

# --- Database Connection ---
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# --- SQL Query ---
query = """
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy, fs.FS_insight,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.ycharts_url,
    yc_ba.YC_Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name,
    ff.Dist_Index, ff.Dist_Active, ff.Dist_Rules_Based, ff.Dist_Quant, ff.Dist_Multi
FROM Funds_to_Screen fs
LEFT JOIN FundFamilyData ff ON fs.fund_family = ff.FundFamilyName
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""
df = pd.read_sql(query, engine)

if DRY_RUN:
    df = df[df['SymbolCUSIP'].isin(TEST_FUNDS)]

# --- Function Definitions ---
def score_keywords(text, keyword_dict, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for category, keywords in keyword_dict.items():
        hits = sum(keyword in text for keyword in keywords)
        if len(keywords) > 0:
            scores[category] = min(hits * (max_points / len(keywords)), max_points)
    return scores

def score_categories(row, tier_dict, max_points):
    category_cols = ['YC_Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name']
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for col in category_cols:
        if pd.notna(row[col]):
            value = row[col].lower()
            for keyword, weights in tier_dict.items():
                if keyword in value:
                    for cat, points in weights.items():
                        scores[cat] += points
    for cat in scores:
        scores[cat] = min(scores[cat], max_points)
    return scores

def score_metadata(row, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    if row['index_fund'] == 'True':
        scores["Index Based"] += max_points
    if row['leveraged_fund'] == 'True' or row['inverse_fund'] == 'True':
        scores["Quant/Systematic"] += max_points / 2
    if row['fund_of_funds'] == 'True':
        scores["Active Discretionary"] += max_points / 3
        scores["Factor/Smart Beta"] += max_points / 3
        scores["Index Based"] += max_points / 3
    return scores

def score_fund_family(row, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    try:
        dist_index = float(row.get('Dist_Index', 0))
        dist_active = float(row.get('Dist_Active', 0))
        dist_rules_based = float(row.get('Dist_Rules_Based', 0))
        dist_quant = float(row.get('Dist_Quant', 0))
        dist_multi = float(row.get('Dist_Multi', 0))
    except Exception:
        dist_index = dist_active = dist_rules_based = dist_quant = dist_multi = 0

    scores["Index Based"] += (dist_index / 100.0) * max_points
    scores["Active Discretionary"] += (dist_active / 100.0) * max_points
    scores["Factor/Smart Beta"] += (dist_rules_based / 100.0) * max_points
    scores["Quant/Systematic"] += (dist_quant / 100.0) * max_points
    scores["Multi-Strategy"] += (dist_multi / 100.0) * max_points

    if (dist_index + dist_active + dist_rules_based + dist_quant + dist_multi) == 0:
        scores["Active Discretionary"] += max_points * 0.5

    return scores

def classify_return(row):
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) else ""
    investment_strategy = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) else ""
    fs_insight = row['FS_insight'].lower() if pd.notna(row['FS_insight']) and row['FS_insight'] != "Error parsing response" else ""
    text = product_name + " " + investment_strategy + " " + fs_insight

    keywords_scores = {tier: score_keywords(text, kw_dict, KEYWORDS_WEIGHT)
                       for tier, kw_dict in zip(TIER_WEIGHTS, [high_conf_keywords, med_conf_keywords, low_conf_keywords])}
    categories_scores = {tier: score_categories(row, CATEGORY_LOOKUP[tier], CATEGORIES_WEIGHT)
                         for tier in TIER_WEIGHTS}

    tier_total_scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for tier in TIER_WEIGHTS:
        for cat in RETURN_CATEGORIES:
            tier_total_scores[cat] += keywords_scores[tier][cat] + categories_scores[tier][cat]
    
    fund_family_scores = score_fund_family(row, FUND_FAMILY_WEIGHT)
    for cat in RETURN_CATEGORIES:
        tier_total_scores[cat] += fund_family_scores[cat]

    decision_scores = {cat: 0 for cat in RETURN_CATEGORIES}

    def to_bool(val):
        return str(val).strip().lower() in ["true", "1"]

    index_flag = to_bool(row.get("index_fund", False))
    esg_flag = to_bool(row.get("socially_responsible_fund", False))
    leveraged_flag = to_bool(row.get("leveraged_fund", False))
    synthetic_flag = to_bool(row.get("synthetic_replication_fund", False))
    currency_hedged_flag = to_bool(row.get("currency_hedged_fund", False))
    fund_of_funds_flag = to_bool(row.get("fund_of_funds", False))

    if index_flag:
        decision_scores["Index Based"] += 50
        decision_scores["Factor/Smart Beta"] += 50
        if esg_flag or leveraged_flag or synthetic_flag:
            decision_scores["Index Based"] += 100
    if not index_flag:
        decision_scores["Index Based"] -= 50
    if fund_of_funds_flag:
        if index_flag:
            decision_scores["Active Discretionary"] -= 50
            decision_scores["Quant/Systematic"] -= 50
        else:
            decision_scores["Active Discretionary"] += 30
            decision_scores["Quant/Systematic"] += 10
            decision_scores["Multi-Strategy"] += 10
    if currency_hedged_flag:
        decision_scores["Factor/Smart Beta"] -= 50
        decision_scores["Active Discretionary"] += 20
        decision_scores["Index Based"] += 20
    if esg_flag:
        decision_scores["Factor/Smart Beta"] -= 50
    if index_flag and (leveraged_flag or synthetic_flag):
        decision_scores["Active Discretionary"] -= 50
        decision_scores["Multi-Strategy"] -= 50
        decision_scores["Quant/Systematic"] -= 50
        decision_scores["Factor/Smart Beta"] -= 50
        decision_scores["Index Based"] += 50

    category_text = ""
    for col in ['YC_Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name']:
        if pd.notna(row[col]):
            category_text += " " + row[col].lower()

    if "defined outcome" in category_text:
        decision_scores["Quant/Systematic"] += 100
    if "target maturity" in category_text:
        decision_scores["Index Based"] += 100
    trading_keywords = ["trading tools", "trading/tactical"]
    trading_categories = ["trading--inverse commodities", "trading--inverse debt", "trading--inverse equity",
                          "trading--leveraged commodities", "trading--leveraged debt", "trading--leveraged equity", "trading--miscellaneous"]
    if any(kw in category_text for kw in trading_keywords) or any(tc in category_text for tc in trading_categories):
        decision_scores["Quant/Systematic"] += 100
    if "target date" in category_text or "target-date" in category_text:
        decision_scores["Factor/Smart Beta"] -= 50
        decision_scores["Quant/Systematic"] -= 50
    if "digital asset" in category_text or "currency" in category_text:
        decision_scores["Index Based"] += 100
    if "commodity" in category_text:
        decision_scores["Factor/Smart Beta"] -= 50
    if "single stock" in category_text:
        decision_scores["Index Based"] += 100
    if "municipal" in category_text:
        decision_scores["Multi-Strategy"] -= 50
    if "systematic trend" in category_text:
        decision_scores["Quant/Systematic"] += 100
    if ("long/short equity" in category_text or "event driven" in category_text) and not index_flag:
        decision_scores["Active Discretionary"] += 50
        decision_scores["Quant/Systematic"] += 50
    if "inflation-protected bond" in category_text:
        decision_scores["Active Discretionary"] += 50
        decision_scores["Index Based"] += 50

    active_management_terms = [
        "actively managed", "actively-managed", "active management", "active bottom‑up approach",
        "actively trades", "active management strategy", "actively managed etf",
        "actively managed fund of funds", "actively managed strategy", "active allocation",
        "actively allocates", "tactically allocates assets", "active trading",
        "trade securities actively", "active trading strategy", "bottom‑up approach",
        "exceptional management", "long-term investments", "growth companies",
        "actively managed mandate", "full discretion", "investment quality score",
        "actively-managed etf", "fund advisor’s discretion",
        "actively invests"
    ]
    for term in active_management_terms:
        if term in text:
            decision_scores["Active Discretionary"] += 100
            break

    final_combined_scores = {cat: tier_total_scores.get(cat, 0) + decision_scores.get(cat, 0)
                             for cat in RETURN_CATEGORIES}
    
    max_score = max(final_combined_scores.values())
    if max_score < 60:
        max_cat = max(final_combined_scores, key=final_combined_scores.get)
        final_combined_scores[max_cat] += 30

    viable_scores = {cat: score for cat, score in final_combined_scores.items() if score > -9999}
    is_defaulted = False
    if not viable_scores or max(viable_scores.values()) < 10:
        predicted_category = "Active Discretionary"
        is_defaulted = True
    else:
        predicted_category = max(viable_scores, key=viable_scores.get)

    result = {'Return_Category': predicted_category, 'IsDefaulted': is_defaulted}
    for cat in RETURN_CATEGORIES:
        result[f"Final_{cat}_Score"] = final_combined_scores[cat]

    return pd.Series(result)

# --- Apply Classification ---
result_cols = ['Return_Category', 'IsDefaulted'] + [f"Final_{cat}_Score" for cat in RETURN_CATEGORIES]
df[result_cols] = df.apply(classify_return, axis=1)

# --- Prepare Output Excel File ---
output_cols = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Return_Category', 'IsDefaulted', 'ycharts_url'
] + [col for col in df.columns if 'Score' in col] + [
    'investment_strategy', 'FS_insight', 'index_fund', 'inverse_fund', 'leveraged_fund', 'socially_responsible_fund',
    'synthetic_replication_fund', 'fund_of_funds', 'YC_Broad_Asset_Class_Name', 'Broad_Category_Name',
    'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name'
]
df[output_cols].to_excel(r"C:\Users\JulianHeron\Software Projects\Return Drivers\classified_return_funds_v6.xlsx", index=False)

# --- Display Classification Distribution ---
distribution = df['Return_Category'].value_counts().to_dict()
print("\nClassification Distribution:")
for category, count in distribution.items():
    print(f"{category}: {count}")

defaulted_count = df['IsDefaulted'].sum()
print(f"\nNumber of funds defaulted to Active Discretionary: {defaulted_count}")

print("Classification complete. Results saved to 'C:\\Users\\JulianHeron\\Software Projects\\Return Drivers\\classified_return_funds_v6.xlsx'.")


Classification Distribution:
Active Discretionary: 3667
Index Based: 1657
Factor/Smart Beta: 224
Quant/Systematic: 32
Multi-Strategy: 6

Number of funds defaulted to Active Discretionary: 14
Classification complete. Results saved to 'C:\Users\JulianHeron\Software Projects\Return Drivers\classified_return_funds_v6.xlsx'.


In [None]:
# next version, totally new scorring

In [15]:
# Version 7.0_2_<timestamp>: New scoring system with stepwise narrowing, fixed TypeError for None in YC category
# Timestamp: 2025-03-13_14:00:00 (replace with actual runtime timestamp)

import pandas as pd
from sqlalchemy import create_engine
import json
from datetime import datetime

"""
classify_return_drivers.py (V7.0.2)

Classifies investment funds into return generation strategies:
- Index Based
- Factor/Smart Beta
- Active Discretionary
- Quant/Systematic
- Multi-Strategy

Enhancements in V7.0.2:
- New scoring: 1 point per signal (booleans, categories, keywords), scaled for balance.
- Uses CWA_Category, YC_Category, and Boolean weights (-3 to +3) for narrowing.
- Retains auto-classify keywords (e.g., "active-management" for +10).
- Logs all data in Excel, including DefaultScores in multiple cells if needed.
- Prints distribution in console.
- Fixed TypeError when Global_Category_Name is None by ensuring string handling.
"""

# --- Tunable Parameters and Dry Run Switches ---
TIER_CAP = 100
FUND_FAMILY_WEIGHT = 40
KEYWORDS_WEIGHT = 50
CATEGORIES_WEIGHT = 15
MIN_CONFIDENCE_THRESHOLD = 5  # Minimum score to avoid default
DRY_RUN = False
TEST_FUNDS = ["VOO", "DFSVX", "BSIIX", "IVV", "FBALX", "LCEAX", "JSMBX"]

TIER_WEIGHTS = {"High": 1.0, "Medium": 0.5, "Low": 0.25}
RETURN_CATEGORIES = ["Index Based", "Factor/Smart Beta", "Active Discretionary", "Quant/Systematic", "Multi-Strategy"]

# --- Category Weights (Directly from your list, no assumptions) ---
CWA_WEIGHTS = {
    "Taxable Fixed Income": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "US Equity, International": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Municipal": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Emerging": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Global Equity": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Sector/Industry": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Nontraditional": {"Index Based": -1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 1, "Active Discretionary": 2},
    "Bond Strategy": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Global Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Strategic": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Commodity": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Country": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Regional": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Quantitative/Tactical": {"Index Based": -1, "Factor/Smart Beta": -1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": -3},
    "Alternative": {"Index Based": -1, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Trading/Tactical": {"Index Based": 0, "Factor/Smart Beta": -2, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": -3}
}

YC_WEIGHTS = {
    "Large Blend, Large Value, Large Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Small Blend, Mid-Cap Blend, Mid-Cap Growth, Small Growth, Global Large-Stock Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Diversified Emerging Mkts": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Foreign Large Blend, Foreign Large Value, Global Large-Stock Blend": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "High Yield Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Technology": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Short-Term Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Moderate Allocation, Aggressive Allocation, Conservative Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Derivative Income": {"Index Based": 0, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Intermediate Core-Plus Bond, Intermediate Core Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Small Value, Mid-Cap Value": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 0, "Active Discretionary": 3},
    "Ultrashort Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Foreign Large Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Multisector Bond, Convertibles": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Muni National Interm, Corporate Bond, Emerging Markets Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Health, Real Estate, Natural Resources": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Global Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Miscellaneous Region, Pacific/Asia ex-Japan Stk": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 2},
    "Moderately Conservative Allocation, Moderately Aggressive Allocation": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": -3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Muni National Short": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Intermediate Government, Short Government": {"Index Based": 2, "Factor/Smart Beta": -1, "Quant/Systematic": -3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "High Yield Muni": {"Index Based": 0, "Factor/Smart Beta": -1, "Quant/Systematic": -3, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Bank Loan": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": 1, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Nontraditional Bond": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": -2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Commodities Focused, Commodities Broad Basket": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Tactical Allocation": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 1, "Active Discretionary": 3},
    "China Region, Europe Stock, India Equity, Japan Stock, Latin America Stock, Diversified Pacific/Asia": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": -1, "Active Discretionary": 1},
    "Financial, Global Real Estate, Miscellaneous Sector, Equity Energy, Industrials, Consumer Cyclical, Communications, Consumer Defensive": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Muni National Long, Muni Single State Long, Muni California Long": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Equity Hedged": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Global Large-Stock Value, Global Small/Mid Stock": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Preferred Stock": {"Index Based": 2, "Factor/Smart Beta": 0, "Quant/Systematic": -2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Miscellaneous Fixed Income, Inflation-Protected Bond, Short-Term Inflation-Protected Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Long Government, Long-Term Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Muni Single State Interm, Muni New York Long, Muni California Intermediate, Muni Pennsylvania, Muni Minnesota, Muni New Jersey, Muni New York Intermediate, Muni Single State Short, Muni Massachusetts, Muni Ohio": {"Index Based": 2, "Factor/Smart Beta": 0, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Equity Precious Metals": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Foreign Small/Mid Blend, Foreign Small/Mid Value": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Energy Limited Partnership": {"Index Based": 3, "Factor/Smart Beta": 0, "Quant/Systematic": 0, "Multi-Strategy": 0, "Active Discretionary": 2},
    "Emerging-Markets Local-Currency Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Digital Assets": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 0},
    "Long/Short Equity": {"Index Based": 1, "Factor/Smart Beta": 0, "Quant/Systematic": 3, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Systematic Trend, Event Driven": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Multistrategy": {"Index Based": -2, "Factor/Smart Beta": -1, "Quant/Systematic": 2, "Multi-Strategy": 3, "Active Discretionary": 3},
    "Equity Market Neutral": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": 3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Trading--Inverse Commodities, Trading--Leveraged Commodities, Trading--Leveraged Debt, Trading--Leveraged Equity": {"Index Based": 2, "Factor/Smart Beta": -3, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": -3},
    "Relative Value Arbitrage": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "US Fixed Income, Miscellaneous Fixed Income, Inflation-Protected Bond, Short-Term Inflation-Protected Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Global Equity Large Cap, Global Large-Stock Blend, Global Large-Stock Value, Global Small/Mid Stock": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "US Municipal Fixed Income, Muni National Long, Muni Single State Long, Muni National Short, Muni Single State Interm, Muni California Long, Muni New York Long, Muni California Intermediate, Muni Pennsylvania, Muni Minnesota, Muni New Jersey, Muni New York Intermediate, Muni Single State Short, Muni Massachusetts, Muni Ohio": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "US Equity Large Cap Blend, US Equity Small Cap, US Equity Mid Cap, US Equity Large Cap Value, US Equity Large Cap Growth, Large Blend, Large Value, Large Growth, Small Blend, Mid-Cap Blend, Mid-Cap Growth, Small Growth, Foreign Large Growth, Global Large-Stock Growth, Europe Equity Large Cap, Europe Equity Mid/Small Cap, Foreign Small/Mid Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Global Emerging Markets Equity, Diversified Emerging Mkts": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Equity Miscellaneous, Technology Sector Equity, Real Estate Sector Equity, Healthcare Sector Equity, Energy Sector Equity, Natural Resources Sector Equity, Consumer Goods & Services Sector Equity, Financials Sector Equity, Industrials Sector Equity, Utilities Sector Equity, Communications Sector Equity, Other Sector Equity, Health, Real Estate, Natural Resources, Financial, Global Real Estate, Miscellaneous Sector, Equity Energy, Industrials, Consumer Cyclical, Communications, Consumer Defensive": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Global Equity Mid/Small Cap": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Cautious Allocation, Moderate Allocation, Aggressive Allocation, Allocation Miscellaneous, Moderately Conservative Allocation, Conservative Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Commodities Specified, Commodities Broad Basket": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Alternative Miscellaneous": {"Index Based": -1, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Fixed Income Miscellaneous, Multisector Bond, Convertibles": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Flexible Allocation": {"Index Based": 1, "Factor/Smart Beta": 1, "Quant/Systematic": 3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Greater China Equity, Japan Equity, Latin America Equity, Asia Equity, Australia & New Zealand Equity, China Region, Europe Stock, India Equity, Japan Stock, Latin America Stock, Diversified Pacific/Asia, Pacific/Asia ex-Japan Stk": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -1, "Active Discretionary": 1},
    "Infrastructure Sector Equity, Infrastructure": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": -1, "Multi-Strategy": 0, "Active Discretionary": 2},
    "Asia ex-Japan Equity, Miscellaneous Region": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Options Trading, Tactical Allocation": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 1, "Active Discretionary": 2},
    "Precious Metals Sector Equity, Global Bond-USD Hedged": {"Index Based": 2, "Factor/Smart Beta": 0, "Quant/Systematic": 0, "Multi-Strategy": 0, "Active Discretionary": 3},
    "Trading Tools": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": -3},
    "Market Neutral": {"Index Based": 0, "Factor/Smart Beta": -2, "Quant/Systematic": 3, "Multi-Strategy": 2, "Active Discretionary": 2},
    "Multialternative": {"Index Based": 1, "Factor/Smart Beta": -3, "Quant/Systematic": 2, "Multi-Strategy": 3, "Active Discretionary": 3},
    "India Equity, Korea Equity, Canadian Equity Large Cap, Mexico Equity, Thailand Equity, UK Equity Large Cap": {"Index Based": 2, "Factor/Smart Beta": 2, "Quant/Systematic": -1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Foreign Large Blend, Foreign Large Value, Foreign Small/Mid Blend, Foreign Small/Mid Value": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "High Yield Bond, Short-Term Bond, Ultrashort Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Derivative Income": {"Index Based": 1, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": 0, "Active Discretionary": 3},
    "Intermediate Core-Plus Bond, Intermediate Core Bond, Ultrashort Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Small Value, Mid-Cap Value": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Muni National Interm, Corporate Bond, Emerging Markets Bond, Global Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Global Allocation": {"Index Based": 2, "Factor/Smart Beta": -3, "Quant/Systematic": -1, "Multi-Strategy": 3, "Active Discretionary": 3},
    "High Yield Muni": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": -3, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Bank Loan, Nontraditional Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Commodities Focused, Commodities Broad Basket": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Equity Hedged": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Moderately Aggressive Allocation": {"Index Based": 1, "Factor/Smart Beta": 0, "Quant/Systematic": 2, "Multi-Strategy": 3, "Active Discretionary": 2},
    "Short Government": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Preferred Stock": {"Index Based": 2, "Factor/Smart Beta": -1, "Quant/Systematic": -2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Equity Precious Metals": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Energy Limited Partnership": {"Index Based": 3, "Factor/Smart Beta": 0, "Quant/Systematic": 0, "Multi-Strategy": 0, "Active Discretionary": 2},
    "Emerging-Markets Local-Currency Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Digital Assets": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 0},
    "Long/Short Equity": {"Index Based": 1, "Factor/Smart Beta": 0, "Quant/Systematic": 3, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Systematic Trend, Event Driven": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Multistrategy": {"Index Based": -2, "Factor/Smart Beta": -1, "Quant/Systematic": 2, "Multi-Strategy": 3, "Active Discretionary": 3},
    "Equity Market Neutral": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": 3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Trading--Inverse Commodities, Trading--Leveraged Commodities, Trading--Leveraged Debt, Trading--Leveraged Equity": {"Index Based": 2, "Factor/Smart Beta": -3, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": -3},
    "Relative Value Arbitrage": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "leveraged_fund, inverse_fund": {"Index Based": 3, "Factor/Smart Beta": -3, "Quant/Systematic": 0, "Multi-Strategy": -3, "Active Discretionary": -3},
    "index_based": {"Index Based": 3, "Factor/Smart Beta": 3, "Quant/Systematic": -3, "Multi-Strategy": -3, "Active Discretionary": -3},
    "currency_hedged_fund": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": -2, "Multi-Strategy": -3, "Active Discretionary": 1},
    "socially_responsible_fund": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": -2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "synthetic_replication_fund": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": -1},
    "fund_of_funds": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": 3, "Active Discretionary": 2},
    "^PEATR": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 3, "Active Discretionary": 3}
}

# --- Consolidated Keyword Lists (no changes to existing words, only consolidation) ---
keywords = {
    "Index Based": high_conf_keywords["Index Based"] + med_conf_keywords["Index Based"] + low_conf_keywords["Index Based"],
    "Factor/Smart Beta": high_conf_keywords["Factor/Smart Beta"] + med_conf_keywords["Factor/Smart Beta"] + low_conf_keywords["Factor/Smart Beta"],
    "Active Discretionary": high_conf_keywords["Active Discretionary"] + med_conf_keywords["Active Discretionary"] + low_conf_keywords["Active Discretionary"],
    "Quant/Systematic": high_conf_keywords["Quant/Systematic"] + med_conf_keywords["Quant/Systematic"] + low_conf_keywords["Quant/Systematic"],
    "Multi-Strategy": high_conf_keywords["Multi-Strategy"] + med_conf_keywords["Multi-Strategy"] + low_conf_keywords["Multi-Strategy"]
}

# --- Auto-Classify Keywords (retained and expanded with your list) ---
AUTO_CLASSIFY_KEYWORDS = {
    "Active Discretionary": ["actively managed", "actively-managed", "active management", "active bottom‑up approach",
                            "actively trades", "active management strategy", "actively managed etf",
                            "actively managed fund of funds", "actively managed strategy", "active allocation",
                            "actively allocates", "tactically allocates assets", "active trading",
                            "trade securities actively", "active trading strategy", "bottom‑up approach",
                            "exceptional management", "long-term investments", "growth companies",
                            "actively managed mandate", "full discretion", "investment quality score",
                            "actively-managed etf", "fund advisor’s discretion", "actively invests",
                            "fundamental approach", "capital preservation", "research-driven"],
    "Factor/Smart Beta": ["rules-based", "factor-based", "factor tilt", "multi-factor", "factor investing",
                         "rules based methodology", "smart beta", "alphaDEX", "relative strength"],
    "Quant/Systematic": ["quantitative", "algorithm-driven", "systematic", "levered", "algorithm", "implied volatility",
                        "rules-based methodology", "equity index futures", "market neutral",
                        "quantitative research", "contrarian strategy", "dynamically allocating",
                        "economic indicators", "artificial intelligence", "trend following",
                        "data-driven", "backtested", "long-short", "model-based"],
    "Index Based": [],  # No auto-classify keywords defined yet, can add if needed
    "Multi-Strategy": ["multi-strategy", "multi-asset", "hybrid strategy", "cautious allocation",
                      "dynamic allocation", "absolute return", "multi-manager", "blended"]
}

# --- Database Connection ---
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# --- SQL Query ---
query = """
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy, fs.FS_insight,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.currency_hedged_fund, fs.ycharts_url,
    yc_ba.YC_Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name,
    ff.Dist_Index, ff.Dist_Active, ff.Dist_Rules_Based, ff.Dist_Quant, ff.Dist_Multi
FROM Funds_to_Screen fs
LEFT JOIN FundFamilyData ff ON fs.fund_family = ff.FundFamilyName
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""
df = pd.read_sql(query, engine)

if DRY_RUN:
    df = df[df['SymbolCUSIP'].isin(TEST_FUNDS)]

# --- Function Definitions ---

# # score_booleans: Applies weights from Boolean's to narrow categories
# # Args: row (pandas Series with boolean columns)
# # Returns: Dictionary of scores per Return_Category
def score_booleans(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for bool_name, weights in [
        ("index_fund", YC_WEIGHTS.get("index_based", {})),
        ("inverse_fund", YC_WEIGHTS.get("leveraged_fund, inverse_fund", {})),
        ("leveraged_fund", YC_WEIGHTS.get("leveraged_fund, inverse_fund", {})),
        ("socially_responsible_fund", YC_WEIGHTS.get("socially_responsible_fund", {})),
        ("synthetic_replication_fund", YC_WEIGHTS.get("synthetic_replication_fund", {})),
        ("fund_of_funds", YC_WEIGHTS.get("fund_of_funds", {})),
        ("currency_hedged_fund", YC_WEIGHTS.get("currency_hedged_fund", {}))
    ]:
        if bool_name in row and pd.notna(row[bool_name]):
            value = str(row[bool_name]).strip().lower()
            if value in ["true", "1"]:
                for cat in RETURN_CATEGORIES:
                    scores[cat] += weights.get(cat, 0)
    return scores

# # score_cwa_category: Applies weights from CWA_Category to refine scores
# # Args: row (pandas Series with CWA_Broad_Category_Name)
# # Returns: Dictionary of scores per Return_Category
def score_cwa_category(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    cwa_category = row.get('CWA_Broad_Category_Name', 'Unknown')
    weights = CWA_WEIGHTS.get(cwa_category, CWA_WEIGHTS.get('Unknown', {}))
    for cat in RETURN_CATEGORIES:
        scores[cat] += weights.get(cat, 0)
    return scores

# # score_yc_category: Applies weights from YC_Category to further refine scores
# # Args: row (pandas Series with Global_Category_Name)
# # Returns: Dictionary of scores per Return_Category
def score_yc_category(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    yc_category = row.get('Global_Category_Name', 'Unknown')
    if yc_category is None:
        yc_category = 'Unknown'  # Handle None values explicitly
        print(f"Warning: Global_Category_Name is None for SymbolCUSIP {row.get('SymbolCUSIP', 'Unknown')}, defaulting to 'Unknown'")
    # Match YC_Category by exact or partial name (e.g., "Large Blend" in "Large Blend, Large Value")
    for yc_key, weights in YC_WEIGHTS.items():
        if yc_category in yc_key or any(part.strip() in yc_category for part in yc_key.split(',')):
            for cat in RETURN_CATEGORIES:
                scores[cat] += weights.get(cat, 0)
    return scores

# # score_keywords: Counts keyword hits, scales them, and applies auto-classify bonuses
# # Args: text (string of combined fund text), max_points (maximum keyword weight), keywords (dictionary of keywords)
# # Returns: Dictionary of scores per Return_Category
def score_keywords(text, max_points, keywords):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    if not text:
        return scores
    total_hits = 0
    for category, keyword_list in keywords.items():
        hits = sum(keyword in text for keyword in keyword_list)
        total_hits += hits
        if hits > 0:
            scores[category] += min(hits, 3)  # Cap at 3 hits per category
    if total_hits > 0:
        for category in RETURN_CATEGORIES:
            scores[category] = scores[category] * (max_points / total_hits) if scores[category] > 0 else 0
    # Apply auto-classify keywords
    for category, auto_keywords in AUTO_CLASSIFY_KEYWORDS.items():
        for keyword in auto_keywords:
            if keyword in text:
                scores[category] += 10  # Fixed +10 for auto-classify
                break
    return scores

# # score_fund_family: Applies distribution weights from FundFamilyData
# # Args: row (pandas Series with fund family data), max_points (maximum weight)
# # Returns: Dictionary of scores per Return_Category
def score_fund_family(row, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    try:
        dist_index = float(row.get('Dist_Index', 0))
        dist_active = float(row.get('Dist_Active', 0))
        dist_rules_based = float(row.get('Dist_Rules_Based', 0))
        dist_quant = float(row.get('Dist_Quant', 0))
        dist_multi = float(row.get('Dist_Multi', 0))
    except (ValueError, TypeError):
        dist_index = dist_active = dist_rules_based = dist_quant = dist_multi = 0

    total_dist = dist_index + dist_active + dist_rules_based + dist_quant + dist_multi
    if total_dist > 0:
        scores["Index Based"] += (dist_index / total_dist) * max_points
        scores["Active Discretionary"] += (dist_active / total_dist) * max_points
        scores["Factor/Smart Beta"] += (dist_rules_based / total_dist) * max_points
        scores["Quant/Systematic"] += (dist_quant / total_dist) * max_points
        scores["Multi-Strategy"] += (dist_multi / total_dist) * max_points
    else:
        scores["Index Based"] += max_points * 0.5  # Default if no distribution
    return scores

# # classify_return: Combines all scores and determines the final category
# # Args: row (pandas Series with all fund data)
# # Returns: pandas Series with Return_Category, IsDefaulted, DefaultScores, and Final scores
def classify_return(row):
    # Combine text fields
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) else ""
    investment_strategy = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) else ""
    fs_insight = row['FS_insight'].lower() if pd.notna(row['FS_insight']) and row['FS_insight'] != "Error parsing response" else ""
    text = product_name + " " + investment_strategy + " " + fs_insight

    # Step 1: Apply Boolean weights
    bool_scores = score_booleans(row)

    # Step 2: Apply CWA category weights
    cwa_scores = score_cwa_category(row)

    # Step 3: Apply YC category weights
    yc_scores = score_yc_category(row)

    # Step 4: Apply keyword scores
    keyword_scores = score_keywords(text, KEYWORDS_WEIGHT, keywords)

    # Step 5: Apply fund family scores
    fund_family_scores = score_fund_family(row, FUND_FAMILY_WEIGHT)

    # Combine all scores
    combined_scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for cat in RETURN_CATEGORIES:
        combined_scores[cat] = (bool_scores.get(cat, 0) +
                               cwa_scores.get(cat, 0) +
                               yc_scores.get(cat, 0) +
                               keyword_scores.get(cat, 0) +
                               fund_family_scores.get(cat, 0))

    # Apply confidence boost if max score is low
    max_score = max(combined_scores.values())
    if max_score < MIN_CONFIDENCE_THRESHOLD:
        max_cat = max(combined_scores, key=combined_scores.get)
        combined_scores[max_cat] += 3  # Boost by 3 to reach threshold

    # Determine viable scores and final category
    viable_scores = {cat: score for cat, score in combined_scores.items() if score > -3}
    is_defaulted = False
    default_scores = None
    if not viable_scores or max(viable_scores.values()) < MIN_CONFIDENCE_THRESHOLD:
        predicted_category = "Active Discretionary"
        is_defaulted = True
        default_scores = json.dumps(combined_scores)  # Log all scores for defaults
    else:
        predicted_category = max(viable_scores, key=viable_scores.get)

    # Prepare result with all scores logged
    result = {
        'Return_Category': predicted_category,
        'IsDefaulted': is_defaulted,
        'DefaultScores': default_scores
    }
    for cat in RETURN_CATEGORIES:
        result[f'Final_{cat}_Score'] = combined_scores[cat]

    return pd.Series(result)

# --- Apply Classification ---
result_cols = ['Return_Category', 'IsDefaulted', 'DefaultScores'] + [f"Final_{cat}_Score" for cat in RETURN_CATEGORIES]
df[result_cols] = df.apply(classify_return, axis=1)

# --- Prepare Output Excel File ---
output_cols = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Return_Category', 'IsDefaulted', 'DefaultScores', 'ycharts_url'
] + [col for col in df.columns if 'Score' in col] + [
    'investment_strategy', 'FS_insight', 'index_fund', 'inverse_fund', 'leveraged_fund', 'socially_responsible_fund',
    'synthetic_replication_fund', 'fund_of_funds', 'currency_hedged_fund', 'YC_Broad_Asset_Class_Name',
    'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name',
    'Dist_Index', 'Dist_Active', 'Dist_Rules_Based', 'Dist_Quant', 'Dist_Multi'
]

# Create a versioned file name with rounded timestamp (e.g., 2025-03-13_15-00-00)
version_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_file = f"C:\\Users\\JulianHeron\\Software Projects\\Return Drivers\\classified_return_funds_v7.0_3_{version_timestamp}.xlsx"
df[output_cols].to_excel(output_file, index=False)

# --- Display Classification Distribution ---
distribution = df['Return_Category'].value_counts().to_dict()
print("\nClassification Distribution:")
for category, count in distribution.items():
    print(f"{category}: {count}")

defaulted_count = df['IsDefaulted'].sum()
print(f"\nNumber of funds defaulted to Active Discretionary: {defaulted_count}")

print(f"Classification complete. Results saved to '{output_file}'.")

# --- Version Naming with Timestamp ---
print(f"Version: 7.0_3_{version_timestamp}")


Classification Distribution:
Active Discretionary: 3012
Index Based: 1758
Factor/Smart Beta: 697
Quant/Systematic: 68
Multi-Strategy: 51

Number of funds defaulted to Active Discretionary: 0
Classification complete. Results saved to 'C:\Users\JulianHeron\Software Projects\Return Drivers\classified_return_funds_v7.0_3_2025-03-13_11-41-13.xlsx'.
Version: 7.0_3_2025-03-13_11-41-13


In [None]:
# updated code with new version of grok based on assesment of the above

In [19]:
# Version 7.0_4_<timestamp>: Restored keywords vs. auto-classify distinction, added debug column, null handling
# Timestamp: 2025-03-13_17:00:00 (replace with actual runtime timestamp)

import pandas as pd
from sqlalchemy import create_engine
import json
from datetime import datetime

"""
classify_return_drivers.py (V7.0.4)

Classifies investment funds into return generation strategies:
- Index Based
- Factor/Smart Beta
- Active Discretionary
- Quant/Systematic
- Multi-Strategy

Enhancements in V7.0.4:
- Restored keywords dictionary for general scoring, moved text-derived terms there.
- Kept AUTO_CLASSIFY_KEYWORDS for 100% hit/match cases only.
- Added DebugScores column to Excel output.
- Improved null handling in text fields.
- Reduced KEYWORDS_WEIGHT to 40 to balance scoring.
"""

# --- Tunable Parameters and Dry Run Switches ---
TIER_CAP = 100
FUND_FAMILY_WEIGHT = 40
KEYWORDS_WEIGHT = 40  # Reduced from 50 to balance with other signals
CATEGORIES_WEIGHT = 15
MIN_CONFIDENCE_THRESHOLD = 5
DRY_RUN = False
TEST_FUNDS = ["VOO", "DFSVX", "BSIIX", "IVV", "FBALX", "LCEAX", "JSMBX"]

TIER_WEIGHTS = {"High": 1.0, "Medium": 0.5, "Low": 0.25}
RETURN_CATEGORIES = ["Index Based", "Factor/Smart Beta", "Active Discretionary", "Quant/Systematic", "Multi-Strategy"]

# --- Category Weights (Unchanged) ---
CWA_WEIGHTS = {
    "Taxable Fixed Income": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "US Equity, International": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Municipal": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Emerging": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Global Equity": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Sector/Industry": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Nontraditional": {"Index Based": -1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 1, "Active Discretionary": 2},
    "Bond Strategy": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Global Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Strategic": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Commodity": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Country": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Regional": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Quantitative/Tactical": {"Index Based": -1, "Factor/Smart Beta": -1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": -3},
    "Alternative": {"Index Based": -1, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Trading/Tactical": {"Index Based": 0, "Factor/Smart Beta": -2, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": -3}
}

YC_WEIGHTS = {
    "Large Blend, Large Value, Large Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Small Blend, Mid-Cap Blend, Mid-Cap Growth, Small Growth, Global Large-Stock Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Diversified Emerging Mkts": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Foreign Large Blend, Foreign Large Value, Global Large-Stock Blend": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "High Yield Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Technology": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Short-Term Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Moderate Allocation, Aggressive Allocation, Conservative Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Derivative Income": {"Index Based": 0, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Intermediate Core-Plus Bond, Intermediate Core Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Small Value, Mid-Cap Value": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 0, "Active Discretionary": 3},
    "Ultrashort Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Foreign Large Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Multisector Bond, Convertibles": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Muni National Interm, Corporate Bond, Emerging Markets Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Health, Real Estate, Natural Resources": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Global Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Miscellaneous Region, Pacific/Asia ex-Japan Stk": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 2},
    "Moderately Conservative Allocation, Moderately Aggressive Allocation": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": -3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Muni National Short": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Intermediate Government, Short Government": {"Index Based": 2, "Factor/Smart Beta": -1, "Quant/Systematic": -3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "High Yield Muni": {"Index Based": 0, "Factor/Smart Beta": -1, "Quant/Systematic": -3, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Bank Loan": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": 1, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Nontraditional Bond": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": -2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Commodities Focused, Commodities Broad Basket": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Tactical Allocation": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 1, "Active Discretionary": 3},
    "China Region, Europe Stock, India Equity, Japan Stock, Latin America Stock, Diversified Pacific/Asia": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": -1, "Active Discretionary": 1},
    "Financial, Global Real Estate, Miscellaneous Sector, Equity Energy, Industrials, Consumer Cyclical, Communications, Consumer Defensive": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Muni National Long, Muni Single State Long, Muni California Long": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Equity Hedged": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Global Large-Stock Value, Global Small/Mid Stock": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Preferred Stock": {"Index Based": 2, "Factor/Smart Beta": 0, "Quant/Systematic": -2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Miscellaneous Fixed Income, Inflation-Protected Bond, Short-Term Inflation-Protected Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Long Government, Long-Term Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Muni Single State Interm, Muni New York Long, Muni California Intermediate, Muni Pennsylvania, Muni Minnesota, Muni New Jersey, Muni New York Intermediate, Muni Single State Short, Muni Massachusetts, Muni Ohio": {"Index Based": 2, "Factor/Smart Beta": 0, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Equity Precious Metals": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Foreign Small/Mid Blend, Foreign Small/Mid Value": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Energy Limited Partnership": {"Index Based": 3, "Factor/Smart Beta": 0, "Quant/Systematic": 0, "Multi-Strategy": 0, "Active Discretionary": 2},
    "Emerging-Markets Local-Currency Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Digital Assets": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 0},
    "Long/Short Equity": {"Index Based": 1, "Factor/Smart Beta": 0, "Quant/Systematic": 3, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Systematic Trend, Event Driven": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Multistrategy": {"Index Based": -2, "Factor/Smart Beta": -1, "Quant/Systematic": 2, "Multi-Strategy": 3, "Active Discretionary": 3},
    "Equity Market Neutral": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": 3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Trading--Inverse Commodities, Trading--Leveraged Commodities, Trading--Leveraged Debt, Trading--Leveraged Equity": {"Index Based": 2, "Factor/Smart Beta": -3, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": -3},
    "Relative Value Arbitrage": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "leveraged_fund, inverse_fund": {"Index Based": 3, "Factor/Smart Beta": -3, "Quant/Systematic": 0, "Multi-Strategy": -3, "Active Discretionary": -3},
    "index_based": {"Index Based": 3, "Factor/Smart Beta": 3, "Quant/Systematic": -3, "Multi-Strategy": -3, "Active Discretionary": -3},
    "currency_hedged_fund": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": -2, "Multi-Strategy": -3, "Active Discretionary": 1},
    "socially_responsible_fund": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": -2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "synthetic_replication_fund": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": -1},
    "fund_of_funds": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": 3, "Active Discretionary": 2},
    "^PEATR": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 3, "Active Discretionary": 3}
}

# --- Consolidated Keyword Lists (Restored and Populated) ---
keywords = {
    "Index Based": ["track", "reflect", "tracks", "indexing", "replicate", "sampling", "closely correspond"],
    "Factor/Smart Beta": ["momentum", "value", "quality", "minimum volatility", "size", "revenue-weighted",
                          "pure value", "pure growth", "high beta", "technical leaders", "low volatility",
                          "fundamental", "factor", "value factors"],
    "Active Discretionary": ["long-term growth", "current income", "capital appreciation", "invests primarily",
                             "superior business models", "attractively valued", "undervalued", "sustainable growth"],
    "Quant/Systematic": ["synthetic", "covered call", "covered put", "FLEX", "options", "options strategies",
                         "spreads", "butterflies", "trend following", "futures", "quantitative factors",
                         "long options", "yield curve", "dynamically adjusting", "inverse exposure",
                         "quant analysis", "trade signals", "tail risk", "put options", "box spread", "momentum score"],
    "Multi-Strategy": ["mix of"]
}

# --- Auto-Classify Keywords (Restored to Original Intent for 100% Matches) ---
AUTO_CLASSIFY_KEYWORDS = {
    "Active Discretionary": ["actively managed", "actively-managed", "active management", "active bottom‑up approach",
                            "actively trades", "active management strategy", "actively managed etf",
                            "actively managed fund of funds", "actively managed strategy", "active allocation",
                            "actively allocates", "tactically allocates assets", "active trading",
                            "trade securities actively", "active trading strategy", "bottom‑up approach",
                            "exceptional management", "long-term investments", "growth companies",
                            "actively managed mandate", "full discretion", "investment quality score",
                            "actively-managed etf", "fund advisor’s discretion", "actively invests",
                            "fundamental approach", "capital preservation", "research-driven"],
    "Factor/Smart Beta": ["rules-based", "factor-based", "factor tilt", "multi-factor", "factor investing",
                         "rules based methodology", "smart beta", "alphaDEX", "relative strength"],
    "Quant/Systematic": ["quantitative", "algorithm-driven", "systematic", "levered", "algorithm", "implied volatility",
                        "rules-based methodology", "equity index futures", "market neutral",
                        "quantitative research", "contrarian strategy", "dynamically allocating",
                        "economic indicators", "artificial intelligence", "trend following",
                        "data-driven", "backtested", "long-short", "model-based"],
    "Index Based": [],
    "Multi-Strategy": ["multi-strategy", "multi-asset", "hybrid strategy", "cautious allocation",
                      "dynamic allocation", "absolute return", "multi-manager", "blended"]
}

# --- Symbol-Based Overrides for Edge Cases ---
SYMBOL_CLASSIFICATIONS = {
    "WGIFX": "Active Discretionary",
    "PGWFX": "Multi-Strategy",
    "OTCIX": "Active Discretionary"
}

# --- Database Connection ---
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# --- SQL Query ---
query = """
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy, fs.FS_insight,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.currency_hedged_fund, fs.ycharts_url,
    yc_ba.YC_Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name,
    ff.Dist_Index, ff.Dist_Active, ff.Dist_Rules_Based, ff.Dist_Quant, ff.Dist_Multi
FROM Funds_to_Screen fs
LEFT JOIN FundFamilyData ff ON fs.fund_family = ff.FundFamilyName
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""
df = pd.read_sql(query, engine)

if DRY_RUN:
    df = df[df['SymbolCUSIP'].isin(TEST_FUNDS)]

# --- Function Definitions ---

def score_booleans(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for bool_name, weights in [
        ("index_fund", YC_WEIGHTS.get("index_based", {})),
        ("inverse_fund", YC_WEIGHTS.get("leveraged_fund, inverse_fund", {})),
        ("leveraged_fund", YC_WEIGHTS.get("leveraged_fund, inverse_fund", {})),
        ("socially_responsible_fund", YC_WEIGHTS.get("socially_responsible_fund", {})),
        ("synthetic_replication_fund", YC_WEIGHTS.get("synthetic_replication_fund", {})),
        ("fund_of_funds", YC_WEIGHTS.get("fund_of_funds", {})),
        ("currency_hedged_fund", YC_WEIGHTS.get("currency_hedged_fund", {}))
    ]:
        if bool_name in row and pd.notna(row[bool_name]):
            value = str(row[bool_name]).strip().lower()
            if value in ["true", "1"]:
                for cat in RETURN_CATEGORIES:
                    scores[cat] += weights.get(cat, 0)
    return scores

def score_cwa_category(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    cwa_category = row.get('CWA_Broad_Category_Name', 'Unknown')
    weights = CWA_WEIGHTS.get(cwa_category, CWA_WEIGHTS.get('Unknown', {}))
    for cat in RETURN_CATEGORIES:
        scores[cat] += weights.get(cat, 0)
    return scores

def score_yc_category(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    yc_category = row.get('Global_Category_Name', 'Unknown')
    if yc_category is None:
        yc_category = 'Unknown'
        print(f"Warning: Global_Category_Name is None for SymbolCUSIP {row.get('SymbolCUSIP', 'Unknown')}, defaulting to 'Unknown'")
    for yc_key, weights in YC_WEIGHTS.items():
        if yc_category in yc_key or any(part.strip() in yc_category for part in yc_key.split(',')):
            for cat in RETURN_CATEGORIES:
                scores[cat] += weights.get(cat, 0)
    return scores

def score_keywords(text, max_points, keywords):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    if not text:
        return scores
    total_hits = 0
    for category, keyword_list in keywords.items():
        hits = sum(keyword in text for keyword in keyword_list)
        total_hits += hits
        if hits > 0:
            scores[category] += min(hits, 3)
    if total_hits > 0:
        for category in RETURN_CATEGORIES:
            scores[category] = scores[category] * (max_points / total_hits) if scores[category] > 0 else 0
    for category, auto_keywords in AUTO_CLASSIFY_KEYWORDS.items():
        for keyword in auto_keywords:
            if keyword in text:
                scores[category] += 10
                break
    return scores

def score_fund_family(row, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    try:
        dist_index = float(row.get('Dist_Index', 0))
        dist_active = float(row.get('Dist_Active', 0))
        dist_rules_based = float(row.get('Dist_Rules_Based', 0))
        dist_quant = float(row.get('Dist_Quant', 0))
        dist_multi = float(row.get('Dist_Multi', 0))
    except (ValueError, TypeError):
        dist_index = dist_active = dist_rules_based = dist_quant = dist_multi = 0

    total_dist = dist_index + dist_active + dist_rules_based + dist_quant + dist_multi
    if total_dist > 0:
        scores["Index Based"] += (dist_index / total_dist) * max_points
        scores["Active Discretionary"] += (dist_active / total_dist) * max_points
        scores["Factor/Smart Beta"] += (dist_rules_based / total_dist) * max_points
        scores["Quant/Systematic"] += (dist_quant / total_dist) * max_points
        scores["Multi-Strategy"] += (dist_multi / total_dist) * max_points
    else:
        scores["Index Based"] += max_points * 0.5
    return scores

def classify_return(row):
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) and row['ProductName'] else ""
    investment_strategy = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) and row['investment_strategy'] else ""
    fs_insight = row['FS_insight'].lower() if pd.notna(row['FS_insight']) and row['FS_insight'] != "Error parsing response" and row['FS_insight'] else ""
    text = product_name + " " + investment_strategy + " " + fs_insight

    bool_scores = score_booleans(row)
    cwa_scores = score_cwa_category(row)
    yc_scores = score_yc_category(row)
    keyword_scores = score_keywords(text, KEYWORDS_WEIGHT, keywords)
    fund_family_scores = score_fund_family(row, FUND_FAMILY_WEIGHT)

    combined_scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for cat in RETURN_CATEGORIES:
        combined_scores[cat] = (bool_scores.get(cat, 0) +
                               cwa_scores.get(cat, 0) +
                               yc_scores.get(cat, 0) +
                               keyword_scores.get(cat, 0) +
                               fund_family_scores.get(cat, 0))

    debug_scores = {
        "Boolean": bool_scores,
        "CWA": cwa_scores,
        "YC": yc_scores,
        "Keywords": keyword_scores,
        "FundFamily": fund_family_scores,
        "Combined": combined_scores
    }

    if any(term in text for term in ["actively managed", "invests primarily"]):
        if not any(term in text for term in ["track", "reflect"]):
            combined_scores["Active Discretionary"] += 10

    max_score = max(combined_scores.values())
    if max_score < MIN_CONFIDENCE_THRESHOLD:
        max_cat = max(combined_scores, key=combined_scores.get)
        combined_scores[max_cat] += 3

    viable_scores = {cat: score for cat, score in combined_scores.items() if score > -3}
    is_defaulted = False
    default_scores = None
    if not viable_scores or max(viable_scores.values()) < MIN_CONFIDENCE_THRESHOLD:
        predicted_category = "Active Discretionary"
        is_defaulted = True
        default_scores = json.dumps(combined_scores)
    else:
        predicted_category = max(viable_scores, key=viable_scores.get)

    symbol = row['SymbolCUSIP']
    if symbol in SYMBOL_CLASSIFICATIONS:
        predicted_category = SYMBOL_CLASSIFICATIONS[symbol]
        is_defaulted = True
        default_scores = json.dumps(combined_scores)

    result = {
        'Return_Category': predicted_category,
        'IsDefaulted': is_defaulted,
        'DefaultScores': default_scores,
        'DebugScores': json.dumps(debug_scores)
    }
    for cat in RETURN_CATEGORIES:
        result[f'Final_{cat}_Score'] = combined_scores[cat]

    return pd.Series(result)

# --- Apply Classification ---
result_cols = ['Return_Category', 'IsDefaulted', 'DefaultScores', 'DebugScores'] + [f"Final_{cat}_Score" for cat in RETURN_CATEGORIES]
df[result_cols] = df.apply(classify_return, axis=1)

# --- Prepare Output Excel File ---
output_cols = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Return_Category', 'IsDefaulted', 'DefaultScores', 'DebugScores', 'ycharts_url'
] + [col for col in df.columns if 'Score' in col] + [
    'investment_strategy', 'FS_insight', 'index_fund', 'inverse_fund', 'leveraged_fund', 'socially_responsible_fund',
    'synthetic_replication_fund', 'fund_of_funds', 'currency_hedged_fund', 'YC_Broad_Asset_Class_Name',
    'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name',
    'Dist_Index', 'Dist_Active', 'Dist_Rules_Based', 'Dist_Quant', 'Dist_Multi'
]

version_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_file = f"C:\\Users\\JulianHeron\\Software Projects\\Return Drivers\\classified_return_funds_v7.0_4_{version_timestamp}.xlsx"
df[output_cols].to_excel(output_file, index=False)

# --- Display Classification Distribution ---
distribution = df['Return_Category'].value_counts().to_dict()
print("\nClassification Distribution:")
for category, count in distribution.items():
    print(f"{category}: {count}")

defaulted_count = df['IsDefaulted'].sum()
print(f"\nNumber of funds defaulted to Active Discretionary or overridden: {defaulted_count}")

print(f"Classification complete. Results saved to '{output_file}'.")

# --- Version Naming with Timestamp ---
print(f"Version: 7.0_4_{version_timestamp}")


Classification Distribution:
Active Discretionary: 3232
Index Based: 1343
Factor/Smart Beta: 816
Quant/Systematic: 191
Multi-Strategy: 4

Number of funds defaulted to Active Discretionary or overridden: 3
Classification complete. Results saved to 'C:\Users\JulianHeron\Software Projects\Return Drivers\classified_return_funds_v7.0_4_2025-03-13_16-51-31.xlsx'.
Version: 7.0_4_2025-03-13_16-51-31


In [None]:
# latest version updated logic and scoring

In [20]:
# Version 7.0_5_<timestamp>: Restored keywords, adjusted FundFamily via SQL, refined logic
# Timestamp: 2025-03-13_18:00:00 (replace with actual runtime timestamp)

import pandas as pd
from sqlalchemy import create_engine
import json
from datetime import datetime

"""
classify_return_drivers.py (V7.0.5)

Classifies investment funds into return generation strategies:
- Index Based
- Factor/Smart Beta
- Active Discretionary
- Quant/Systematic
- Multi-Strategy

Enhancements in V7.0.5:
- Moved keywords to general scoring, kept AUTO_CLASSIFY_KEYWORDS minimal.
- Removed 'track' from keywords, added tiebreaker logic.
- Adjusted FundFamily weights via SQL (YieldMax, Simplify, Pacer).
- Added Quant boost for Nontraditional/Alternative CWA.
"""

# --- Tunable Parameters and Dry Run Switches ---
TIER_CAP = 100
FUND_FAMILY_WEIGHT = 40
KEYWORDS_WEIGHT = 40  # Held at 40 per user request
CATEGORIES_WEIGHT = 15
MIN_CONFIDENCE_THRESHOLD = 5
DRY_RUN = False
TEST_FUNDS = ["VOO", "DFSVX", "BSIIX", "IVV", "FBALX", "LCEAX", "JSMBX"]

TIER_WEIGHTS = {"High": 1.0, "Medium": 0.5, "Low": 0.25}
RETURN_CATEGORIES = ["Index Based", "Factor/Smart Beta", "Active Discretionary", "Quant/Systematic", "Multi-Strategy"]

# --- Category Weights (Unchanged) ---
CWA_WEIGHTS = {
    "Taxable Fixed Income": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "US Equity, International": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Municipal": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Emerging": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Global Equity": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Sector/Industry": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Nontraditional": {"Index Based": -1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 1, "Active Discretionary": 2},
    "Bond Strategy": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Global Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Strategic": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Commodity": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Country": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Regional": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Quantitative/Tactical": {"Index Based": -1, "Factor/Smart Beta": -1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": -3},
    "Alternative": {"Index Based": -1, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Trading/Tactical": {"Index Based": 0, "Factor/Smart Beta": -2, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": -3}
}

YC_WEIGHTS = {
    "Large Blend, Large Value, Large Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Small Blend, Mid-Cap Blend, Mid-Cap Growth, Small Growth, Global Large-Stock Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Diversified Emerging Mkts": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Foreign Large Blend, Foreign Large Value, Global Large-Stock Blend": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "High Yield Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Technology": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Short-Term Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Moderate Allocation, Aggressive Allocation, Conservative Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Derivative Income": {"Index Based": 0, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Intermediate Core-Plus Bond, Intermediate Core Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Small Value, Mid-Cap Value": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 0, "Active Discretionary": 3},
    "Ultrashort Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Foreign Large Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Multisector Bond, Convertibles": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Muni National Interm, Corporate Bond, Emerging Markets Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Health, Real Estate, Natural Resources": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Global Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Miscellaneous Region, Pacific/Asia ex-Japan Stk": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 2},
    "Moderately Conservative Allocation, Moderately Aggressive Allocation": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": -3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Muni National Short": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Intermediate Government, Short Government": {"Index Based": 2, "Factor/Smart Beta": -1, "Quant/Systematic": -3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "High Yield Muni": {"Index Based": 0, "Factor/Smart Beta": -1, "Quant/Systematic": -3, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Bank Loan": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": 1, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Nontraditional Bond": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": -2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Commodities Focused, Commodities Broad Basket": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Tactical Allocation": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 1, "Active Discretionary": 3},
    "China Region, Europe Stock, India Equity, Japan Stock, Latin America Stock, Diversified Pacific/Asia": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": -1, "Active Discretionary": 1},
    "Financial, Global Real Estate, Miscellaneous Sector, Equity Energy, Industrials, Consumer Cyclical, Communications, Consumer Defensive": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Muni National Long, Muni Single State Long, Muni California Long": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Equity Hedged": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Global Large-Stock Value, Global Small/Mid Stock": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Preferred Stock": {"Index Based": 2, "Factor/Smart Beta": 0, "Quant/Systematic": -2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Miscellaneous Fixed Income, Inflation-Protected Bond, Short-Term Inflation-Protected Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Long Government, Long-Term Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Muni Single State Interm, Muni New York Long, Muni California Intermediate, Muni Pennsylvania, Muni Minnesota, Muni New Jersey, Muni New York Intermediate, Muni Single State Short, Muni Massachusetts, Muni Ohio": {"Index Based": 2, "Factor/Smart Beta": 0, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Equity Precious Metals": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Foreign Small/Mid Blend, Foreign Small/Mid Value": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Energy Limited Partnership": {"Index Based": 3, "Factor/Smart Beta": 0, "Quant/Systematic": 0, "Multi-Strategy": 0, "Active Discretionary": 2},
    "Emerging-Markets Local-Currency Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Digital Assets": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 0},
    "Long/Short Equity": {"Index Based": 1, "Factor/Smart Beta": 0, "Quant/Systematic": 3, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Systematic Trend, Event Driven": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Multistrategy": {"Index Based": -2, "Factor/Smart Beta": -1, "Quant/Systematic": 2, "Multi-Strategy": 3, "Active Discretionary": 3},
    "Equity Market Neutral": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": 3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Trading--Inverse Commodities, Trading--Leveraged Commodities, Trading--Leveraged Debt, Trading--Leveraged Equity": {"Index Based": 2, "Factor/Smart Beta": -3, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": -3},
    "Relative Value Arbitrage": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "leveraged_fund, inverse_fund": {"Index Based": 3, "Factor/Smart Beta": -3, "Quant/Systematic": 0, "Multi-Strategy": -3, "Active Discretionary": -3},
    "index_based": {"Index Based": 3, "Factor/Smart Beta": 3, "Quant/Systematic": -3, "Multi-Strategy": -3, "Active Discretionary": -3},
    "currency_hedged_fund": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": -2, "Multi-Strategy": -3, "Active Discretionary": 1},
    "socially_responsible_fund": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": -2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "synthetic_replication_fund": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": -1},
    "fund_of_funds": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": 3, "Active Discretionary": 2},
    "^PEATR": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 3, "Active Discretionary": 3}
}

# --- Consolidated Keyword Lists ---
keywords = {
    "Index Based": ["reflect", "correspond", "replicate", "sampling"],
    "Factor/Smart Beta": ["momentum", "value", "quality", "minimum volatility", "size", "revenue-weighted",
                          "pure value", "pure growth", "high beta", "technical leaders", "low volatility",
                          "value factors", "cash cows"],
    "Active Discretionary": ["long-term growth", "current income", "capital appreciation", "invests primarily",
                             "superior business models", "attractively valued", "undervalued", "sustainable growth",
                             "research", "tax managed", "capital growth", "tactical", "bottom-up",
                             "prudent portfolio management"],
    "Quant/Systematic": ["synthetic", "covered call", "covered put", "FLEX", "options", "options strategies",
                         "spreads", "butterflies", "trend following", "futures", "quantitative factors",
                         "long options", "yield curve", "dynamically adjusting", "inverse exposure",
                         "quant analysis", "trade signals", "tail risk", "put options", "box spread",
                         "momentum score", "option income", "convexity"],
    "Multi-Strategy": ["mix of"]
}

# --- Auto-Classify Keywords ---
AUTO_CLASSIFY_KEYWORDS = {
    "Active Discretionary": ["actively managed", "actively-managed", "active management", "active bottom‑up approach",
                            "actively trades", "active management strategy", "actively managed etf",
                            "actively managed fund of funds", "actively managed strategy", "active allocation",
                            "actively allocates", "tactically allocates assets", "active trading",
                            "trade securities actively", "active trading strategy", "bottom‑up approach",
                            "exceptional management", "long-term investments", "growth companies",
                            "actively managed mandate", "full discretion", "investment quality score",
                            "actively-managed etf", "fund advisor’s discretion", "actively invests",
                            "fundamental approach", "capital preservation", "research-driven"],
    "Factor/Smart Beta": ["rules-based", "factor-based", "factor tilt", "multi-factor", "factor investing",
                         "rules based methodology", "smart beta", "alphaDEX", "relative strength",
                         "trendpilot", "RAFI"],
    "Quant/Systematic": ["quantitative", "algorithm-driven", "systematic", "levered", "algorithm", "implied volatility",
                        "rules-based methodology", "equity index futures", "market neutral",
                        "quantitative research", "contrarian strategy", "dynamically allocating",
                        "economic indicators", "artificial intelligence", "trend following",
                        "data-driven", "backtested", "long-short", "model-based",
                        "synthetic covered call"],
    "Index Based": [],
    "Multi-Strategy": ["multi-strategy", "multi-asset", "hybrid strategy", "cautious allocation",
                      "dynamic allocation", "absolute return", "multi-manager", "blended"]
}

# --- Symbol-Based Overrides ---
SYMBOL_CLASSIFICATIONS = {
    "WGIFX": "Active Discretionary",
    "PGWFX": "Multi-Strategy",
    "OTCIX": "Active Discretionary"
}

# --- Database Connection ---
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# --- SQL Query ---
query = """
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy, fs.FS_insight,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.currency_hedged_fund, fs.ycharts_url,
    yc_ba.YC_Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name,
    ff.Dist_Index, ff.Dist_Active, ff.Dist_Rules_Based, ff.Dist_Quant, ff.Dist_Multi
FROM Funds_to_Screen fs
LEFT JOIN FundFamilyData ff ON fs.fund_family = ff.FundFamilyName
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""
df = pd.read_sql(query, engine)

if DRY_RUN:
    df = df[df['SymbolCUSIP'].isin(TEST_FUNDS)]

# --- Function Definitions ---

def score_booleans(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for bool_name, weights in [
        ("index_fund", YC_WEIGHTS.get("index_based", {})),
        ("inverse_fund", YC_WEIGHTS.get("leveraged_fund, inverse_fund", {})),
        ("leveraged_fund", YC_WEIGHTS.get("leveraged_fund, inverse_fund", {})),
        ("socially_responsible_fund", YC_WEIGHTS.get("socially_responsible_fund", {})),
        ("synthetic_replication_fund", YC_WEIGHTS.get("synthetic_replication_fund", {})),
        ("fund_of_funds", YC_WEIGHTS.get("fund_of_funds", {})),
        ("currency_hedged_fund", YC_WEIGHTS.get("currency_hedged_fund", {}))
    ]:
        if bool_name in row and pd.notna(row[bool_name]):
            value = str(row[bool_name]).strip().lower()
            if value in ["true", "1"]:
                # Check text for index intent to avoid misflags
                text = (row['ProductName'].lower() if pd.notna(row['ProductName']) and row['ProductName'] else "") + " " + \
                       (row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) and row['investment_strategy'] else "")
                if bool_name == "index_fund" and any(term in text for term in ["track", "reflect", "replicate", "correspond"]):
                    for cat in RETURN_CATEGORIES:
                        scores[cat] += weights.get(cat, 0)
                elif bool_name != "index_fund":
                    for cat in RETURN_CATEGORIES:
                        scores[cat] += weights.get(cat, 0)
    return scores

def score_cwa_category(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    cwa_category = row.get('CWA_Broad_Category_Name', 'Unknown')
    weights = CWA_WEIGHTS.get(cwa_category, CWA_WEIGHTS.get('Unknown', {}))
    for cat in RETURN_CATEGORIES:
        scores[cat] += weights.get(cat, 0)
    return scores

def score_yc_category(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    yc_category = row.get('Global_Category_Name', 'Unknown')
    if yc_category is None:
        yc_category = 'Unknown'
        print(f"Warning: Global_Category_Name is None for SymbolCUSIP {row.get('SymbolCUSIP', 'Unknown')}, defaulting to 'Unknown'")
    for yc_key, weights in YC_WEIGHTS.items():
        if yc_category in yc_key or any(part.strip() in yc_category for part in yc_key.split(',')):
            for cat in RETURN_CATEGORIES:
                scores[cat] += weights.get(cat, 0)
    return scores

def score_keywords(text, max_points, keywords):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    if not text:
        return scores
    total_hits = 0
    for category, keyword_list in keywords.items():
        hits = sum(keyword in text for keyword in keyword_list)
        total_hits += hits
        if hits > 0:
            scores[category] += min(hits, 3)
    if total_hits > 0:
        for category in RETURN_CATEGORIES:
            scores[category] = scores[category] * (max_points / total_hits) if scores[category] > 0 else 0
    for category, auto_keywords in AUTO_CLASSIFY_KEYWORDS.items():
        for keyword in auto_keywords:
            if keyword in text:
                scores[category] += 10
                break
    return scores

def score_fund_family(row, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    try:
        dist_index = float(row.get('Dist_Index', 0))
        dist_active = float(row.get('Dist_Active', 0))
        dist_rules_based = float(row.get('Dist_Rules_Based', 0))
        dist_quant = float(row.get('Dist_Quant', 0))
        dist_multi = float(row.get('Dist_Multi', 0))
    except (ValueError, TypeError):
        dist_index = dist_active = dist_rules_based = dist_quant = dist_multi = 0

    total_dist = dist_index + dist_active + dist_rules_based + dist_quant + dist_multi
    if total_dist > 0:
        scores["Index Based"] += (dist_index / total_dist) * max_points
        scores["Active Discretionary"] += (dist_active / total_dist) * max_points
        scores["Factor/Smart Beta"] += (dist_rules_based / total_dist) * max_points
        scores["Quant/Systematic"] += (dist_quant / total_dist) * max_points
        scores["Multi-Strategy"] += (dist_multi / total_dist) * max_points
    else:
        scores["Index Based"] += max_points * 0.5
    return scores

def classify_return(row):
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) and row['ProductName'] else ""
    investment_strategy = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) and row['investment_strategy'] else ""
    fs_insight = row['FS_insight'].lower() if pd.notna(row['FS_insight']) and row['FS_insight'] != "Error parsing response" and row['FS_insight'] else ""
    text = product_name + " " + investment_strategy + " " + fs_insight

    bool_scores = score_booleans(row)
    cwa_scores = score_cwa_category(row)
    yc_scores = score_yc_category(row)
    keyword_scores = score_keywords(text, KEYWORDS_WEIGHT, keywords)
    fund_family_scores = score_fund_family(row, FUND_FAMILY_WEIGHT)

    combined_scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for cat in RETURN_CATEGORIES:
        combined_scores[cat] = (bool_scores.get(cat, 0) +
                               cwa_scores.get(cat, 0) +
                               yc_scores.get(cat, 0) +
                               keyword_scores.get(cat, 0) +
                               fund_family_scores.get(cat, 0))

    # Quant boost for Nontraditional/Alternative CWA with options/futures
    cwa_category = row.get('CWA_Broad_Category_Name', 'Unknown')
    if cwa_category in ["Nontraditional", "Alternative"] and any(term in text for term in ["options", "futures"]):
        combined_scores["Quant/Systematic"] += 5

    if any(term in text for term in ["actively managed", "invests primarily"]):
        if not any(term in text for term in ["track", "reflect"]):
            combined_scores["Active Discretionary"] += 10

    max_score = max(combined_scores.values())
    if max_score < MIN_CONFIDENCE_THRESHOLD:
        max_cat = max(combined_scores, key=combined_scores.get)
        combined_scores[max_cat] += 3

    viable_scores = {cat: score for cat, score in combined_scores.items() if score > -3}
    is_defaulted = False
    default_scores = None
    if not viable_scores or max(viable_scores.values()) < MIN_CONFIDENCE_THRESHOLD:
        predicted_category = "Active Discretionary"
        is_defaulted = True
        default_scores = json.dumps(combined_scores)
    else:
        # Tiebreaker logic
        max_categories = [cat for cat, score in viable_scores.items() if score == max(viable_scores.values())]
        if len(max_categories) > 1:
            if "Active Discretionary" in max_categories:
                if "Quant/Systematic" in max_categories:
                    predicted_category = "Active Discretionary"
                elif "Factor/Smart Beta" in max_categories:
                    predicted_category = "Active Discretionary"
                elif "Index Based" in max_categories:
                    predicted_category = "Active Discretionary"
                elif "Multi-Strategy" in max_categories:
                    predicted_category = "Multi-Strategy"
                else:
                    predicted_category = max_categories[0]
            elif "Quant/Systematic" in max_categories:
                if "Factor/Smart Beta" in max_categories:
                    predicted_category = "Quant/Systematic"
                elif "Index Based" in max_categories:
                    predicted_category = "Quant/Systematic"
                elif "Multi-Strategy" in max_categories:
                    predicted_category = "Quant/Systematic"
                else:
                    predicted_category = max_categories[0]
            elif "Factor/Smart Beta" in max_categories and "Index Based" in max_categories:
                predicted_category = "Index Based"
            else:
                predicted_category = max_categories[0]
        else:
            predicted_category = max(viable_scores, key=viable_scores.get)

    symbol = row['SymbolCUSIP']
    if symbol in SYMBOL_CLASSIFICATIONS:
        predicted_category = SYMBOL_CLASSIFICATIONS[symbol]
        is_defaulted = True
        default_scores = json.dumps(combined_scores)

    result = {
        'Return_Category': predicted_category,
        'IsDefaulted': is_defaulted,
        'DefaultScores': default_scores,
        'DebugScores': json.dumps({
            "Boolean": bool_scores,
            "CWA": cwa_scores,
            "YC": yc_scores,
            "Keywords": keyword_scores,
            "FundFamily": fund_family_scores,
            "Combined": combined_scores
        })
    }
    for cat in RETURN_CATEGORIES:
        result[f'Final_{cat}_Score'] = combined_scores[cat]

    return pd.Series(result)

# --- Apply Classification ---
result_cols = ['Return_Category', 'IsDefaulted', 'DefaultScores', 'DebugScores'] + [f"Final_{cat}_Score" for cat in RETURN_CATEGORIES]
df[result_cols] = df.apply(classify_return, axis=1)

# --- Prepare Output Excel File ---
output_cols = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Return_Category', 'IsDefaulted', 'DefaultScores', 'DebugScores', 'ycharts_url'
] + [col for col in df.columns if 'Score' in col] + [
    'investment_strategy', 'FS_insight', 'index_fund', 'inverse_fund', 'leveraged_fund', 'socially_responsible_fund',
    'synthetic_replication_fund', 'fund_of_funds', 'currency_hedged_fund', 'YC_Broad_Asset_Class_Name',
    'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name',
    'Dist_Index', 'Dist_Active', 'Dist_Rules_Based', 'Dist_Quant', 'Dist_Multi'
]

version_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_file = f"C:\\Users\\JulianHeron\\Software Projects\\Return Drivers\\classified_return_funds_v7.0_5_{version_timestamp}.xlsx"
df[output_cols].to_excel(output_file, index=False)

# --- Display Classification Distribution ---
distribution = df['Return_Category'].value_counts().to_dict()
print("\nClassification Distribution:")
for category, count in distribution.items():
    print(f"{category}: {count}")

defaulted_count = df['IsDefaulted'].sum()
print(f"\nNumber of funds defaulted to Active Discretionary or overridden: {defaulted_count}")

print(f"Classification complete. Results saved to '{output_file}'.")

# --- Version Naming with Timestamp ---
print(f"Version: 7.0_5_{version_timestamp}")


Classification Distribution:
Active Discretionary: 3604
Index Based: 1055
Factor/Smart Beta: 654
Quant/Systematic: 265
Multi-Strategy: 8

Number of funds defaulted to Active Discretionary or overridden: 3
Classification complete. Results saved to 'C:\Users\JulianHeron\Software Projects\Return Drivers\classified_return_funds_v7.0_5_2025-03-13_17-39-12.xlsx'.
Version: 7.0_5_2025-03-13_17-39-12


In [None]:
# tightened up version, hopefully final

In [23]:
# Version 7.0_6_<timestamp>: Refined keywords, adjusted weights, logic tweaks
# Timestamp: 2025-03-13_19:00:00 (replace with actual runtime timestamp)

import pandas as pd
from sqlalchemy import create_engine
import json
from datetime import datetime

"""
classify_return_drivers.py (V7.0.6)

Classifies investment funds into return generation strategies:
- Index Based
- Factor/Smart Beta
- Active Discretionary
- Quant/Systematic
- Multi-Strategy

Enhancements in V7.0.6:
- Refined Factor/Smart Beta keywords, removed broad terms.
- Added specific Index Based and Active Discretionary keywords.
- Increased AUTO_CLASSIFY weights: 'actively managed' to 20, 'rules-based' to 15.
- Boosted index_fund=True Boolean from 3 to 4.
- Added Quant tiebreaker check.
"""

# --- Tunable Parameters and Dry Run Switches ---
TIER_CAP = 100
FUND_FAMILY_WEIGHT = 40
KEYWORDS_WEIGHT = 40  # Held at 40 to test keyword tweaks
CATEGORIES_WEIGHT = 15
MIN_CONFIDENCE_THRESHOLD = 5
DRY_RUN = False
TEST_FUNDS = ["VOO", "DFSVX", "BSIIX", "IVV", "FBALX", "LCEAX", "JSMBX"]

TIER_WEIGHTS = {"High": 1.0, "Medium": 0.5, "Low": 0.25}
RETURN_CATEGORIES = ["Index Based", "Factor/Smart Beta", "Active Discretionary", "Quant/Systematic", "Multi-Strategy"]

# --- Category Weights (Unchanged) ---
CWA_WEIGHTS = {
    "Taxable Fixed Income": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "US Equity, International": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Municipal": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Emerging": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Global Equity": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Sector/Industry": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Nontraditional": {"Index Based": -1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 1, "Active Discretionary": 2},
    "Bond Strategy": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Global Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Strategic": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Commodity": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Country": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Regional": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Quantitative/Tactical": {"Index Based": -1, "Factor/Smart Beta": -1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": -3},
    "Alternative": {"Index Based": -1, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Trading/Tactical": {"Index Based": 0, "Factor/Smart Beta": -2, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": -3}
}

YC_WEIGHTS = {
    "Large Blend, Large Value, Large Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Small Blend, Mid-Cap Blend, Mid-Cap Growth, Small Growth, Global Large-Stock Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Diversified Emerging Mkts": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Foreign Large Blend, Foreign Large Value, Global Large-Stock Blend": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "High Yield Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Technology": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Short-Term Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Moderate Allocation, Aggressive Allocation, Conservative Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Derivative Income": {"Index Based": 0, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Intermediate Core-Plus Bond, Intermediate Core Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Small Value, Mid-Cap Value": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 0, "Active Discretionary": 3},
    "Ultrashort Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Foreign Large Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Multisector Bond, Convertibles": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Muni National Interm, Corporate Bond, Emerging Markets Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Health, Real Estate, Natural Resources": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Global Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Miscellaneous Region, Pacific/Asia ex-Japan Stk": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 2},
    "Moderately Conservative Allocation, Moderately Aggressive Allocation": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": -3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Muni National Short": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Intermediate Government, Short Government": {"Index Based": 2, "Factor/Smart Beta": -1, "Quant/Systematic": -3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "High Yield Muni": {"Index Based": 0, "Factor/Smart Beta": -1, "Quant/Systematic": -3, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Bank Loan": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": 1, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Nontraditional Bond": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": -2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Commodities Focused, Commodities Broad Basket": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Tactical Allocation": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 1, "Active Discretionary": 3},
    "China Region, Europe Stock, India Equity, Japan Stock, Latin America Stock, Diversified Pacific/Asia": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": -1, "Active Discretionary": 1},
    "Financial, Global Real Estate, Miscellaneous Sector, Equity Energy, Industrials, Consumer Cyclical, Communications, Consumer Defensive": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Muni National Long, Muni Single State Long, Muni California Long": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Equity Hedged": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Global Large-Stock Value, Global Small/Mid Stock": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Preferred Stock": {"Index Based": 2, "Factor/Smart Beta": 0, "Quant/Systematic": -2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Miscellaneous Fixed Income, Inflation-Protected Bond, Short-Term Inflation-Protected Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Long Government, Long-Term Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Muni Single State Interm, Muni New York Long, Muni California Intermediate, Muni Pennsylvania, Muni Minnesota, Muni New Jersey, Muni New York Intermediate, Muni Single State Short, Muni Massachusetts, Muni Ohio": {"Index Based": 2, "Factor/Smart Beta": 0, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Equity Precious Metals": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Foreign Small/Mid Blend, Foreign Small/Mid Value": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Energy Limited Partnership": {"Index Based": 3, "Factor/Smart Beta": 0, "Quant/Systematic": 0, "Multi-Strategy": 0, "Active Discretionary": 2},
    "Emerging-Markets Local-Currency Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Digital Assets": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 0},
    "Long/Short Equity": {"Index Based": 1, "Factor/Smart Beta": 0, "Quant/Systematic": 3, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Systematic Trend, Event Driven": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Multistrategy": {"Index Based": -2, "Factor/Smart Beta": -1, "Quant/Systematic": 2, "Multi-Strategy": 3, "Active Discretionary": 3},
    "Equity Market Neutral": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": 3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Trading--Inverse Commodities, Trading--Leveraged Commodities, Trading--Leveraged Debt, Trading--Leveraged Equity": {"Index Based": 2, "Factor/Smart Beta": -3, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": -3},
    "Relative Value Arbitrage": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "leveraged_fund, inverse_fund": {"Index Based": 3, "Factor/Smart Beta": -3, "Quant/Systematic": 0, "Multi-Strategy": -3, "Active Discretionary": -3},
    "index_based": {"Index Based": 3, "Factor/Smart Beta": 3, "Quant/Systematic": -3, "Multi-Strategy": -3, "Active Discretionary": -3},
    "currency_hedged_fund": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": -2, "Multi-Strategy": -3, "Active Discretionary": 1},
    "socially_responsible_fund": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": -2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "synthetic_replication_fund": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": -1},
    "fund_of_funds": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": 3, "Active Discretionary": 2},
    "^PEATR": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 3, "Active Discretionary": 3}
}

# --- Consolidated Keyword Lists ---
keywords = {
    "Index Based": ["reflect", "correspond", "replicate", "sampling", "passive", "component securities", "index methodology"],
    "Factor/Smart Beta": ["minimum volatility", "revenue-weighted", "pure value", "pure growth", "high beta",
                          "technical leaders", "value factors", "cash cows", "value factor", "quality factor",
                          "factor", "factors", "Multifactor", "Multi-factor", "low beta", "alphadex", "size", "factor tilt"],
    "Active Discretionary": ["invests primarily", "superior business models", "attractively valued", "undervalued",
                             "sustainable growth", "research", "tax managed", "capital growth", "tactical", "bottom-up",
                             "prudent portfolio management", "active allocation", "fundamental analysis", "active",
                             "investment quality score", "bottom-up approach", "relative value", "fundamental approach"],
    "Quant/Systematic": ["synthetic", "covered call", "covered put", "FLEX", "options", "options strategies",
                         "spreads", "butterflies", "trend following", "futures", "quantitative factors",
                         "long options", "yield curve", "dynamically adjusting", "inverse exposure",
                         "quant analysis", "trade signals", "tail risk", "put options", "box spread",
                         "momentum score", "option income", "convexity", "optimizer", "quantitative model",
                         "systematic tilt", "systematic"],
    "Multi-Strategy": ["mix of"]
}

# --- Auto-Classify Keywords ---
AUTO_CLASSIFY_KEYWORDS = {
    "Active Discretionary": ["actively managed", "actively-managed", "active management", "active bottom‑up approach",
                            "actively trades", "active management strategy", "actively managed etf",
                            "actively managed fund of funds", "actively managed strategy", "active allocation",
                            "actively allocates", "tactically allocates assets", "active trading",
                            "trade securities actively", "active trading strategy", "bottom‑up approach",
                            "exceptional management", "long-term investments", "growth companies",
                            "actively managed mandate", "full discretion",
                            "actively-managed etf", "fund advisor’s discretion", "actively invests",
                            "fundamental approach", "capital preservation", "research-driven"],
    "Factor/Smart Beta": ["rules-based", "factor-based", "factor tilt", "multi-factor", "factor investing",
                         "rules based methodology", "smart beta", "relative strength", "trendpilot", "RAFI"],
    "Quant/Systematic": ["quantitative", "algorithm-driven", "systematic", "levered", "algorithm", "implied volatility",
                        "rules-based methodology", "equity index futures", "market neutral",
                        "quantitative research", "contrarian strategy", "dynamically allocating",
                        "economic indicators", "artificial intelligence", "trend following",
                        "data-driven", "backtested", "long-short", "model-based",
                        "synthetic covered call"],
    "Index Based": [],
    "Multi-Strategy": ["multi-strategy", "multi-asset", "hybrid strategy", "cautious allocation",
                      "dynamic allocation", "absolute return", "multi-manager", "blended"]
}

# --- Symbol-Based Overrides ---
SYMBOL_CLASSIFICATIONS = {
    "WGIFX": "Active Discretionary",
    "PGWFX": "Multi-Strategy",
    "OTCIX": "Active Discretionary"
}

# --- Database Connection ---
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# --- SQL Query ---
query = """
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy, fs.FS_insight,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.currency_hedged_fund, fs.ycharts_url,
    yc_ba.YC_Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name,
    ff.Dist_Index, ff.Dist_Active, ff.Dist_Rules_Based, ff.Dist_Quant, ff.Dist_Multi
FROM Funds_to_Screen fs
LEFT JOIN FundFamilyData ff ON fs.fund_family = ff.FundFamilyName
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""
df = pd.read_sql(query, engine)

if DRY_RUN:
    df = df[df['SymbolCUSIP'].isin(TEST_FUNDS)]

# --- Function Definitions ---

def score_booleans(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for bool_name, weights in [
        ("index_fund", YC_WEIGHTS.get("index_based", {})),
        ("inverse_fund", YC_WEIGHTS.get("leveraged_fund, inverse_fund", {})),
        ("leveraged_fund", YC_WEIGHTS.get("leveraged_fund, inverse_fund", {})),
        ("socially_responsible_fund", YC_WEIGHTS.get("socially_responsible_fund", {})),
        ("synthetic_replication_fund", YC_WEIGHTS.get("synthetic_replication_fund", {})),
        ("fund_of_funds", YC_WEIGHTS.get("fund_of_funds", {})),
        ("currency_hedged_fund", YC_WEIGHTS.get("currency_hedged_fund", {}))
    ]:
        if bool_name in row and pd.notna(row[bool_name]):
            value = str(row[bool_name]).strip().lower()
            if value in ["true", "1"]:
                # Check text for index intent to avoid misflags
                text = (row['ProductName'].lower() if pd.notna(row['ProductName']) and row['ProductName'] else "") + " " + \
                       (row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) and row['investment_strategy'] else "")
                if bool_name == "index_fund" and any(term in text for term in ["reflect", "correspond", "replicate", "component securities"]):
                    for cat in RETURN_CATEGORIES:
                        scores[cat] += weights.get(cat, 0) * 4 / 3  # Increase from +3 to +4
                elif bool_name != "index_fund":
                    for cat in RETURN_CATEGORIES:
                        scores[cat] += weights.get(cat, 0)
    return scores

def score_cwa_category(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    cwa_category = row.get('CWA_Broad_Category_Name', 'Unknown')
    weights = CWA_WEIGHTS.get(cwa_category, CWA_WEIGHTS.get('Unknown', {}))
    for cat in RETURN_CATEGORIES:
        scores[cat] += weights.get(cat, 0)
    return scores

def score_yc_category(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    yc_category = row.get('Global_Category_Name', 'Unknown')
    if yc_category is None:
        yc_category = 'Unknown'
        print(f"Warning: Global_Category_Name is None for SymbolCUSIP {row.get('SymbolCUSIP', 'Unknown')}, defaulting to 'Unknown'")
    for yc_key, weights in YC_WEIGHTS.items():
        if yc_category in yc_key or any(part.strip() in yc_category for part in yc_key.split(',')):
            for cat in RETURN_CATEGORIES:
                scores[cat] += weights.get(cat, 0)
    return scores

def score_keywords(text, max_points, keywords):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    if not text:
        return scores
    total_hits = 0
    for category, keyword_list in keywords.items():
        hits = sum(keyword in text for keyword in keyword_list)
        total_hits += hits
        if hits > 0:
            scores[category] += min(hits, 3)
    if total_hits > 0:
        for category in RETURN_CATEGORIES:
            scores[category] = scores[category] * (max_points / total_hits) if scores[category] > 0 else 0
    for category, auto_keywords in AUTO_CLASSIFY_KEYWORDS.items():
        for keyword in auto_keywords:
            if keyword in text:
                if category == "Active Discretionary" and keyword == "actively managed":
                    scores[category] += 15  # Increased from 10 to 20, changed back to 15
                elif category == "Factor/Smart Beta" and keyword == "rules-based":
                    scores[category] += 12  # Increased from 10 to 15, changed back to 12
                else:
                    scores[category] += 10
                break
    return scores

def score_fund_family(row, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    try:
        dist_index = float(row.get('Dist_Index', 0))
        dist_active = float(row.get('Dist_Active', 0))
        dist_rules_based = float(row.get('Dist_Rules_Based', 0))
        dist_quant = float(row.get('Dist_Quant', 0))
        dist_multi = float(row.get('Dist_Multi', 0))
    except (ValueError, TypeError):
        dist_index = dist_active = dist_rules_based = dist_quant = dist_multi = 0

    total_dist = dist_index + dist_active + dist_rules_based + dist_quant + dist_multi
    if total_dist > 0:
        scores["Index Based"] += (dist_index / total_dist) * max_points
        scores["Active Discretionary"] += (dist_active / total_dist) * max_points
        scores["Factor/Smart Beta"] += (dist_rules_based / total_dist) * max_points
        scores["Quant/Systematic"] += (dist_quant / total_dist) * max_points
        scores["Multi-Strategy"] += (dist_multi / total_dist) * max_points
    else:
        scores["Index Based"] += max_points * 0.5
    return scores

def classify_return(row):
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) and row['ProductName'] else ""
    investment_strategy = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) and row['investment_strategy'] else ""
    fs_insight = row['FS_insight'].lower() if pd.notna(row['FS_insight']) and row['FS_insight'] != "Error parsing response" and row['FS_insight'] else ""
    text = product_name + " " + investment_strategy + " " + fs_insight

    bool_scores = score_booleans(row)
    cwa_scores = score_cwa_category(row)
    yc_scores = score_yc_category(row)
    keyword_scores = score_keywords(text, KEYWORDS_WEIGHT, keywords)
    fund_family_scores = score_fund_family(row, FUND_FAMILY_WEIGHT)

    combined_scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for cat in RETURN_CATEGORIES:
        combined_scores[cat] = (bool_scores.get(cat, 0) +
                               cwa_scores.get(cat, 0) +
                               yc_scores.get(cat, 0) +
                               keyword_scores.get(cat, 0) +
                               fund_family_scores.get(cat, 0))

    # Quant boost for Nontraditional/Alternative CWA with options/futures
    cwa_category = row.get('CWA_Broad_Category_Name', 'Unknown')
    if cwa_category in ["Nontraditional", "Alternative"] and any(term in text for term in ["options", "futures"]):
        combined_scores["Quant/Systematic"] += 5

    if any(term in text for term in ["actively managed", "invests primarily"]):
        if not any(term in text for term in ["reflect", "correspond", "replicate"]):
            combined_scores["Active Discretionary"] += 10

    max_score = max(combined_scores.values())
    if max_score < MIN_CONFIDENCE_THRESHOLD:
        max_cat = max(combined_scores, key=combined_scores.get)
        combined_scores[max_cat] += 3

    viable_scores = {cat: score for cat, score in combined_scores.items() if score > -3}
    is_defaulted = False
    default_scores = None
    if not viable_scores or max(viable_scores.values()) < MIN_CONFIDENCE_THRESHOLD:
        predicted_category = "Active Discretionary"
        is_defaulted = True
        default_scores = json.dumps(combined_scores)
    else:
        # Tiebreaker logic
        max_categories = [cat for cat, score in viable_scores.items() if score == max(viable_scores.values())]
        if len(max_categories) > 1:
            if "Quant/Systematic" in max_categories and any(term in text for term in ["quantitative", "quantitative model", "systematic"]):
                predicted_category = "Quant/Systematic"
            elif "Active Discretionary" in max_categories:
                if "Quant/Systematic" in max_categories:
                    predicted_category = "Active Discretionary"
                elif "Factor/Smart Beta" in max_categories:
                    predicted_category = "Active Discretionary"
                elif "Index Based" in max_categories:
                    predicted_category = "Active Discretionary"
                elif "Multi-Strategy" in max_categories:
                    predicted_category = "Multi-Strategy"
                else:
                    predicted_category = max_categories[0]
            elif "Quant/Systematic" in max_categories:
                if "Factor/Smart Beta" in max_categories:
                    predicted_category = "Quant/Systematic"
                elif "Index Based" in max_categories:
                    predicted_category = "Quant/Systematic"
                elif "Multi-Strategy" in max_categories:
                    predicted_category = "Quant/Systematic"
                else:
                    predicted_category = max_categories[0]
            elif "Factor/Smart Beta" in max_categories and "Index Based" in max_categories:
                predicted_category = "Index Based"
            else:
                predicted_category = max_categories[0]
        else:
            predicted_category = max(viable_scores, key=viable_scores.get)

    symbol = row['SymbolCUSIP']
    if symbol in SYMBOL_CLASSIFICATIONS:
        predicted_category = SYMBOL_CLASSIFICATIONS[symbol]
        is_defaulted = True
        default_scores = json.dumps(combined_scores)

    result = {
        'Return_Category': predicted_category,
        'IsDefaulted': is_defaulted,
        'DefaultScores': default_scores,
        'DebugScores': json.dumps({
            "Boolean": bool_scores,
            "CWA": cwa_scores,
            "YC": yc_scores,
            "Keywords": keyword_scores,
            "FundFamily": fund_family_scores,
            "Combined": combined_scores
        })
    }
    for cat in RETURN_CATEGORIES:
        result[f'Final_{cat}_Score'] = combined_scores[cat]

    return pd.Series(result)

# --- Apply Classification ---
result_cols = ['Return_Category', 'IsDefaulted', 'DefaultScores', 'DebugScores'] + [f"Final_{cat}_Score" for cat in RETURN_CATEGORIES]
df[result_cols] = df.apply(classify_return, axis=1)

# --- Prepare Output Excel File ---
output_cols = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Return_Category', 'IsDefaulted', 'DefaultScores', 'DebugScores', 'ycharts_url'
] + [col for col in df.columns if 'Score' in col] + [
    'investment_strategy', 'FS_insight', 'index_fund', 'inverse_fund', 'leveraged_fund', 'socially_responsible_fund',
    'synthetic_replication_fund', 'fund_of_funds', 'currency_hedged_fund', 'YC_Broad_Asset_Class_Name',
    'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name',
    'Dist_Index', 'Dist_Active', 'Dist_Rules_Based', 'Dist_Quant', 'Dist_Multi'
]

version_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_file = f"C:\\Users\\JulianHeron\\Software Projects\\Return Drivers\\classified_return_funds_v7.0_6_{version_timestamp}.xlsx"
df[output_cols].to_excel(output_file, index=False)

# --- Display Classification Distribution ---
distribution = df['Return_Category'].value_counts().to_dict()
print("\nClassification Distribution:")
for category, count in distribution.items():
    print(f"{category}: {count}")

defaulted_count = df['IsDefaulted'].sum()
print(f"\nNumber of funds defaulted to Active Discretionary or overridden: {defaulted_count}")

print(f"Classification complete. Results saved to '{output_file}'.")

# --- Version Naming with Timestamp ---
print(f"Version: 7.0_6_{version_timestamp}")


Classification Distribution:
Active Discretionary: 3617
Index Based: 1253
Factor/Smart Beta: 454
Quant/Systematic: 247
Multi-Strategy: 15

Number of funds defaulted to Active Discretionary or overridden: 3
Classification complete. Results saved to 'C:\Users\JulianHeron\Software Projects\Return Drivers\classified_return_funds_v7.0_6_2025-03-13_20-59-06.xlsx'.
Version: 7.0_6_2025-03-13_20-59-06


In [None]:
# alter just index on the 7.5 version, other keyword enhancements -JBH

In [24]:
# Version 7.0_5_<timestamp>: Restored keywords, adjusted FundFamily via SQL, refined logic
# Timestamp: 2025-03-13_18:00:00 (replace with actual runtime timestamp)

import pandas as pd
from sqlalchemy import create_engine
import json
from datetime import datetime

"""
classify_return_drivers.py (V7.0.5)

Classifies investment funds into return generation strategies:
- Index Based
- Factor/Smart Beta
- Active Discretionary
- Quant/Systematic
- Multi-Strategy

Enhancements in V7.0.5:
- Moved keywords to general scoring, kept AUTO_CLASSIFY_KEYWORDS minimal.
- Removed 'track' from keywords, added tiebreaker logic.
- Adjusted FundFamily weights via SQL (YieldMax, Simplify, Pacer).
- Added Quant boost for Nontraditional/Alternative CWA.
"""

# --- Tunable Parameters and Dry Run Switches ---
TIER_CAP = 100
FUND_FAMILY_WEIGHT = 40
KEYWORDS_WEIGHT = 40  # Held at 40 per user request
CATEGORIES_WEIGHT = 15
MIN_CONFIDENCE_THRESHOLD = 5
DRY_RUN = False
TEST_FUNDS = ["VOO", "DFSVX", "BSIIX", "IVV", "FBALX", "LCEAX", "JSMBX"]

TIER_WEIGHTS = {"High": 1.0, "Medium": 0.5, "Low": 0.25}
RETURN_CATEGORIES = ["Index Based", "Factor/Smart Beta", "Active Discretionary", "Quant/Systematic", "Multi-Strategy"]

# --- Category Weights (Unchanged) ---
CWA_WEIGHTS = {
    "Taxable Fixed Income": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "US Equity, International": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Municipal": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Emerging": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Global Equity": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Sector/Industry": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Nontraditional": {"Index Based": -1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 1, "Active Discretionary": 2},
    "Bond Strategy": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Global Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Strategic": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Commodity": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Country": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Regional": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Quantitative/Tactical": {"Index Based": -1, "Factor/Smart Beta": -1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": -3},
    "Alternative": {"Index Based": -1, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Trading/Tactical": {"Index Based": 0, "Factor/Smart Beta": -2, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": -3}
}

YC_WEIGHTS = {
    "Large Blend, Large Value, Large Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Small Blend, Mid-Cap Blend, Mid-Cap Growth, Small Growth, Global Large-Stock Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Diversified Emerging Mkts": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Foreign Large Blend, Foreign Large Value, Global Large-Stock Blend": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "High Yield Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Technology": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Short-Term Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Moderate Allocation, Aggressive Allocation, Conservative Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Derivative Income": {"Index Based": 0, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Intermediate Core-Plus Bond, Intermediate Core Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Small Value, Mid-Cap Value": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 0, "Active Discretionary": 3},
    "Ultrashort Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Foreign Large Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Multisector Bond, Convertibles": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Muni National Interm, Corporate Bond, Emerging Markets Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Health, Real Estate, Natural Resources": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Global Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Miscellaneous Region, Pacific/Asia ex-Japan Stk": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 2},
    "Moderately Conservative Allocation, Moderately Aggressive Allocation": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": -3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Muni National Short": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Intermediate Government, Short Government": {"Index Based": 2, "Factor/Smart Beta": -1, "Quant/Systematic": -3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "High Yield Muni": {"Index Based": 0, "Factor/Smart Beta": -1, "Quant/Systematic": -3, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Bank Loan": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": 1, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Nontraditional Bond": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": -2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Commodities Focused, Commodities Broad Basket": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Tactical Allocation": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 1, "Active Discretionary": 3},
    "China Region, Europe Stock, India Equity, Japan Stock, Latin America Stock, Diversified Pacific/Asia": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": -1, "Active Discretionary": 1},
    "Financial, Global Real Estate, Miscellaneous Sector, Equity Energy, Industrials, Consumer Cyclical, Communications, Consumer Defensive": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Muni National Long, Muni Single State Long, Muni California Long": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Equity Hedged": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Global Large-Stock Value, Global Small/Mid Stock": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Preferred Stock": {"Index Based": 2, "Factor/Smart Beta": 0, "Quant/Systematic": -2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Miscellaneous Fixed Income, Inflation-Protected Bond, Short-Term Inflation-Protected Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Long Government, Long-Term Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Muni Single State Interm, Muni New York Long, Muni California Intermediate, Muni Pennsylvania, Muni Minnesota, Muni New Jersey, Muni New York Intermediate, Muni Single State Short, Muni Massachusetts, Muni Ohio": {"Index Based": 2, "Factor/Smart Beta": 0, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Equity Precious Metals": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Foreign Small/Mid Blend, Foreign Small/Mid Value": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Energy Limited Partnership": {"Index Based": 3, "Factor/Smart Beta": 0, "Quant/Systematic": 0, "Multi-Strategy": 0, "Active Discretionary": 2},
    "Emerging-Markets Local-Currency Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Digital Assets": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 0},
    "Long/Short Equity": {"Index Based": 1, "Factor/Smart Beta": 0, "Quant/Systematic": 3, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Systematic Trend, Event Driven": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Multistrategy": {"Index Based": -2, "Factor/Smart Beta": -1, "Quant/Systematic": 2, "Multi-Strategy": 3, "Active Discretionary": 3},
    "Equity Market Neutral": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": 3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Trading--Inverse Commodities, Trading--Leveraged Commodities, Trading--Leveraged Debt, Trading--Leveraged Equity": {"Index Based": 2, "Factor/Smart Beta": -3, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": -3},
    "Relative Value Arbitrage": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "leveraged_fund, inverse_fund": {"Index Based": 3, "Factor/Smart Beta": -3, "Quant/Systematic": 0, "Multi-Strategy": -3, "Active Discretionary": -3},
    "index_based": {"Index Based": 4, "Factor/Smart Beta": 3, "Quant/Systematic": -3, "Multi-Strategy": -3, "Active Discretionary": -3},
    "currency_hedged_fund": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": -2, "Multi-Strategy": -3, "Active Discretionary": 1},
    "socially_responsible_fund": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": -2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "synthetic_replication_fund": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": -1},
    "fund_of_funds": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": 3, "Active Discretionary": 2},
    "^PEATR": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 3, "Active Discretionary": 3}
}

# --- Consolidated Keyword Lists ---
keywords = {
    "Index Based": ["reflect", "correspond", "replicate", "sampling", "replicate", "passive", "underlying index", "vanguard",
                    "benchmark index", "seeks to track", "tracking the performance"],
    "Factor/Smart Beta": ["momentum", "value factor", "quality factor", "minimum volatility", "size", "revenue-weighted",
                          "pure value", "pure growth", "high beta", "technical leaders", "low volatility",
                          "value factors", "cash cows", "alphadex", "low beta", "Enhanced Value", "Sector Neutral Quality Index",
                          "Momentum Value", "avantis", "lower volatility", "strong recent performance", "relatively lower", "relatively higher momentum",
                          "relative to fundamental values", "momentum signals", "positive momentum", " Fidelity U.S. Value Factor",
                          " MSCI USA Momentum", "Low Size"],
    "Active Discretionary": ["long-term growth", "current income", "capital appreciation", "invests primarily",
                             "superior business models", "attractively valued", "undervalued", "sustainable growth",
                             "research", "tax managed", "capital growth", "tactical", "bottom-up",
                             "prudent portfolio management", "active", "advisor"],
    "Quant/Systematic": ["synthetic", "covered call", "covered put", "FLEX", "options", "options strategies",
                         "spreads", "butterflies", "trend following", "futures", "quantitative factors", "machine learning",
                         "long options", "yield curve", "dynamically adjusting", "inverse exposure", "quantitative ranking",
                         "quant analysis", "trade signals", "tail risk", "put options", "box spread", "algorithm-driven",
                         "momentum score", "option income", "convexity", "systematic", "quantitative", "systematic tilt",
                         "implied volatility", "quantitative research", "data-driven", "swaps", "forwards", "currency futures",
                         "volatility futures", "simplify", "rotation"],
    "Multi-Strategy": ["mix of"]
}

# --- Auto-Classify Keywords ---
AUTO_CLASSIFY_KEYWORDS = {
    "Active Discretionary": ["actively managed", "actively-managed", "active management", "active bottom‑up approach",
                            "actively trades", "active management strategy", "actively managed etf",
                            "actively managed fund of funds", "actively managed strategy", "active allocation",
                            "actively allocates", "tactically allocates assets", "active trading",
                            "trade securities actively", "active trading strategy", "bottom‑up approach",
                            "exceptional management", "growth companies",
                            "actively managed mandate", "full discretion", "investment quality score",
                            "actively-managed etf", "fund advisor’s discretion", "actively invests",
                            "fundamental approach", "capital preservation", "research-driven"],
    "Factor/Smart Beta": ["rules-based", "factor-based", "factor tilt", "multi-factor", "factor investing",
                         "rules based methodology", "smart beta", "relative strength",
                         "trendpilot", "RAFI", "minimum volatility", "momentum", "quality factor", "value factor"],
    "Quant/Systematic": ["quantitative", "algorithm-driven", "systematic", "levered", "algorithm", "implied volatility",
                        "rules-based methodology", "equity index futures", "market neutral",
                        "quantitative research", "contrarian strategy", "dynamically allocating",
                        "economic indicators", "artificial intelligence", "trend following",
                        "data-driven", "backtested", "long-short", "model-based",
                        "synthetic covered call"],
    "Index Based": [],
    "Multi-Strategy": ["multi-strategy", "multi-asset", "hybrid strategy", "cautious allocation",
                      "dynamic allocation", "absolute return", "multi-manager", "blended"]
}

# --- Symbol-Based Overrides ---
SYMBOL_CLASSIFICATIONS = {
    "WGIFX": "Active Discretionary",
    "PGWFX": "Multi-Strategy",
    "OTCIX": "Active Discretionary"
}

# --- Database Connection ---
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# --- SQL Query ---
query = """
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy, fs.FS_insight,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.currency_hedged_fund, fs.ycharts_url,
    yc_ba.YC_Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name,
    ff.Dist_Index, ff.Dist_Active, ff.Dist_Rules_Based, ff.Dist_Quant, ff.Dist_Multi
FROM Funds_to_Screen fs
LEFT JOIN FundFamilyData ff ON fs.fund_family = ff.FundFamilyName
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""
df = pd.read_sql(query, engine)

if DRY_RUN:
    df = df[df['SymbolCUSIP'].isin(TEST_FUNDS)]

# --- Function Definitions ---

def score_booleans(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for bool_name, weights in [
        ("index_fund", YC_WEIGHTS.get("index_based", {})),
        ("inverse_fund", YC_WEIGHTS.get("leveraged_fund, inverse_fund", {})),
        ("leveraged_fund", YC_WEIGHTS.get("leveraged_fund, inverse_fund", {})),
        ("socially_responsible_fund", YC_WEIGHTS.get("socially_responsible_fund", {})),
        ("synthetic_replication_fund", YC_WEIGHTS.get("synthetic_replication_fund", {})),
        ("fund_of_funds", YC_WEIGHTS.get("fund_of_funds", {})),
        ("currency_hedged_fund", YC_WEIGHTS.get("currency_hedged_fund", {}))
    ]:
        if bool_name in row and pd.notna(row[bool_name]):
            value = str(row[bool_name]).strip().lower()
            if value in ["true", "1"]:
                # Check text for index intent to avoid misflags
                text = (row['ProductName'].lower() if pd.notna(row['ProductName']) and row['ProductName'] else "") + " " + \
                       (row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) and row['investment_strategy'] else "")
                if bool_name == "index_fund" and any(term in text for term in ["track", "reflect", "replicate", "correspond"]):
                    for cat in RETURN_CATEGORIES:
                        scores[cat] += weights.get(cat, 0) 
                elif bool_name != "index_fund":
                    for cat in RETURN_CATEGORIES:
                        scores[cat] += weights.get(cat, 0)
    return scores

def score_cwa_category(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    cwa_category = row.get('CWA_Broad_Category_Name', 'Unknown')
    weights = CWA_WEIGHTS.get(cwa_category, CWA_WEIGHTS.get('Unknown', {}))
    for cat in RETURN_CATEGORIES:
        scores[cat] += weights.get(cat, 0)
    return scores

def score_yc_category(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    yc_category = row.get('Global_Category_Name', 'Unknown')
    if yc_category is None:
        yc_category = 'Unknown'
        print(f"Warning: Global_Category_Name is None for SymbolCUSIP {row.get('SymbolCUSIP', 'Unknown')}, defaulting to 'Unknown'")
    for yc_key, weights in YC_WEIGHTS.items():
        if yc_category in yc_key or any(part.strip() in yc_category for part in yc_key.split(',')):
            for cat in RETURN_CATEGORIES:
                scores[cat] += weights.get(cat, 0)
    return scores

def score_keywords(text, max_points, keywords):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    if not text:
        return scores
    total_hits = 0
    for category, keyword_list in keywords.items():
        hits = sum(keyword in text for keyword in keyword_list)
        total_hits += hits
        if hits > 0:
            scores[category] += min(hits, 3)
    if total_hits > 0:
        for category in RETURN_CATEGORIES:
            scores[category] = scores[category] * (max_points / total_hits) if scores[category] > 0 else 0
    for category, auto_keywords in AUTO_CLASSIFY_KEYWORDS.items():
        for keyword in auto_keywords:
            if keyword in text:
                scores[category] += 10
                break
    return scores

def score_fund_family(row, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    try:
        dist_index = float(row.get('Dist_Index', 0))
        dist_active = float(row.get('Dist_Active', 0))
        dist_rules_based = float(row.get('Dist_Rules_Based', 0))
        dist_quant = float(row.get('Dist_Quant', 0))
        dist_multi = float(row.get('Dist_Multi', 0))
    except (ValueError, TypeError):
        dist_index = dist_active = dist_rules_based = dist_quant = dist_multi = 0

    total_dist = dist_index + dist_active + dist_rules_based + dist_quant + dist_multi
    if total_dist > 0:
        scores["Index Based"] += (dist_index / total_dist) * max_points
        scores["Active Discretionary"] += (dist_active / total_dist) * max_points
        scores["Factor/Smart Beta"] += (dist_rules_based / total_dist) * max_points
        scores["Quant/Systematic"] += (dist_quant / total_dist) * max_points
        scores["Multi-Strategy"] += (dist_multi / total_dist) * max_points
    else:
        scores["Index Based"] += max_points * 0.5
    return scores

def classify_return(row):
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) and row['ProductName'] else ""
    investment_strategy = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) and row['investment_strategy'] else ""
    fs_insight = row['FS_insight'].lower() if pd.notna(row['FS_insight']) and row['FS_insight'] != "Error parsing response" and row['FS_insight'] else ""
    text = product_name + " " + investment_strategy + " " + fs_insight

    bool_scores = score_booleans(row)
    cwa_scores = score_cwa_category(row)
    yc_scores = score_yc_category(row)
    keyword_scores = score_keywords(text, KEYWORDS_WEIGHT, keywords)
    fund_family_scores = score_fund_family(row, FUND_FAMILY_WEIGHT)

    combined_scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for cat in RETURN_CATEGORIES:
        combined_scores[cat] = (bool_scores.get(cat, 0) +
                               cwa_scores.get(cat, 0) +
                               yc_scores.get(cat, 0) +
                               keyword_scores.get(cat, 0) +
                               fund_family_scores.get(cat, 0))

    # Quant boost for Nontraditional/Alternative CWA with options/futures
    cwa_category = row.get('CWA_Broad_Category_Name', 'Unknown')
    if cwa_category in ["Nontraditional", "Alternative"] and any(term in text for term in ["options", "futures"]):
        combined_scores["Quant/Systematic"] += 5

    if any(term in text for term in ["actively managed", "invests primarily"]):
        if not any(term in text for term in ["track", "reflect"]):
            combined_scores["Active Discretionary"] += 10

    max_score = max(combined_scores.values())
    if max_score < MIN_CONFIDENCE_THRESHOLD:
        max_cat = max(combined_scores, key=combined_scores.get)
        combined_scores[max_cat] += 3

    viable_scores = {cat: score for cat, score in combined_scores.items() if score > -3}
    is_defaulted = False
    default_scores = None
    if not viable_scores or max(viable_scores.values()) < MIN_CONFIDENCE_THRESHOLD:
        predicted_category = "Active Discretionary"
        is_defaulted = True
        default_scores = json.dumps(combined_scores)
    else:
        # Tiebreaker logic
        max_categories = [cat for cat, score in viable_scores.items() if score == max(viable_scores.values())]
        if len(max_categories) > 1:
            if "Active Discretionary" in max_categories:
                if "Quant/Systematic" in max_categories:
                    predicted_category = "Active Discretionary"
                elif "Factor/Smart Beta" in max_categories:
                    predicted_category = "Active Discretionary"
                elif "Index Based" in max_categories:
                    predicted_category = "Active Discretionary"
                elif "Multi-Strategy" in max_categories:
                    predicted_category = "Multi-Strategy"
                else:
                    predicted_category = max_categories[0]
            elif "Quant/Systematic" in max_categories:
                if "Factor/Smart Beta" in max_categories:
                    predicted_category = "Quant/Systematic"
                elif "Index Based" in max_categories:
                    predicted_category = "Quant/Systematic"
                elif "Multi-Strategy" in max_categories:
                    predicted_category = "Quant/Systematic"
                else:
                    predicted_category = max_categories[0]
            elif "Factor/Smart Beta" in max_categories and "Index Based" in max_categories:
                predicted_category = "Index Based"
            else:
                predicted_category = max_categories[0]
        else:
            predicted_category = max(viable_scores, key=viable_scores.get)

    symbol = row['SymbolCUSIP']
    if symbol in SYMBOL_CLASSIFICATIONS:
        predicted_category = SYMBOL_CLASSIFICATIONS[symbol]
        is_defaulted = True
        default_scores = json.dumps(combined_scores)

    result = {
        'Return_Category': predicted_category,
        'IsDefaulted': is_defaulted,
        'DefaultScores': default_scores,
        'DebugScores': json.dumps({
            "Boolean": bool_scores,
            "CWA": cwa_scores,
            "YC": yc_scores,
            "Keywords": keyword_scores,
            "FundFamily": fund_family_scores,
            "Combined": combined_scores
        })
    }
    for cat in RETURN_CATEGORIES:
        result[f'Final_{cat}_Score'] = combined_scores[cat]

    return pd.Series(result)

# --- Apply Classification ---
result_cols = ['Return_Category', 'IsDefaulted', 'DefaultScores', 'DebugScores'] + [f"Final_{cat}_Score" for cat in RETURN_CATEGORIES]
df[result_cols] = df.apply(classify_return, axis=1)

# --- Prepare Output Excel File ---
output_cols = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Return_Category', 'IsDefaulted', 'DefaultScores', 'DebugScores', 'ycharts_url'
] + [col for col in df.columns if 'Score' in col] + [
    'investment_strategy', 'FS_insight', 'index_fund', 'inverse_fund', 'leveraged_fund', 'socially_responsible_fund',
    'synthetic_replication_fund', 'fund_of_funds', 'currency_hedged_fund', 'YC_Broad_Asset_Class_Name',
    'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name',
    'Dist_Index', 'Dist_Active', 'Dist_Rules_Based', 'Dist_Quant', 'Dist_Multi'
]

version_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_file = f"C:\\Users\\JulianHeron\\Software Projects\\Return Drivers\\classified_return_funds_v7.8_{version_timestamp}.xlsx"
df[output_cols].to_excel(output_file, index=False)

# --- Display Classification Distribution ---
distribution = df['Return_Category'].value_counts().to_dict()
print("\nClassification Distribution:")
for category, count in distribution.items():
    print(f"{category}: {count}")

defaulted_count = df['IsDefaulted'].sum()
print(f"\nNumber of funds defaulted to Active Discretionary or overridden: {defaulted_count}")

print(f"Classification complete. Results saved to '{output_file}'.")

# --- Version Naming with Timestamp ---
print(f"Version: 7.8_{version_timestamp}")


Classification Distribution:
Active Discretionary: 3558
Index Based: 1534
Factor/Smart Beta: 283
Quant/Systematic: 208
Multi-Strategy: 3

Number of funds defaulted to Active Discretionary or overridden: 3
Classification complete. Results saved to 'C:\Users\JulianHeron\Software Projects\Return Drivers\classified_return_funds_v7.8_2025-03-13_21-08-29.xlsx'.
Version: 7.8_2025-03-13_21-08-29


In [None]:
# Current Final updated only with DB writes

In [28]:
# Version 7.0_5_<timestamp>: Restored keywords, adjusted FundFamily via SQL, refined logic
# Timestamp: 2025-03-13_18:00:00 (replace with actual runtime timestamp)

import pandas as pd
from sqlalchemy import create_engine
import json
from datetime import datetime

"""
classify_return_drivers.py (V7.8)

Classifies investment funds into return generation strategies:
- Index Based
- Factor/Smart Beta
- Active Discretionary
- Quant/Systematic
- Multi-Strategy

Enhancements in V7.8:
- Moved keywords to general scoring, kept AUTO_CLASSIFY_KEYWORDS minimal.
- Removed 'track' from keywords, added tiebreaker logic.
- Adjusted FundFamily weights via SQL (YieldMax, Simplify, Pacer).
- Added Quant boost for Nontraditional/Alternative CWA.
- Added update_database function to save results.
"""

# --- Tunable Parameters and Dry Run Switches ---
TIER_CAP = 100
FUND_FAMILY_WEIGHT = 40
KEYWORDS_WEIGHT = 40  # Held at 40 per user request
CATEGORIES_WEIGHT = 15
MIN_CONFIDENCE_THRESHOLD = 5
DRY_RUN = False
TEST_FUNDS = ["VOO", "DFSVX", "BSIIX", "IVV", "FBALX", "LCEAX", "JSMBX"]

TIER_WEIGHTS = {"High": 1.0, "Medium": 0.5, "Low": 0.25}
RETURN_CATEGORIES = ["Index Based", "Factor/Smart Beta", "Active Discretionary", "Quant/Systematic", "Multi-Strategy"]

# --- Category Weights (Unchanged) ---
CWA_WEIGHTS = {
    "Taxable Fixed Income": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "US Equity, International": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Municipal": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Emerging": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Global Equity": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Sector/Industry": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Nontraditional": {"Index Based": -1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 1, "Active Discretionary": 2},
    "Bond Strategy": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Global Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Strategic": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Commodity": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Country": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Regional": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Quantitative/Tactical": {"Index Based": -1, "Factor/Smart Beta": -1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": -3},
    "Alternative": {"Index Based": -1, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Trading/Tactical": {"Index Based": 0, "Factor/Smart Beta": -2, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": -3}
}

YC_WEIGHTS = {
    "Large Blend, Large Value, Large Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Small Blend, Mid-Cap Blend, Mid-Cap Growth, Small Growth, Global Large-Stock Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Diversified Emerging Mkts": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Foreign Large Blend, Foreign Large Value, Global Large-Stock Blend": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "High Yield Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Technology": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Short-Term Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Moderate Allocation, Aggressive Allocation, Conservative Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Derivative Income": {"Index Based": 0, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Intermediate Core-Plus Bond, Intermediate Core Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Small Value, Mid-Cap Value": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 0, "Active Discretionary": 3},
    "Ultrashort Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Foreign Large Growth": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Multisector Bond, Convertibles": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Muni National Interm, Corporate Bond, Emerging Markets Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Health, Real Estate, Natural Resources": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -3, "Active Discretionary": 2},
    "Global Allocation": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Miscellaneous Region, Pacific/Asia ex-Japan Stk": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": -2, "Active Discretionary": 2},
    "Moderately Conservative Allocation, Moderately Aggressive Allocation": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": -3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Muni National Short": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Intermediate Government, Short Government": {"Index Based": 2, "Factor/Smart Beta": -1, "Quant/Systematic": -3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "High Yield Muni": {"Index Based": 0, "Factor/Smart Beta": -1, "Quant/Systematic": -3, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Bank Loan": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": 1, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Nontraditional Bond": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": -2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Commodities Focused, Commodities Broad Basket": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Tactical Allocation": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 1, "Active Discretionary": 3},
    "China Region, Europe Stock, India Equity, Japan Stock, Latin America Stock, Diversified Pacific/Asia": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": -1, "Active Discretionary": 1},
    "Financial, Global Real Estate, Miscellaneous Sector, Equity Energy, Industrials, Consumer Cyclical, Communications, Consumer Defensive": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -1, "Active Discretionary": 2},
    "Muni National Long, Muni Single State Long, Muni California Long": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Equity Hedged": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Global Large-Stock Value, Global Small/Mid Stock": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "Preferred Stock": {"Index Based": 2, "Factor/Smart Beta": 0, "Quant/Systematic": -2, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Miscellaneous Fixed Income, Inflation-Protected Bond, Short-Term Inflation-Protected Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Long Government, Long-Term Bond": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": 0, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Muni Single State Interm, Muni New York Long, Muni California Intermediate, Muni Pennsylvania, Muni Minnesota, Muni New Jersey, Muni New York Intermediate, Muni Single State Short, Muni Massachusetts, Muni Ohio": {"Index Based": 2, "Factor/Smart Beta": 0, "Quant/Systematic": -1, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Equity Precious Metals": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1, "Active Discretionary": 3},
    "Foreign Small/Mid Blend, Foreign Small/Mid Value": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Energy Limited Partnership": {"Index Based": 3, "Factor/Smart Beta": 0, "Quant/Systematic": 0, "Multi-Strategy": 0, "Active Discretionary": 2},
    "Emerging-Markets Local-Currency Bond": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": -2, "Multi-Strategy": -2, "Active Discretionary": 3},
    "Digital Assets": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 2, "Multi-Strategy": -1, "Active Discretionary": 0},
    "Long/Short Equity": {"Index Based": 1, "Factor/Smart Beta": 0, "Quant/Systematic": 3, "Multi-Strategy": -3, "Active Discretionary": 3},
    "Systematic Trend, Event Driven": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Multistrategy": {"Index Based": -2, "Factor/Smart Beta": -1, "Quant/Systematic": 2, "Multi-Strategy": 3, "Active Discretionary": 3},
    "Equity Market Neutral": {"Index Based": 1, "Factor/Smart Beta": -1, "Quant/Systematic": 3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "Trading--Inverse Commodities, Trading--Leveraged Commodities, Trading--Leveraged Debt, Trading--Leveraged Equity": {"Index Based": 2, "Factor/Smart Beta": -3, "Quant/Systematic": 2, "Multi-Strategy": 1, "Active Discretionary": -3},
    "Relative Value Arbitrage": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 3, "Multi-Strategy": 2, "Active Discretionary": 3},
    "leveraged_fund, inverse_fund": {"Index Based": 3, "Factor/Smart Beta": -3, "Quant/Systematic": 0, "Multi-Strategy": -3, "Active Discretionary": -3},
    "index_based": {"Index Based": 4, "Factor/Smart Beta": 3, "Quant/Systematic": -3, "Multi-Strategy": -3, "Active Discretionary": -3},
    "currency_hedged_fund": {"Index Based": 3, "Factor/Smart Beta": 1, "Quant/Systematic": -2, "Multi-Strategy": -3, "Active Discretionary": 1},
    "socially_responsible_fund": {"Index Based": 3, "Factor/Smart Beta": 2, "Quant/Systematic": -2, "Multi-Strategy": -1, "Active Discretionary": 3},
    "synthetic_replication_fund": {"Index Based": 2, "Factor/Smart Beta": -2, "Quant/Systematic": 2, "Multi-Strategy": -3, "Active Discretionary": -1},
    "fund_of_funds": {"Index Based": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 0, "Multi-Strategy": 3, "Active Discretionary": 2},
    "^PEATR": {"Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 2, "Multi-Strategy": 3, "Active Discretionary": 3}
}

# --- Consolidated Keyword Lists ---
keywords = {
    "Index Based": ["reflect", "correspond", "replicate", "sampling", "replicate", "passive", "underlying index", "vanguard",
                    "benchmark index", "seeks to track", "tracking the performance"],
    "Factor/Smart Beta": ["momentum", "value factor", "quality factor", "minimum volatility", "size", "revenue-weighted",
                          "pure value", "pure growth", "high beta", "technical leaders", "low volatility",
                          "value factors", "cash cows", "alphadex", "low beta", "Enhanced Value", "Sector Neutral Quality Index",
                          "Momentum Value", "avantis", "lower volatility", "strong recent performance", "relatively lower", "relatively higher momentum",
                          "relative to fundamental values", "momentum signals", "positive momentum", " Fidelity U.S. Value Factor",
                          " MSCI USA Momentum", "Low Size"],
    "Active Discretionary": ["long-term growth", "current income", "capital appreciation", "invests primarily",
                             "superior business models", "attractively valued", "undervalued", "sustainable growth",
                             "research", "tax managed", "capital growth", "tactical", "bottom-up",
                             "prudent portfolio management", "active", "advisor"],
    "Quant/Systematic": ["synthetic", "covered call", "covered put", "FLEX", "options", "options strategies",
                         "spreads", "butterflies", "trend following", "futures", "quantitative factors", "machine learning",
                         "long options", "yield curve", "dynamically adjusting", "inverse exposure", "quantitative ranking",
                         "quant analysis", "trade signals", "tail risk", "put options", "box spread", "algorithm-driven",
                         "momentum score", "option income", "convexity", "systematic", "quantitative", "systematic tilt",
                         "implied volatility", "quantitative research", "data-driven", "swaps", "forwards", "currency futures",
                         "volatility futures", "simplify", "rotation"],
    "Multi-Strategy": ["mix of"]
}

# --- Auto-Classify Keywords ---
AUTO_CLASSIFY_KEYWORDS = {
    "Active Discretionary": ["actively managed", "actively-managed", "active management", "active bottom‑up approach",
                            "actively trades", "active management strategy", "actively managed etf",
                            "actively managed fund of funds", "actively managed strategy", "active allocation",
                            "actively allocates", "tactically allocates assets", "active trading",
                            "trade securities actively", "active trading strategy", "bottom‑up approach",
                            "exceptional management", "growth companies",
                            "actively managed mandate", "full discretion", "investment quality score",
                            "actively-managed etf", "fund advisor’s discretion", "actively invests",
                            "fundamental approach", "capital preservation", "research-driven"],
    "Factor/Smart Beta": ["rules-based", "factor-based", "factor tilt", "multi-factor", "factor investing",
                         "rules based methodology", "smart beta", "relative strength",
                         "trendpilot", "RAFI", "minimum volatility", "momentum", "quality factor", "value factor"],
    "Quant/Systematic": ["quantitative", "algorithm-driven", "systematic", "levered", "algorithm", "implied volatility",
                        "rules-based methodology", "equity index futures", "market neutral",
                        "quantitative research", "contrarian strategy", "dynamically allocating",
                        "economic indicators", "artificial intelligence", "trend following",
                        "data-driven", "backtested", "long-short", "model-based",
                        "synthetic covered call"],
    "Index Based": [],
    "Multi-Strategy": ["multi-strategy", "multi-asset", "hybrid strategy", "cautious allocation",
                      "dynamic allocation", "absolute return", "multi-manager", "blended"]
}

# --- Symbol-Based Overrides ---
SYMBOL_CLASSIFICATIONS = {
    "WGIFX": "Active Discretionary",
    "PGWFX": "Multi-Strategy",
    "OTCIX": "Active Discretionary"
}

# --- Database Connection ---
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# --- SQL Query ---
query = """
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy, fs.FS_insight,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.currency_hedged_fund, fs.ycharts_url,
    yc_ba.YC_Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name,
    ff.Dist_Index, ff.Dist_Active, ff.Dist_Rules_Based, ff.Dist_Quant, ff.Dist_Multi
FROM Funds_to_Screen fs
LEFT JOIN FundFamilyData ff ON fs.fund_family = ff.FundFamilyName
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""
df = pd.read_sql(query, engine)

if DRY_RUN:
    df = df[df['SymbolCUSIP'].isin(TEST_FUNDS)]

# --- Function Definitions ---

def score_booleans(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for bool_name, weights in [
        ("index_fund", YC_WEIGHTS.get("index_based", {})),
        ("inverse_fund", YC_WEIGHTS.get("leveraged_fund, inverse_fund", {})),
        ("leveraged_fund", YC_WEIGHTS.get("leveraged_fund, inverse_fund", {})),
        ("socially_responsible_fund", YC_WEIGHTS.get("socially_responsible_fund", {})),
        ("synthetic_replication_fund", YC_WEIGHTS.get("synthetic_replication_fund", {})),
        ("fund_of_funds", YC_WEIGHTS.get("fund_of_funds", {})),
        ("currency_hedged_fund", YC_WEIGHTS.get("currency_hedged_fund", {}))
    ]:
        if bool_name in row and pd.notna(row[bool_name]):
            value = str(row[bool_name]).strip().lower()
            if value in ["true", "1"]:
                # Check text for index intent to avoid misflags
                text = (row['ProductName'].lower() if pd.notna(row['ProductName']) and row['ProductName'] else "") + " " + \
                       (row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) and row['investment_strategy'] else "")
                if bool_name == "index_fund" and any(term in text for term in ["track", "reflect", "replicate", "correspond"]):
                    for cat in RETURN_CATEGORIES:
                        scores[cat] += weights.get(cat, 0) 
                elif bool_name != "index_fund":
                    for cat in RETURN_CATEGORIES:
                        scores[cat] += weights.get(cat, 0)
    return scores

def score_cwa_category(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    cwa_category = row.get('CWA_Broad_Category_Name', 'Unknown')
    weights = CWA_WEIGHTS.get(cwa_category, CWA_WEIGHTS.get('Unknown', {}))
    for cat in RETURN_CATEGORIES:
        scores[cat] += weights.get(cat, 0)
    return scores

def score_yc_category(row):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    yc_category = row.get('Global_Category_Name', 'Unknown')
    if yc_category is None:
        yc_category = 'Unknown'
        print(f"Warning: Global_Category_Name is None for SymbolCUSIP {row.get('SymbolCUSIP', 'Unknown')}, defaulting to 'Unknown'")
    for yc_key, weights in YC_WEIGHTS.items():
        if yc_category in yc_key or any(part.strip() in yc_category for part in yc_key.split(',')):
            for cat in RETURN_CATEGORIES:
                scores[cat] += weights.get(cat, 0)
    return scores

def score_keywords(text, max_points, keywords):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    if not text:
        return scores
    total_hits = 0
    for category, keyword_list in keywords.items():
        hits = sum(keyword in text for keyword in keyword_list)
        total_hits += hits
        if hits > 0:
            scores[category] += min(hits, 3)
    if total_hits > 0:
        for category in RETURN_CATEGORIES:
            scores[category] = scores[category] * (max_points / total_hits) if scores[category] > 0 else 0
    for category, auto_keywords in AUTO_CLASSIFY_KEYWORDS.items():
        for keyword in auto_keywords:
            if keyword in text:
                scores[category] += 10
                break
    return scores

def score_fund_family(row, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    try:
        dist_index = float(row.get('Dist_Index', 0))
        dist_active = float(row.get('Dist_Active', 0))
        dist_rules_based = float(row.get('Dist_Rules_Based', 0))
        dist_quant = float(row.get('Dist_Quant', 0))
        dist_multi = float(row.get('Dist_Multi', 0))
    except (ValueError, TypeError):
        dist_index = dist_active = dist_rules_based = dist_quant = dist_multi = 0

    total_dist = dist_index + dist_active + dist_rules_based + dist_quant + dist_multi
    if total_dist > 0:
        scores["Index Based"] += (dist_index / total_dist) * max_points
        scores["Active Discretionary"] += (dist_active / total_dist) * max_points
        scores["Factor/Smart Beta"] += (dist_rules_based / total_dist) * max_points
        scores["Quant/Systematic"] += (dist_quant / total_dist) * max_points
        scores["Multi-Strategy"] += (dist_multi / total_dist) * max_points
    else:
        scores["Index Based"] += max_points * 0.5
    return scores

def classify_return(row):
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) and row['ProductName'] else ""
    investment_strategy = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) and row['investment_strategy'] else ""
    fs_insight = row['FS_insight'].lower() if pd.notna(row['FS_insight']) and row['FS_insight'] != "Error parsing response" and row['FS_insight'] else ""
    text = product_name + " " + investment_strategy + " " + fs_insight

    bool_scores = score_booleans(row)
    cwa_scores = score_cwa_category(row)
    yc_scores = score_yc_category(row)
    keyword_scores = score_keywords(text, KEYWORDS_WEIGHT, keywords)
    fund_family_scores = score_fund_family(row, FUND_FAMILY_WEIGHT)

    combined_scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for cat in RETURN_CATEGORIES:
        combined_scores[cat] = (bool_scores.get(cat, 0) +
                               cwa_scores.get(cat, 0) +
                               yc_scores.get(cat, 0) +
                               keyword_scores.get(cat, 0) +
                               fund_family_scores.get(cat, 0))

    # Quant boost for Nontraditional/Alternative CWA with options/futures
    cwa_category = row.get('CWA_Broad_Category_Name', 'Unknown')
    if cwa_category in ["Nontraditional", "Alternative"] and any(term in text for term in ["options", "futures"]):
        combined_scores["Quant/Systematic"] += 5

    if any(term in text for term in ["actively managed", "invests primarily"]):
        if not any(term in text for term in ["track", "reflect"]):
            combined_scores["Active Discretionary"] += 10

    max_score = max(combined_scores.values())
    if max_score < MIN_CONFIDENCE_THRESHOLD:
        max_cat = max(combined_scores, key=combined_scores.get)
        combined_scores[max_cat] += 3

    viable_scores = {cat: score for cat, score in combined_scores.items() if score > -3}
    is_defaulted = False
    default_scores = None
    if not viable_scores or max(viable_scores.values()) < MIN_CONFIDENCE_THRESHOLD:
        predicted_category = "Active Discretionary"
        is_defaulted = True
        default_scores = json.dumps(combined_scores)
    else:
        # Tiebreaker logic
        max_categories = [cat for cat, score in viable_scores.items() if score == max(viable_scores.values())]
        if len(max_categories) > 1:
            if "Active Discretionary" in max_categories:
                if "Quant/Systematic" in max_categories:
                    predicted_category = "Active Discretionary"
                elif "Factor/Smart Beta" in max_categories:
                    predicted_category = "Active Discretionary"
                elif "Index Based" in max_categories:
                    predicted_category = "Active Discretionary"
                elif "Multi-Strategy" in max_categories:
                    predicted_category = "Multi-Strategy"
                else:
                    predicted_category = max_categories[0]
            elif "Quant/Systematic" in max_categories:
                if "Factor/Smart Beta" in max_categories:
                    predicted_category = "Quant/Systematic"
                elif "Index Based" in max_categories:
                    predicted_category = "Quant/Systematic"
                elif "Multi-Strategy" in max_categories:
                    predicted_category = "Quant/Systematic"
                else:
                    predicted_category = max_categories[0]
            elif "Factor/Smart Beta" in max_categories and "Index Based" in max_categories:
                predicted_category = "Index Based"
            else:
                predicted_category = max_categories[0]
        else:
            predicted_category = max(viable_scores, key=viable_scores.get)

    symbol = row['SymbolCUSIP']
    if symbol in SYMBOL_CLASSIFICATIONS:
        predicted_category = SYMBOL_CLASSIFICATIONS[symbol]
        is_defaulted = True
        default_scores = json.dumps(combined_scores)

    result = {
        'Return_Category': predicted_category,
        'IsDefaulted': is_defaulted,
        'DefaultScores': default_scores,
        'DebugScores': json.dumps({
            "Boolean": bool_scores,
            "CWA": cwa_scores,
            "YC": yc_scores,
            "Keywords": keyword_scores,
            "FundFamily": fund_family_scores,
            "Combined": combined_scores
        })
    }
    for cat in RETURN_CATEGORIES:
        result[f'Final_{cat}_Score'] = combined_scores[cat]

    return pd.Series(result)

# --- New Function to Update Database ---
def update_database(df, engine):
    update_query = """
    UPDATE Funds_to_Screen
    SET return_driver = :Return_Category
    WHERE SymbolCUSIP = :SymbolCUSIP
    """
    with engine.connect() as connection:
        for _, row in df.iterrows():
            connection.execute(
                text(update_query),
                {
                    "SymbolCUSIP": row['SymbolCUSIP'],
                    "Return_Category": row['Return_Category']
                }
            )
        connection.commit()
    print("Database update complete (return_driver updated).")

# --- Apply Classification ---
result_cols = ['Return_Category', 'IsDefaulted', 'DefaultScores', 'DebugScores'] + [f"Final_{cat}_Score" for cat in RETURN_CATEGORIES]
df[result_cols] = df.apply(classify_return, axis=1)

# --- Prepare Output Excel File ---
output_cols = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Return_Category', 'IsDefaulted', 'DefaultScores', 'DebugScores', 'ycharts_url'
] + [col for col in df.columns if 'Score' in col] + [
    'investment_strategy', 'FS_insight', 'index_fund', 'inverse_fund', 'leveraged_fund', 'socially_responsible_fund',
    'synthetic_replication_fund', 'fund_of_funds', 'currency_hedged_fund', 'YC_Broad_Asset_Class_Name',
    'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name',
    'Dist_Index', 'Dist_Active', 'Dist_Rules_Based', 'Dist_Quant', 'Dist_Multi'
]

version_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_file = f"C:\\Users\\JulianHeron\\Software Projects\\Return Drivers\\classified_return_funds_v7.8_{version_timestamp}.xlsx"
df[output_cols].to_excel(output_file, index=False)

# --- Update Database ---
update_database(df, engine)

# --- Display Classification Distribution ---
distribution = df['Return_Category'].value_counts().to_dict()
print("\nClassification Distribution:")
for category, count in distribution.items():
    print(f"{category}: {count}")

defaulted_count = df['IsDefaulted'].sum()
print(f"\nNumber of funds defaulted to Active Discretionary or overridden: {defaulted_count}")

print(f"Classification complete. Results saved to '{output_file}'.")

# --- Version Naming with Timestamp ---
print(f"Version: 7.8_{version_timestamp}")

Database update complete (return_driver updated).

Classification Distribution:
Active Discretionary: 3617
Index Based: 1480
Factor/Smart Beta: 283
Quant/Systematic: 203
Multi-Strategy: 3

Number of funds defaulted to Active Discretionary or overridden: 3
Classification complete. Results saved to 'C:\Users\JulianHeron\Software Projects\Return Drivers\classified_return_funds_v7.8_2025-03-13_22-35-13.xlsx'.
Version: 7.8_2025-03-13_22-35-13
