In [5]:
# This code successfully mimicked Factset for this call

import requests

def factset_equalsfdsfql_tqqq():
    """
    This script replicates the FactSet equalsfdsfql request exactly as captured in Fiddler.
    It uses every header and cookie from your capture and sends the request in the format 
    the server expects (SYMBOLS and EXPRS instead of FQL_STATE_ID).

    WARNING:
      - These cookies and header values are live session credentials. Handle with care.
      - verify=False disables SSL verification (for testing only).
    """
    
    # Base URL from Fiddler capture
    url = "https://my.apps.factset.com/services/equalsfdsfql"
    
    # Query parameters from the Fiddler capture
    params = {
        "string_na": "true",
        "APP": "DATA_REQUEST_STREAM"
    }
    
    # Headers exactly as in your Fiddler capture (Content-Length is omitted since requests calculates it)
    headers = {
        "Host": "my.apps.factset.com",
        "Connection": "keep-alive",
        "sec-ch-ua-platform": "\"Windows\"",
        "Accept-Language": "en-US",
        "sec-ch-ua": "\"Chromium\";v=\"133\", \"Microsoft Edge WebView2\";v=\"133\", \"Not(A:Brand\";v=\"99\", \"Microsoft Edge\";v=\"133\"",
        "X-Fdsa-Long-Request-Deadline": "29",
        "sec-ch-ua-mobile": "?0",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0",
        "Accept": "application/json, text/plain, */*",
        "Content-Type": "application/x-www-form-urlencoded",
        "Origin": "https://my.apps.factset.com",
        "Sec-Fetch-Site": "same-origin",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Dest": "empty",
        "Accept-Encoding": "gzip, deflate, br, zstd"
    }
    
    # Cookies exactly as in your Fiddler capture.
    cookies = {
        "FdsAuthScheme": "oidc",
        "FdsBrowserId": "6a94fb4a-3b8c-435e-b4ae-6154338c64ee",
        "FdsSessionId": "43cb2419-9e8f-4c76-8b77-ec347e13a69d",
        "AccessedApplications": "FactSet",
        "lightstep_guid%2Fid-widget": "430940a607367703",
        "lightstep_session_id": "0e062d6b44045c9a",
        "TS013442b8": "014dd04807b8df3fac66d3039876db97d9fe1ed4b8a3d0668235353f9b65762a69384c95b511f357169882efca4bbb94e8455cb8f5",
        "TS01742872": "014dd04807b8df3fac66d3039876db97d9fe1ed4b8a3d0668235353f9b65762a69384c95b511f357169882efca4bbb94e8455cb8f5"
    }
    
    # Instead of the FQL_STATE_ID from your capture (which causes a 400 error),
    # we use the expected payload format:
    #   SYMBOLS=<symbol>&EXPRS=<formula>
    # For a ticker like TQQQ-US with the expression ETP_COUNTERPARTY("HOLDS_DERIVATIVES","TEXT")
    data = 'SYMBOLS=TQQQ-US&EXPRS=ETP_COUNTERPARTY("HOLDS_DERIVATIVES","TEXT")'
    
    # Make the POST request with verify=False to bypass SSL verification for testing.
    response = requests.post(
        url,
        params=params,
        headers=headers,
        cookies=cookies,
        data=data,
        verify=False
    )
    
    # Print out response details
    print("HTTP Status Code:", response.status_code)
    print("Response Headers:", response.headers)
    try:
        print("Response JSON:", response.json())
    except Exception:
        print("Response Text:", response.text)

if __name__ == "__main__":
    factset_equalsfdsfql_tqqq()


HTTP Status Code: 200
Response Headers: {'Content-Type': 'application/json; charset=UTF-8', 'Content-Length': '141', 'Connection': 'keep-alive', 'X-Fql-Service-Evaluation-Time': '74.792173', 'X-Fql-Service-State-Id': '5EDEE99D50DEE36E1C01FE3447EA6F0AC1026CCBE029D5E392C6DD2CAAC44A215321B924795086C376951E7AC13E53032BADF2B9397DD83BD229C10F5574D4E26B2CEFF922E51285BC7041322F7118297F0B09808B9F44B18511DF22B4D8CB40919C63C6254E0CAA8F1A6115E57346D3B011A0B82A5B08905EF6FB8AF307893A', 'X-Fql-Service-Version': '1', 'Keep-Alive': 'timeout=30', 'X-DataDirect-Request-Key': '67BF475D1BBB97C3', 'Set-Cookie': 'FdsSessionId=43cb2419-9e8f-4c76-8b77-ec347e13a69d; Path=/; Domain=.factset.com; Expires=Thu, 27-Feb-2025 06:54:53 GMT; Secure; HttpOnly; SameSite=None, AccessedApplications=FactSet; Path=/; Domain=.factset.com; Secure; HttpOnly; SameSite=None, TS013442b8=014dd04807349bd4eac3c5058087e9a43043b5c5ecde27d08b8b0f1143169ebe0a6805477589a6b065e3c976428f8750a4e3823ef3; Path=/; Secure; HTTPOnly, TS01742872=01



In [21]:
import requests
from sqlalchemy import create_engine, text

# Database connection string (adjust as needed)
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)

# Create SQLAlchemy engine
engine = create_engine(connection_string)

### Functions

def fetch_tickers_from_db():
    """Fetch tickers and their FS_insight from the database."""
    with engine.connect() as conn:
        result = conn.execute(text("SELECT SymbolCUSIP, FS_insight FROM Funds_to_Screen"))
        tickers_data = [{"ticker": row[0], "insight": row[1]} for row in result.fetchall()]
    return tickers_data

def update_fs_insight(ticker, insight):
    """Update the FS_insight column for a given ticker."""
    with engine.connect() as conn:
        query = text("UPDATE Funds_to_Screen SET FS_insight = :insight WHERE SymbolCUSIP = :ticker")
        conn.execute(query, {"insight": insight, "ticker": ticker})
        conn.commit()

def factset_equalsfdsfql_for_ticker(ticker):
    """Make a FactSet API call for a specific ticker."""
    url = "https://my.apps.factset.com/services/equalsfdsfql"
    params = {"string_na": "true", "APP": "DATA_REQUEST_STREAM"}
    headers = {
        "Host": "my.apps.factset.com",
        "Connection": "keep-alive",
        "Accept": "application/json, text/plain, */*",
        "Content-Type": "application/x-www-form-urlencoded",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    }
    cookies = {
        "FdsAuthScheme": "oidc",
        "FdsSessionId": "your-session-id-here",  # Replace with your actual session ID
        # Add other cookies as needed
    }
    fql_state_id = "your-fql-state-id-here"  # Replace with your actual FQL state ID
    symbol_exprs = f"{ticker}%5EFFD_INSIGHT"  # URL-encoded '^' as %5E
    data = f"FQL_STATE_ID={fql_state_id}&SYMBOL_EXPRS={symbol_exprs}"

    response = requests.post(
        url, params=params, headers=headers, cookies=cookies, data=data, verify=False
    )

    if response.status_code == 200:
        try:
            json_data = response.json()
            insight = json_data[0]["$value"][0][0] if json_data and "$value" in json_data[0] else "No insight available"
            return insight
        except Exception as e:
            print(f"Error parsing response for {ticker}: {e}")
            return "Error parsing response"
    else:
        print(f"Failed for {ticker}: HTTP {response.status_code}")
        return "API call failed"

def main():
    """Fetch tickers and update insights only if FS_insight is missing."""
    tickers_data = fetch_tickers_from_db()
    for data in tickers_data:
        ticker = data["ticker"]
        insight = data["insight"]
        if insight is None or insight.strip() == "":
            print(f"Processing {ticker} (no existing insight)...")
            new_insight = factset_equalsfdsfql_for_ticker(ticker)
            update_fs_insight(ticker, new_insight)
            print(f"Updated {ticker} with insight: {new_insight[:50]}...")
        else:
            print(f"Skipping {ticker} (insight already exists)")

    print("All tickers have been processed.")

if __name__ == "__main__":
    main()

Skipping NTSX (insight already exists)
Skipping HIPS (insight already exists)
Skipping EAOA (insight already exists)
Skipping AOM (insight already exists)
Skipping AOK (insight already exists)
Skipping EAOK (insight already exists)
Skipping EAOM (insight already exists)
Skipping INCM (insight already exists)
Skipping ICVT (insight already exists)
Skipping ZHDG (insight already exists)
Skipping DYTA (insight already exists)
Skipping LEXI (insight already exists)
Skipping RORO (insight already exists)
Skipping TFPN (insight already exists)
Skipping BAMY (insight already exists)
Skipping CGBL (insight already exists)
Skipping AOR (insight already exists)
Skipping EAOR (insight already exists)
Skipping BAMO (insight already exists)
Skipping GHTA (insight already exists)
Skipping MPRO (insight already exists)
Skipping HCMT (insight already exists)
Skipping THY (insight already exists)
Skipping SPAX (insight already exists)
Skipping FMF (insight already exists)
Skipping CTA (insight already 

In [7]:
# This script is a test for Return Drivers a category in fund categorization to see if scoring will work

import pandas as pd
from sqlalchemy import create_engine

# Define the five fund categories
FUND_CATEGORIES = [
    "Index Based",
    "Factor/Smart Beta",
    "Active Discretionary",
    "Quant/Systematic",
    "Multi-Strategy"
]

# Database connection string
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)

# Create SQLAlchemy engine
engine = create_engine(connection_string)

# SQL query with additional columns
query = """
SELECT 
    fs.SymbolCUSIP,
    fs.ProductName,
    fs.fund_family,
    fs.investment_strategy,
    fs.index_fund,
    fs.inverse_fund,
    fs.leveraged_fund,
    fs.socially_responsible_fund,
    fs.synthetic_replication_fund,
    fs.fund_of_funds,
    fs.ycharts_url,
    yc_ba.Broad_Asset_Class_Name,
    yc_bc.Broad_Category_Name,
    yc_gc.Global_Category_Name,
    yc_c.Category_Name,
    cwa_bc.CWA_Broad_Category_Name,
    ff.Shop_Style,
    ff.Likely_Fund_Type
FROM 
    Funds_to_Screen fs
LEFT JOIN 
    FundFamily_Data ff ON fs.fund_family = ff.Fund_Family
LEFT JOIN 
    YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN 
    YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN 
    YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN 
    YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN 
    CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""

# Load data into a DataFrame
df = pd.read_sql(query, engine)

# Function to score based on keywords in investment_strategy
def keyword_score(description, category):
    if pd.isna(description):
        return 0
    description = description.lower()
    keywords = {
        "Index Based": ["index", "passive", "tracks", "replicates"],
        "Factor/Smart Beta": ["factor", "smart beta", "tilt", "weighted"],
        "Active Discretionary": ["active", "discretionary", "manager", "selection"],
        "Quant/Systematic": ["quantitative", "systematic", "model", "algorithm"],
        "Multi-Strategy": ["multi-strategy", "diversified", "blend", "multiple"]
    }
    category_keywords = keywords.get(category, [])
    return sum(keyword in description for keyword in category_keywords)

# Function to adjust score based on boolean flags
def boolean_adjustment(row, category):
    score = 0
    if category in ["Index Based", "Factor/Smart Beta"] and row['index_fund'] == 'True':
        score += 2
    if category in ["Quant/Systematic", "Multi-Strategy"] and (
        row['leveraged_fund'] == 'True' or row['inverse_fund'] == 'True' or 
        row['synthetic_replication_fund'] == 'True'
    ):
        score += 1
    if category == "Multi-Strategy" and row['fund_of_funds'] == 'True':
        score += 1
    return score

# Function to map category names to the five categories
def category_mapping(row):
    category_cols = [
        'Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name',
        'Category_Name', 'CWA_Broad_Category_Name'
    ]
    for col in category_cols:
        if pd.notna(row[col]):
            value = row[col].lower()
            if "passive" in value or "index" in value:
                return "Index Based"
            elif "smart beta" in value or "factor" in value:
                return "Factor/Smart Beta"
            elif "active" in value:
                return "Active Discretionary"
            elif "quantitative" in value or "systematic" in value:
                return "Quant/Systematic"
            elif "multi" in value or "blend" in value:
                return "Multi-Strategy"
    return None

# Main classification function returning classification and scores
def classify_fund(row):
    scores = {category: 0 for category in FUND_CATEGORIES}
    
    # Score from investment_strategy keywords
    for category in scores:
        scores[category] += keyword_score(row['investment_strategy'], category)
    
    # Adjust scores based on boolean flags
    for category in scores:
        scores[category] += boolean_adjustment(row, category)
    
    # Add weight from category mapping
    mapped_category = category_mapping(row)
    if mapped_category:
        scores[mapped_category] += 3
    
    # Incorporate Likely_Fund_Type from FundFamily_Data
    if pd.notna(row['Likely_Fund_Type']) and row['Likely_Fund_Type'] in FUND_CATEGORIES:
        scores[row['Likely_Fund_Type']] += 2
    
    # Determine classification
    classification = max(scores, key=scores.get)
    
    # Return classification and scores
    return pd.Series({
        'Classification': classification,
        'Index Based Score': scores['Index Based'],
        'Factor/Smart Beta Score': scores['Factor/Smart Beta'],
        'Active Discretionary Score': scores['Active Discretionary'],
        'Quant/Systematic Score': scores['Quant/Systematic'],
        'Multi-Strategy Score': scores['Multi-Strategy']
    })

# Apply classification and add new columns
classification_results = df.apply(classify_fund, axis=1)
df = pd.concat([df, classification_results], axis=1)

# Select columns for the Excel file
columns_to_save = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Classification', 'investment_strategy',
    'index_fund', 'inverse_fund', 'leveraged_fund', 'socially_responsible_fund',
    'synthetic_replication_fund', 'fund_of_funds', 'Broad_Asset_Class_Name',
    'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name',
    'Shop_Style', 'Likely_Fund_Type', 'ycharts_url', 'Index Based Score',
    'Factor/Smart Beta Score', 'Active Discretionary Score', 'Quant/Systematic Score',
    'Multi-Strategy Score'
]

# Save to Excel
df[columns_to_save].to_excel('classified_funds.xlsx', index=False)
print("Results saved to 'classified_funds.xlsx'")

Results saved to 'classified_funds.xlsx'


In [14]:
# updated script to incorporate risk overlays test and the return drivers
# This update also includes the larger keyword list that grok missed in the last test.
# This update also includes logic for big firms that are listed as "both" in the FundFamily_Data style
# Update also scan's fund name for keywords in risk and return drivers

import pandas as pd
from sqlalchemy import create_engine

# Define categories
RETURN_CATEGORIES = ["Index Based", "Factor/Smart Beta", "Active Discretionary", "Quant/Systematic", "Multi-Strategy"]
RISK_CATEGORIES = ["None", "Moderate", "Heavy / Persistent", "Offensive / Levered"]

# Updated keyword lists for return drivers
keywords = {
    "Index Based": [
        "tracks", "replicates", "mirrors", "follows", "indexes", "passive", "benchmark",
        "market-cap weighted", "equal-weighted index", "low tracking error", "high correlation",
        "beta", "etf", "index fund", "closely matches", "minimal deviation", "broad market exposure",
        "cost-efficient", "low-cost", "transparent", "rules-based replication", "market index",
        "sector index", "fixed income index", "commodity index", "currency index", "rules-based", "index",
        "bond index", "aggregate bond", "treasury index", "corporate bond index", "correspond", "underlying index"
    ],
    "Factor/Smart Beta": [
        "factor", "smart beta", "value", "growth", "momentum", "quality", "low volatility",
        "dividend", "fundamental weighting", "equal weighting", "rules-based", "enhanced returns",
        "risk-adjusted", "academic strategies", "tactical", "alternative weighting", "optimized",
        "factor tilt", "multi-factor", "single-factor", "thematic", "sector rotation", "style-based",
        "esg factors", "minimum variance", "quantitative screens", "proprietary index",
        "custom index", "enhanced index", "revenue weighted", "dividend weighted", "yield weighted",
        "dividend yield weighted", "factor weighted", "alternative beta", "tilt"
    ],
    "Active Discretionary": [
        "active", "discretionary", "fundamental", "research-driven", "stock picking",
        "portfolio manager", "judgment", "analysis", "insight", "conviction", "opportunistic",
        "tactical allocation", "high conviction", "concentrated portfolio", "bottom-up", "top-down",
        "macro-driven", "event-driven", "catalyst-driven", "long-term horizon", "short-term trading",
        "active share", "alpha generation", "outperform", "undervalued", "overvalued", "market timing",
        "sector selection", "security selection", "active risk", "active bets", "manager expertise",
        "proprietary research", "in-depth analysis", "qualitative assessment", "quantitative assessment",
        "active bond management", "credit analysis", "duration management", "yield curve positioning",
        "bond selection", "policy", "capital growth", "current income", "manager", "believes", "research",
        "selection", "actively managed", "capital appreciation", "capital preservation", "benchmark", "deemed",
        "select", "manager", "strategically"
    ],
    "Quant/Systematic": [
        "quantitative", "systematic", "algorithm", "model-based", "data-driven", "statistical",
        "machine learning", "ai", "rules-driven", "backtested", "factor models", "risk parity",
        "momentum strategies", "mean-variance optimization", "arbitrage", "high-frequency",
        "low-frequency", "trend-following", "mean-reversion", "quantitative screens",
        "proprietary models", "signal-based", "factor timing", "dynamic factor weighting",
        "quantitative overlay", "automated trading", "algorithmic trading", "robo-advisor",
        "systematic risk management", "quantitative risk models", "factor decomposition",
        "style analysis", "attribution analysis", "monte carlo simulation", "optimization algorithms",
        "long/short", "sell puts", "sell calls", "straddle", "options strategy", "derivatives-based",
        "quantitative fixed income", "bond factor models", "systematic trading", "algorithmic options"
    ],
    "Multi-Strategy": [
        "multi-strategy", "blended", "diverse", "combination", "hybrid", "integrated", "multi-manager",
        "multi-asset", "allocation across strategies", "dynamic allocation", "tactical overlay",
        "risk-managed", "diversified", "cross-asset", "cross-strategy", "strategy rotation", "adaptive",
        "flexible", "multi-style", "balanced", "mixed approach", "overlay strategy", "core-satellite",
        "barbell strategy", "hedged", "long-short", "market-neutral", "absolute return", "relative value",
        "global macro", "managed futures", "alternative beta", "risk premia", "portable alpha",
        "long/short equity", "options overlay", "derivatives hedging", "multi-asset class",
        "cross-strategy allocation", "alternative strategies"
    ]
}

# Shop lookup for "both" styles
SHOP_BOTH_LOOKUP = {
    "T. Rowe Price": {"Active Discretionary": 3, "Index Based": 1, "Quant/Systematic": 1},
    "Fidelity Investments": {"Active Discretionary": 2, "Index Based": 2, "Quant/Systematic": 1},
    "BlackRock": {"Index Based": 2, "Factor/Smart Beta": 2, "Active Discretionary": 1, "Quant/Systematic": 1},
    "Invesco": {"Active Discretionary": 2, "Index Based": 1, "Factor/Smart Beta": 1, "Quant/Systematic": 1},
    "WisdomTree": {"Factor/Smart Beta": 3, "Index Based": 1},
    "First Trust": {"Active Discretionary": 2, "Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "AllianceBernstein": {"Active Discretionary": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "JPMorgan": {"Active Discretionary": 3, "Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "John Hancock": {"Active Discretionary": 2, "Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "Victory Capital": {"Active Discretionary": 2, "Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "Columbia Threadneedle": {"Active Discretionary": 3, "Index Based": 1, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "Federated": {"Active Discretionary": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1},
    "Hartford Mutual Funds": {"Active Discretionary": 2, "Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "Allspring Global Investments": {"Active Discretionary": 3, "Index Based": 1, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1}
}

# Database connection
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# SQL query
query = """
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.ycharts_url,
    yc_ba.Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name, ff.Shop_Style, ff.Likely_Fund_Type
FROM Funds_to_Screen fs
LEFT JOIN FundFamily_Data ff ON fs.fund_family = ff.Fund_Family
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""
df = pd.read_sql(query, engine)

# --- Return Generation Classification ---

def keyword_score_return(row, category):
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) else ""
    description = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) else ""
    text = product_name + " " + description
    return sum(keyword in text for keyword in keywords.get(category, []))

def boolean_adjustment_return(row, category):
    score = 0
    if category in ["Index Based", "Factor/Smart Beta"] and row['index_fund'] == 'True':
        score += 2
    if category in ["Quant/Systematic", "Multi-Strategy"] and (
        row['leveraged_fund'] == 'True' or row['inverse_fund'] == 'True' or 
        row['synthetic_replication_fund'] == 'True'
    ):
        score += 1
    if category == "Multi-Strategy" and row['fund_of_funds'] == 'True':
        score += 1
    return score

def category_mapping_return(row):
    category_cols = ['Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name']
    scores = {category: 0 for category in RETURN_CATEGORIES}
    for col in category_cols:
        if pd.notna(row[col]):
            value = row[col].lower()
            if "trading tool" in value or value in ["quantitative/tactical", "systematic trend", "tactical allocation", "trading tools", "options trading"]:
                scores["Quant/Systematic"] += 3
            elif value in ["municipal", "muni national", "muni single state"]:
                scores["Index Based"] += 2
                scores["Active Discretionary"] += 2
            elif value in ["nontraditional", "nontraditional bond"]:
                scores["Quant/Systematic"] += 2
                scores["Multi-Strategy"] += 2
                scores["Active Discretionary"] += 1
                scores["Factor/Smart Beta"] += 1
            elif value in ["sector/industry", "communications", "energy", "financials", "healthcare", "industrials", "technology", "utilities"]:
                scores["Index Based"] += 3
            elif value in ["single stock"]:
                scores["Index Based"] += 2
                scores["Quant/Systematic"] += 2
            elif value in ["target maturity", "muni target maturity"]:
                scores["Index Based"] += 3
            elif value in ["digital asset", "digital assets"]:
                scores["Index Based"] += 3
                scores["Factor/Smart Beta"] += 1
                scores["Multi-Strategy"] += 1
            elif value in ["commodity", "commodities broad basket", "commodities specified"]:
                if row['CWA_Broad_Category_Name'] == "Commodity":
                    scores["Index Based"] += 3
                elif "active" in row.get('investment_strategy', '').lower():
                    scores["Active Discretionary"] += 3
                elif "quantitative" in row.get('investment_strategy', '').lower():
                    scores["Quant/Systematic"] += 3
            elif value in ["alternative", "defined outcome", "multialternative", "multistrategy"]:
                scores["Multi-Strategy"] += 3
            elif value in ["currency", "single currency"]:
                scores["Quant/Systematic"] += 3
            elif value in ["strategic", "bond strategy"]:
                scores["Active Discretionary"] += 3
    max_score = max(scores.values())
    if max_score > 0:
        return max(scores, key=scores.get)
    return None

def classify_return(row):
    scores = {category: 0 for category in RETURN_CATEGORIES}
    for category in scores:
        scores[category] += keyword_score_return(row, category)
        scores[category] += boolean_adjustment_return(row, category)
    mapped_category = category_mapping_return(row)
    if mapped_category:
        scores[mapped_category] += 2
    if pd.notna(row['Shop_Style']) and row['Shop_Style'].lower() == "both":
        shop_weights = SHOP_BOTH_LOOKUP.get(row['fund_family'], {})
        for category, weight in shop_weights.items():
            if category in RETURN_CATEGORIES:
                scores[category] += weight
    else:
        if pd.notna(row['Likely_Fund_Type']) and row['Likely_Fund_Type'] in RETURN_CATEGORIES:
            scores[row['Likely_Fund_Type']] += 2
    return pd.Series({
        'Return_Category': max(scores, key=scores.get),
        'Index Based Score': scores['Index Based'],
        'Factor/Smart Beta Score': scores['Factor/Smart Beta'],
        'Active Discretionary Score': scores['Active Discretionary'],
        'Quant/Systematic Score': scores['Quant/Systematic'],
        'Multi-Strategy Score': scores['Multi-Strategy']
    })

# --- Risk Management Overlay Classification ---

def keyword_score_risk(row, category):
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) else ""
    description = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) else ""
    text = product_name + " " + description
    keywords = {
        "None": ["long-only", "no derivatives", "no short", "straight equity"],
        "Moderate": ["hedging", "derivatives", "borrowings", "contracts", "futures", "options", "partial hedge", "covered call", "duration management", "hedge"],
        "Heavy / Persistent": ["persistent hedges", "borrowings", "contracts", "futures", "tail-risk", "protective", "overlay", "derivatives", "systematic hedging", "hedge"],
        "Offensive / Levered": ["leverage", "amplified", "geared", "borrowings", "contracts", "futures", "derivatives", "2x", "3x", "hedge"]
    }
    return sum(keyword in text for keyword in keywords.get(category, []))

def boolean_adjustment_risk(row, category):
    score = 0
    if category == "Offensive / Levered" and row['leveraged_fund'] == 'True':
        score += 3
    if category in ["Heavy / Persistent", "Offensive / Levered"] and row['inverse_fund'] == 'True':
        score += 2
    if category in ["Moderate", "Heavy / Persistent"] and row['synthetic_replication_fund'] == 'True':
        score += 1
    if category == "None" and all(row[col] == 'False' for col in ['leveraged_fund', 'inverse_fund', 'synthetic_replication_fund']):
        score += 2
    return score

def category_mapping_risk(row):
    category_cols = ['Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name']
    for col in category_cols:
        if pd.notna(row[col]):
            value = row[col].lower()
            if value in ["us equity", "municipal", "taxable fixed income", "target date"]:
                return "None"
            elif value in ["allocation", "bond strategy", "strategic", "target maturity", "convertibles"]:
                return "Moderate"
            elif value in ["alternative", "defined outcome", "nontraditional", "specialty", "derivative income"]:
                return "Heavy / Persistent"
            elif value in ["trading/tactical", "quantitative/tactical", "commodity", "currency", "digital asset", "trading--leveraged", "trading--inverse"]:
                return "Offensive / Levered"
    return None

def classify_risk(row):
    scores = {category: 0 for category in RISK_CATEGORIES}
    for category in scores:
        scores[category] += keyword_score_risk(row, category)
        scores[category] += boolean_adjustment_risk(row, category)
    mapped_category = category_mapping_risk(row)
    if mapped_category:
        scores[mapped_category] += 2
    if pd.notna(row['Shop_Style']):
        if row['Shop_Style'] == 'Quant' and row['Likely_Fund_Type'] in ['Quant/Systematic', 'Multi-Strategy']:
            scores['Heavy / Persistent'] += 1
            scores['Offensive / Levered'] += 1
        elif row['Shop_Style'] == 'Active':
            scores['None'] += 1
    return pd.Series({
        'Risk_Overlay': max(scores, key=scores.get),
        'None Score': scores['None'],
        'Moderate Score': scores['Moderate'],
        'Heavy / Persistent Score': scores['Heavy / Persistent'],
        'Offensive / Levered Score': scores['Offensive / Levered']
    })

# Apply classifications
return_results = df.apply(classify_return, axis=1)
df = pd.concat([df, return_results], axis=1)
risk_results = df.apply(classify_risk, axis=1)
df = pd.concat([df, risk_results], axis=1)

# Select columns for Excel output
columns_to_save = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Return_Category', 'Risk_Overlay',
    'ycharts_url','investment_strategy', 'index_fund', 'inverse_fund', 'leveraged_fund',
    'socially_responsible_fund', 'synthetic_replication_fund', 'fund_of_funds',
    'Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name',
    'Category_Name', 'CWA_Broad_Category_Name', 'Shop_Style', 'Likely_Fund_Type',
    'Index Based Score', 'Factor/Smart Beta Score', 'Active Discretionary Score',
    'Quant/Systematic Score', 'Multi-Strategy Score', 'None Score', 'Moderate Score',
    'Heavy / Persistent Score', 'Offensive / Levered Score'
]

# Save to Excel
df[columns_to_save].to_excel('classified_funds_v3.xlsx', index=False)
print("Results saved to 'classified_funds_v3.xlsx'")

Results saved to 'classified_funds_v3.xlsx'


In [None]:
# updated script to incorporate risk overlays test and the return drivers
# This update also includes the larger keyword list that grok missed in the last test.
# This update also includes logic for big firms that are listed as "both" in the FundFamily_Data style
# Update also scan's fund name for keywords in risk and return drivers
# This update added the ability to skew shop overlay likelihoods

import pandas as pd
from sqlalchemy import create_engine

# Define categories
RETURN_CATEGORIES = ["Index Based", "Factor/Smart Beta", "Active Discretionary", "Quant/Systematic", "Multi-Strategy"]
RISK_CATEGORIES = ["None", "Moderate", "Heavy / Persistent", "Offensive / Levered"]

# Updated keyword lists for return drivers
return_keywords = {
    "Index Based": [
        "tracks", "replicates", "mirrors", "follows", "indexes", "passive", "benchmark",
        "market-cap weighted", "equal-weighted index", "low tracking error", "high correlation",
        "beta", "etf", "index fund", "closely matches", "minimal deviation", "broad market exposure",
        "cost-efficient", "low-cost", "transparent", "rules-based replication", "market index",
        "sector index", "fixed income index", "commodity index", "currency index", "rules-based", "index",
        "bond index", "aggregate bond", "treasury index", "corporate bond index", "correspond", "underlying index"
    ],
    "Factor/Smart Beta": [
        "factor", "smart beta", "value", "growth", "momentum", "quality", "low volatility",
        "dividend", "fundamental weighting", "equal weighting", "rules-based", "enhanced returns",
        "risk-adjusted", "academic strategies", "tactical", "alternative weighting", "optimized",
        "factor tilt", "multi-factor", "single-factor", "thematic", "sector rotation", "style-based",
        "esg factors", "minimum variance", "quantitative screens", "proprietary index",
        "custom index", "enhanced index", "revenue weighted", "dividend weighted", "yield weighted",
        "dividend yield weighted", "factor weighted", "alternative beta", "tilt"
    ],
    "Active Discretionary": [
        "active", "discretionary", "fundamental", "research-driven", "stock picking",
        "portfolio manager", "judgment", "analysis", "insight", "conviction", "opportunistic",
        "tactical allocation", "high conviction", "concentrated portfolio", "bottom-up", "top-down",
        "macro-driven", "event-driven", "catalyst-driven", "long-term horizon", "short-term trading",
        "active share", "alpha generation", "outperform", "undervalued", "overvalued", "market timing",
        "sector selection", "security selection", "active risk", "active bets", "manager expertise",
        "proprietary research", "in-depth analysis", "qualitative assessment", "quantitative assessment",
        "active bond management", "credit analysis", "duration management", "yield curve positioning",
        "bond selection", "policy", "capital growth", "current income", "manager", "believes", "research",
        "selection", "strategically", "managed"
    ],
    "Quant/Systematic": [
        "quantitative", "systematic", "algorithm", "model-based", "data-driven", "statistical",
        "machine learning", "ai", "rules-driven", "backtested", "factor models", "risk parity",
        "momentum strategies", "mean-variance optimization", "arbitrage", "high-frequency",
        "low-frequency", "trend-following", "mean-reversion", "quantitative screens",
        "proprietary models", "signal-based", "factor timing", "dynamic factor weighting",
        "quantitative overlay", "automated trading", "algorithmic trading", "robo-advisor",
        "systematic risk management", "quantitative risk models", "factor decomposition",
        "style analysis", "attribution analysis", "monte carlo simulation", "optimization algorithms",
        "long/short", "sell puts", "sell calls", "straddle", "options strategy", "derivatives-based",
        "quantitative fixed income", "bond factor models", "systematic trading", "algorithmic options"
    ],
    "Multi-Strategy": [
        "multi-strategy", "blended", "diverse", "combination", "hybrid", "integrated", "multi-manager",
        "multi-asset", "allocation across strategies", "dynamic allocation", "tactical overlay",
        "risk-managed", "diversified", "cross-asset", "cross-strategy", "strategy rotation", "adaptive",
        "flexible", "multi-style", "balanced", "mixed approach", "overlay strategy", "core-satellite",
        "barbell strategy", "hedged", "long-short", "market-neutral", "absolute return", "relative value",
        "global macro", "managed futures", "alternative beta", "risk premia", "portable alpha",
        "long/short equity", "options overlay", "derivatives hedging", "multi-asset class",
        "cross-strategy allocation", "alternative strategies"
    ]
}

# Updated keyword lists for risk overlays
risk_keywords = {
    "None": ["long-only", "no derivatives", "no short", "straight equity"],
    "Moderate": ["hedging", "derivatives", "borrowings", "contracts", "futures", "options", "partial hedge", "covered call", "duration management", "hedge"],
    "Heavy / Persistent": ["persistent hedges", "borrowings", "contracts", "futures", "tail-risk", "protective", "overlay", "derivatives", "systematic hedging", "hedge"],
    "Offensive / Levered": ["leverage", "amplified", "geared", "borrowings", "contracts", "futures", "derivatives", "2x", "3x", "hedge"]
}

# Updated shop lookup for return drivers
SHOP_BOTH_LOOKUP = {
    "T. Rowe Price": {"Active Discretionary": 3, "Index Based": 1, "Quant/Systematic": 1},
    "Fidelity Investments": {"Active Discretionary": 2, "Index Based": 2, "Quant/Systematic": 1},
    "BlackRock": {"Index Based": 2, "Factor/Smart Beta": 2, "Active Discretionary": 1, "Quant/Systematic": 1},
    "Invesco": {"Active Discretionary": 2, "Index Based": 1, "Factor/Smart Beta": 1, "Quant/Systematic": 1},
    "WisdomTree": {"Factor/Smart Beta": 3, "Index Based": 1},
    "First Trust": {"Active Discretionary": 2, "Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "AllianceBernstein": {"Active Discretionary": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "JPMorgan": {"Active Discretionary": 3, "Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "John Hancock": {"Active Discretionary": 2, "Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "Victory Capital": {"Active Discretionary": 2, "Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "Columbia Threadneedle": {"Active Discretionary": 3, "Index Based": 1, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "Federated": {"Active Discretionary": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1},
    "Hartford Mutual Funds": {"Active Discretionary": 2, "Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "Allspring Global Investments": {"Active Discretionary": 3, "Index Based": 1, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1}
}

# New shop lookup for risk overlays (rough estimates, adjust as needed)
SHOP_RISK_LOOKUP = {
    "Simplify": {"None": 0, "Moderate": 2, "Heavy / Persistent": 1, "Offensive / Levered": 1},
    "Guggenheim Investments": {"None": 0, "Moderate": 1, "Heavy / Persistent": 1, "Offensive / Levered": 1},
    "ProShares": {"None": 0, "Moderate": 1, "Heavy / Persistent": 1, "Offensive / Levered": 1},
    "Direxion Funds": {"None": 1, "Moderate": 1, "Heavy / Persistent": 1, "Offensive / Levered": 0},
}

# Database connection
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# SQL query
query = """
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.ycharts_url,
    yc_ba.Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name, ff.Shop_Style, ff.Likely_Fund_Type
FROM Funds_to_Screen fs
LEFT JOIN FundFamily_Data ff ON fs.fund_family = ff.Fund_Family
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""
df = pd.read_sql(query, engine)

# --- Return Generation Classification ---

def keyword_score_return(row, category):
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) else ""
    description = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) else ""
    text = product_name + " " + description
    return sum(keyword in text for keyword in return_keywords.get(category, []))

def boolean_adjustment_return(row, category):
    score = 0
    if category in ["Index Based", "Factor/Smart Beta"] and row['index_fund'] == 'True':
        score += 2
    if category in ["Quant/Systematic", "Multi-Strategy"] and (
        row['leveraged_fund'] == 'True' or row['inverse_fund'] == 'True' or 
        row['synthetic_replication_fund'] == 'True'
    ):
        score += 1
    if category == "Multi-Strategy" and row['fund_of_funds'] == 'True':
        score += 1
    return score

def category_mapping_return(row):
    category_cols = ['Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name']
    scores = {category: 0 for category in RETURN_CATEGORIES}
    bond_related = ["municipal", "muni national", "muni single state", "global bond", "taxable fixed income", "fixed income"]
    
    for col in category_cols:
        if pd.notna(row[col]):
            value = row[col].lower()
            # Bond-related categories (except "bond strategy")
            if value in bond_related and value != "bond strategy":
                scores["Index Based"] += 1
                scores["Active Discretionary"] += 1
            # Specific actionable tags
            if "trading tool" in value or value in ["quantitative/tactical", "systematic trend", "tactical allocation", "trading tools", "options trading"]:
                scores["Quant/Systematic"] += 3
            elif value in ["municipal", "muni national", "muni single state"]:
                scores["Index Based"] += 1
                scores["Active Discretionary"] += 2
            elif value in ["nontraditional", "nontraditional bond"]:
                scores["Quant/Systematic"] += 2
                scores["Multi-Strategy"] += 2
                scores["Active Discretionary"] += 1
                scores["Factor/Smart Beta"] += 1
            elif value in ["sector/industry", "communications", "energy", "financials", "healthcare", "industrials", "technology", "utilities"]:
                scores["Index Based"] += 3
            elif value in ["single stock"]:
                scores["Index Based"] += 2
                scores["Quant/Systematic"] += 2
            elif value in ["target maturity", "muni target maturity"]:
                scores["Index Based"] += 3
            elif value in ["digital asset", "digital assets"]:
                scores["Index Based"] += 3
                scores["Factor/Smart Beta"] += 1
                scores["Multi-Strategy"] += 1
            elif value in ["commodity", "commodities broad basket", "commodities specified"]:
                if row['CWA_Broad_Category_Name'] == "Commodity":
                    scores["Index Based"] += 3
                elif "active" in row.get('investment_strategy', '').lower():
                    scores["Active Discretionary"] += 3
                elif "quantitative" in row.get('investment_strategy', '').lower():
                    scores["Quant/Systematic"] += 3
            elif value in ["alternative", "defined outcome", "multialternative", "multistrategy"]:
                scores["Multi-Strategy"] += 3
            elif value in ["currency", "single currency"]:
                scores["Quant/Systematic"] += 3
            elif value in ["strategic", "bond strategy"]:
                scores["Active Discretionary"] += 3
    
    max_score = max(scores.values())
    if max_score > 0:
        return max(scores, key=scores.get)
    return None

def classify_return(row):
    scores = {category: 0 for category in RETURN_CATEGORIES}
    for category in scores:
        scores[category] += keyword_score_return(row, category)
        scores[category] += boolean_adjustment_return(row, category)
    mapped_category = category_mapping_return(row)
    if mapped_category:
        scores[mapped_category] += 2
    if pd.notna(row['Shop_Style']) and row['Shop_Style'].lower() == "both":
        shop_weights = SHOP_BOTH_LOOKUP.get(row['fund_family'], {})
        for category, weight in shop_weights.items():
            if category in RETURN_CATEGORIES:
                scores[category] += weight
    else:
        if pd.notna(row['Likely_Fund_Type']) and row['Likely_Fund_Type'] in RETURN_CATEGORIES:
            scores[row['Likely_Fund_Type']] += 2
    return pd.Series({
        'Return_Category': max(scores, key=scores.get),
        'Index Based Score': scores['Index Based'],
        'Factor/Smart Beta Score': scores['Factor/Smart Beta'],
        'Active Discretionary Score': scores['Active Discretionary'],
        'Quant/Systematic Score': scores['Quant/Systematic'],
        'Multi-Strategy Score': scores['Multi-Strategy']
    })

# --- Risk Management Overlay Classification ---

def keyword_score_risk(row, category):
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) else ""
    description = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) else ""
    text = product_name + " " + description
    return sum(keyword in text for keyword in risk_keywords.get(category, []))

def boolean_adjustment_risk(row, category):
    score = 0
    if category == "Offensive / Levered" and row['leveraged_fund'] == 'True':
        score += 3
    if category in ["Heavy / Persistent", "Offensive / Levered"] and row['inverse_fund'] == 'True':
        score += 2
    if category in ["Moderate", "Heavy / Persistent"] and row['synthetic_replication_fund'] == 'True':
        score += 1
    if category == "None" and all(row[col] == 'False' for col in ['leveraged_fund', 'inverse_fund', 'synthetic_replication_fund']):
        score += 2
    return score

def category_mapping_risk(row):
    category_cols = ['Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name']
    for col in category_cols:
        if pd.notna(row[col]):
            value = row[col].lower()
            if value in ["us equity", "municipal", "taxable fixed income", "target date"]:
                return "None"
            elif value in ["allocation", "bond strategy", "strategic", "target maturity", "convertibles"]:
                return "Moderate"
            elif value in ["alternative", "defined outcome", "nontraditional", "specialty", "derivative income"]:
                return "Heavy / Persistent"
            elif value in ["trading/tactical", "quantitative/tactical", "commodity", "currency", "digital asset", "trading--leveraged", "trading--inverse"]:
                return "Offensive / Levered"
    return None

def classify_risk(row):
    scores = {category: 0 for category in RISK_CATEGORIES}
    for category in scores:
        scores[category] += keyword_score_risk(row, category)
        scores[category] += boolean_adjustment_risk(row, category)
    mapped_category = category_mapping_risk(row)
    if mapped_category:
        scores[mapped_category] += 2
    if pd.notna(row['Shop_Style']) and row['Shop_Style'].lower() == "both":
        shop_weights = SHOP_RISK_LOOKUP.get(row['fund_family'], {})
        for category, weight in shop_weights.items():
            if category in RISK_CATEGORIES:
                scores[category] += weight
    elif pd.notna(row['Shop_Style']):
        if row['Shop_Style'] == 'Quant' and row['Likely_Fund_Type'] in ['Quant/Systematic', 'Multi-Strategy']:
            scores['Heavy / Persistent'] += 1
            scores['Offensive / Levered'] += 1
        elif row['Shop_Style'] == 'Active':
            scores['None'] += 1
    return pd.Series({
        'Risk_Overlay': max(scores, key=scores.get),
        'None Score': scores['None'],
        'Moderate Score': scores['Moderate'],
        'Heavy / Persistent Score': scores['Heavy / Persistent'],
        'Offensive / Levered Score': scores['Offensive / Levered']
    })

# Apply classifications
return_results = df.apply(classify_return, axis=1)
df = pd.concat([df, return_results], axis=1)
risk_results = df.apply(classify_risk, axis=1)
df = pd.concat([df, risk_results], axis=1)

# Select columns for Excel output with ycharts_url moved after Risk_Overlay
columns_to_save = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Return_Category', 'Risk_Overlay', 'ycharts_url',
    'investment_strategy', 'index_fund', 'inverse_fund', 'leveraged_fund',
    'socially_responsible_fund', 'synthetic_replication_fund', 'fund_of_funds',
    'Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name',
    'Category_Name', 'CWA_Broad_Category_Name', 'Shop_Style', 'Likely_Fund_Type',
    'Index Based Score', 'Factor/Smart Beta Score', 'Active Discretionary Score',
    'Quant/Systematic Score', 'Multi-Strategy Score', 'None Score', 'Moderate Score',
    'Heavy / Persistent Score', 'Offensive / Levered Score'
]

# Save to Excel with new filename
df[columns_to_save].to_excel('classified_funds_v3.xlsx', index=False)
print("Results saved to 'classified_funds_v3.xlsx'")

In [16]:
# Separated Return Drivers Script
# updated script to incorporate risk overlays test and the return drivers
# This update also includes the larger keyword list that grok missed in the last test.
# This update also includes logic for big firms that are listed as "both" in the FundFamily_Data style
# Update also scan's fund name for keywords in risk and return drivers
# This update added the ability to skew shop overlay likelihoods

import pandas as pd
from sqlalchemy import create_engine

# Define return generation categories
RETURN_CATEGORIES = ["Index Based", "Factor/Smart Beta", "Active Discretionary", "Quant/Systematic", "Multi-Strategy"]

# Keyword lists for return drivers
return_keywords = {
    "Index Based": [
        "tracks", "replicates", "mirrors", "follows", "indexes", "passive", "benchmark",
        "market-cap weighted", "equal-weighted index", "low tracking error", "high correlation",
        "beta", "etf", "index fund", "closely matches", "minimal deviation", "broad market exposure",
        "cost-efficient", "low-cost", "transparent", "rules-based replication", "market index",
        "sector index", "fixed income index", "commodity index", "currency index", "rules-based", "index",
        "bond index", "aggregate bond", "treasury index", "corporate bond index", "correspond", "underlying index"
    ],
    "Factor/Smart Beta": [
        "factor", "smart beta", "value", "growth", "momentum", "quality", "low volatility",
        "dividend", "fundamental weighting", "equal weighting", "rules-based", "enhanced returns",
        "risk-adjusted", "academic strategies", "tactical", "alternative weighting", "optimized",
        "factor tilt", "multi-factor", "single-factor", "thematic", "sector rotation", "style-based",
        "esg factors", "minimum variance", "quantitative screens", "proprietary index",
        "custom index", "enhanced index", "revenue weighted", "dividend weighted", "yield weighted",
        "dividend yield weighted", "factor weighted", "alternative beta", "tilt", "ishares"
    ],
    "Active Discretionary": [
        "active", "discretionary", "fundamental", "research-driven", "stock picking",
        "portfolio manager", "judgment", "analysis", "insight", "conviction", "opportunistic",
        "tactical allocation", "high conviction", "concentrated portfolio", "bottom-up", "top-down",
        "macro-driven", "event-driven", "catalyst-driven", "long-term horizon", "short-term trading",
        "active share", "alpha generation", "outperform", "undervalued", "overvalued", "market timing",
        "sector selection", "security selection", "active risk", "active bets", "manager expertise",
        "proprietary research", "in-depth analysis", "qualitative assessment", "quantitative assessment",
        "active bond management", "credit analysis", "duration management", "yield curve positioning",
        "bond selection", "policy", "capital growth", "current income", "manager", "believes", "research",
        "selection", "strategically", "managed", "actively managed", "blackrock"
    ],
    "Quant/Systematic": [
        "quantitative", "systematic", "algorithm", "model-based", "data-driven", "statistical",
        "machine learning", "ai", "rules-driven", "backtested", "factor models", "risk parity",
        "momentum strategies", "mean-variance optimization", "arbitrage", "high-frequency",
        "low-frequency", "trend-following", "mean-reversion", "quantitative screens",
        "proprietary models", "signal-based", "factor timing", "dynamic factor weighting",
        "quantitative overlay", "automated trading", "algorithmic trading", "robo-advisor",
        "systematic risk management", "quantitative risk models", "factor decomposition",
        "style analysis", "attribution analysis", "monte carlo simulation", "optimization algorithms",
        "long/short", "sell puts", "sell calls", "straddle", "options strategy", "derivatives-based",
        "quantitative fixed income", "bond factor models", "systematic trading", "algorithmic options"
    ],
    "Multi-Strategy": [
        "multi-strategy", "blended", "diverse", "combination", "hybrid", "integrated", "multi-manager",
        "multi-asset", "allocation across strategies", "dynamic allocation", "tactical overlay",
        "risk-managed", "diversified", "cross-asset", "cross-strategy", "strategy rotation", "adaptive",
        "flexible", "multi-style", "balanced", "mixed approach", "overlay strategy", "core-satellite",
        "barbell strategy", "hedged", "long-short", "market-neutral", "absolute return", "relative value",
        "global macro", "managed futures", "alternative beta", "risk premia", "portable alpha",
        "long/short equity", "options overlay", "derivatives hedging", "multi-asset class",
        "cross-strategy allocation", "alternative strategies"
    ]
}

# Shop lookup for return drivers
SHOP_BOTH_LOOKUP = {
    "T. Rowe Price": {"Active Discretionary": 3, "Index Based": 1, "Quant/Systematic": 1},
    "Fidelity Investments": {"Active Discretionary": 2, "Index Based": 2, "Quant/Systematic": 1},
    "BlackRock": {"Index Based": 2, "Factor/Smart Beta": 2, "Active Discretionary": 1, "Quant/Systematic": 1},
    "Invesco": {"Active Discretionary": 2, "Index Based": 1, "Factor/Smart Beta": 1, "Quant/Systematic": 1},
    "WisdomTree": {"Factor/Smart Beta": 3, "Index Based": 1},
    "First Trust": {"Active Discretionary": 2, "Index Based": 1, "Factor/Smart Beta": 3, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "AllianceBernstein": {"Active Discretionary": 3, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "JPMorgan": {"Active Discretionary": 3, "Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "John Hancock": {"Active Discretionary": 2, "Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "Victory Capital": {"Active Discretionary": 2, "Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "Columbia Threadneedle": {"Active Discretionary": 3, "Index Based": 1, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "Federated": {"Active Discretionary": 2, "Factor/Smart Beta": 1, "Quant/Systematic": 1},
    "Hartford Mutual Funds": {"Active Discretionary": 2, "Index Based": 1, "Factor/Smart Beta": 2, "Quant/Systematic": 1, "Multi-Strategy": 1},
    "Allspring Global Investments": {"Active Discretionary": 3, "Index Based": 1, "Factor/Smart Beta": 1, "Quant/Systematic": 1, "Multi-Strategy": 1}
    "American Funds": {"Active Discretionary": 2},
    "AMG Funds": {"Active Discretionary": 2},
    "AQR Funds": {"Quant/Systematic": 1},
    "Amundi US": {"Active Discretionary": 1},
    "AQR Funds": {"Quant/Systematic": 2},
}

# Category lookup for return drivers
CATEGORY_LOOKUP = {
    "trading tool": {"Quant/Systematic": 3},
    "quantitative/tactical": {"Quant/Systematic": 3},
    "systematic trend": {"Quant/Systematic": 3},
    "tactical allocation": {"Quant/Systematic": 3},
    "trading tools": {"Quant/Systematic": 3},
    "options trading": {"Quant/Systematic": 3},
    "municipal": {"Index Based": 1, "Active Discretionary": 2},  # Reduced Index from +2 to +1
    "muni national": {"Index Based": 1, "Active Discretionary": 2},
    "muni single state": {"Index Based": 1, "Active Discretionary": 2},
    "nontraditional": {"Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 1, "Factor/Smart Beta": 1},
    "nontraditional bond": {"Quant/Systematic": 2, "Multi-Strategy": 2, "Active Discretionary": 1, "Factor/Smart Beta": 1},
    "sector/industry": {"Index Based": 3},
    "communications": {"Index Based": 3},
    "energy": {"Index Based": 3},
    "financials": {"Index Based": 3},
    "healthcare": {"Index Based": 3},
    "industrials": {"Index Based": 3},
    "technology": {"Index Based": 3},
    "utilities": {"Index Based": 3},
    "single stock": {"Index Based": 2, "Quant/Systematic": 2},
    "target maturity": {"Index Based": 3},
    "muni target maturity": {"Index Based": 3},
    "digital asset": {"Index Based": 3, "Factor/Smart Beta": 1, "Multi-Strategy": 1},
    "digital assets": {"Index Based": 3, "Factor/Smart Beta": 1, "Multi-Strategy": 1},
    "commodity": {"Index Based": 3},  # Conditional logic handled separately
    "commodities broad basket": {"Index Based": 3},
    "commodities specified": {"Index Based": 3},
    "alternative": {"Multi-Strategy": 3},
    "defined outcome": {"Multi-Strategy": 3},
    "multialternative": {"Multi-Strategy": 3},
    "multistrategy": {"Multi-Strategy": 3},
    "currency": {"Quant/Systematic": 3},
    "single currency": {"Quant/Systematic": 3},
    "strategic": {"Active Discretionary": 3},
    "bond strategy": {"Active Discretionary": 3},
    "global bond": {"Index Based": 1, "Active Discretionary": 1},
    "taxable fixed income": {"Index Based": 1, "Active Discretionary": 1},
    "fixed income": {"Index Based": 1, "Active Discretionary": 1}
}

# Database connection
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# SQL query
query = """
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.ycharts_url,
    yc_ba.Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name, ff.Shop_Style, ff.Likely_Fund_Type
FROM Funds_to_Screen fs
LEFT JOIN FundFamily_Data ff ON fs.fund_family = ff.Fund_Family
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""
df = pd.read_sql(query, engine)

# --- Return Generation Classification ---

def keyword_score_return(row, category):
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) else ""
    description = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) else ""
    text = product_name + " " + description
    return sum(keyword in text for keyword in return_keywords.get(category, []))

def boolean_adjustment_return(row, category):
    score = 0
    if category in ["Index Based", "Factor/Smart Beta"] and row['index_fund'] == 'True':
        score += 2
    if category in ["Quant/Systematic", "Multi-Strategy"] and (
        row['leveraged_fund'] == 'True' or row['inverse_fund'] == 'True' or 
        row['synthetic_replication_fund'] == 'True'
    ):
        score += 1
    if category == "Multi-Strategy" and row['fund_of_funds'] == 'True':
        score += 1
    return score

def category_mapping_return(row):
    category_cols = ['Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name']
    scores = {category: 0 for category in RETURN_CATEGORIES}
    
    for col in category_cols:
        if pd.notna(row[col]):
            value = row[col].lower()
            for keyword, category_weights in CATEGORY_LOOKUP.items():
                if keyword in value:
                    for cat, points in category_weights.items():
                        scores[cat] += points
            # Special case for "commodity" conditional logic
            if value in ["commodity", "commodities broad basket", "commodities specified"]:
                if row['CWA_Broad_Category_Name'] == "Commodity":
                    scores["Index Based"] = max(scores["Index Based"], 3)
                elif "active" in row.get('investment_strategy', '').lower():
                    scores["Active Discretionary"] = max(scores["Active Discretionary"], 3)
                elif "quantitative" in row.get('investment_strategy', '').lower():
                    scores["Quant/Systematic"] = max(scores["Quant/Systematic"], 3)
    
    max_score = max(scores.values())
    if max_score > 0:
        return max(scores, key=scores.get)
    return None

def classify_return(row):
    scores = {category: 0 for category in RETURN_CATEGORIES}
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) else ""
    description = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) else ""
    text = product_name + " " + description
    
    # Base keyword scoring
    for category in scores:
        scores[category] += keyword_score_return(row, category)
        scores[category] += boolean_adjustment_return(row, category)
    
    # Bonus points for strong indicators
    if "actively managed" in text:
        scores["Active Discretionary"] += 2  # Bonus for explicit "actively managed"
    if "rules-based" in text:
        scores["Factor/Smart Beta"] += 2  # Bonus for explicit "rules-based" (corrected from Index Based)
    
    # Category mapping
    mapped_category = category_mapping_return(row)
    if mapped_category:
        scores[mapped_category] += 2
    
    # Shop tilt
    if pd.notna(row['Shop_Style']) and row['Shop_Style'].lower() == "both":
        shop_weights = SHOP_BOTH_LOOKUP.get(row['fund_family'], {})
        for category, weight in shop_weights.items():
            if category in RETURN_CATEGORIES:
                scores[category] += weight
    else:
        if pd.notna(row['Likely_Fund_Type']) and row['Likely_Fund_Type'] in RETURN_CATEGORIES:
            scores[row['Likely_Fund_Type']] += 2
    
    return pd.Series({
        'Return_Category': max(scores, key=scores.get),
        'Index Based Score': scores['Index Based'],
        'Factor/Smart Beta Score': scores['Factor/Smart Beta'],
        'Active Discretionary Score': scores['Active Discretionary'],
        'Quant/Systematic Score': scores['Quant/Systematic'],
        'Multi-Strategy Score': scores['Multi-Strategy']
    })

# Apply classification
return_results = df.apply(classify_return, axis=1)
df = pd.concat([df, return_results], axis=1)

# Select columns for Excel output with updated order
columns_to_save = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Return_Category', 'ycharts_url',
    'Shop_Style', 'Likely_Fund_Type', 'Index Based Score', 'Factor/Smart Beta Score', 'Active Discretionary Score',
    'Quant/Systematic Score', 'Multi-Strategy Score', 'investment_strategy', 'index_fund', 'inverse_fund', 'leveraged_fund',
    'socially_responsible_fund', 'synthetic_replication_fund', 'fund_of_funds',
    'Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name',
    'Category_Name', 'CWA_Broad_Category_Name'
]

# Save to Excel with new filename
df[columns_to_save].to_excel('classified_return_funds_v3.xlsx', index=False)
print("Results saved to 'classified_return_funds_v3.xlsx'")

Results saved to 'classified_return_funds_v3.xlsx'


In [None]:
# Separated Risk Overlays screen

import pandas as pd
from sqlalchemy import create_engine

# Define risk management overlay categories
RISK_CATEGORIES = ["None", "Moderate", "Heavy / Persistent", "Offensive / Levered"]

# Updated keyword lists for risk overlays
risk_keywords = {
    "None": ["long-only", "no derivatives", "no short", "straight equity"],
    "Moderate": ["hedging", "derivatives", "borrowings", "contracts", "futures", "options", "partial hedge", "covered call", "duration management", "hedge"],
    "Heavy / Persistent": ["persistent hedges", "borrowings", "contracts", "futures", "tail-risk", "protective", "overlay", "derivatives", "systematic hedging", "hedge"],
    "Offensive / Levered": ["leverage", "amplified", "geared", "borrowings", "contracts", "futures", "derivatives", "2x", "3x", "hedge"]
}

# Shop lookup for risk overlays
SHOP_RISK_LOOKUP = {
    "T. Rowe Price": {"None": 2, "Moderate": 1},
    "Fidelity Investments": {"None": 2, "Moderate": 1, "Heavy / Persistent": 1},
    "BlackRock": {"None": 2, "Moderate": 1, "Offensive / Levered": 1},
    "Invesco": {"None": 2, "Moderate": 1, "Heavy / Persistent": 1},
    "WisdomTree": {"None": 2, "Moderate": 1},
    "First Trust": {"None": 1, "Moderate": 2, "Heavy / Persistent": 1},
    "AllianceBernstein": {"None": 1, "Moderate": 2, "Heavy / Persistent": 1},
    "JPMorgan": {"None": 1, "Moderate": 2, "Heavy / Persistent": 1, "Offensive / Levered": 1},
    "John Hancock": {"None": 1, "Moderate": 2, "Heavy / Persistent": 1},
    "Victory Capital": {"None": 1, "Moderate": 2, "Heavy / Persistent": 1},
    "Columbia Threadneedle": {"None": 1, "Moderate": 2, "Heavy / Persistent": 1},
    "Federated": {"None": 2, "Moderate": 1},
    "Hartford Mutual Funds": {"None": 1, "Moderate": 2, "Heavy / Persistent": 1},
    "Allspring Global Investments": {"None": 1, "Moderate": 2, "Heavy / Persistent": 1}
}

# Category lookup for risk overlays
RISK_CATEGORY_LOOKUP = {
    "us equity": {"None": 2},
    "municipal": {"None": 2},
    "taxable fixed income": {"None": 2},
    "target date": {"None": 2},
    "allocation": {"Moderate": 2},
    "bond strategy": {"Moderate": 2},
    "strategic": {"Moderate": 2},
    "target maturity": {"Moderate": 2},
    "convertibles": {"Moderate": 2},
    "alternative": {"Heavy / Persistent": 2},
    "defined outcome": {"Heavy / Persistent": 2},
    "nontraditional": {"Heavy / Persistent": 2},
    "specialty": {"Heavy / Persistent": 2},
    "derivative income": {"Heavy / Persistent": 2},
    "trading/tactical": {"Offensive / Levered": 2},
    "quantitative/tactical": {"Offensive / Levered": 2},
    "commodity": {"Offensive / Levered": 2},
    "currency": {"Offensive / Levered": 2},
    "digital asset": {"Offensive / Levered": 2},
    "trading--leveraged": {"Offensive / Levered": 3},
    "trading--inverse": {"Offensive / Levered": 3}
}

# Database connection
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# SQL query
query = """
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.ycharts_url,
    yc_ba.Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name, ff.Shop_Style, ff.Likely_Fund_Type
FROM Funds_to_Screen fs
LEFT JOIN FundFamily_Data ff ON fs.fund_family = ff.Fund_Family
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""
df = pd.read_sql(query, engine)

# --- Risk Management Overlay Classification ---

def keyword_score_risk(row, category):
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) else ""
    description = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) else ""
    text = product_name + " " + description
    return sum(keyword in text for keyword in risk_keywords.get(category, []))

def boolean_adjustment_risk(row, category):
    score = 0
    if category == "Offensive / Levered" and row['leveraged_fund'] == 'True':
        score += 3
    if category in ["Heavy / Persistent", "Offensive / Levered"] and row['inverse_fund'] == 'True':
        score += 2
    if category in ["Moderate", "Heavy / Persistent"] and row['synthetic_replication_fund'] == 'True':
        score += 1
    if category == "None" and all(row[col] == 'False' for col in ['leveraged_fund', 'inverse_fund', 'synthetic_replication_fund']):
        score += 2
    return score

def category_mapping_risk(row):
    category_cols = ['Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name']
    scores = {category: 0 for category in RISK_CATEGORIES}
    
    for col in category_cols:
        if pd.notna(row[col]):
            value = row[col].lower()
            for keyword, category_weights in RISK_CATEGORY_LOOKUP.items():
                if keyword in value:
                    for cat, points in category_weights.items():
                        scores[cat] += points
    
    max_score = max(scores.values())
    if max_score > 0:
        return max(scores, key=scores.get)
    return None

def classify_risk(row):
    scores = {category: 0 for category in RISK_CATEGORIES}
    for category in scores:
        scores[category] += keyword_score_risk(row, category)
        scores[category] += boolean_adjustment_risk(row, category)
    mapped_category = category_mapping_risk(row)
    if mapped_category:
        scores[mapped_category] += 2
    if pd.notna(row['Shop_Style']) and row['Shop_Style'].lower() == "both":
        shop_weights = SHOP_RISK_LOOKUP.get(row['fund_family'], {})
        for category, weight in shop_weights.items():
            if category in RISK_CATEGORIES:
                scores[category] += weight
    elif pd.notna(row['Shop_Style']):
        if row['Shop_Style'] == 'Quant' and row['Likely_Fund_Type'] in ['Quant/Systematic', 'Multi-Strategy']:
            scores['Heavy / Persistent'] += 1
            scores['Offensive / Levered'] += 1
        elif row['Shop_Style'] == 'Active':
            scores['None'] += 1
    return pd.Series({
        'Risk_Overlay': max(scores, key=scores.get),
        'None Score': scores['None'],
        'Moderate Score': scores['Moderate'],
        'Heavy / Persistent Score': scores['Heavy / Persistent'],
        'Offensive / Levered Score': scores['Offensive / Levered']
    })

# Apply classification
risk_results = df.apply(classify_risk, axis=1)
df = pd.concat([df, risk_results], axis=1)

# Select columns for Excel output
columns_to_save = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Risk_Overlay', 'ycharts_url',
    'investment_strategy', 'index_fund', 'inverse_fund', 'leveraged_fund',
    'socially_responsible_fund', 'synthetic_replication_fund', 'fund_of_funds',
    'Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name',
    'Category_Name', 'CWA_Broad_Category_Name', 'Shop_Style', 'Likely_Fund_Type',
    'None Score', 'Moderate Score', 'Heavy / Persistent Score', 'Offensive / Levered Score'
]

# Save to Excel
df[columns_to_save].to_excel('classified_funds_v3_risk.xlsx', index=False)
print("Results saved to 'classified_funds_v3_risk.xlsx'")

In [29]:
# Version 4, now with tiers, confidence intervals per data type etc

import pandas as pd
from sqlalchemy import create_engine

"""
classify_return_drivers.py (V5)

This script classifies investment funds into one of five return generation strategies:
- Index Based
- Factor/Smart Beta
- Active Discretionary
- Quant/Systematic
- Multi-Strategy

Overview:
- Uses a tiered confidence system (High, Medium, Low) to score funds based on keywords, fund family, categories, and metadata.
- Metadata flags (e.g., index_fund) are scored once, outside the tier system, to avoid overweighting.
- Supports a "Dry Run" mode for testing specific funds.

Tunable Parameters and Switches:
- Clustered at the top for easy adjustment.

Output:
- 'classified_return_funds_v4.xlsx': Contains classified return categories and tier-specific scores.
"""

# --- Tunable Parameters and Dry Run Switches (Adjust Here) ---
TIER_CAP = 100  # Maximum points per tier
FUND_FAMILY_WEIGHT = 40  # Points allocated to fund family tendencies
KEYWORDS_WEIGHT = 30  # Points allocated to keywords
CATEGORIES_WEIGHT = 15  # Points allocated to category labels
METADATA_WEIGHT = 15  # Points allocated to metadata flags (scored once)
HIGH_CONF_THRESHOLD = 50  # Threshold for classification in High Confidence tier
LIKELY_FUND_TYPE_WEIGHT = 0.5  # Proportion of FUND_FAMILY_WEIGHT for Likely_Fund_Type
DRY_RUN = False  # Enable dry-run mode for testing
TEST_FUNDS = ["VOO", "DFSVX", "BSIIX", "IVV", "FBALX", "LCEAX", "JSMBX"]  # Funds to test in dry-run mode

# Tier weights for final score normalization
TIER_WEIGHTS = {"High": 1.0, "Medium": 0.5, "Low": 0.25}

# --- Constants ---
RETURN_CATEGORIES = ["Index Based", "Factor/Smart Beta", "Active Discretionary", "Quant/Systematic", "Multi-Strategy"]

# --- Keyword Lists (Insert Updated Keywords Here) ---
# Points for keywords are calculated via: hits * (Max points/# of keywords in tier)
# Therefore # keywords matters, less for high confidence, more in medium, whatever in low to try to "get there"

# High Confidence: Definitive terms
high_conf_keywords = {
    "Index Based": ["index fund", "tracks", "replicates", "indexed", "underlying index", "thematic"],
    "Factor/Smart Beta": ["rules-based", "factor-based", "factor tilt", "multi-factor", "factor investing"],
    "Active Discretionary": ["actively managed", "actively-managed", "manager believes", "manager's judgment"],
    "Quant/Systematic": ["quantitative", "algorithm-driven", "systematic", "levered"],
    "Multi-Strategy": ["multi-strategy", "multi-asset", "hybrid strategy"]
}

# Medium Confidence: Suggestive terms
med_conf_keywords = {
    "Index Based": ["bond index", "market-cap weighted", "low tracking error", "high correlation", "benchmark", "low-cost", "broad market exposure", "passive",
    "broad market exposure", "aggregate bond", "passive"],
    "Factor/Smart Beta": ["enhanced index", "revenue weighted", "dividend weighted", "enhanced returns", "thematic", "fundamental weighting",
    "yield weighted", "quality factor", "low volatility", "rotation"],
    "Active Discretionary": ["machine learning", "ai","research-driven", "fundamental", "strategically", "tactical allocation", "active", "rotation"],
    "Quant/Systematic": ["data-driven", "systematic", "backtested", "long-short", "model-based", "Rotation"],
    "Multi-Strategy": ["multi-manager", "dynamic allocation", "multi-asset", "absolute return", "blended"]
}

# Low Confidence: Broad or ambiguous terms
low_conf_keywords = {
    "Index Based": ["index", "mirrors", "equal-weighted", "bond index", "beta", "etf","value index", "growth index", "value index"],
    "Factor/Smart Beta": ["fundamental weighting", "enhanced index", "thematic", "tilt", "optimized"],
    "Active Discretionary": ["discretionary", "judgment", "analysis", "outperform", "selection", "tactical", "trend-following", "trend following"],
    "Quant/Systematic": ["quantitative", "algorithm", "statistical", "rules-driven", "trend-following", "trend following", "tactical"],
    "Multi-Strategy": ["combination", "hybrid", "multi-asset", "flexible", "alternative"]
}

# --- Shop Lookup for "Both" Shops Across Tiers (Insert Updated Mappings Below) ---
SHOP_BOTH_LOOKUP = {
    "High": {
        "Vanguard": {"Index Based": 30},
        "Dimensional Fund Advisors": {"Factor/Smart Beta": 40},
        "American Funds": {"Active Discretionary": 40},
        "AMG Funds": {"Active Discretionary": 30},
        "AQR Funds": {"Quant/Systematic": 35},
        "Amundi US": {"Active Discretionary": 20},
        "iShares": {"Active Discretionary": 0, "Index Based": 18, "Factor/Smart Beta": 18, "Quant/Systematic": 2, "Multi-Strategy": 2},
        "wisdomtree": {"Active Discretionary": 0, "Index Based": 18, "Factor/Smart Beta": 18, "Quant/Systematic": 2, "Multi-Strategy": 2},
    },
    "Medium": {
        "T. Rowe Price": {"Active Discretionary": 25, "Index Based": 10, "Quant/Systematic": 5},
        "Fidelity Investments": {"Active Discretionary": 25, "Index Based": 10, "Quant/Systematic": 5},
        "Invesco": {"Active Discretionary": 20, "Index Based": 10, "Factor/Smart Beta": 10},
        "JPMorgan": {"Active Discretionary": 20, "Index Based": 5, "Factor/Smart Beta": 10, "Quant/Systematic": 0, "Multi-Strategy": 5},
        "John Hancock": {"Active Discretionary": 20, "Index Based": 0, "Factor/Smart Beta": 10, "Quant/Systematic": 0, "Multi-Strategy": 10},
        "Victory Capital": {"Active Discretionary": 15, "Index Based": 0, "Factor/Smart Beta": 15, "Quant/Systematic": 5, "Multi-Strategy": 5},
        "Columbia Threadneedle": {"Active Discretionary": 30, "Index Based": 0, "Factor/Smart Beta": 5, "Quant/Systematic": 0, "Multi-Strategy": 5},
        "Federated": {"Active Discretionary": 20, "Factor/Smart Beta": 10, "Quant/Systematic": 10},
        "Hartford Mutual Funds": {"Active Discretionary": 20, "Index Based": 0, "Factor/Smart Beta": 20, "Quant/Systematic": 0, "Multi-Strategy": 0},
        "Allspring Global Investments": {"Active Discretionary": 30, "Index Based": 0, "Factor/Smart Beta": 0, "Quant/Systematic": 0, "Multi-Strategy": 0},
        "BlackRock": {"Active Discretionary": 25,"Index Based": 0, "Factor/Smart Beta": 0, "Quant/Systematic": 5, "Multi-Strategy": 10}
    },
    "Low": {
    "AdvisorShares": {"Active Discretionary": 15,"Index Based": 5, "Factor/Smart Beta": 15, "Quant/Systematic": 5, "Multi-Strategy": 5}
    }
}

# --- Category Lookup Across Tiers (Insert Updated Mappings Below) ---
CATEGORY_LOOKUP = {
    "High": {
        "Defined Outcome": {"Quant/Systematic": 15},
        "actively managed": {"Active Discretionary": 15},
        "Quantitative/Tactical": {"Quant/Systematic": 15},
        "Trading/Tactical": {"Quant/Systematic": 10},
        "Digital Asset": {"Index Based": 15},
        "Currency": {"Index Based": 15},
        "Target Maturity": {"Index Based": 15},
        "Systematic Trend": {"Quant/Systematic": 15}
    },
    "Medium": {
        "Bond Strategy": {"Active Discretionary": 5, "Factor/Smart Beta": 5, "Quant/Systematic": 5},
        "Trading Tools": {"Quant/Systematic": 10},
        "Trading": {"Quant/Systematic": 10},
        "Target Date": {"Active Discretionary": 5, "Index Based": 2, "Multi-Strategy": 8,},
        "Options Trading": {"Active Discretionary": 5, "Factor/Smart Beta": 5, "Quant/Systematic": 5},
        "Long/Short Equity": {"Active Discretionary": 5, "Quant/Systematic": 5},
        "Commodities": {"Active Discretionary": 4, "Factor/Smart Beta": 1, "Quant/Systematic": 4, "Index Based": 4, "Multi-Strategy": 1},
        "Municipal": {"Active Discretionary": 6, "Factor/Smart Beta": 2, "Index Based": 6, "Multi-Strategy": 1},
        "Derivative Income": {"Active Discretionary": 5, "Quant/Systematic": 4, "Factor/Smart Beta": 2, "Index Based": 4},
        "Event Driven": {"Active Discretionary": 8, "Quant/Systematic": 7},
        "Inflation-Protected Bond": {"Active Discretionary": 6, "Index Based": 9}
    },
    "Low": {
        "Multialternative": {"Quant/Systematic": 7, "Active Discretionary": 8},
        "Specialty": {"Multi-Strategy": 4, "Quant/Systematic": 4, "Active Discretionary": 5},
        "Strategic": {"Multi-Strategy": 4, "Quant/Systematic": 4, "Active Discretionary": 5},
        "Nontraditional": {"Multi-Strategy": 2, "Quant/Systematic": 3, "Active Discretionary": 3, "Factor/Smart Beta": 3},
        "Equity Hedged": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5},
        "Multisector Bond": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5},
        "Nontraditional Bond": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5}
    }
}

# --- Database Connection ---
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# --- SQL Query ---
query = """
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy, fs.FS_insight,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.ycharts_url,
    yc_ba.Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name, ff.Shop_Style, ff.Likely_Fund_Type
FROM Funds_to_Screen fs
LEFT JOIN FundFamily_Data ff ON fs.fund_family = ff.Fund_Family
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""
df = pd.read_sql(query, engine)

# --- Apply Dry-Run Filter ---
if DRY_RUN:
    df = df[df['SymbolCUSIP'].isin(TEST_FUNDS)]

# --- Function Definitions ---

## Score Keywords
def score_keywords(text, keyword_dict, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for category, keywords in keyword_dict.items():
        hits = sum(keyword in text for keyword in keywords)
        if len(keywords) > 0:
            scores[category] = min(hits * (max_points / len(keywords)), max_points)
    return scores

## Score Categories
def score_categories(row, tier_dict, max_points):
    category_cols = ['Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name']
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for col in category_cols:
        if pd.notna(row[col]):
            value = row[col].lower()
            for keyword, weights in tier_dict.items():
                if keyword in value:
                    for cat, points in weights.items():
                        scores[cat] += points
    for cat in scores:
        scores[cat] = min(scores[cat], max_points)
    return scores

## Score Metadata (Once, Outside Tiers)
def score_metadata(row, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    if row['index_fund'] == 'True':
        scores["Index Based"] += max_points / 1
    if row['leveraged_fund'] == 'True' or row['inverse_fund'] == 'True':
        scores["Quant/Systematic"] += max_points / 2
    if row['fund_of_funds'] == 'True':
        scores["Active Discretionary"] and ["Factor/Smart Beta"] and ["Index Based"] += max_points / 3
    return scores

## Score Fund Family
def score_fund_family(row, tier, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    if pd.notna(row['Shop_Style']) and row['Shop_Style'].lower() != "both":
        if pd.notna(row['Likely_Fund_Type']) and row['Likely_Fund_Type'] in RETURN_CATEGORIES:
            if tier in ["High", "Medium"]:
                scores[row['Likely_Fund_Type']] += max_points * LIKELY_FUND_TYPE_WEIGHT
    else:
        shop_weights = SHOP_BOTH_LOOKUP.get(tier, {}).get(row['fund_family'], {})
        for cat, points in shop_weights.items():
            scores[cat] += points
        if row['fund_family'] == "BlackRock" and pd.notna(row['ProductName']):
            product_name = row['ProductName'].lower()
            if "ishares" in product_name and tier == "High":
                scores["Index Based"] += 30
                scores["Factor/Smart Beta"] += 10
            elif "ishares" not in product_name and tier == "Medium":
                scores["Active Discretionary"] += 20
    return scores

## Classify Return Drivers
def classify_return(row):
    # Combine text fields for keyword matching
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) else ""
    investment_strategy = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) else ""
    fs_insight = row['FS_insight'].lower() if pd.notna(row['FS_insight']) else ""
    text = product_name + " " + investment_strategy + " " + fs_insight

    # Initialize final scores dictionary
    final_scores = {cat: 0 for cat in RETURN_CATEGORIES}

    # Initialize dictionaries to store tier-specific and metadata scores
    tier_scores = {tier: {cat: 0 for cat in RETURN_CATEGORIES} for tier in TIER_WEIGHTS}
    metadata_scores = {cat: 0 for cat in RETURN_CATEGORIES}

    # --- Step 1: Score Keywords Across Tiers ---
    # High Confidence Tier
    high_keyword_scores = score_keywords(text, high_conf_keywords, KEYWORDS_WEIGHT)
    for cat in RETURN_CATEGORIES:
        tier_scores['High'][cat] += high_keyword_scores[cat]

    # Medium Confidence Tier
    med_keyword_scores = score_keywords(text, med_conf_keywords, KEYWORDS_WEIGHT)
    for cat in RETURN_CATEGORIES:
        tier_scores['Medium'][cat] += med_keyword_scores[cat]

    # Low Confidence Tier
    low_keyword_scores = score_keywords(text, low_conf_keywords, KEYWORDS_WEIGHT)
    for cat in RETURN_CATEGORIES:
        tier_scores['Low'][cat] += low_keyword_scores[cat]

    # --- Step 2: Score Fund Family Across Tiers ---
    # High Confidence Tier
    high_ff_scores = score_fund_family(row, 'High', FUND_FAMILY_WEIGHT)
    for cat in RETURN_CATEGORIES:
        tier_scores['High'][cat] += high_ff_scores[cat]

    # Medium Confidence Tier
    med_ff_scores = score_fund_family(row, 'Medium', FUND_FAMILY_WEIGHT)
    for cat in RETURN_CATEGORIES:
        tier_scores['Medium'][cat] += med_ff_scores[cat]

    # Low Confidence Tier
    low_ff_scores = score_fund_family(row, 'Low', FUND_FAMILY_WEIGHT)
    for cat in RETURN_CATEGORIES:
        tier_scores['Low'][cat] += low_ff_scores[cat]

    # --- Step 3: Score Categories Across Tiers ---
    # High Confidence Tier
    high_cat_scores = score_categories(row, CATEGORY_LOOKUP['High'], CATEGORIES_WEIGHT)
    for cat in RETURN_CATEGORIES:
        tier_scores['High'][cat] += high_cat_scores[cat]

    # Medium Confidence Tier
    med_cat_scores = score_categories(row, CATEGORY_LOOKUP['Medium'], CATEGORIES_WEIGHT)
    for cat in RETURN_CATEGORIES:
        tier_scores['Medium'][cat] += med_cat_scores[cat]

    # Low Confidence Tier
    low_cat_scores = score_categories(row, CATEGORY_LOOKUP['Low'], CATEGORIES_WEIGHT)
    for cat in RETURN_CATEGORIES:
        tier_scores['Low'][cat] += low_cat_scores[cat]

    # --- Step 4: Score Metadata (Once, Outside Tiers) ---
    metadata_scores = score_metadata(row, METADATA_WEIGHT)

    # --- Step 5: Compute Final Scores ---
    for cat in RETURN_CATEGORIES:
        weighted_score = sum(tier_scores[tier][cat] * TIER_WEIGHTS[tier] for tier in TIER_WEIGHTS)
        weighted_score += metadata_scores[cat]
        final_scores[cat] = weighted_score

    # --- Step 6: Determine Predicted Category ---
    # If all scores are 0, assign "Unclassified" (or handle as needed)
    if all(score == 0 for score in final_scores.values()):
        predicted_category = "Unclassified"
    else:
        predicted_category = max(final_scores, key=final_scores.get)

    # --- Step 7: Prepare Output as a pandas Series ---
    result = {'Return_Category': predicted_category}
    
    # Add tier-specific scores
    for tier in TIER_WEIGHTS:
        for cat in RETURN_CATEGORIES:
            result[f"{tier}_{cat}_Score"] = tier_scores[tier][cat]
    
    # Add metadata scores
    for cat in RETURN_CATEGORIES:
        result[f"Metadata_{cat}_Score"] = metadata_scores[cat]

    return pd.Series(result)
    
# --- Apply Classification ---
result_cols = ['Return_Category'] + [f"{tier}_{cat}_Score" for tier in TIER_WEIGHTS for cat in RETURN_CATEGORIES] + [f"Metadata_{cat}_Score" for cat in RETURN_CATEGORIES]
df[result_cols] = df.apply(classify_return, axis=1)

# --- Prepare Output ---
output_cols = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Return_Category', 'ycharts_url',
    'Shop_Style', 'Likely_Fund_Type'] + [col for col in df.columns if '_Score' in col] + [
    'investment_strategy', 'FS_insight', 'index_fund', 'inverse_fund', 'leveraged_fund', 'socially_responsible_fund',
    'synthetic_replication_fund', 'fund_of_funds', 'Broad_Asset_Class_Name', 'Broad_Category_Name',
    'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name'
]
df[output_cols].to_excel('classified_return_funds_v4.xlsx', index=False)
print("Classification complete. Results saved to 'classified_return_funds_v4.xlsx'.")

Classification complete. Results saved to 'classified_return_funds_v4.xlsx'.


In [30]:
# Version 5, enhanced with granular scoring output

import pandas as pd
from sqlalchemy import create_engine

"""
classify_return_drivers.py (V5)

Classifies investment funds into return generation strategies:
- Index Based
- Factor/Smart Beta
- Active Discretionary
- Quant/Systematic
- Multi-Strategy

Enhancements in V5:
- Outputs granular scoring details (component-wise scores per tier, total tier scores, final scores) to the Excel file.
- Improves debugging and verification of scoring logic.
"""

# --- Tunable Parameters and Dry Run Switches ---
TIER_CAP = 100
FUND_FAMILY_WEIGHT = 40
KEYWORDS_WEIGHT = 30
CATEGORIES_WEIGHT = 15
METADATA_WEIGHT = 15
HIGH_CONF_THRESHOLD = 50
LIKELY_FUND_TYPE_WEIGHT = 0.5
DRY_RUN = False
TEST_FUNDS = ["VOO", "DFSVX", "BSIIX", "IVV", "FBALX", "LCEAX", "JSMBX"]

TIER_WEIGHTS = {"High": 1.0, "Medium": 0.5, "Low": 0.25}

# --- Constants ---
RETURN_CATEGORIES = ["Index Based", "Factor/Smart Beta", "Active Discretionary", "Quant/Systematic", "Multi-Strategy"]

# --- Keyword Lists ---
high_conf_keywords = {
    "Index Based": ["index fund", "tracks", "replicates", "indexed", "underlying index", "thematic"],
    "Factor/Smart Beta": ["rules-based", "factor-based", "factor tilt", "multi-factor", "factor investing"],
    "Active Discretionary": ["actively managed", "actively-managed", "manager believes", "manager's judgment"],
    "Quant/Systematic": ["quantitative", "algorithm-driven", "systematic", "levered"],
    "Multi-Strategy": ["multi-strategy", "multi-asset", "hybrid strategy"]
}

med_conf_keywords = {
    "Index Based": ["bond index", "market-cap weighted", "low tracking error", "high correlation", "benchmark", "low-cost", "broad market exposure", "passive",
                    "broad market exposure", "aggregate bond", "passive"],
    "Factor/Smart Beta": ["enhanced index", "revenue weighted", "dividend weighted", "enhanced returns", "thematic", "fundamental weighting",
                          "yield weighted", "quality factor", "low volatility", "rotation"],
    "Active Discretionary": ["machine learning", "ai", "research-driven", "fundamental", "strategically", "tactical allocation", "active", "rotation"],
    "Quant/Systematic": ["data-driven", "systematic", "backtested", "long-short", "model-based", "Rotation"],
    "Multi-Strategy": ["multi-manager", "dynamic allocation", "multi-asset", "absolute return", "blended"]
}

low_conf_keywords = {
    "Index Based": ["index", "mirrors", "equal-weighted", "bond index", "beta", "etf", "value index", "growth index", "value index"],
    "Factor/Smart Beta": ["fundamental weighting", "enhanced index", "thematic", "tilt", "optimized"],
    "Active Discretionary": ["discretionary", "judgment", "analysis", "outperform", "selection", "tactical", "trend-following", "trend following"],
    "Quant/Systematic": ["quantitative", "algorithm", "statistical", "rules-driven", "trend-following", "trend following", "tactical"],
    "Multi-Strategy": ["combination", "hybrid", "multi-asset", "flexible", "alternative"]
}

# --- Shop Lookup ---
SHOP_BOTH_LOOKUP = {
    "High": {
        "Vanguard": {"Index Based": 30},
        "Dimensional Fund Advisors": {"Factor/Smart Beta": 40},
        "American Funds": {"Active Discretionary": 40},
        "AMG Funds": {"Active Discretionary": 30},
        "AQR Funds": {"Quant/Systematic": 35},
        "Amundi US": {"Active Discretionary": 20},
        "iShares": {"Active Discretionary": 0, "Index Based": 18, "Factor/Smart Beta": 18, "Quant/Systematic": 2, "Multi-Strategy": 2},
        "wisdomtree": {"Active Discretionary": 0, "Index Based": 18, "Factor/Smart Beta": 18, "Quant/Systematic": 2, "Multi-Strategy": 2},
    },
    "Medium": {
        "T. Rowe Price": {"Active Discretionary": 25, "Index Based": 10, "Quant/Systematic": 5},
        "Fidelity Investments": {"Active Discretionary": 25, "Index Based": 10, "Quant/Systematic": 5},
        "Invesco": {"Active Discretionary": 20, "Index Based": 10, "Factor/Smart Beta": 10},
        "JPMorgan": {"Active Discretionary": 20, "Index Based": 5, "Factor/Smart Beta": 10, "Quant/Systematic": 0, "Multi-Strategy": 5},
        "John Hancock": {"Active Discretionary": 20, "Index Based": 0, "Factor/Smart Beta": 10, "Quant/Systematic": 0, "Multi-Strategy": 10},
        "Victory Capital": {"Active Discretionary": 15, "Index Based": 0, "Factor/Smart Beta": 15, "Quant/Systematic": 5, "Multi-Strategy": 5},
        "Columbia Threadneedle": {"Active Discretionary": 30, "Index Based": 0, "Factor/Smart Beta": 5, "Quant/Systematic": 0, "Multi-Strategy": 5},
        "Federated": {"Active Discretionary": 20, "Factor/Smart Beta": 10, "Quant/Systematic": 10},
        "Hartford Mutual Funds": {"Active Discretionary": 20, "Index Based": 0, "Factor/Smart Beta": 20, "Quant/Systematic": 0, "Multi-Strategy": 0},
        "Allspring Global Investments": {"Active Discretionary": 30, "Index Based": 0, "Factor/Smart Beta": 0, "Quant/Systematic": 0, "Multi-Strategy": 0},
        "BlackRock": {"Active Discretionary": 25, "Index Based": 0, "Factor/Smart Beta": 0, "Quant/Systematic": 5, "Multi-Strategy": 10}
    },
    "Low": {
        "AdvisorShares": {"Active Discretionary": 15, "Index Based": 5, "Factor/Smart Beta": 15, "Quant/Systematic": 5, "Multi-Strategy": 5}
    }
}

# --- Category Lookup ---
CATEGORY_LOOKUP = {
    "High": {
        "Defined Outcome": {"Quant/Systematic": 15},
        "actively managed": {"Active Discretionary": 15},
        "Quantitative/Tactical": {"Quant/Systematic": 15},
        "Trading/Tactical": {"Quant/Systematic": 10},
        "Digital Asset": {"Index Based": 15},
        "Currency": {"Index Based": 15},
        "Target Maturity": {"Index Based": 15},
        "Systematic Trend": {"Quant/Systematic": 15}
    },
    "Medium": {
        "Bond Strategy": {"Active Discretionary": 5, "Factor/Smart Beta": 5, "Quant/Systematic": 5},
        "Trading Tools": {"Quant/Systematic": 10},
        "Trading": {"Quant/Systematic": 10},
        "Target Date": {"Active Discretionary": 5, "Index Based": 2, "Multi-Strategy": 8},
        "Options Trading": {"Active Discretionary": 5, "Factor/Smart Beta": 5, "Quant/Systematic": 5},
        "Long/Short Equity": {"Active Discretionary": 5, "Quant/Systematic": 5},
        "Commodities": {"Active Discretionary": 4, "Factor/Smart Beta": 1, "Quant/Systematic": 4, "Index Based": 4, "Multi-Strategy": 1},
        "Municipal": {"Active Discretionary": 6, "Factor/Smart Beta": 2, "Index Based": 6, "Multi-Strategy": 1},
        "Derivative Income": {"Active Discretionary": 5, "Quant/Systematic": 4, "Factor/Smart Beta": 2, "Index Based": 4},
        "Event Driven": {"Active Discretionary": 8, "Quant/Systematic": 7},
        "Inflation-Protected Bond": {"Active Discretionary": 6, "Index Based": 9}
    },
    "Low": {
        "Multialternative": {"Quant/Systematic": 7, "Active Discretionary": 8},
        "Specialty": {"Multi-Strategy": 4, "Quant/Systematic": 4, "Active Discretionary": 5},
        "Strategic": {"Multi-Strategy": 4, "Quant/Systematic": 4, "Active Discretionary": 5},
        "Nontraditional": {"Multi-Strategy": 2, "Quant/Systematic": 3, "Active Discretionary": 3, "Factor/Smart Beta": 3},
        "Equity Hedged": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5},
        "Multisector Bond": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5},
        "Nontraditional Bond": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5}
    }
}

# --- Database Connection ---
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# --- SQL Query ---
query = """
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy, fs.FS_insight,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.ycharts_url,
    yc_ba.Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name, ff.Shop_Style, ff.Likely_Fund_Type
FROM Funds_to_Screen fs
LEFT JOIN FundFamily_Data ff ON fs.fund_family = ff.Fund_Family
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""
df = pd.read_sql(query, engine)

if DRY_RUN:
    df = df[df['SymbolCUSIP'].isin(TEST_FUNDS)]

# --- Function Definitions ---

def score_keywords(text, keyword_dict, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for category, keywords in keyword_dict.items():
        hits = sum(keyword in text for keyword in keywords)
        if len(keywords) > 0:
            scores[category] = min(hits * (max_points / len(keywords)), max_points)
    return scores

def score_categories(row, tier_dict, max_points):
    category_cols = ['Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name']
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for col in category_cols:
        if pd.notna(row[col]):
            value = row[col].lower()
            for keyword, weights in tier_dict.items():
                if keyword in value:
                    for cat, points in weights.items():
                        scores[cat] += points
    for cat in scores:
        scores[cat] = min(scores[cat], max_points)
    return scores

def score_metadata(row, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    if row['index_fund'] == 'True':
        scores["Index Based"] += max_points / 1
    if row['leveraged_fund'] == 'True' or row['inverse_fund'] == 'True':
        scores["Quant/Systematic"] += max_points / 2
    if row['fund_of_funds'] == 'True':
        scores["Active Discretionary"] += max_points / 3  # Fixed syntax
        scores["Factor/Smart Beta"] += max_points / 3
        scores["Index Based"] += max_points / 3
    return scores

def score_fund_family(row, tier, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    if pd.notna(row['Shop_Style']) and row['Shop_Style'].lower() != "both":
        if pd.notna(row['Likely_Fund_Type']) and row['Likely_Fund_Type'] in RETURN_CATEGORIES:
            if tier in ["High", "Medium"]:
                scores[row['Likely_Fund_Type']] += max_points * LIKELY_FUND_TYPE_WEIGHT
    else:
        shop_weights = SHOP_BOTH_LOOKUP.get(tier, {}).get(row['fund_family'], {})
        for cat, points in shop_weights.items():
            scores[cat] += points
        if row['fund_family'] == "BlackRock" and pd.notna(row['ProductName']):
            product_name = row['ProductName'].lower()
            if "ishares" in product_name and tier == "High":
                scores["Index Based"] += 30
                scores["Factor/Smart Beta"] += 10
            elif "ishares" not in product_name and tier == "Medium":
                scores["Active Discretionary"] += 20
    return scores

def classify_return(row):
    # Combine text fields
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) else ""
    investment_strategy = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) else ""
    fs_insight = row['FS_insight'].lower() if pd.notna(row['FS_insight']) else ""
    text = product_name + " " + investment_strategy + " " + fs_insight

    # Initialize scoring dictionaries
    final_scores = {cat: 0 for cat in RETURN_CATEGORIES}
    tier_scores = {tier: {cat: 0 for cat in RETURN_CATEGORIES} for tier in TIER_WEIGHTS}
    component_scores = {
        'Keywords': {tier: {cat: 0 for cat in RETURN_CATEGORIES} for tier in TIER_WEIGHTS},
        'FundFamily': {tier: {cat: 0 for cat in RETURN_CATEGORIES} for tier in TIER_WEIGHTS},
        'Categories': {tier: {cat: 0 for cat in RETURN_CATEGORIES} for tier in TIER_WEIGHTS},
    }
    metadata_scores = {cat: 0 for cat in RETURN_CATEGORIES}

    # Step 1: Score Keywords
    component_scores['Keywords']['High'] = score_keywords(text, high_conf_keywords, KEYWORDS_WEIGHT)
    component_scores['Keywords']['Medium'] = score_keywords(text, med_conf_keywords, KEYWORDS_WEIGHT)
    component_scores['Keywords']['Low'] = score_keywords(text, low_conf_keywords, KEYWORDS_WEIGHT)

    # Step 2: Score Fund Family
    component_scores['FundFamily']['High'] = score_fund_family(row, 'High', FUND_FAMILY_WEIGHT)
    component_scores['FundFamily']['Medium'] = score_fund_family(row, 'Medium', FUND_FAMILY_WEIGHT)
    component_scores['FundFamily']['Low'] = score_fund_family(row, 'Low', FUND_FAMILY_WEIGHT)

    # Step 3: Score Categories
    component_scores['Categories']['High'] = score_categories(row, CATEGORY_LOOKUP['High'], CATEGORIES_WEIGHT)
    component_scores['Categories']['Medium'] = score_categories(row, CATEGORY_LOOKUP['Medium'], CATEGORIES_WEIGHT)
    component_scores['Categories']['Low'] = score_categories(row, CATEGORY_LOOKUP['Low'], CATEGORIES_WEIGHT)

    # Step 4: Compute Total Tier Scores
    for tier in TIER_WEIGHTS:
        for cat in RETURN_CATEGORIES:
            tier_scores[tier][cat] = (
                component_scores['Keywords'][tier][cat] +
                component_scores['FundFamily'][tier][cat] +
                component_scores['Categories'][tier][cat]
            )

    # Step 5: Score Metadata
    metadata_scores = score_metadata(row, METADATA_WEIGHT)

    # Step 6: Determine Predicted Category
    if all(score == 0 for score in final_scores.values()):
        predicted_category = "Unclassified"
    else:
        predicted_category = max(final_scores, key=final_scores.get)

    # Debugging: Print scores and predicted category for verification
    if row['SymbolCUSIP'] in ['RORO', 'TFPN']:
        print(f"{row['SymbolCUSIP']}: final_scores = {final_scores}")
        print(f"{row['SymbolCUSIP']}: predicted_category = {predicted_category}")

    # Step 7: Prepare Output as a pandas Series
    result = {'Return_Category': predicted_category}
    
    # Add tier-specific scores
    for tier in TIER_WEIGHTS:
        for cat in RETURN_CATEGORIES:
            result[f"{tier}_{cat}_Score"] = tier_scores[tier][cat]
    
    # Add metadata scores
    for cat in RETURN_CATEGORIES:
        result[f"Metadata_{cat}_Score"] = metadata_scores[cat]

    return pd.Series(result)

# --- Apply Classification ---
result_cols = (
    ['Return_Category'] +
    [f"{tier}_{comp}_{cat}_Score" for tier in TIER_WEIGHTS for comp in ['Keywords', 'FundFamily', 'Categories'] for cat in RETURN_CATEGORIES] +
    [f"{tier}_Total_{cat}_Score" for tier in TIER_WEIGHTS for cat in RETURN_CATEGORIES] +
    [f"Metadata_{cat}_Score" for cat in RETURN_CATEGORIES] +
    [f"Final_{cat}_Score" for cat in RETURN_CATEGORIES]
)
df[result_cols] = df.apply(classify_return, axis=1)

# --- Prepare Output ---
output_cols = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Return_Category', 'ycharts_url',
    'Shop_Style', 'Likely_Fund_Type'] + [col for col in df.columns if '_Score' in col] + [
    'investment_strategy', 'FS_insight', 'index_fund', 'inverse_fund', 'leveraged_fund', 'socially_responsible_fund',
    'synthetic_replication_fund', 'fund_of_funds', 'Broad_Asset_Class_Name', 'Broad_Category_Name',
    'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name'
]
df[output_cols].to_excel('classified_return_funds_v5.xlsx', index=False)
print("Classification complete. Results saved to 'classified_return_funds_v5.xlsx'.")

Classification complete. Results saved to 'classified_return_funds_v5.xlsx'.


In [53]:
# Version 5, enhanced with granular scoring output, integrated FundFamilyData distributions,
# increased keyword weight, and updated keyword and category lookup lists.

import pandas as pd
from sqlalchemy import create_engine

"""
classify_return_drivers.py (V5 - Updated Keywords & Category Lookup)

Classifies investment funds into return generation strategies:
- Index Based
- Factor/Smart Beta
- Active Discretionary
- Quant/Systematic
- Multi-Strategy

Enhancements:
- Uses FundFamilyData distribution fields for scoring fund family signals.
- Increased KEYWORDS_WEIGHT to 50.
- Updated keyword lists (high, medium, low) for improved matching.
- Updated CATEGORY_LOOKUP (High tier portion updated as provided).
"""

# --- Tunable Parameters and Dry Run Switches ---
TIER_CAP = 100
FUND_FAMILY_WEIGHT = 40
KEYWORDS_WEIGHT = 70    # Increased to 50
CATEGORIES_WEIGHT = 15
METADATA_WEIGHT = 15
HIGH_CONF_THRESHOLD = 50
LIKELY_FUND_TYPE_WEIGHT = 0.5
DRY_RUN = False
TEST_FUNDS = ["VOO", "DFSVX", "BSIIX", "IVV", "FBALX", "LCEAX", "JSMBX"]

# We'll continue to use tier weights for keywords and categories.
TIER_WEIGHTS = {"High": 1.0, "Medium": 0.5, "Low": 0.25}

# --- Constants ---
RETURN_CATEGORIES = ["Index Based", "Factor/Smart Beta", "Active Discretionary", "Quant/Systematic", "Multi-Strategy"]

# --- Updated Keyword Lists ---
high_conf_keywords = {
    "Index Based": [
        "index fund", "tracks", "replicates", "indexed", "underlying index",
        "thematic", "passive", "economic characteristics that are substantially"
    ],
    "Factor/Smart Beta": [
        "rules-based", "factor-based", "factor tilt", "multi-factor", "factor investing",
        "momentum", "low volatility", "low vol", "value factor", "quality", "quality factor",
        "free cash flow", "fcf", "factor", "objective", "time tested", "relatively", "go up",
        "certain fundamental metrics", "momentum index", "quality index", "relatively lower valuations",
        "factors", "minimum volatility", "high dividend yield"
    ],
    "Active Discretionary": [
        "actively managed", "actively-managed", "manager believes", "manager's judgment",
        "active bottom‑up", "active strategy", "discretionary", "active management", "active-management"
    ],
    "Quant/Systematic": [
        "quantitative", "algorithm-driven", "systematic", "levered", "algorithm", "implied volatility"
    ],
    "Multi-Strategy": [
        "multi-strategy", "multi-asset", "hybrid strategy"
    ]
}

med_conf_keywords = {
    "Index Based": [
        "bond index", "market-cap weighted", "low tracking error", "high correlation", "benchmark",
        "low-cost", "broad market exposure", "passive", "aggregate bond", "passive", "broad"
    ],
    "Factor/Smart Beta": [
        "enhanced index", "revenue weighted", "dividend weighted", "enhanced returns", "thematic",
        "fundamental weighting", "yield weighted", "quality factor", "low volatility", "rotation",
        "rules based methodology", "cash cows", "alphaDEX", "ranked", "lower volatility"
    ],
    "Active Discretionary": [
        "machine learning", "ai", "research-driven", "fundamental", "strategically",
        "tactical allocation", "active", "rotation"
    ],
    "Quant/Systematic": [
        "data-driven", "systematic", "backtested", "long-short", "model-based", "rotation"
    ],
    "Multi-Strategy": [
        "multi-manager", "dynamic allocation", "multi-asset", "absolute return", "blended"
    ]
}

low_conf_keywords = {
    "Index Based": [
        "index", "mirrors", "equal-weighted", "bond index", "beta", "etf", "value index", "growth index"
    ],
    "Factor/Smart Beta": [
        "fundamental weighting", "enhanced index", "thematic", "tilt", "optimized", "component securities",
        "economic characteristics", "free cash flow yield", "objective", "high dividend yields",
        "ranking system", "consistantly increased dividends", "dividend", "dividend yield", "dividends",
        "strong cash", "low debt", "increasing earnings", "earnings", "rising dividend", "acheivers",
        "volatility weighted", "long/cash", "low beta", "low size"
    ],
    "Active Discretionary": [
        "discretionary", "judgment", "analysis", "outperform", "selection", "tactical",
        "trend-following", "trend following"
    ],
    "Quant/Systematic": [
        "quantitative", "algorithm", "statistical", "rules-driven", "trend-following",
        "trend following", "tactical"
    ],
    "Multi-Strategy": [
        "combination", "hybrid", "multi-asset", "flexible", "alternative"
    ]
}

# --- Updated Category Lookup ---
CATEGORY_LOOKUP = {
    "High": {
        "Defined Outcome": {"Quant/Systematic": 15},
        "Quantitative/Tactical": {"Quant/Systematic": 15},
        "Trading/Tactical": {"Quant/Systematic": 10},
        "Digital Asset": {"Index Based": 15},
        "Currency": {"Index Based": 15},
        "Target Maturity": {"Index Based": 15},
        "Systematic Trend": {"Quant/Systematic": 15}
    },
    # Medium and Low tiers remain as before (or update as needed)
    "Medium": {
        "Bond Strategy": {"Active Discretionary": 5, "Factor/Smart Beta": 5, "Quant/Systematic": 5},
        "Trading Tools": {"Quant/Systematic": 10},
        "Trading": {"Quant/Systematic": 10},
        "Target Date": {"Active Discretionary": 5, "Index Based": 2, "Multi-Strategy": 8},
        "Options Trading": {"Active Discretionary": 5, "Factor/Smart Beta": 5, "Quant/Systematic": 5},
        "Long/Short Equity": {"Active Discretionary": 5, "Quant/Systematic": 5},
        "Commodities": {"Active Discretionary": 4, "Factor/Smart Beta": 1, "Quant/Systematic": 4, "Index Based": 4, "Multi-Strategy": 1},
        "Municipal": {"Active Discretionary": 6, "Factor/Smart Beta": 2, "Index Based": 6, "Multi-Strategy": 1},
        "Derivative Income": {"Active Discretionary": 5, "Quant/Systematic": 4, "Factor/Smart Beta": 2, "Index Based": 4},
        "Event Driven": {"Active Discretionary": 8, "Quant/Systematic": 7},
        "Inflation-Protected Bond": {"Active Discretionary": 6, "Index Based": 9}
    },
    "Low": {
        "Multialternative": {"Quant/Systematic": 7, "Active Discretionary": 8},
        "Specialty": {"Multi-Strategy": 4, "Quant/Systematic": 4, "Active Discretionary": 5},
        "Strategic": {"Multi-Strategy": 4, "Quant/Systematic": 4, "Active Discretionary": 5},
        "Nontraditional": {"Multi-Strategy": 2, "Quant/Systematic": 3, "Active Discretionary": 3, "Factor/Smart Beta": 3},
        "Equity Hedged": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5},
        "Multisector Bond": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5},
        "Nontraditional Bond": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5}
    }
}

# --- Database Connection ---
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# --- SQL Query ---
query = """
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy, fs.FS_insight,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.ycharts_url,
    yc_ba.Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name,
    ff.Dist_Index, ff.Dist_Active, ff.Dist_Rules_Based, ff.Dist_Quant, ff.Dist_Multi
FROM Funds_to_Screen fs
LEFT JOIN FundFamilyData ff ON fs.fund_family = ff.[Fund Family Name]
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""
df = pd.read_sql(query, engine)

if DRY_RUN:
    df = df[df['SymbolCUSIP'].isin(TEST_FUNDS)]

# --- Function Definitions ---

def score_keywords(text, keyword_dict, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for category, keywords in keyword_dict.items():
        hits = sum(keyword in text for keyword in keywords)
        if len(keywords) > 0:
            scores[category] = min(hits * (max_points / len(keywords)), max_points)
    return scores

def score_categories(row, tier_dict, max_points):
    category_cols = ['Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name']
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for col in category_cols:
        if pd.notna(row[col]):
            value = row[col].lower()
            for keyword, weights in tier_dict.items():
                if keyword in value:
                    for cat, points in weights.items():
                        scores[cat] += points
    for cat in scores:
        scores[cat] = min(scores[cat], max_points)
    return scores

def score_metadata(row, max_points):
    # Retained for reference; metadata bonus not added to final score.
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    if row['index_fund'] == 'True':
        scores["Index Based"] += max_points
    if row['leveraged_fund'] == 'True' or row['inverse_fund'] == 'True':
        scores["Quant/Systematic"] += max_points / 2
    if row['fund_of_funds'] == 'True':
        scores["Active Discretionary"] += max_points / 3
        scores["Factor/Smart Beta"] += max_points / 3
        scores["Index Based"] += max_points / 3
    return scores

def score_fund_family(row, max_points):
    """
    Updated scoring for Fund Family using FundFamilyData distribution columns.
    Expected columns from FundFamilyData join:
      - Dist_Index, Dist_Active, Dist_Rules_Based, Dist_Quant, Dist_Multi
    Returns a dictionary with bonus scores for each category.
    """
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    try:
        dist_index = float(row.get('Dist_Index', 0))
        dist_active = float(row.get('Dist_Active', 0))
        dist_rules_based = float(row.get('Dist_Rules_Based', 0))
        dist_quant = float(row.get('Dist_Quant', 0))
        dist_multi = float(row.get('Dist_Multi', 0))
    except Exception:
        dist_index = dist_active = dist_rules_based = dist_quant = dist_multi = 0

    scores["Index Based"] += (dist_index / 100.0) * max_points
    scores["Active Discretionary"] += (dist_active / 100.0) * max_points
    scores["Factor/Smart Beta"] += (dist_rules_based / 100.0) * max_points
    scores["Quant/Systematic"] += (dist_quant / 100.0) * max_points
    scores["Multi-Strategy"] += (dist_multi / 100.0) * max_points

    if (dist_index + dist_active + dist_rules_based + dist_quant + dist_multi) == 0:
        scores["Active Discretionary"] += max_points * 0.5

    return scores

def classify_return(row):
    # Combine text fields (lowercased)
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) else ""
    investment_strategy = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) else ""
    fs_insight = row['FS_insight'].lower() if pd.notna(row['FS_insight']) else ""
    text = product_name + " " + investment_strategy + " " + fs_insight

    # --- Compute Tiered Scores for Keywords and Categories ---
    keywords_scores = {tier: score_keywords(text, kw_dict, KEYWORDS_WEIGHT)
                       for tier, kw_dict in zip(TIER_WEIGHTS, [high_conf_keywords, med_conf_keywords, low_conf_keywords])}
    categories_scores = {tier: score_categories(row, CATEGORY_LOOKUP[tier], CATEGORIES_WEIGHT)
                         for tier in TIER_WEIGHTS}

    tier_total_scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for tier in TIER_WEIGHTS:
        for cat in RETURN_CATEGORIES:
            tier_total_scores[cat] += keywords_scores[tier][cat] + categories_scores[tier][cat]
    
    # --- (Metadata scores are computed but not added to final score) ---
    # metadata_scores = score_metadata(row, METADATA_WEIGHT)
    # for cat in RETURN_CATEGORIES:
    #     tier_total_scores[cat] += metadata_scores[cat]
    
    # --- Fund Family Scores (using new distribution) ---
    fund_family_scores = score_fund_family(row, FUND_FAMILY_WEIGHT)
    for cat in RETURN_CATEGORIES:
        tier_total_scores[cat] += fund_family_scores[cat]

    # --- Decision Tree Rules Adjustments ---
    decision_scores = {cat: 0 for cat in RETURN_CATEGORIES}

    def to_bool(val):
        return str(val).strip().lower() in ["true", "1"]

    index_flag          = to_bool(row.get("index_fund", False))
    esg_flag            = to_bool(row.get("socially_responsible_fund", False))
    leveraged_flag      = to_bool(row.get("leveraged_fund", False))
    synthetic_flag      = to_bool(row.get("synthetic_replication_fund", False))
    currency_hedged_flag = to_bool(row.get("currency_hedged_fund", False))
    fund_of_funds_flag  = to_bool(row.get("fund_of_funds", False))

    # 1. Index Flag Bonus:
    # For any indexed fund, add a balanced bonus: +50 to "Index Based" and +50 to "Factor/Smart Beta."
    if index_flag:
        decision_scores["Index Based"] += 50
        decision_scores["Factor/Smart Beta"] += 50
    # Additional bonus: if index and any of (esg, leveraged, synthetic) then add extra +100 to "Index Based."
    if index_flag and (esg_flag or leveraged_flag or synthetic_flag):
        decision_scores["Index Based"] += 100
    # 2. No Index Classification: If not index, eliminate Index Based.
    if not index_flag:
        decision_scores["Index Based"] = -9999
    # 3. Fund of Funds:
    if fund_of_funds_flag:
        if index_flag:
            decision_scores["Active Discretionary"] -= 100
            decision_scores["Quant/Systematic"] -= 100
        else:
            decision_scores["Active Discretionary"] += 30
            decision_scores["Quant/Systematic"] += 10
            decision_scores["Multi-Strategy"] += 20
    # 4. Currency Hedged:
    if currency_hedged_flag:
        decision_scores["Factor/Smart Beta"] = -9999
        decision_scores["Active Discretionary"] += 20
        decision_scores["Index Based"] += 20
    # 5. Socially Responsible (ESG):
    if esg_flag:
        decision_scores["Factor/Smart Beta"] = -9999
    # 6. Leveraged or Synthetic with Index:
    if index_flag and (leveraged_flag or synthetic_flag):
        decision_scores["Active Discretionary"] = -9999
        decision_scores["Multi-Strategy"] = -9999
        decision_scores["Quant/Systematic"] = -9999
        decision_scores["Factor/Smart Beta"] = -9999
        decision_scores["Index Based"] += 50

    # --- Category-Based Decision Rules ---
    category_text = ""
    for col in ['Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name']:
        if pd.notna(row[col]):
            category_text += " " + row[col].lower()

    if "defined outcome" in category_text:
        decision_scores["Quant/Systematic"] += 100
    if "target maturity" in category_text:
        decision_scores["Index Based"] += 100
    trading_keywords = ["trading tools", "trading/tactical"]
    trading_categories = ["trading--inverse commodities", "trading--inverse debt", "trading--inverse equity",
                          "trading--leveraged commodities", "trading--leveraged debt", "trading--leveraged equity", "trading--miscellaneous"]
    if any(kw in category_text for kw in trading_keywords) or any(tc in category_text for tc in trading_categories):
        decision_scores["Quant/Systematic"] += 100
    if "target date" in category_text or "target-date" in category_text:
        decision_scores["Factor/Smart Beta"] = -9999
        decision_scores["Quant/Systematic"] = -9999
    if "digital asset" in category_text or "currency" in category_text:
        decision_scores["Index Based"] += 100
    if "commodity" in category_text:
        decision_scores["Factor/Smart Beta"] = -9999
    if "single stock" in category_text:
        decision_scores["Index Based"] += 100
    if "municipal" in category_text:
        decision_scores["Multi-Strategy"] = -9999
    if "systematic trend" in category_text:
        decision_scores["Quant/Systematic"] += 100
    if ("long/short equity" in category_text or "event driven" in category_text) and not index_flag:
        decision_scores["Active Discretionary"] += 50
        decision_scores["Quant/Systematic"] += 50
    if "inflation-protected bond" in category_text:
        decision_scores["Active Discretionary"] += 50
        decision_scores["Index Based"] += 50

    # --- Active Management Variants ---
    active_management_terms = [
        "actively managed",
        "actively-managed",
        "active management",
        "active bottom‑up approach",
        "actively trades",
        "active management strategy",
        "actively managed etf",
        "actively managed fund of funds",
        "actively managed strategy",
        "active allocation",
        "actively allocates",
        "tactically allocates assets",
        "active trading",
        "trade securities actively",
        "active trading strategy",
        "bottom‑up approach"
    ]
    for term in active_management_terms:
        if term in text:
            decision_scores["Active Discretionary"] += 100
            break

    # --- Combine All Scores ---
    final_combined_scores = {cat: tier_total_scores.get(cat, 0) + decision_scores.get(cat, 0)
                             for cat in RETURN_CATEGORIES}
    viable_scores = {cat: score for cat, score in final_combined_scores.items() if score > -9999}
    if not viable_scores:
        predicted_category = "Unclassified"
    else:
        predicted_category = max(viable_scores, key=viable_scores.get)

    # --- Prepare Output ---
    result = {'Return_Category': predicted_category}
    for cat in RETURN_CATEGORIES:
        result[f"Final_{cat}_Score"] = final_combined_scores[cat]

    return pd.Series(result)

# --- Apply Classification ---
result_cols = ['Return_Category'] + [f"Final_{cat}_Score" for cat in RETURN_CATEGORIES]
df[result_cols] = df.apply(classify_return, axis=1)

# --- Prepare Output Excel File ---
output_cols = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Return_Category', 'ycharts_url'
] + [col for col in df.columns if 'Score' in col] + [
    'investment_strategy', 'FS_insight', 'index_fund', 'inverse_fund', 'leveraged_fund', 'socially_responsible_fund',
    'synthetic_replication_fund', 'fund_of_funds', 'Broad_Asset_Class_Name', 'Broad_Category_Name',
    'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name'
]
df[output_cols].to_excel('classified_return_funds_v5.xlsx', index=False)
print("Classification complete. Results saved to 'classified_return_funds_v5.xlsx'.")


Classification complete. Results saved to 'classified_return_funds_v5.xlsx'.


In [2]:
# Version 6: Enhanced with updated FundFamilyData distributions, increased keyword weight,
# and adjusted decision tree bonuses (metadata bonus removed)

import pandas as pd
from sqlalchemy import create_engine

"""
classify_return_drivers.py (V6)

Classifies investment funds into return generation strategies:
- Index Based
- Factor/Smart Beta
- Active Discretionary
- Quant/Systematic
- Multi-Strategy

Enhancements in V6:
- Uses FundFamilyData distribution fields (Dist_Index, Dist_Active, Dist_Rules_Based, Dist_Quant, Dist_Multi)
  for scoring fund family signals.
- KEYWORDS_WEIGHT is increased to 50.
- Metadata bonus points are removed.
- Decision-tree adjustments:
    * If index_fund is true, a balanced bonus of +50 is added to "Index Based" and +50 to "Factor/Smart Beta."
    * Additionally, if index_fund is true and any of (esg, leveraged, synthetic) flags are true, an extra +100 is added to "Index Based."
- All other scoring components (keywords, category lookup) remain as before.
"""

# --- Tunable Parameters and Dry Run Switches ---
TIER_CAP = 100
FUND_FAMILY_WEIGHT = 40
KEYWORDS_WEIGHT = 50    # Increased to 50
CATEGORIES_WEIGHT = 15
# METADATA_WEIGHT is defined but its scores will not be added.
METADATA_WEIGHT = 15
HIGH_CONF_THRESHOLD = 50
LIKELY_FUND_TYPE_WEIGHT = 0.5
DRY_RUN = False
TEST_FUNDS = ["VOO", "DFSVX", "BSIIX", "IVV", "FBALX", "LCEAX", "JSMBX"]

# We'll continue to use tier weights for keywords and categories.
TIER_WEIGHTS = {"High": 1.0, "Medium": 0.5, "Low": 0.25}

# --- Constants ---
RETURN_CATEGORIES = ["Index Based", "Factor/Smart Beta", "Active Discretionary", "Quant/Systematic", "Multi-Strategy"]

# --- Updated Keyword Lists ---
high_conf_keywords = {
    "Index Based": [
        "index fund", "tracks", "replicates", "indexed", "underlying index",
        "thematic", "passive", "economic characteristics that are substantially"
    ],
    "Factor/Smart Beta": [
        "rules-based", "factor-based", "factor tilt", "multi-factor", "factor investing",
        "momentum", "low volatility", "low vol", "value factor", "quality", "quality factor",
        "free cash flow", "fcf", "factor", "objective", "time tested", "relatively", "go up",
        "certain fundamental metrics", "momentum index", "quality index", "relatively lower valuations",
        "factors", "minimum volatility", "high dividend yield"
    ],
    "Active Discretionary": [
        "actively managed", "actively-managed", "manager believes", "manager's judgment",
        "active bottom‑up", "active strategy", "discretionary", "active management", "active-management"
    ],
    "Quant/Systematic": [
        "quantitative", "algorithm-driven", "systematic", "levered", "algorithm", "implied volatility"
    ],
    "Multi-Strategy": [
        "multi-strategy", "multi-asset", "hybrid strategy"
    ]
}

med_conf_keywords = {
    "Index Based": [
        "bond index", "market-cap weighted", "low tracking error", "high correlation", "benchmark",
        "low-cost", "broad market exposure", "passive", "aggregate bond", "passive", "broad"
    ],
    "Factor/Smart Beta": [
        "enhanced index", "revenue weighted", "dividend weighted", "enhanced returns", "thematic",
        "fundamental weighting", "yield weighted", "quality factor", "low volatility", "rotation",
        "rules based methodology", "cash cows", "alphaDEX", "ranked", "lower volatility"
    ],
    "Active Discretionary": [
        "machine learning", "ai", "research-driven", "fundamental", "strategically",
        "tactical allocation", "active", "rotation"
    ],
    "Quant/Systematic": [
        "data-driven", "systematic", "backtested", "long-short", "model-based", "rotation"
    ],
    "Multi-Strategy": [
        "multi-manager", "dynamic allocation", "multi-asset", "absolute return", "blended"
    ]
}

low_conf_keywords = {
    "Index Based": [
        "index", "mirrors", "equal-weighted", "bond index", "beta", "etf", "value index", "growth index"
    ],
    "Factor/Smart Beta": [
        "fundamental weighting", "enhanced index", "thematic", "tilt", "optimized", "component securities",
        "economic characteristics", "free cash flow yield", "objective", "high dividend yields",
        "ranking system", "consistantly increased dividends", "dividend", "dividend yield", "dividends",
        "strong cash", "low debt", "increasing earnings", "earnings", "rising dividend", "acheivers",
        "volatility weighted", "long/cash", "low beta", "low size"
    ],
    "Active Discretionary": [
        "discretionary", "judgment", "analysis", "outperform", "selection", "tactical",
        "trend-following", "trend following"
    ],
    "Quant/Systematic": [
        "quantitative", "algorithm", "statistical", "rules-driven", "trend-following",
        "trend following", "tactical"
    ],
    "Multi-Strategy": [
        "combination", "hybrid", "multi-asset", "flexible", "alternative"
    ]
}

# --- Updated Category Lookup ---
CATEGORY_LOOKUP = {
    "High": {
        "Defined Outcome": {"Quant/Systematic": 15},
        "Quantitative/Tactical": {"Quant/Systematic": 15},
        "Trading/Tactical": {"Quant/Systematic": 10},
        "Digital Asset": {"Index Based": 15},
        "Currency": {"Index Based": 15},
        "Target Maturity": {"Index Based": 15},
        "Systematic Trend": {"Quant/Systematic": 15}
    },
    "Medium": {
        "Bond Strategy": {"Active Discretionary": 5, "Factor/Smart Beta": 5, "Quant/Systematic": 5},
        "Trading Tools": {"Quant/Systematic": 10},
        "Trading": {"Quant/Systematic": 10},
        "Target Date": {"Active Discretionary": 5, "Index Based": 2, "Multi-Strategy": 8},
        "Options Trading": {"Active Discretionary": 5, "Factor/Smart Beta": 5, "Quant/Systematic": 5},
        "Long/Short Equity": {"Active Discretionary": 5, "Quant/Systematic": 5},
        "Commodities": {"Active Discretionary": 4, "Factor/Smart Beta": 1, "Quant/Systematic": 4, "Index Based": 4, "Multi-Strategy": 1},
        "Municipal": {"Active Discretionary": 6, "Factor/Smart Beta": 2, "Index Based": 6, "Multi-Strategy": 1},
        "Derivative Income": {"Active Discretionary": 5, "Quant/Systematic": 4, "Factor/Smart Beta": 2, "Index Based": 4},
        "Event Driven": {"Active Discretionary": 8, "Quant/Systematic": 7},
        "Inflation-Protected Bond": {"Active Discretionary": 6, "Index Based": 9}
    },
    "Low": {
        "Multialternative": {"Quant/Systematic": 7, "Active Discretionary": 8},
        "Specialty": {"Multi-Strategy": 4, "Quant/Systematic": 4, "Active Discretionary": 5},
        "Strategic": {"Multi-Strategy": 4, "Quant/Systematic": 4, "Active Discretionary": 5},
        "Nontraditional": {"Multi-Strategy": 2, "Quant/Systematic": 3, "Active Discretionary": 3, "Factor/Smart Beta": 3},
        "Equity Hedged": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5},
        "Multisector Bond": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5},
        "Nontraditional Bond": {"Active Discretionary": 5, "Quant/Systematic": 3, "Factor/Smart Beta": 2, "Index Based": 5}
    }
}

# --- Database Connection ---
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# --- SQL Query ---
query = """
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy, fs.FS_insight,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.ycharts_url,
    yc_ba.Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name,
    ff.Dist_Index, ff.Dist_Active, ff.Dist_Rules_Based, ff.Dist_Quant, ff.Dist_Multi
FROM Funds_to_Screen fs
LEFT JOIN FundFamilyData ff ON fs.fund_family = ff.[Fund_Family_Name]
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
"""
df = pd.read_sql(query, engine)

if DRY_RUN:
    df = df[df['SymbolCUSIP'].isin(TEST_FUNDS)]

# --- Function Definitions ---

def score_keywords(text, keyword_dict, max_points):
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for category, keywords in keyword_dict.items():
        hits = sum(keyword in text for keyword in keywords)
        if len(keywords) > 0:
            scores[category] = min(hits * (max_points / len(keywords)), max_points)
    return scores

def score_categories(row, tier_dict, max_points):
    category_cols = ['Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name']
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for col in category_cols:
        if pd.notna(row[col]):
            value = row[col].lower()
            for keyword, weights in tier_dict.items():
                if keyword in value:
                    for cat, points in weights.items():
                        scores[cat] += points
    for cat in scores:
        scores[cat] = min(scores[cat], max_points)
    return scores

def score_metadata(row, max_points):
    # Retained for reference; metadata bonus not added to final score.
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    if row['index_fund'] == 'True':
        scores["Index Based"] += max_points
    if row['leveraged_fund'] == 'True' or row['inverse_fund'] == 'True':
        scores["Quant/Systematic"] += max_points / 2
    if row['fund_of_funds'] == 'True':
        scores["Active Discretionary"] += max_points / 3
        scores["Factor/Smart Beta"] += max_points / 3
        scores["Index Based"] += max_points / 3
    return scores

def score_fund_family(row, max_points):
    """
    Updated scoring for Fund Family using FundFamilyData distribution columns.
    Expected columns from FundFamilyData join:
      - Dist_Index, Dist_Active, Dist_Rules_Based, Dist_Quant, Dist_Multi
    Returns a dictionary with bonus scores for each category.
    """
    scores = {cat: 0 for cat in RETURN_CATEGORIES}
    try:
        dist_index = float(row.get('Dist_Index', 0))
        dist_active = float(row.get('Dist_Active', 0))
        dist_rules_based = float(row.get('Dist_Rules_Based', 0))
        dist_quant = float(row.get('Dist_Quant', 0))
        dist_multi = float(row.get('Dist_Multi', 0))
    except Exception:
        dist_index = dist_active = dist_rules_based = dist_quant = dist_multi = 0

    scores["Index Based"] += (dist_index / 100.0) * max_points
    scores["Active Discretionary"] += (dist_active / 100.0) * max_points
    scores["Factor/Smart Beta"] += (dist_rules_based / 100.0) * max_points
    scores["Quant/Systematic"] += (dist_quant / 100.0) * max_points
    scores["Multi-Strategy"] += (dist_multi / 100.0) * max_points

    if (dist_index + dist_active + dist_rules_based + dist_quant + dist_multi) == 0:
        scores["Active Discretionary"] += max_points * 0.5

    return scores

def classify_return(row):
    # Combine text fields (lowercased)
    product_name = row['ProductName'].lower() if pd.notna(row['ProductName']) else ""
    investment_strategy = row['investment_strategy'].lower() if pd.notna(row['investment_strategy']) else ""
    fs_insight = row['FS_insight'].lower() if pd.notna(row['FS_insight']) else ""
    text = product_name + " " + investment_strategy + " " + fs_insight

    # --- Compute Tiered Scores for Keywords and Categories ---
    keywords_scores = {tier: score_keywords(text, kw_dict, KEYWORDS_WEIGHT)
                       for tier, kw_dict in zip(TIER_WEIGHTS, [high_conf_keywords, med_conf_keywords, low_conf_keywords])}
    categories_scores = {tier: score_categories(row, CATEGORY_LOOKUP[tier], CATEGORIES_WEIGHT)
                         for tier in TIER_WEIGHTS}

    tier_total_scores = {cat: 0 for cat in RETURN_CATEGORIES}
    for tier in TIER_WEIGHTS:
        for cat in RETURN_CATEGORIES:
            tier_total_scores[cat] += keywords_scores[tier][cat] + categories_scores[tier][cat]
    
    # --- (Metadata scores are computed but not added to final score) ---
    # metadata_scores = score_metadata(row, METADATA_WEIGHT)
    # for cat in RETURN_CATEGORIES:
    #     tier_total_scores[cat] += metadata_scores[cat]
    
    # --- Fund Family Scores (using new distribution) ---
    fund_family_scores = score_fund_family(row, FUND_FAMILY_WEIGHT)
    for cat in RETURN_CATEGORIES:
        tier_total_scores[cat] += fund_family_scores[cat]

    # --- Decision Tree Rules Adjustments ---
    decision_scores = {cat: 0 for cat in RETURN_CATEGORIES}

    def to_bool(val):
        return str(val).strip().lower() in ["true", "1"]

    index_flag          = to_bool(row.get("index_fund", False))
    esg_flag            = to_bool(row.get("socially_responsible_fund", False))
    leveraged_flag      = to_bool(row.get("leveraged_fund", False))
    synthetic_flag      = to_bool(row.get("synthetic_replication_fund", False))
    currency_hedged_flag = to_bool(row.get("currency_hedged_fund", False))
    fund_of_funds_flag  = to_bool(row.get("fund_of_funds", False))

    # 1. Index Flag Bonus:
    # For any indexed fund, add a balanced bonus: +50 to "Index Based" and +50 to "Factor/Smart Beta."
    if index_flag:
        decision_scores["Index Based"] += 50
        decision_scores["Factor/Smart Beta"] += 50
    # Additional bonus: if index and any of (esg, leveraged, synthetic) then add extra +100 to "Index Based."
    if index_flag and (esg_flag or leveraged_flag or synthetic_flag):
        decision_scores["Index Based"] += 100
    # 2. No Index Classification: If not index, eliminate Index Based.
    if not index_flag:
        decision_scores["Index Based"] = -9999
    # 3. Fund of Funds:
    if fund_of_funds_flag:
        if index_flag:
            decision_scores["Active Discretionary"] -= 100
            decision_scores["Quant/Systematic"] -= 100
        else:
            decision_scores["Active Discretionary"] += 30
            decision_scores["Quant/Systematic"] += 10
            decision_scores["Multi-Strategy"] += 10
    # 4. Currency Hedged:
    if currency_hedged_flag:
        decision_scores["Factor/Smart Beta"] = -9999
        decision_scores["Active Discretionary"] += 20
        decision_scores["Index Based"] += 20
    # 5. Socially Responsible (ESG):
    if esg_flag:
        decision_scores["Factor/Smart Beta"] = -9999
    # 6. Leveraged or Synthetic with Index:
    if index_flag and (leveraged_flag or synthetic_flag):
        decision_scores["Active Discretionary"] = -9999
        decision_scores["Multi-Strategy"] = -9999
        decision_scores["Quant/Systematic"] = -9999
        decision_scores["Factor/Smart Beta"] = -9999
        decision_scores["Index Based"] += 50

    # --- Category-Based Decision Rules ---
    category_text = ""
    for col in ['Broad_Asset_Class_Name', 'Broad_Category_Name', 'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name']:
        if pd.notna(row[col]):
            category_text += " " + row[col].lower()

    if "defined outcome" in category_text:
        decision_scores["Quant/Systematic"] += 100
    if "target maturity" in category_text:
        decision_scores["Index Based"] += 100
    trading_keywords = ["trading tools", "trading/tactical"]
    trading_categories = ["trading--inverse commodities", "trading--inverse debt", "trading--inverse equity",
                          "trading--leveraged commodities", "trading--leveraged debt", "trading--leveraged equity", "trading--miscellaneous"]
    if any(kw in category_text for kw in trading_keywords) or any(tc in category_text for tc in trading_categories):
        decision_scores["Quant/Systematic"] += 100
    if "target date" in category_text or "target-date" in category_text:
        decision_scores["Factor/Smart Beta"] = -9999
        decision_scores["Quant/Systematic"] = -9999
    if "digital asset" in category_text or "currency" in category_text:
        decision_scores["Index Based"] += 100
    if "commodity" in category_text:
        decision_scores["Factor/Smart Beta"] = -9999
    if "single stock" in category_text:
        decision_scores["Index Based"] += 100
    if "municipal" in category_text:
        decision_scores["Multi-Strategy"] = -9999
    if "systematic trend" in category_text:
        decision_scores["Quant/Systematic"] += 100
    if ("long/short equity" in category_text or "event driven" in category_text) and not index_flag:
        decision_scores["Active Discretionary"] += 50
        decision_scores["Quant/Systematic"] += 50
    if "inflation-protected bond" in category_text:
        decision_scores["Active Discretionary"] += 50
        decision_scores["Index Based"] += 50

    # --- Active Management Variants ---
    active_management_terms = [
        "actively managed",
        "actively-managed",
        "active management",
        "active bottom‑up approach",
        "actively trades",
        "active management strategy",
        "actively managed etf",
        "actively managed fund of funds",
        "actively managed strategy",
        "active allocation",
        "actively allocates",
        "tactically allocates assets",
        "active trading",
        "trade securities actively",
        "active trading strategy",
        "bottom‑up approach"
    ]
    for term in active_management_terms:
        if term in text:
            decision_scores["Active Discretionary"] += 100
            break

    # --- Combine All Scores ---
    final_combined_scores = {cat: tier_total_scores.get(cat, 0) + decision_scores.get(cat, 0)
                             for cat in RETURN_CATEGORIES}
    viable_scores = {cat: score for cat, score in final_combined_scores.items() if score > -9999}
    if not viable_scores:
        predicted_category = "Unclassified"
    else:
        predicted_category = max(viable_scores, key=viable_scores.get)

    # --- Prepare Output ---
    result = {'Return_Category': predicted_category}
    for cat in RETURN_CATEGORIES:
        result[f"Final_{cat}_Score"] = final_combined_scores[cat]

    return pd.Series(result)

# --- Apply Classification ---
result_cols = ['Return_Category'] + [f"Final_{cat}_Score" for cat in RETURN_CATEGORIES]
df[result_cols] = df.apply(classify_return, axis=1)

# --- Prepare Output Excel File ---
output_cols = [
    'SymbolCUSIP', 'ProductName', 'fund_family', 'Return_Category', 'ycharts_url'
] + [col for col in df.columns if 'Score' in col] + [
    'investment_strategy', 'FS_insight', 'index_fund', 'inverse_fund', 'leveraged_fund', 'socially_responsible_fund',
    'synthetic_replication_fund', 'fund_of_funds', 'Broad_Asset_Class_Name', 'Broad_Category_Name',
    'Global_Category_Name', 'Category_Name', 'CWA_Broad_Category_Name'
]
df[output_cols].to_excel('classified_return_funds_v6.xlsx', index=False)
print("Classification complete. Results saved to 'classified_return_funds_v6.xlsx'.")


ProgrammingError: (pyodbc.ProgrammingError) ('42S22', "[42S22] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Invalid column name 'Fund_Family_Name'. (207) (SQLExecDirectW); [42S22] [Microsoft][ODBC Driver 18 for SQL Server][SQL Server]Invalid column name 'Broad_Asset_Class_Name'. (207)")
[SQL: 
SELECT 
    fs.SymbolCUSIP, fs.ProductName, fs.fund_family, fs.investment_strategy, fs.FS_insight,
    fs.index_fund, fs.inverse_fund, fs.leveraged_fund, fs.socially_responsible_fund,
    fs.synthetic_replication_fund, fs.fund_of_funds, fs.ycharts_url,
    yc_ba.Broad_Asset_Class_Name, yc_bc.Broad_Category_Name, yc_gc.Global_Category_Name,
    yc_c.Category_Name, cwa_bc.CWA_Broad_Category_Name,
    ff.Dist_Index, ff.Dist_Active, ff.Dist_Rules_Based, ff.Dist_Quant, ff.Dist_Multi
FROM Funds_to_Screen fs
LEFT JOIN FundFamilyData ff ON fs.fund_family = ff.[Fund_Family_Name]
LEFT JOIN YC_Broad_Asset_Class_List yc_ba ON fs.YC_Broad_Asset_Class_ID = yc_ba.ID
LEFT JOIN YC_Broad_Category_List yc_bc ON fs.YC_Broad_Category_ID = yc_bc.ID
LEFT JOIN YC_Global_Category_List yc_gc ON fs.YC_Global_Category_ID = yc_gc.ID
LEFT JOIN YC_Category_List yc_c ON fs.YC_Category_ID = yc_c.ID
LEFT JOIN CWA_Broad_Category_List cwa_bc ON fs.CWA_Broad_Category_ID = cwa_bc.ID
]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [34]:
import requests
import time
import logging
import threading
import urllib3
import sqlalchemy
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# YCharts API configuration for YCI style requests
headers_YCI = {
    "X-YCHARTSAUTHORIZATION": "yIIphqbsQysnTvWWxfW33w",  # Replace with your API key
    "X-YCHARTSEXCELSESSION": "b645cd897b2446bfa3796acfa3a879db",
    "X-YCHARTSEXCELVERSION": "4.4",
    "X-YCHARTSOPERATINGSYSTEM": "Microsoft Windows NT 10.0.26100.0",
    "X-YCHARTSIP": "",
    "Host": "api.ycharts.com",
    "Connection": "Keep-Alive"
}
yc_base_url = "https://api.ycharts.com/v3/"

# Configure the Database Connection (SQL Server)
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/CWA_Fund_Database?"
    "driver=ODBC+Driver+18+for+SQL+Server&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = sqlalchemy.create_engine(connection_string)

# Simple rate limiter to allow at most 13 calls per second across threads
class RateLimiter:
    def __init__(self, rate):
        self.rate = rate
        self.interval = 1.0 / rate
        self.lock = threading.Lock()
        self.last_time = time.time()

    def wait(self):
        with self.lock:
            now = time.time()
            elapsed = now - self.last_time
            wait_time = self.interval - elapsed
            if wait_time > 0:
                time.sleep(wait_time)
            self.last_time = time.time()

rate_limiter = RateLimiter(rate=13)

def fetch_currency_hedged_fund(symbol, fund_type_id):
    """
    Fetch the 'currency_hedged_fund' metric for a given fund symbol using YCI style API.
    Returns a tuple (symbol, value) where value is the raw string (e.g., "TRUE", "FALSE").
    """
    rate_limiter.wait()  # Ensure we don't exceed 13 calls/sec
    # Build the endpoint; mutual funds (Fund_Type_ID = 3) use "M:" prefix
    if fund_type_id == 3:
        endpoint = f"mutual_funds/M:{symbol}/info/currency_hedged_fund?retrieve_ttl=true"
    else:
        endpoint = f"companies/{symbol}/info/currency_hedged_fund?retrieve_ttl=true"
    url = f"{yc_base_url}{endpoint}"
    logging.info(f"Fetching {symbol}: {url}")
    try:
        response = requests.get(url, headers=headers_YCI, verify=False)
        if response.status_code != 200:
            logging.error(f"HTTP error for {symbol}: {response.status_code}")
            return symbol, None

        data = response.json()
        response_key = f"M:{symbol}" if fund_type_id == 3 else symbol
        if "response" not in data or response_key not in data["response"]:
            logging.warning(f"No response data for {symbol} (expected key: {response_key})")
            return symbol, None

        results = data["response"][response_key].get("results", {})
        if "currency_hedged_fund" not in results:
            logging.warning(f"'currency_hedged_fund' metric not found for {symbol}")
            return symbol, None

        metric_data = results["currency_hedged_fund"]
        value = metric_data.get("data")
        logging.info(f"Fetched {symbol}: {value}")
        return symbol, value
    except Exception as e:
        logging.error(f"Error fetching {symbol}: {e}")
        return symbol, None

def update_database(results, batch_size=500):
    """
    Update the Funds_to_Screen table with the fetched currency_hedged_fund values in batches.
    'results' is expected to be a list of (SymbolCUSIP, currency_hedged_fund) tuples.
    """
    df = pd.DataFrame(results, columns=["SymbolCUSIP", "currency_hedged_fund"])
    total = len(df)
    logging.info(f"Updating database for {total} funds...")
    with engine.begin() as conn:
        for i in range(0, total, batch_size):
            batch = df.iloc[i:i + batch_size]
            updates = [{"symbol": row["SymbolCUSIP"], "value": row["currency_hedged_fund"]} for _, row in batch.iterrows()]
            conn.execute(
                sqlalchemy.text("""
                    UPDATE Funds_to_Screen 
                    SET currency_hedged_fund = :value 
                    WHERE SymbolCUSIP = :symbol
                """),
                updates
            )
    logging.info("Database update complete.")

def main():
    # Retrieve all funds that need the currency_hedged_fund update
    query = "SELECT SymbolCUSIP, Fund_Type_ID FROM Funds_to_Screen WHERE currency_hedged_fund IS NULL"
    funds = pd.read_sql(query, engine)
    logging.info(f"Found {len(funds)} funds needing currency_hedged_fund update.")

    results = []
    # Use a ThreadPoolExecutor with a max of 13 workers to respect our rate limit
    with ThreadPoolExecutor(max_workers=13) as executor:
        future_to_symbol = {
            executor.submit(fetch_currency_hedged_fund, row["SymbolCUSIP"], row["Fund_Type_ID"]): row["SymbolCUSIP"]
            for _, row in funds.iterrows()
        }
        for future in as_completed(future_to_symbol):
            symbol, value = future.result()
            results.append((symbol, value))

    # Update the database in batches
    update_database(results, batch_size=500)
    print("Updating Database")

    #end script
    print("All funds processed")

if __name__ == "__main__":
    main()


2025-03-01 18:14:06,752 - INFO - Found 40 funds needing currency_hedged_fund update.
2025-03-01 18:14:06,805 - INFO - Fetching AMPD: https://api.ycharts.com/v3/companies/AMPD/info/currency_hedged_fund?retrieve_ttl=true
2025-03-01 18:14:06,883 - INFO - Fetching VMOT: https://api.ycharts.com/v3/companies/VMOT/info/currency_hedged_fund?retrieve_ttl=true
2025-03-01 18:14:06,961 - INFO - Fetching PP: https://api.ycharts.com/v3/companies/PP/info/currency_hedged_fund?retrieve_ttl=true
2025-03-01 18:14:07,050 - INFO - Fetching VCAR: https://api.ycharts.com/v3/companies/VCAR/info/currency_hedged_fund?retrieve_ttl=true
2025-03-01 18:14:07,127 - INFO - Fetching OFIYX: https://api.ycharts.com/v3/mutual_funds/M:OFIYX/info/currency_hedged_fund?retrieve_ttl=true
2025-03-01 18:14:07,219 - INFO - Fetching TWIO: https://api.ycharts.com/v3/companies/TWIO/info/currency_hedged_fund?retrieve_ttl=true
2025-03-01 18:14:07,301 - INFO - Fetching DFNV: https://api.ycharts.com/v3/companies/DFNV/info/currency_hedg

Updating Database
All funds processed


In [48]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import Integer, Float, String, DateTime, Boolean

# Step 1: Load the Excel file into a pandas DataFrame
df = pd.read_excel(r"C:\Users\JulianHeron\Software Projects\Fund family table upload.xlsx")

# Step 2: Determine SQL data types based on the DataFrame's data
dtype_dict = {}
for col in df.columns:
    if pd.api.types.is_integer_dtype(df[col]):
        # Use INT for integer columns
        dtype_dict[col] = Integer()
    elif pd.api.types.is_float_dtype(df[col]):
        # Use FLOAT for floating-point numbers
        dtype_dict[col] = Float()
    elif pd.api.types.is_string_dtype(df[col]):
        # Use VARCHAR with the maximum length of strings in the column
        max_len = df[col].str.len().max()
        dtype_dict[col] = String(length=max_len)
    elif pd.api.types.is_datetime64_any_dtype(df[col]):
        # Use DATETIME for date and time values
        dtype_dict[col] = DateTime()
    elif pd.api.types.is_bool_dtype(df[col]):
        # Use BIT for boolean values (SQL Server equivalent)
        dtype_dict[col] = Boolean()

# Step 3: Create a connection to the SQL Server database
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)
engine = create_engine(connection_string)

# Step 4: Create the table "FundFamilyData" and insert the data
df.to_sql('FundFamilyData', engine, if_exists='replace', index=False, dtype=dtype_dict)

163