In [None]:
# Comparison code for grok scripts

import pandas as pd
import os
import glob
import sys
from datetime import datetime

def list_files(directory, pattern="*.xlsx"):
    return sorted(glob.glob(os.path.join(directory, pattern)))

def select_file(directory, prompt):
    files = list_files(directory)
    if not files:
        print(f"No files found in {directory}")
        sys.exit(1)
    print(f"\n{prompt}")
    for idx, file in enumerate(files, 1):
        print(f"{idx}. {os.path.basename(file)}")
    while True:
        try:
            choice = int(input("Enter the number of your selection: ")) - 1
            if 0 <= choice < len(files):
                return files[choice]
            print("Invalid selection. Try again.")
        except ValueError:
            print("Please enter a valid number.")

def compare_runs(file1_path=None, file2_path=None, output_base_path=None):
    # Default paths
    input_base_path = r"C:\Users\JulianHeron\Software Projects\Test files"
    output_base_path = output_base_path or r"C:\Users\JulianHeron\Software Projects\Test files\Comparison files"

    # If files not provided, prompt user to select
    if not file1_path:
        file1_path = select_file(input_base_path, "Select the first file to compare:")
    if not file2_path:
        file2_path = select_file(input_base_path, "Select the second file to compare:")

    # Read the two Excel files
    try:
        df1 = pd.read_excel(file1_path)
        df2 = pd.read_excel(file2_path)
    except Exception as e:
        print(f"Error reading files: {e}")
        return

    # Ensure SymbolCUSIP is present
    if 'SymbolCUSIP' not in df1.columns or 'SymbolCUSIP' not in df2.columns:
        print("SymbolCUSIP column missing in one or both files")
        return

    df1.set_index('SymbolCUSIP', inplace=True)
    df2.set_index('SymbolCUSIP', inplace=True)

    common_cusips = df1.index.intersection(df2.index)
    df_merged = pd.DataFrame(index=common_cusips)

    columns_to_compare = ['Final_Classification', 'Method_Used', 'Audit_Log', 'ProductName', 'fund_family', 'investment_strategy', 'FS_insight']
    differences = []
    for col in columns_to_compare:
        if col in df1.columns and col in df2.columns:
            df_merged[f'{col}_Run1'] = df1.loc[common_cusips, col]
            df_merged[f'{col}_Run2'] = df2.loc[common_cusips, col]
            diff_mask = df_merged[f'{col}_Run1'].astype(str) != df_merged[f'{col}_Run2'].astype(str)
            df_merged[f'{col}_Changed'] = diff_mask
            differences.append(diff_mask)

    any_diff = pd.DataFrame(differences).T.any(axis=0)
    df_merged['Has_Changes'] = any_diff
    df_merged.reset_index(inplace=True)

    df_diff = df_merged[df_merged['Has_Changes']].copy()
    summary_stats = {
        'Total_Funds_Compared': len(common_cusips),
        'Funds_With_Changes': len(df_diff),
        'Classification_Changes': df_diff['Final_Classification_Changed'].sum(),
        'Method_Changes': df_diff['Method_Used_Changed'].sum(),
        'Audit_Log_Changes': df_diff['Audit_Log_Changed'].sum(),
        'File1': file1_path,
        'File2': file2_path
    }

    # Generate output filename based on input filenames
    file1_name = os.path.splitext(os.path.basename(file1_path))[0]
    file2_name = os.path.splitext(os.path.basename(file2_path))[0]
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f"Comparison_{file1_name}_vs_{file2_name}_{timestamp}.xlsx"
    output_path = os.path.join(output_base_path, output_filename)

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with pd.ExcelWriter(output_path) as writer:
        df_diff.to_excel(writer, sheet_name='Differences', index=False)
        pd.Series(summary_stats).to_frame('Count').to_excel(writer, sheet_name='Summary')

    print(f"Comparison results exported to {output_path}")
    print("Summary Statistics:")
    for key, value in summary_stats.items():
        print(f"  {key}: {value}")

if __name__ == "__main__":
    # Command-line arguments or interactive selection
    if len(sys.argv) == 3:
        file1_path = sys.argv[1]
        file2_path = sys.argv[2]
        compare_runs(file1_path, file2_path)
    else:
        compare_runs()

In [22]:
import pandas as pd
from sqlalchemy import create_engine
import re
import os
import logging
from collections import Counter
from concurrent.futures import ProcessPoolExecutor, as_completed
from functools import partial
from datetime import datetime
import glob

# Setup logging to both console and file with UTF-8 encoding
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
try:
    console_handler.stream.reconfigure(encoding='utf-8')
except AttributeError:
    # Fallback for environments where reconfigure isn't available
    pass

logging.basicConfig(
    level=logging.INFO,
    handlers=[
        logging.FileHandler("classification.log", encoding='utf-8'),
        console_handler
    ]
)

# Configuration Flags
CONFIG = {
    # Auto-Classification Flags
    "use_auto_classification": False,          # UAC
    "use_fund_family_mapping": False,          # FFM
    "use_direct_keyword_mapping": False,       # DKM
    "use_direct_category_mapping": False,      # DCM

    # Exposure-Based Classification Flags
    "use_exposure_classification": True,     # UEC
    "use_proof_disproof_all_tiers": True,    # PDT
    "use_slight_none_proof_disproof": True,  # SNP

    # Validation and Fallback Flags
    "use_validation": True,                  # UVL
    "use_fallback_evaluation": True,         # UFE
    "use_fallback_scoring": True,            # UFS
    "use_fallback_tiebreaker": True,         # UFT

    # Scoring and Decision Weights
    "use_keyword_scoring": False,             # UKS
    "keyword_score_weight": 1.0,              # KSW
    "category_score_weight": 2.0,             # CSW

    # Concurrency Settings
    "use_concurrency": False,                 # UCN
    "max_workers": 4,                         # MXW
    "concurrency_executor": "process",        # CEX

    # Detailed Logging
    "use_detailed_logging": True              # UDL
}

# Connection string
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)

# Define keyword mappings for general scoring
keyword_mappings = {
    "Slight/None": [
        "long-only", "no derivatives", "no hedging", "no leverage", "no options", "no short",
        "position adjustment", "occasional hedging", "covered call", "put-write", "light hedge",
        "may include options", "limited use of derivatives", "for risk management purposes",
        "minor hedging", "occasional short positions", "overwrite", "investment grade", "core"
    ],
    "Moderate": [
        "hedged", "currency hedge", "protective put", "partial hedge", "hedged equity",
        "covered call", "convexity option overlay", "option overlay", "put/spread collar",
        "forward agreement", "enhanced index strategy", "BuyWrite", "Buy-Write", "buy write",
        "option spread", "volatility hedge", "put options", "enhance", "options-based income",
        "ELN", "premium income", "call option", "FLEX options", "option premium", "write calls",
        "sell calls", "protective puts", "equity-linked notes", "structured notes",
        "risk mitigation", "downside protection", "limited hedging", "multi-asset"
    ],
    "Persistent Systematic": [
        "tail-risk", "trend-following", "systematic hedging", "overlay", "CTA", "managed futures",
        "defined outcome", "long-short", "market neutral", "systematic strategy", "return stacking",
        "option writing", "straddle", "derivative income", "futures contracts", "swap contract",
        "forward agreement", "enhanced index strategy", "volatility hedge", "put options",
        "options-based income", "ELN", "option premium", "swap", "forward", "futures", "future",
        "VIX", "managed futures strategy", "trend strategy", "quantitative hedging",
        "systematic options", "options overlay strategy", "futures overlay", "swaps-based",
        "multi-asset", "Flex Options", "Flexible Exchange Options", "YieldMax", "buffer"
    ],
    "Heavy Amplification": [
        "2x", "3x", "Uncapped Accelerator", "-2x", "-3x", "YieldMax"
    ]
}

# Direct mapping keywords (often in ProductName)
direct_keyword_mappings = {
    "Persistent Systematic": ["Market Neutral", "managed futures", "Premia", "Return Stacked ETFs"]
}

# Direct mapping categories
direct_category_mappings = {
    "Persistent Systematic": {
        "YC_Category": ["Defined Outcome"],
        "CWA_Broad_Category": ["Defined Outcome"],
        "YC_Global_Category": ["market neutral"]
    },
    "Heavy Amplification": {
        "YC_Category": [
            "Trading--Leveraged Equity", "Trading--Leveraged Debt", "Trading--Leveraged Commodities"
        ],
        "CWA_Broad_Category": ["Single Stock"]
    }
}

# Helper category mappings (narrow possibilities)
helper_category_mappings = {
    ("Persistent Systematic", "Heavy Amplification"): {
        "YC_Category": [
            "Trading--Inverse Commodities", "Trading--Inverse Debt", "Trading--Inverse Equity",
            "Trading--Miscellaneous"
        ],
        "CWA_Broad_Category": ["Trading/Tactical"],
        "YC_Global_Category": ["Trading Tools"]
    },
    ("Persistent Systematic", "Moderate"): {
        "YC_Global_Category": ["Multialternative", "Long/Short Equity"],
        "YC_Category": ["Equity Hedged"]
    }
}

# Weak helper category mappings (may or may not have overlay)
weak_helper_category_mappings = {
    ("Slight/None", "Moderate"): {
        "YC_Global_Category": ["Flexible Allocation", "Alternative Miscellaneous"],
        "return_driver": ["Index Based", "Factor/Smart Beta"]
    },
    ("Slight/None", "Moderate", "Persistent Systematic"): {
        "YC_Category": ["Relative Value Arbitrage"]
    },
    ("Heavy Amplification", "Persistent Systematic", "Moderate", "Slight/None"): {
        "return_driver": ["Quant/Systematic"]
    },
    ("Persistent Systematic", "Moderate", "Slight/None"): {
        "return_driver": ["Active Discretionary", "Multi-Strategy"]
    }
}

# Proof/Disproof definitions for Tier 1 (Slight/None)
slight_proof_phrases = [
    "used for minor duration or risk tweaks",
    "occasional use for limited exposure adjustments",
    "used for position adjustments on a case-by-case basis",
    "applied sparingly to fine-tune risk",
    "used on an ad hoc basis for hedging",
    "employed occasionally for cash management",
    "derivatives used optionally for risk",
    "can utilize swaps for adjustments",
    "can use futures to track index",
    "can employ derivatives occasionally",
    "derivatives used discretionarily",
    "derivatives permitted for limited purposes",
    "may invest in derivatives sparingly",
    "may employ futures for cash flow",
    "uses derivatives to adjust exposure",
    "utilizes futures contracts to equitize cash",
    "will not use it to increase leveraged exposure",
    "use of derivatives is permitted within limits",
    "may use derivatives",
    "may invest in derivatives",
    "derivatives only to mitigate",
    "may utilize derivatives for managing duration, sector exposure, yield curve and risk mitigation",
    "may utilize derivatives for managing duration",
    "may invest in derivatives, including foreign currency derivatives",
    "may also invest in futures contracts and options to manage market exposure",
    "may use derivatives to leverage exposure or manage cash",
    "may also use derivatives to leverage or hedge exposure",
    "may hedge foreign currency exposure through derivatives, although it is not required to do so"
]

slight_disproof_phrases = [
    "employs currency forward contracts to hedge exposure",
    "derivatives are integral to its hedging strategy",
    "systematically uses derivatives",
    "uses a quantitative model to generate derivative signals",
    "systematic use of derivatives",
    "derivatives are central",
    "invests primarily in futures, call options, and put options",
    "hedges currency exposure with derivatives",
    "writes call options on index",
    "invests in futures to offset risk",
    "enters swap transactions for protection",
    "uses futures to enhance exposure",
    "invests in derivatives through subsidiary",
    "hedges interest rates with options",
    "allocates assets to options strategy",
    "employs leverage through inverse floaters",
    "employs options strategies regularly",
    "rolled according to a fixed schedule",
    "currency‐related derivatives to hedge",
    "may invest up to 15% of its total assets in credit default swaps",
    "applies an options collar strategy",
    "derivatives to hedge currency exposure",
    "writing covered calls",
    "selects put options through a laddered approach that rolls monthly"
]

# Placeholders for proof/disproof phrases for other tiers
moderate_proof_phrases = []
moderate_disproof_phrases = []

persistent_systematic_proof_phrases = []
persistent_systematic_disproof_phrases = []

heavy_amplification_proof_phrases = []
heavy_amplification_disproof_phrases = []

# Update proof_disproof_keywords to incorporate keywords and phrases for all tiers
proof_disproof_keywords = {
    "Slight/None": {
        "proof": keyword_mappings["Slight/None"] + slight_proof_phrases,
        "disproof": (
            keyword_mappings["Moderate"] +
            keyword_mappings["Persistent Systematic"] +
            keyword_mappings["Heavy Amplification"] +
            slight_disproof_phrases
        )
    },
    "Moderate": {
        "proof": keyword_mappings["Moderate"] + moderate_proof_phrases,
        "disproof": (
            keyword_mappings["Slight/None"] +
            keyword_mappings["Persistent Systematic"] +
            keyword_mappings["Heavy Amplification"] +
            moderate_disproof_phrases
        )
    },
    "Persistent Systematic": {
        "proof": keyword_mappings["Persistent Systematic"] + persistent_systematic_proof_phrases,
        "disproof": (
            keyword_mappings["Slight/None"] +
            keyword_mappings["Moderate"] +
            keyword_mappings["Heavy Amplification"] +
            persistent_systematic_disproof_phrases
        )
    },
    "Heavy Amplification": {
        "proof": keyword_mappings["Heavy Amplification"] + heavy_amplification_proof_phrases,
        "disproof": (
            keyword_mappings["Slight/None"] +
            keyword_mappings["Moderate"] +
            keyword_mappings["Persistent Systematic"] +
            heavy_amplification_disproof_phrases
        )
    }
}

# Utility Functions
def safe_lower(value):
    return value.lower() if isinstance(value, str) else ""

def search_keywords(text, keywords):
    if pd.isna(text):
        return 0
    text = str(text).lower()
    return sum(1 for kw in keywords if re.search(r'\b' + re.escape(kw.lower()) + r'\b', text))

def sanitize_excel_text(value):
    if pd.isna(value):
        return value
    text = str(value).replace("<", "[lt]").replace(">", "[gt]")
    if text.startswith(("=", "+", "-", "@")):
        text = "'" + text
    return text

# Function to generate output filename based on flags
def generate_output_filename(base_path):
    # Define shorthand abbreviations for each flag (same as in CONFIG comments)
    shorthand_map = {
        "use_auto_classification": "UAC",
        "use_fund_family_mapping": "FFM",
        "use_direct_keyword_mapping": "DKM",
        "use_direct_category_mapping": "DCM",
        "use_exposure_classification": "UEC",
        "use_proof_disproof_all_tiers": "PDT",
        "use_slight_none_proof_disproof": "SNP",
        "use_validation": "UVL",
        "use_fallback_evaluation": "UFE",
        "use_fallback_scoring": "UFS",
        "use_fallback_tiebreaker": "UFT",
        "use_keyword_scoring": "UKS",
        "use_concurrency": "UCN",
        "use_detailed_logging": "UDL"
    }

    disabled_flags = [key for key, value in CONFIG.items() if isinstance(value, bool) and not value]
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    if disabled_flags:
        # Use shorthand abbreviations for disabled flags
        max_flags = 5  # Increased to 5 since shorthands are shorter
        short_flags = disabled_flags[:max_flags]
        # Map each flag to its shorthand abbreviation
        flags_str = "_".join([shorthand_map.get(flag, flag.replace("use_", "")) for flag in short_flags])
        if len(disabled_flags) > max_flags:
            flags_str += f"_plus_{len(disabled_flags)-max_flags}_more"
        filename = f"Risk_Overlays_{timestamp}_No_{flags_str}.xlsx"
        total_path = os.path.join(base_path, filename)
        if len(total_path) > 200:  # Safe limit for Windows path length
            existing_files = glob.glob(os.path.join(base_path, "Risk_Overlays_*.xlsx"))
            increment = len(existing_files) + 1
            filename = f"Risk_Overlays_{timestamp}_Run_{increment}.xlsx"
    else:
        existing_files = glob.glob(os.path.join(base_path, "Risk_Overlays_*.xlsx"))
        increment = len(existing_files) + 1
        filename = f"Risk_Overlays_{timestamp}_Run_{increment}.xlsx"
    
    return os.path.join(base_path, filename)

# Step 1: Auto-Classification
def auto_classify(row, audit_log, stats_counter):
    if not CONFIG["use_auto_classification"]:
        audit_log.append("Auto-classification disabled")
        return None

    if CONFIG["use_fund_family_mapping"]:
        if pd.notna(row['fund_family']) and "return stacked etfs" in row['fund_family'].lower():
            audit_log.append("Auto-classified: 'Return Stacked ETFs' → Persistent Systematic")
            stats_counter["auto_classify_fund_family"] += 1
            return "Persistent Systematic"
    
    if CONFIG["use_direct_keyword_mapping"]:
        for category, keywords in direct_keyword_mappings.items():
            for field in ['ProductName', 'investment_strategy', 'FS_insight']:
                if pd.notna(row[field]) and search_keywords(row[field], keywords) > 0:
                    audit_log.append(f"Auto-classified: {category} via keyword in {field}")
                    stats_counter[f"auto_classify_keyword_{category}"] += 1
                    return category
    
    if CONFIG["use_direct_category_mapping"]:
        for category, mappings in direct_category_mappings.items():
            for db_field, values in mappings.items():
                vals_lower = [v.lower() for v in values]
                field_val = safe_lower(row.get(db_field, ''))
                if field_val in vals_lower:
                    audit_log.append(f"Auto-classified: {category} via {db_field}={field_val}")
                    stats_counter[f"auto_classify_category_{category}"] += 1
                    return category
    
    return None

# Step 2: Exposure-Based Classification with Proof/Disproof
def tally_proof_disproof(row, tier_name, audit_log, stats_counter):
    fields = ["ProductName", "investment_strategy", "FS_insight"]
    
    proof_phrases = {
        "Slight/None": slight_proof_phrases,
        "Moderate": moderate_proof_phrases,
        "Persistent Systematic": persistent_systematic_proof_phrases,
        "Heavy Amplification": heavy_amplification_proof_phrases
    }.get(tier_name, [])
    
    disproof_phrases = {
        "Slight/None": slight_disproof_phrases,
        "Moderate": moderate_disproof_phrases,
        "Persistent Systematic": persistent_systematic_disproof_phrases,
        "Heavy Amplification": heavy_amplification_disproof_phrases
    }.get(tier_name, [])
    
    proof_score = 0
    disproof_score = 0
    proof_found = []
    disproof_found = []
    
    # Check specific proof/disproof phrases for the tier
    for field in fields:
        if pd.notna(row[field]):
            text = row[field].lower()
            for phrase in proof_phrases:
                if phrase in text:
                    proof_score += 1
                    proof_found.append(phrase)
                    stats_counter[f"proof_{tier_name}_{phrase}"] += 1
            for phrase in disproof_phrases:
                if phrase in text:
                    disproof_score += 1
                    disproof_found.append(phrase)
                    stats_counter[f"disproof_{tier_name}_{phrase}"] += 1
    
    # Also check keyword-based proof/disproof
    proof_keywords = proof_disproof_keywords[tier_name]["proof"]
    disproof_keywords = proof_disproof_keywords[tier_name]["disproof"]
    
    for field in fields:
        if pd.notna(row[field]):
            text = str(row[field]).lower()
            proof_found.extend([kw for kw in proof_keywords if re.search(r'\b' + re.escape(kw.lower()) + r'\b', text) and kw not in proof_found])
            disproof_found.extend([kw for kw in disproof_keywords if re.search(r'\b' + re.escape(kw.lower()) + r'\b', text) and kw not in disproof_found])
    
    proof_score += len([kw for kw in proof_found if kw not in proof_phrases])
    disproof_score += len([kw for kw in disproof_found if kw not in disproof_phrases])
    
    if proof_found:
        stats_counter[f"proof_{tier_name}_keywords"] += len(proof_found)
        audit_log.append(f"{tier_name} Proof Keywords/Phrases Found: {proof_found}")
    if disproof_found:
        stats_counter[f"disproof_{tier_name}_keywords"] += len(disproof_found)
        audit_log.append(f"{tier_name} Disproof Keywords/Phrases Found: {disproof_found}")
    
    audit_log.append(f"{tier_name} Proof Score={proof_score}, Disproof Score={disproof_score}")
    return proof_score, disproof_score

def classify_by_exposures_with_disproof(row, audit_log, stats_counter):
    if not CONFIG["use_exposure_classification"]:
        audit_log.append("Exposure-based classification disabled")
        return "Unclassified"

    exposure_cols = ['cash_long', 'cash_short', 'stock_long', 'stock_short', 'bond_long', 'bond_short', 'other_long', 'other_short']
    for col in exposure_cols:
        val = pd.to_numeric(row[col], errors='coerce')
        row[col] = 0 if pd.isna(val) else val

    long_total = (row['cash_long'] + row['stock_long'] + row['bond_long'] + row['other_long']) * 100
    short_total = (row['cash_short'] + row['stock_short'] + row['bond_short'] + row['other_short']) * 100
    other_total = (row['other_long'] + row['other_short']) * 100

    long_r = round(long_total, 4)
    short_r = round(short_total, 4)
    other_r = round(other_total, 4)
    audit_log.append(f"Exposures: long={long_r}%, short={short_r}%, other={other_r}%")

    tiers = [
        ("Slight/None", lambda: abs(short_r) < 1 and abs(other_r) < 1 and long_r <= 100.2),
        ("Moderate", lambda: short_r <= 10 and other_r <= 10),
        ("Persistent Systematic", lambda: short_r <= 50 or other_r <= 50),
        ("Heavy Amplification", lambda: True)
    ]

    best_classification = None
    best_proof_score = -1

    for tier_name, condition in tiers:
        if condition():
            audit_log.append(f"Tier {tier_name} matches exposure criteria")
            if CONFIG["use_proof_disproof_all_tiers"]:
                proof_score, disproof_score = tally_proof_disproof(row, tier_name, audit_log, stats_counter)
                if proof_score >= disproof_score and proof_score > best_proof_score:
                    best_classification = tier_name
                    best_proof_score = proof_score
                    audit_log.append(f"New best classification: {tier_name} with Proof={proof_score}, Disproof={disproof_score}")
            else:
                return tier_name
        else:
            audit_log.append(f"Tier {tier_name} does not match exposure criteria")

    if best_classification:
        stats_counter[f"exposure_classify_{best_classification}"] += 1
        return best_classification
    else:
        audit_log.append("No tier passed proof/disproof, defaulting to Heavy Amplification")
        stats_counter["exposure_classify_default_heavy"] += 1
        return "Heavy Amplification"

# Step 3: Validation and Fallback Evaluation
def validate_classification(row, classification, audit_log, stats_counter):
    if not CONFIG["use_validation"]:
        return False

    questionable = False
    reasons = []

    for cats, mappings in helper_mappings.items():
        if classification not in cats:
            continue
        for field, expected_values in mappings.items():
            vals_lower = [v.lower() for v in expected_values]
            actual_val = safe_lower(row.get(field, ''))
            if actual_val in vals_lower:
                questionable = True
                reason = f"Conflicts with {field}={actual_val}"
                reasons.append(reason)
                stats_counter[f"validation_conflict_{field}"] += 1

    if classification == "Persistent Systematic" and "long-only" in safe_lower(row.get('investment_strategy', '')):
        questionable = True
        reasons.append("Persistent Systematic but 'long-only' found")
        stats_counter["validation_conflict_long_only"] += 1
    elif classification == "Slight/None" and safe_lower(row.get('YC_Category_Name', '')) == "defined outcome":
        questionable = True
        reasons.append("Slight/None but 'Defined Outcome' category")
        stats_counter["validation_conflict_defined_outcome"] += 1

    if questionable:
        audit_log.append(f"Questionable classification: {reasons}")
    return questionable

def fallback_evaluation(row, audit_log, stats_counter):
    if not CONFIG["use_fallback_evaluation"]:
        audit_log.append("Fallback evaluation disabled, defaulting to Slight/None")
        stats_counter["fallback_default_slight_none"] += 1
        return "Slight/None"

    scores = {"Slight/None": 0, "Moderate": 0, "Persistent Systematic": 0, "Heavy Amplification": 0}
    
    if CONFIG["use_fallback_scoring"]:
        if CONFIG["use_keyword_scoring"]:
            for category, keywords in keyword_mappings.items():
                for field in ['ProductName', 'investment_strategy', 'FS_insight']:
                    count = search_keywords(row[field], keywords)
                    scores[category] += count * CONFIG["keyword_score_weight"]
                    if count > 0:
                        stats_counter[f"fallback_keywords_{category}"] += count
        if row.get('YC_Category_Name', '').lower() in ["government bond", "corporate bond"]:
            scores["Slight/None"] += CONFIG["category_score_weight"]
            stats_counter["fallback_adjust_slight_none"] += 1
        if row.get('YC_Category_Name', '').lower() in ["long-short equity", "equity hedged"]:
            scores["Persistent Systematic"] += CONFIG["category_score_weight"]
            stats_counter["fallback_adjust_persistent_systematic"] += 1

    audit_log.append(f"Fallback scores: {scores}")
    max_score = max(scores.values())
    top_categories = [cat for cat, score in scores.items() if score == max_score]
    
    if len(top_categories) == 1:
        stats_counter[f"fallback_classify_{top_categories[0]}"] += 1
        return top_categories[0]
    
    if CONFIG["use_fallback_tiebreaker"]:
        conservative_order = ["Slight/None", "Moderate", "Persistent Systematic", "Heavy Amplification"]
        final = min(top_categories, key=lambda x: conservative_order.index(x))
        stats_counter[f"fallback_conservative_{final}"] += 1
        audit_log.append(f"Fallback conservative tiebreaker: {final}")
        return final
    else:
        final = top_categories[0]
        stats_counter[f"fallback_first_{final}"] += 1
        audit_log.append(f"Fallback no tiebreaker, first choice: {final}")
        return final

# Main Classification Function
def classify_fund(row, stats_counter):
    audit_log = []
    method_used = None

    classification = auto_classify(row, audit_log, stats_counter)
    if classification:
        method_used = "Auto-Classification"
        audit_log.append(f"Final: {classification} via {method_used}")
        return classification, method_used, audit_log

    classification = classify_by_exposures_with_disproof(row, audit_log, stats_counter)
    method_used = "Exposure-Based"

    questionable = validate_classification(row, classification, audit_log, stats_counter)
    if questionable:
        audit_log.append("Validation flagged as questionable, moving to fallback")
        classification = fallback_evaluation(row, audit_log, stats_counter)
        method_used = "Fallback Evaluation"

    audit_log.append(f"Final: {classification} via {method_used}")
    return classification, method_used, audit_log

def process_row(row, stats_counter):
    classification, method_used, audit_log = classify_fund(row, stats_counter)
    return {
        'classification': classification,
        'method_used': method_used,
        'audit_log': audit_log,
        'row': row.to_dict()
    }

def main():
    engine = create_engine(connection_string)
    query = """
    SELECT 
        f.SymbolCUSIP, f.ProductName, f.fund_family, f.investment_strategy, f.FS_insight,
        f.index_fund, f.inverse_fund, f.leveraged_fund, f.synthetic_replication_fund,
        f.fund_of_funds, f.ycharts_url, f.currency_hedged_fund,
        f.cash_long, f.cash_net, f.cash_short, f.stock_long, f.stock_net, f.stock_short,
        f.bond_long, f.bond_net, f.bond_short, f.other_long, f.other_net, f.other_short,
        f.return_driver, f.YC_BM_Symbol,
        cwa.CWA_Broad_Category_Name,
        yc.Category_Name AS YC_Category_Name,
        ycg.Global_Category_Name,
        ycba.YC_Broad_Asset_Class_Name
    FROM Funds_to_Screen f
    LEFT JOIN CWA_Broad_Category_List cwa ON f.CWA_Broad_Category_ID = cwa.ID
    LEFT JOIN YC_Category_List yc ON f.YC_Category_ID = yc.ID
    LEFT JOIN YC_Global_Category_List ycg ON f.YC_Global_Category_ID = ycg.ID
    LEFT JOIN YC_Broad_Asset_Class_List ycba ON f.YC_Broad_Asset_Class_ID = ycba.ID
    """

    try:
        df = pd.read_sql(query, engine)
    except Exception as e:
        logging.error(f"Database error: {e}")
        return
    finally:
        engine.dispose()

    stats_counter = Counter()
    classifications_counter = Counter()
    questionable_counter = 0
    results = []

    if CONFIG["use_concurrency"]:
        with ProcessPoolExecutor(max_workers=CONFIG["max_workers"]) as executor:
            process_row_with_stats = partial(process_row, stats_counter=stats_counter)
            futures = [executor.submit(process_row_with_stats, row) for _, row in df.iterrows()]
            for future in as_completed(futures):
                try:
                    result = future.result()
                    classification = result['classification']
                    method_used = result['method_used']
                    audit_log = result['audit_log']
                    row_dict = result['row']
                    
                    classifications_counter[classification] += 1
                    if "Fallback Evaluation" in method_used:
                        questionable_counter += 1
                    new_row = pd.Series(row_dict)
                    new_row['Final_Classification'] = classification
                    new_row['Method_Used'] = method_used
                    new_row['Audit_Log'] = "; ".join(audit_log)
                    results.append(new_row)
                except Exception as e:
                    logging.error(f"Error processing row in ProcessPoolExecutor: {e}")
                    continue
    else:
        for idx, row in df.iterrows():
            classification, method_used, audit_log = classify_fund(row, stats_counter)
            classifications_counter[classification] += 1
            if "Fallback Evaluation" in method_used:
                questionable_counter += 1
            new_row = row.copy()
            new_row['Final_Classification'] = classification
            new_row['Method_Used'] = method_used
            new_row['Audit_Log'] = "; ".join(audit_log)
            results.append(new_row)

    out_df = pd.DataFrame(results)
    out_df['Audit_Log'] = out_df['Audit_Log'].apply(sanitize_excel_text)

    columns_front = ['SymbolCUSIP', 'ProductName', 'fund_family', 'Final_Classification', 'Method_Used', 'ycharts_url']
    other_cols = [c for c in out_df.columns if c not in columns_front]
    out_df = out_df[columns_front + other_cols]

    base_path = r"C:\Users\JulianHeron\Software Projects\Test files"
    output_path = generate_output_filename(base_path)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    out_df.to_excel(output_path, index=False)

    # Logging section integrated here
    if CONFIG.get("use_detailed_logging", True):
        logging.info("=== Detailed Classification Summary ===")
        logging.info(f"Total Funds Processed: {len(out_df)}")
        
        logging.info("\nDistribution of Classifications:")
        for cat, count in classifications_counter.items():
            logging.info(f"  {cat}: {count} ({count/len(out_df)*100:.2f}%)")
        
        logging.info("\nClassification Methods Breakdown:")
        methods = ["Auto-Classification", "Exposure-Based", "Fallback Evaluation"]
        for method in methods:
            if method == "Auto-Classification":
                # Sum sub-method counts for top-level Auto-Classification
                auto_count = (
                    stats_counter.get("auto_classify_fund_family", 0) +
                    sum(stats_counter.get(f"auto_classify_keyword_{cat}", 0) for cat in keyword_mappings.keys()) +
                    sum(stats_counter.get(f"auto_classify_category_{cat}", 0) for cat in direct_category_mappings.keys())
                )
                logging.info(f"  {method}: {auto_count}")
                logging.info("    By Fund Family:")
                logging.info(f"      Return Stacked ETFs: {stats_counter.get('auto_classify_fund_family', 0)}")
                logging.info("    By Keywords:")
                for cat in keyword_mappings.keys():
                    logging.info(f"      {cat}: {stats_counter.get(f'auto_classify_keyword_{cat}', 0)}")
                logging.info("    By Category:")
                for cat in direct_category_mappings.keys():
                    logging.info(f"      {cat}: {stats_counter.get(f'auto_classify_category_{cat}', 0)}")
            elif method == "Exposure-Based":
                # Sum sub-method counts for top-level Exposure-Based
                exposure_count = sum(
                    stats_counter.get(f"exposure_classify_{tier}", 0) for tier in keyword_mappings.keys()
                ) + stats_counter.get("exposure_classify_default_heavy", 0)
                logging.info(f"  {method}: {exposure_count}")
                logging.info("    By Tier:")
                for tier in keyword_mappings.keys():
                    logging.info(f"      {tier}: {stats_counter.get(f'exposure_classify_{tier}', 0)}")
                logging.info(f"    Default to Heavy Amplification: {stats_counter.get('exposure_classify_default_heavy', 0)}")
                logging.info("    Proof/Disproof Triggers:")
                for tier in keyword_mappings.keys():
                    logging.info(f"      {tier} Proof Keywords: {stats_counter.get(f'proof_{tier}_keywords', 0)}")
                    logging.info(f"      {tier} Disproof Keywords: {stats_counter.get(f'disproof_{tier}_keywords', 0)}")
                if CONFIG["use_slight_none_proof_disproof"]:
                    logging.info("    Slight/None Specific Proof/Disproof:")
                    for phrase in slight_proof_phrases:
                        logging.info(f"      Proof Phrase '{phrase}': {stats_counter.get(f'proof_slight_none_{phrase}', 0)}")
                    for phrase in slight_disproof_phrases:
                        logging.info(f"      Disproof Phrase '{phrase}': {stats_counter.get(f'disproof_slight_none_{phrase}', 0)}")
            elif method == "Fallback Evaluation":
                logging.info(f"  {method}: {sum(stats_counter.get(f'fallback_classify_{tier}', 0) for tier in keyword_mappings.keys())}")
                logging.info(f"    Total Questionable Classifications: {questionable_counter}")
                logging.info("    Validation Conflicts:")
                logging.info(f"      Long-Only Conflicts: {stats_counter.get('validation_conflict_long_only', 0)}")
                logging.info(f"      Defined Outcome Conflicts: {stats_counter.get('validation_conflict_defined_outcome', 0)}")
                for field in ['YC_Category', 'YC_Global_Category']:
                    logging.info(f"      {field} Conflicts: {stats_counter.get(f'validation_conflict_{field}', 0)}")
                logging.info("    Fallback Classifications:")
                for tier in keyword_mappings.keys():
                    logging.info(f"      {tier}: {stats_counter.get(f'fallback_classify_{tier}', 0)}")
                logging.info("    Fallback Conservative Tiebreakers:")
                for tier in keyword_mappings.keys():
                    logging.info(f"      {tier}: {stats_counter.get(f'fallback_conservative_{tier}', 0)}")

    logging.info(f"Exported classification results => {output_path}")

if __name__ == "__main__":
    main()

2025-03-05 17:53:42,984 - INFO - === Detailed Classification Summary ===
2025-03-05 17:53:42,986 - INFO - Total Funds Processed: 5586
2025-03-05 17:53:42,986 - INFO - 
Distribution of Classifications:
2025-03-05 17:53:42,987 - INFO -   Persistent Systematic: 1019 (18.24%)
2025-03-05 17:53:42,988 - INFO -   Moderate: 888 (15.90%)
2025-03-05 17:53:42,988 - INFO -   Heavy Amplification: 353 (6.32%)
2025-03-05 17:53:42,989 - INFO -   Slight/None: 3326 (59.54%)
2025-03-05 17:53:42,990 - INFO - 
Classification Methods Breakdown:
2025-03-05 17:53:42,991 - INFO -   Auto-Classification: 0
2025-03-05 17:53:42,991 - INFO -     By Fund Family:
2025-03-05 17:53:42,992 - INFO -       Return Stacked ETFs: 0
2025-03-05 17:53:42,993 - INFO -     By Keywords:
2025-03-05 17:53:42,993 - INFO -       Slight/None: 0
2025-03-05 17:53:42,994 - INFO -       Moderate: 0
2025-03-05 17:53:42,994 - INFO -       Persistent Systematic: 0
2025-03-05 17:53:42,995 - INFO -       Heavy Amplification: 0
2025-03-05 17:53:

In [None]:
# Exposures Code with overlay.. replacing risk drivers

import re
import pandas as pd

def safe_lower(text):
    return str(text).lower() if pd.notna(text) else ""

def classify_by_exposures_with_disproof(row, audit_log, stats_counter):
    if not CONFIG["use_exposure_classification"]:
        return "Unclassified Leverage", "No Overlay", "No Risk Management", "Unclassified Purpose"

    exposure_cols = ['cash_long', 'cash_short', 'stock_long', 'stock_short', 'bond_long', 'bond_short', 'other_long', 'other_short']
    for col in exposure_cols:
        val = pd.to_numeric(row[col], errors='coerce')
        row[col] = 0 if pd.isna(val) else val

    # Step 1: Total known positions
    total_long = (row['cash_long'] + row['stock_long'] + row['bond_long'] + row['other_long']) * 100
    total_short = (row['cash_short'] + row['stock_short'] + row['bond_short'] + row['other_short']) * 100
    other_total = (row['other_long'] + row['other_short']) * 100

    # Step 2: Adjust bond positions to balance to 100%
    long_adjustment = 100 - total_long if total_long < 100 else max(0, total_long - 100)
    short_adjustment = 100 - total_short if total_short < 100 else max(0, total_short - 100)
    adjusted_bond_long = row['bond_long'] * 100 + long_adjustment if total_long < 100 else max(0, row['bond_long'] * 100 - long_adjustment)
    adjusted_bond_short = row['bond_short'] * 100 + short_adjustment if total_short < 100 else max(0, row['bond_short'] * 100 - short_adjustment)
    adjusted_total_long = (row['cash_long'] + row['stock_long'] + (adjusted_bond_long / 100) + row['other_long']) * 100
    adjusted_total_short = (row['cash_short'] + row['stock_short'] + (adjusted_bond_short / 100) + row['other_short']) * 100

    # Step 3: Calculate non-derivative long/short
    non_deriv_long = (adjusted_total_long - other_total) / (1 - other_total/100) if other_total < 100 else 0
    non_deriv_short = (adjusted_total_short - other_total) / (1 - other_total/100) if other_total < 100 else 0
    net_non_deriv = non_deriv_long - non_deriv_short

    # Step 4: Calculate leverage (excess over 100%)
    net_exposure = (adjusted_total_long - adjusted_total_short)
    derivative_leverage = 1.5 * other_total
    leverage = max(0, net_exposure + derivative_leverage - 100)
    other_r = round(other_total, 4)
    audit_log.append(f"Total Long: {round(total_long, 4)}%, Total Short: {round(total_short, 4)}%, Adjusted Net: {round(net_exposure, 4)}%, Derivative Leverage: {round(derivative_leverage, 4)}%, Total Leverage: {round(leverage, 4)}%")

    # Leverage magnitude tiers
    if leverage < 5:
        leverage_class = "Slight"
    elif leverage <= 20:
        leverage_class = "Moderate"
    elif leverage <= 50:
        leverage_class = "Substantial"
    else:
        leverage_class = "Heavy"

    # Binary derivative overlay
    overlay_keywords = ["derivatives", "overlay", "options", "warrants", "swaps", "forwards", "collars", "futures", "hedges"]
    has_overlay = other_r > 0 and any(kw in safe_lower(row.get('investment_strategy', '')) for kw in overlay_keywords)

    # Risk management overlay with chained permissive keywords
    permissive_patterns = [rf"(may use|can utilize|permitted to use)\s+(derivatives|options|warrants|swaps|forwards|collars|futures|hedges)"]
    risk_keywords = ["hedges", "risk mitigation", "downside protection"]
    has_permissive = any(re.search(pattern, safe_lower(row.get('investment_strategy', ''))) for pattern in permissive_patterns)
    has_risk_specific = any(kw in safe_lower(row.get('investment_strategy', '')) for kw in risk_keywords)
    risk_management = "Yes" if (has_overlay and (other_r >= 1 or has_risk_specific)) else "No"
    if has_permissive and other_r < 1:
        risk_management = "No"  # Incidental use

    stats_counter[f"exposure_leverage_{leverage_class}"] += 1
    stats_counter[f"overlay_{'Yes' if has_overlay else 'No'}"] += 1
    stats_counter[f"risk_management_{risk_management}"] += 1
    purpose = determine_purpose(row, audit_log, stats_counter)
    audit_log.append(f"Leverage: {leverage_class}, Overlay: {'Yes' if has_overlay else 'No'}, Risk Management: {risk_management}, Purpose: {purpose}")
    return leverage_class, "Yes" if has_overlay else "No", risk_management, purpose



In [2]:
# Leverage exposure and overlay information

import re
import pandas as pd
import logging
from collections import defaultdict

# Initialize logging to console and file
logging.basicConfig(
    level=logging.INFO,
    format='%(message)s',
    handlers=[
        logging.StreamHandler(),  # Output to console
        logging.FileHandler('classification_results.log')  # Save to file
    ]
)

# Configuration
CONFIG = {
    "use_exposure_classification": True,
    "use_detailed_logging": True
}

def safe_lower(text):
    """Utility to safely convert text to lowercase."""
    return str(text).lower() if pd.notna(text) else ""

def auto_classify(row, audit_log, stats_counter):
    """Placeholder for auto-classification logic."""
    return None

def determine_purpose(row, audit_log, stats_counter):
    """Classify the purpose of derivative use based on categories and keywords."""
    fields = ["ProductName", "investment_strategy", "FS_insight"]
    purpose_scores = {"Incidental": 0, "Overlay": 0, "Systematic": 0, "Amplification": 0}

    # Boolean and category checks
    if row.get('index_fund', False) and not row.get('leveraged_fund', False):
        purpose_scores["Incidental"] += 2
    if row.get('leveraged_fund', False) or "2x" in str(row.get('ProductName', '')).lower() or "3x" in str(row.get('ProductName', '')).lower():
        purpose_scores["Amplification"] += 2
    if row.get('YC_Category_Name', '').lower() in ["market neutral", "long-short equity", "managed futures"]:
        purpose_scores["Systematic"] += 2
    if row.get('YC_Category_Name', '').lower() in ["option income", "covered call"]:
        purpose_scores["Overlay"] += 2

    # Keyword/phrase checks
    incidental_phrases = ["no derivatives or hedging strategies used"]
    overlay_phrases = ["writes covered call options", "options overlay strategy"]
    systematic_phrases = ["long and short positions", "managed futures strategy"]
    amplification_phrases = ["targets 2x daily returns"]

    for field in fields:
        if pd.notna(row[field]):
            text = row[field].lower()
            for phrase in incidental_phrases:
                if phrase in text:
                    purpose_scores["Incidental"] += 1
            for phrase in overlay_phrases:
                if phrase in text:
                    purpose_scores["Overlay"] += 1
            for phrase in systematic_phrases:
                if phrase in text:
                    purpose_scores["Systematic"] += 1
            for phrase in amplification_phrases:
                if phrase in text:
                    purpose_scores["Amplification"] += 1

    max_score = max(purpose_scores.values())
    if max_score == 0:
        return "Unclassified Purpose"
    purpose = max(purpose_scores, key=purpose_scores.get)
    stats_counter[f"purpose_{purpose}"] += 1
    audit_log.append(f"Purpose Scores: {purpose_scores}")
    return purpose

def classify_by_exposures_with_disproof(row, audit_log, stats_counter):
    """Classify funds by leverage, derivative overlay, risk management, and purpose."""
    if not CONFIG["use_exposure_classification"]:
        return "Unclassified Leverage", "No Overlay", "No Risk Management", "Unclassified Purpose"

    exposure_cols = ['cash_long', 'cash_short', 'stock_long', 'stock_short', 'bond_long', 'bond_short', 'other_long', 'other_short']
    for col in exposure_cols:
        val = pd.to_numeric(row[col], errors='coerce')
        row[col] = 0 if pd.isna(val) else val

    # Step 1: Total known positions
    total_long = (row['cash_long'] + row['stock_long'] + row['bond_long'] + row['other_long']) * 100
    total_short = (row['cash_short'] + row['stock_short'] + row['bond_short'] + row['other_short']) * 100
    other_total = (row['other_long'] + row['other_short']) * 100

    # Step 2: Adjust bond positions to balance to 100%
    long_adjustment = 100 - total_long if total_long < 100 else max(0, total_long - 100)
    short_adjustment = 100 - total_short if total_short < 100 else max(0, total_short - 100)
    adjusted_bond_long = row['bond_long'] * 100 + long_adjustment if total_long < 100 else max(0, row['bond_long'] * 100 - long_adjustment)
    adjusted_bond_short = row['bond_short'] * 100 + short_adjustment if total_short < 100 else max(0, row['bond_short'] * 100 - short_adjustment)
    adjusted_total_long = (row['cash_long'] + row['stock_long'] + (adjusted_bond_long / 100) + row['other_long']) * 100
    adjusted_total_short = (row['cash_short'] + row['stock_short'] + (adjusted_bond_short / 100) + row['other_short']) * 100

    # Step 3: Calculate non-derivative long/short
    non_deriv_long = (adjusted_total_long - other_total) / (1 - other_total/100) if other_total < 100 else 0
    non_deriv_short = (adjusted_total_short - other_total) / (1 - other_total/100) if other_total < 100 else 0
    net_non_deriv = non_deriv_long - non_deriv_short

    # Step 4: Calculate leverage (excess over 100%)
    net_exposure = (adjusted_total_long - adjusted_total_short)
    derivative_leverage = 1.5 * other_total
    leverage = max(0, net_exposure + derivative_leverage - 100)
    other_r = round(other_total, 4)
    audit_log.append(f"Total Long: {round(total_long, 4)}%, Total Short: {round(total_short, 4)}%, Adjusted Net: {round(net_exposure, 4)}%, Derivative Leverage: {round(derivative_leverage, 4)}%, Total Leverage: {round(leverage, 4)}%")

    # Leverage magnitude tiers
    if leverage < 5:
        leverage_class = "Slight"
    elif leverage <= 20:
        leverage_class = "Moderate"
    elif leverage <= 50:
        leverage_class = "Substantial"
    else:
        leverage_class = "Heavy"

    # Binary derivative overlay
    overlay_keywords = ["derivatives", "overlay", "options", "warrants", "swaps", "forwards", "collars", "futures", "hedges"]
    has_overlay = other_r > 0 and any(kw in safe_lower(row.get('investment_strategy', '')) for kw in overlay_keywords)

    # Risk management overlay with chained permissive keywords
    permissive_patterns = [rf"(may use|can utilize|permitted to use)\s+(derivatives|options|warrants|swaps|forwards|collars|futures|hedges)"]
    risk_keywords = ["hedges", "risk mitigation", "downside protection"]
    has_permissive = any(re.search(pattern, safe_lower(row.get('investment_strategy', ''))) for pattern in permissive_patterns)
    has_risk_specific = any(kw in safe_lower(row.get('investment_strategy', '')) for kw in risk_keywords)
    risk_management = "Yes" if (has_overlay and (other_r >= 1 or has_risk_specific)) else "No"
    if has_permissive and other_r < 1:
        risk_management = "No"  # Incidental use

    stats_counter[f"exposure_leverage_{leverage_class}"] += 1
    stats_counter[f"overlay_{'Yes' if has_overlay else 'No'}"] += 1
    stats_counter[f"risk_management_{risk_management}"] += 1
    purpose = determine_purpose(row, audit_log, stats_counter)
    audit_log.append(f"Leverage: {leverage_class}, Overlay: {'Yes' if has_overlay else 'No'}, Risk Management: {risk_management}, Purpose: {purpose}")
    return leverage_class, "Yes" if has_overlay else "No", risk_management, purpose

def classify_fund(row, stats_counter):
    """Integrate classification steps."""
    audit_log = []
    method_used = None

    classification = auto_classify(row, audit_log, stats_counter)
    if classification:
        method_used = "Auto-Classification"
        leverage, overlay, risk_mgmt, purpose = classification, "No Overlay", "No Risk Management", "Unclassified Purpose"
    else:
        leverage, overlay, risk_mgmt, purpose = classify_by_exposures_with_disproof(row, audit_log, stats_counter)
        method_used = "Exposure-Based"

    audit_log.append(f"Final: {leverage} {overlay} {risk_mgmt} {purpose} via {method_used}")
    return leverage, overlay, risk_mgmt, purpose, method_used, audit_log

def sanitize_excel_text(text):
    """Sanitize text for Excel output."""
    return str(text).replace('\r', '').replace('\n', ' ')

def main(df):
    """Main function to classify funds and log results."""
    stats_counter = defaultdict(int)
    results = []
    
    for idx, row in df.iterrows():
        try:
            leverage, overlay, risk_mgmt, purpose, method_used, audit_log = classify_fund(row, stats_counter)
            new_row = row.copy()
            new_row['Leverage'] = leverage
            new_row['Overlay'] = overlay
            new_row['Risk_Management'] = risk_mgmt
            new_row['Purpose'] = purpose
            new_row['Method_Used'] = method_used
            new_row['Audit_Log'] = "; ".join(audit_log)
            results.append(new_row)
        except Exception as e:
            logging.error(f"Error processing row {idx}: {str(e)}")
            continue

    out_df = pd.DataFrame(results)
    out_df['Classification'] = out_df['Leverage'] + " " + out_df['Overlay'] + " " + out_df['Risk_Management'] + " " + out_df['Purpose']

    if CONFIG.get("use_detailed_logging", True):
        logging.info("=== Detailed Classification Summary ===")
        logging.info(f"Total Funds Processed: {len(out_df)}")
        logging.info("\nLeverage Distribution:")
        for lev in ["Slight", "Moderate", "Substantial", "Heavy"]:
            count = stats_counter.get(f"exposure_leverage_{lev}", 0)
            logging.info(f"  {lev}: {count} ({count/len(out_df)*100:.2f}%)")
        logging.info("\nOverlay Distribution:")
        for ovr in ["Yes", "No"]:
            count = stats_counter.get(f"overlay_{ovr}", 0)
            logging.info(f"  {ovr}: {count} ({count/len(out_df)*100:.2f}%)")
        logging.info("\nRisk Management Distribution:")
        for rm in ["Yes", "No"]:
            count = stats_counter.get(f"risk_management_{rm}", 0)
            logging.info(f"  {rm}: {count} ({count/len(out_df)*100:.2f}%)")
        logging.info("\nPurpose Distribution:")
        for purp in ["Incidental", "Overlay", "Systematic", "Amplification"]:
            count = stats_counter.get(f"purpose_{purp}", 0)
            logging.info(f"  {purp}: {count} ({count/len(out_df)*100:.2f}%)")

    # Save results to CSV
    out_df.to_csv("classified_funds_output.csv", index=False)
    logging.info("Results saved to 'classified_funds_output.csv'")
    return out_df

# Example usage
if __name__ == "__main__":
    # Load your DataFrame (replace with your file path)
    try:
        df = pd.read_csv("your_fund_data.csv")  # Adjust path as needed
        out_df = main(df)
    except FileNotFoundError:
        logging.error("Data file 'your_fund_data.csv' not found. Please provide the correct path.")
    except Exception as e:
        logging.error(f"Error running main: {str(e)}")

Data file 'your_fund_data.csv' not found. Please provide the correct path.


In [1]:
# Leverage only test

import pandas as pd
from sqlalchemy import create_engine
import re
import os
import logging
from collections import Counter
from concurrent.futures import ProcessPoolExecutor, as_completed
from functools import partial
from datetime import datetime
import glob
from typing import Dict, List, Tuple, Optional

class FundClassifier:
    """Class to handle fund leverage measurement and overlay strategy detection"""
    
    def __init__(self):
        # Configuration setup
        self.config = {
            "use_concurrency": False,
            "max_workers": 4,
            "use_detailed_logging": True,
            "derivatives_multiplier": 1.5,  # Multiplier for 'other' (derivatives) exposure
            "other_threshold": 0.02,  # 2% threshold for overlay strategy detection
            "other_cap": 0.9  # Cap for 'other' in adjustment formula to avoid division issues
        }
        
        # Database connection
        self.connection_string = (
            "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
            "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
            "&trusted_connection=yes&TrustServerCertificate=yes"
        )
        
        # Keyword mappings for overlay detection
        self.derivative_keywords = ["options", "futures", "forwards", "swaps", "warrants"]
        self.permissive_phrases = ["may use", "can use", "might use", "occasionally uses"]
        
        # Setup logging
        self._setup_logging()

    def _setup_logging(self):
        """Configure logging to console and file"""
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
        try:
            console_handler.stream.reconfigure(encoding='utf-8')
        except AttributeError:
            pass

        logging.basicConfig(
            level=logging.INFO,
            handlers=[
                logging.FileHandler("exposures_overlays.log", encoding='utf-8'),
                console_handler
            ]
        )

    def _sanitize_excel_text(self, value) -> str:
        """Sanitize text for Excel output"""
        if pd.isna(value):
            return value
        text = str(value).replace("<", "[lt]").replace(">", "[gt]")
        return f"'{text}" if text.startswith(("=", "+", "-", "@")) else text

    def _generate_output_filename(self, base_path: str) -> str:
        """Generate unique output filename based on timestamp"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        increment = len(glob.glob(os.path.join(base_path, "exposures_overlays_*.xlsx"))) + 1
        return os.path.join(base_path, f"exposures_overlays_{timestamp}_Run_{increment}.xlsx")

    def _calculate_leverage(self, row: pd.Series, audit_log: List[str], stats: Counter) -> Tuple[float, float, float]:
        """
        Calculate leverage based on exposures, with derivative adjustment
        Returns: (adjusted_long, adjusted_short, total_leverage)
        """
        # Define exposure columns
        long_cols = ['cash_long', 'stock_long', 'bond_long', 'other_long']
        short_cols = ['cash_short', 'stock_short', 'bond_short', 'other_short']
        
        # Convert to numeric and handle NaN
        for col in long_cols + short_cols:
            row[col] = pd.to_numeric(row.get(col, 0), errors='coerce') or 0

        # Calculate raw totals
        total_long = sum(row[col] for col in long_cols) * 100  # Convert to percentage
        total_short = sum(row[col] for col in short_cols) * 100
        other_long = row['other_long'] * 100
        other_short = row['other_short'] * 100
        
        audit_log.append(f"Raw Exposures: Total Long={total_long:.2f}%, Total Short={total_short:.2f}%, "
                        f"Other Long={other_long:.2f}%, Other Short={other_short:.2f}%")

        # Cap 'other' for adjustment to avoid division issues
        other_long_adj = min(other_long / 100, self.config["other_cap"])  # Convert back to decimal for formula
        other_short_adj = min(other_short / 100, self.config["other_cap"])
        
        if other_long / 100 > self.config["other_cap"]:
            audit_log.append(f"Capped other_long at {self.config['other_cap']*100}% for adjustment")
            stats["other_capped_long"] += 1
        if other_short / 100 > self.config["other_cap"]:
            audit_log.append(f"Capped other_short at {self.config['other_cap']*100}% for adjustment")
            stats["other_capped_short"] += 1

        # Calculate adjusted non-derivative holdings
        adjusted_long = ((total_long - other_long) / (1 - other_long_adj)) if other_long_adj < 1 else 0
        adjusted_short = ((total_short - other_short) / (1 - other_short_adj)) if other_short_adj < 1 else 0
        
        # Calculate total leverage with derivative multiplier
        derivative_impact = (other_long + other_short) * self.config["derivatives_multiplier"]
        total_leverage = (adjusted_long - adjusted_short) + derivative_impact
        
        audit_log.append(f"Adjusted Long={adjusted_long:.2f}%, Adjusted Short={adjusted_short:.2f}%, "
                        f"Derivative Impact={derivative_impact:.2f}%, Total Leverage={total_leverage:.2f}%")
        
        stats["leverage_calculated"] += 1
        return adjusted_long, adjusted_short, total_leverage

    def _categorize_leverage(self, total_leverage: float, audit_log: List[str], stats: Counter) -> str:
        """Categorize leverage based on deviation from 100%"""
        deviation = abs(total_leverage - 100)
        
        if deviation < 5:
            category = "Slight"
        elif deviation < 15:
            category = "Moderate"
        elif deviation < 30:
            category = "Substantial"
        else:
            category = "Heavy"
            
        audit_log.append(f"Leverage Deviation={deviation:.2f}%, Category={category}")
        stats[f"leverage_{category.lower()}"] += 1
        return category

    def _detect_overlay(self, row: pd.Series, audit_log: List[str], stats: Counter) -> str:
        """Detect overlay strategy based on 'other' exposure and keywords"""
        # Calculate total 'other' exposure
        other_long = pd.to_numeric(row.get('other_long', 0), errors='coerce') or 0
        other_short = pd.to_numeric(row.get('other_short', 0), errors='coerce') or 0
        total_other = (other_long + other_short) * 100  # Convert to percentage
        
        # Search for keywords in text fields
        text_fields = ['investment_strategy', 'ProductName', 'FS_insight']
        text = " ".join(str(row.get(field, '')).lower() for field in text_fields if pd.notna(row.get(field)))
        
        # Check for derivative keywords
        has_keywords = False
        has_permissive = False
        found_keywords = []
        found_permissive = []
        
        for keyword in self.derivative_keywords:
            if re.search(r'\b' + re.escape(keyword) + r'\b', text):
                has_keywords = True
                found_keywords.append(keyword)
                # Check for permissive phrases preceding the keyword
                for phrase in self.permissive_phrases:
                    if re.search(r'\b' + re.escape(phrase) + r'\s+' + re.escape(keyword) + r'\b', text):
                        has_permissive = True
                        found_permissive.append(f"{phrase} {keyword}")
                        break
        
        audit_log.append(f"Total Other Exposure={total_other:.2f}%, Keywords Found={found_keywords}, "
                        f"Permissive Phrases={found_permissive}")
        
        # Classify overlay
        if total_other < self.config["other_threshold"] * 100:
            overlay = "Incidental/None"
        elif total_other >= self.config["other_threshold"] * 100:
            if has_keywords and not has_permissive:
                overlay = "Strategic"
            else:
                overlay = "Incidental/None"
        else:
            overlay = "Incidental/None"
            
        audit_log.append(f"Overlay Category={overlay}")
        stats[f"overlay_{overlay.lower().replace('/', '_')}"] += 1
        return overlay

    def process_fund(self, row: pd.Series, stats: Counter) -> Dict:
        """Process a single fund row"""
        audit_log = []
        
        # Calculate leverage
        adjusted_long, adjusted_short, total_leverage = self._calculate_leverage(row, audit_log, stats)
        leverage_category = self._categorize_leverage(total_leverage, audit_log, stats)
        
        # Detect overlay strategy
        overlay_category = self._detect_overlay(row, audit_log, stats)
        
        return {
            'adjusted_long': adjusted_long,
            'adjusted_short': adjusted_short,
            'total_leverage': total_leverage,
            'leverage_category': leverage_category,
            'overlay_category': overlay_category,
            'audit_log': audit_log,
            'row': row.to_dict()
        }

    def run(self):
        """Main execution method"""
        engine = create_engine(self.connection_string)
        query = """
        SELECT SymbolCUSIP, ProductName, fund_family, investment_strategy, FS_insight,
               cash_long, cash_short, stock_long, stock_short,
               bond_long, bond_short, other_long, other_short
        FROM Funds_to_Screen
        """

        try:
            df = pd.read_sql(query, engine)
        except Exception as e:
            logging.error(f"Database error: {e}")
            return
        finally:
            engine.dispose()

        stats = Counter()
        results = []

        if self.config["use_concurrency"]:
            with ProcessPoolExecutor(max_workers=self.config["max_workers"]) as executor:
                futures = [executor.submit(self.process_fund, row, stats) for _, row in df.iterrows()]
                results = [future.result() for future in as_completed(futures)]
        else:
            results = [self.process_fund(row, stats) for _, row in df.iterrows()]

        # Process results
        out_df = pd.DataFrame([r['row'] for r in results])
        out_df['Adjusted_Long'] = [r['adjusted_long'] for r in results]
        out_df['Adjusted_Short'] = [r['adjusted_short'] for r in results]
        out_df['Total_Leverage'] = [r['total_leverage'] for r in results]
        out_df['Leverage_Category'] = [r['leverage_category'] for r in results]
        out_df['Overlay_Category'] = [r['overlay_category'] for r in results]
        out_df['Audit_Log'] = [self._sanitize_excel_text("; ".join(r['audit_log'])) for r in results]

        # Column ordering
        columns_front = ['SymbolCUSIP', 'ProductName', 'fund_family', 
                        'Adjusted_Long', 'Adjusted_Short', 'Total_Leverage', 
                        'Leverage_Category', 'Overlay_Category']
        other_cols = [c for c in out_df.columns if c not in columns_front]
        out_df = out_df[columns_front + other_cols]

        # Export
        base_path = r"C:\Users\JulianHeron\Software Projects\Test files"
        output_path = self._generate_output_filename(base_path)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        out_df.to_excel(output_path, index=False)

        # Logging
        if self.config["use_detailed_logging"]:
            logging.info(f"Processed {len(out_df)} funds")
            logging.info("Leverage Categories:")
            for cat in ["Slight", "Moderate", "Substantial", "Heavy"]:
                logging.info(f"  {cat}: {stats[f'leverage_{cat.lower()}']}")
            logging.info("Overlay Categories:")
            logging.info(f"  Incidental/None: {stats['overlay_incidental_none']}")
            logging.info(f"  Strategic: {stats['overlay_strategic']}")
            logging.info(f"Other Exposures Capped: Long={stats['other_capped_long']}, Short={stats['other_capped_short']}")
            logging.info(f"Results saved to {output_path}")

if __name__ == "__main__":
    classifier = FundClassifier()
    classifier.run()

2025-03-10 10:14:25,125 - INFO - Processed 5586 funds
2025-03-10 10:14:25,127 - INFO - Leverage Categories:
2025-03-10 10:14:25,127 - INFO -   Slight: 5093
2025-03-10 10:14:25,129 - INFO -   Moderate: 186
2025-03-10 10:14:25,130 - INFO -   Substantial: 82
2025-03-10 10:14:25,130 - INFO -   Heavy: 225
2025-03-10 10:14:25,131 - INFO - Overlay Categories:
2025-03-10 10:14:25,131 - INFO -   Incidental/None: 5449
2025-03-10 10:14:25,132 - INFO -   Strategic: 137
2025-03-10 10:14:25,132 - INFO - Other Exposures Capped: Long=26, Short=8
2025-03-10 10:14:25,133 - INFO - Results saved to C:\Users\JulianHeron\Software Projects\Test files\exposures_overlays_20250310_101423_Run_1.xlsx


In [2]:
# new gross and net leverage from doge

import pandas as pd
from sqlalchemy import create_engine
import logging
import os
from datetime import datetime

# Setup logging to both console and file with UTF-8 encoding
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
try:
    console_handler.stream.reconfigure(encoding='utf-8')
except AttributeError:
    pass

logging.basicConfig(
    level=logging.INFO,
    handlers=[
        logging.FileHandler("classification.log", encoding='utf-8'),
        console_handler
    ]
)

# Configuration
CONFIG = {
    "use_detailed_logging": True
}

# Database connection string
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)

# Create database connection
def create_db_connection():
    try:
        engine = create_engine(connection_string)
        logging.info("Database connection established successfully.")
        return engine
    except Exception as e:
        logging.error(f"Error connecting to database: {str(e)}")
        raise

# Retrieve data from Funds_to_Screen table with category joins
def load_fund_data():
    engine = create_db_connection()
    try:
        query = """
        SELECT 
            f.SymbolCUSIP, f.ProductName, f.fund_family, f.ycharts_url,
            f.cash_long, f.cash_net, f.cash_short,
            f.stock_long, f.stock_net, f.stock_short,
            f.bond_long, f.bond_net, f.bond_short,
            f.other_long, f.other_net, f.other_short,
            f.preferred_long, f.preferred_net, f.preferred_short,
            f.convertible_long, f.convertible_net, f.convertible_short,
            yc.Category_Name AS YC_Category_Name
        FROM Funds_to_Screen f
        LEFT JOIN YC_Category_List yc ON f.YC_Category_ID = yc.ID
        """
        df = pd.read_sql(query, engine)
        logging.info(f"Loaded {len(df)} rows from Funds_to_Screen table.")
        return df
    except Exception as e:
        logging.error(f"Error loading data from database: {str(e)}")
        raise
    finally:
        engine.dispose()

# Classify leverage use and effect
def classify_leverage(row, audit_log):
    # Define exposure columns for each category
    exposure_categories = {
        'cash': ['cash_long', 'cash_short', 'cash_net'],
        'stock': ['stock_long', 'stock_short', 'stock_net'],
        'bond': ['bond_long', 'bond_short', 'bond_net'],
        'other': ['other_long', 'other_short', 'other_net'],
        'preferred': ['preferred_long', 'preferred_short', 'preferred_net'],
        'convertible': ['convertible_long', 'convertible_short', 'convertible_net']
    }

    # Ensure all exposure columns are numeric and handle missing values
    for category in exposure_categories:
        for col in exposure_categories[category]:
            val = pd.to_numeric(row.get(col, 0), errors='coerce')
            row[col] = 0 if pd.isna(val) else val * 100  # Convert to percentage

    # Calculate gross leverage for each category
    gross_leverage = {}
    for category in exposure_categories:
        long_val = row[exposure_categories[category][0]]
        short_val = row[exposure_categories[category][1]]
        gross_leverage[category] = long_val + short_val

    # Determine the dominant category (highest gross leverage)
    dominant_category = max(gross_leverage, key=gross_leverage.get)
    total_gross_leverage = gross_leverage[dominant_category]

    # Classify Gross Leverage (Use)
    if total_gross_leverage < 100:
        leverage_use = "Low"
    elif 100 <= total_gross_leverage <= 250:
        leverage_use = "Medium"
    else:
        leverage_use = "High"

    # Calculate Net Leverage (Effect) for the dominant category
    net_leverage = row[exposure_categories[dominant_category][2]]

    # Classify Net Leverage (Effect) with long-only adjustment
    long_only_categories = ["Taxable Fixed Income", "US Equity", "Global Bond-USD Hedged"]
    yc_category = row.get('YC_Category_Name', '').lower()
    is_long_only = any(cat.lower() in yc_category for cat in long_only_categories) or dominant_category in ['bond', 'convertible']

    if is_long_only and abs(net_leverage) < 10:
        leverage_effect = "Slight"
    else:
        if abs(net_leverage) < 10:
            leverage_effect = "Slight"
        elif 10 <= abs(net_leverage) <= 50:
            leverage_effect = "Strategic"
        elif 50 < abs(net_leverage) <= 100:
            leverage_effect = "Systematic"
        else:
            leverage_effect = "Amplified"

    audit_log.append(
        f"Gross Leverage (Use): {round(total_gross_leverage, 4)}% (Dominant: {dominant_category}), "
        f"Class: {leverage_use}, "
        f"Net Leverage (Effect): {round(net_leverage, 4)}%, "
        f"Class: {leverage_effect}"
    )

    return leverage_use, leverage_effect

# Process and classify each fund
def process_fund(row):
    audit_log = []
    leverage_use, leverage_effect = classify_leverage(row, audit_log)
    return {
        'SymbolCUSIP': row.get('SymbolCUSIP', ''),
        'ProductName': row.get('ProductName', ''),
        'fund_family': row.get('fund_family', ''),
        'Leverage_Use': leverage_use,
        'Leverage_Effect': leverage_effect,
        'ycharts_url': row.get('ycharts_url', ''),
        'Audit_Log': "; ".join(audit_log)
    }

# Main function
def main():
    # Load data
    df = load_fund_data()

    # Process each row
    results = [process_fund(row) for _, row in df.iterrows()]

    # Create DataFrame
    out_df = pd.DataFrame(results)

    # Define output path
    base_path = r"C:\Users\JulianHeron\Software Projects\Test files"
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f"Risk_Overlays_{timestamp}.xlsx"
    output_path = os.path.join(base_path, output_filename)
    os.makedirs(base_path, exist_ok=True)

    # Save to Excel
    out_df.to_excel(output_path, index=False)
    logging.info(f"Results saved to '{output_path}'")

    # Detailed logging
    if CONFIG.get("use_detailed_logging", True):
        logging.info("=== Detailed Classification Summary ===")
        logging.info(f"Total Funds Processed: {len(out_df)}")
        logging.info("\nLeverage Use Distribution:")
        for lev in ["Low", "Medium", "High"]:
            count = len(out_df[out_df['Leverage_Use'] == lev])
            logging.info(f"  {lev}: {count} ({count/len(out_df)*100:.2f}%)")
        logging.info("\nLeverage Effect Distribution:")
        for eff in ["Slight", "Strategic", "Systematic", "Amplified"]:
            count = len(out_df[out_df['Leverage_Effect'] == eff])
            logging.info(f"  {eff}: {count} ({count/len(out_df)*100:.2f}%)")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logging.error(f"Error running main: {str(e)}")

2025-03-10 16:04:19,537 - INFO - Database connection established successfully.
2025-03-10 16:04:19,630 - INFO - Loaded 5586 rows from Funds_to_Screen table.
2025-03-10 16:04:19,894 - ERROR - Error running main: 'NoneType' object has no attribute 'lower'


In [3]:
# new gross and net leverage from doge

import pandas as pd
from sqlalchemy import create_engine
import logging
import os
from datetime import datetime

# Setup logging to both console and file with UTF-8 encoding
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
try:
    console_handler.stream.reconfigure(encoding='utf-8')
except AttributeError:
    pass

logging.basicConfig(
    level=logging.INFO,
    handlers=[
        logging.FileHandler("classification.log", encoding='utf-8'),
        console_handler
    ]
)

# Configuration
CONFIG = {
    "use_detailed_logging": True
}

# Database connection string
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)

# Create database connection
def create_db_connection():
    try:
        engine = create_engine(connection_string)
        logging.info("Database connection established successfully.")
        return engine
    except Exception as e:
        logging.error(f"Error connecting to database: {str(e)}")
        raise

# Retrieve data from Funds_to_Screen table with category joins
def load_fund_data():
    engine = create_db_connection()
    try:
        query = """
        SELECT 
            f.SymbolCUSIP, f.ProductName, f.fund_family, f.ycharts_url,
            f.cash_long, f.cash_net, f.cash_short,
            f.stock_long, f.stock_net, f.stock_short,
            f.bond_long, f.bond_net, f.bond_short,
            f.other_long, f.other_net, f.other_short,
            f.preferred_long, f.preferred_net, f.preferred_short,
            f.convertible_long, f.convertible_net, f.convertible_short,
            yc.Category_Name AS YC_Category_Name
        FROM Funds_to_Screen f
        LEFT JOIN YC_Category_List yc ON f.YC_Category_ID = yc.ID
        """
        df = pd.read_sql(query, engine)
        logging.info(f"Loaded {len(df)} rows from Funds_to_Screen table.")
        return df
    except Exception as e:
        logging.error(f"Error loading data from database: {str(e)}")
        raise
    finally:
        engine.dispose()

# Utility function to safely handle None/NaN values
def safe_lower(value):
    return str(value).lower() if pd.notna(value) else ""

# Classify leverage use and effect
def classify_leverage(row, audit_log):
    # Define exposure columns for each category
    exposure_categories = {
        'cash': ['cash_long', 'cash_short', 'cash_net'],
        'stock': ['stock_long', 'stock_short', 'stock_net'],
        'bond': ['bond_long', 'bond_short', 'bond_net'],
        'other': ['other_long', 'other_short', 'other_net'],
        'preferred': ['preferred_long', 'preferred_short', 'preferred_net'],
        'convertible': ['convertible_long', 'convertible_short', 'convertible_net']
    }

    # Ensure all exposure columns are numeric and handle missing values
    for category in exposure_categories:
        for col in exposure_categories[category]:
            val = pd.to_numeric(row.get(col, 0), errors='coerce')
            row[col] = 0 if pd.isna(val) else val * 100  # Convert to percentage

    # Calculate gross leverage for each category
    gross_leverage = {}
    for category in exposure_categories:
        long_val = row[exposure_categories[category][0]]
        short_val = row[exposure_categories[category][1]]
        gross_leverage[category] = long_val + short_val

    # Determine the dominant category (highest gross leverage)
    dominant_category = max(gross_leverage, key=gross_leverage.get)
    total_gross_leverage = gross_leverage[dominant_category]

    # Classify Gross Leverage (Use)
    if total_gross_leverage < 100:
        leverage_use = "Low"
    elif 100 <= total_gross_leverage <= 250:
        leverage_use = "Medium"
    else:
        leverage_use = "High"

    # Calculate Net Leverage (Effect) for the dominant category
    net_leverage = row[exposure_categories[dominant_category][2]]

    # Classify Net Leverage (Effect) with long-only adjustment
    long_only_categories = ["Taxable Fixed Income", "US Equity", "Global Bond-USD Hedged"]
    yc_category = safe_lower(row.get('YC_Category_Name', ''))
    is_long_only = any(cat.lower() in yc_category for cat in long_only_categories) or dominant_category in ['bond', 'convertible']

    if is_long_only and abs(net_leverage) < 10:
        leverage_effect = "Slight"
    else:
        if abs(net_leverage) < 10:
            leverage_effect = "Slight"
        elif 10 <= abs(net_leverage) <= 50:
            leverage_effect = "Strategic"
        elif 50 < abs(net_leverage) <= 100:
            leverage_effect = "Systematic"
        else:
            leverage_effect = "Amplified"

    audit_log.append(
        f"Gross Leverage (Use): {round(total_gross_leverage, 4)}% (Dominant: {dominant_category}), "
        f"Class: {leverage_use}, "
        f"Net Leverage (Effect): {round(net_leverage, 4)}%, "
        f"Class: {leverage_effect}"
    )

    return leverage_use, leverage_effect

# Process and classify each fund
def process_fund(row):
    audit_log = []
    leverage_use, leverage_effect = classify_leverage(row, audit_log)
    return {
        'SymbolCUSIP': row.get('SymbolCUSIP', ''),
        'ProductName': row.get('ProductName', ''),
        'fund_family': row.get('fund_family', ''),
        'Leverage_Use': leverage_use,
        'Leverage_Effect': leverage_effect,
        'ycharts_url': row.get('ycharts_url', ''),
        'Audit_Log': "; ".join(audit_log)
    }

# Main function
def main():
    # Load data
    df = load_fund_data()

    # Process each row
    results = [process_fund(row) for _, row in df.iterrows()]

    # Create DataFrame
    out_df = pd.DataFrame(results)

    # Define output path
    base_path = r"C:\Users\JulianHeron\Software Projects\Test files"
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f"Risk_Overlays_{timestamp}.xlsx"
    output_path = os.path.join(base_path, output_filename)
    os.makedirs(base_path, exist_ok=True)

    # Save to Excel
    out_df.to_excel(output_path, index=False)
    logging.info(f"Results saved to '{output_path}'")

    # Detailed logging
    if CONFIG.get("use_detailed_logging", True):
        logging.info("=== Detailed Classification Summary ===")
        logging.info(f"Total Funds Processed: {len(out_df)}")
        logging.info("\nLeverage Use Distribution:")
        for lev in ["Low", "Medium", "High"]:
            count = len(out_df[out_df['Leverage_Use'] == lev])
            logging.info(f"  {lev}: {count} ({count/len(out_df)*100:.2f}%)")
        logging.info("\nLeverage Effect Distribution:")
        for eff in ["Slight", "Strategic", "Systematic", "Amplified"]:
            count = len(out_df[out_df['Leverage_Effect'] == eff])
            logging.info(f"  {eff}: {count} ({count/len(out_df)*100:.2f}%)")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logging.error(f"Error running main: {str(e)}")

2025-03-10 16:05:21,591 - INFO - Database connection established successfully.
2025-03-10 16:05:21,941 - INFO - Loaded 5586 rows from Funds_to_Screen table.
2025-03-10 16:05:25,903 - INFO - Results saved to 'C:\Users\JulianHeron\Software Projects\Test files\Risk_Overlays_20250310_160524.xlsx'
2025-03-10 16:05:25,903 - INFO - === Detailed Classification Summary ===
2025-03-10 16:05:25,904 - INFO - Total Funds Processed: 5586
2025-03-10 16:05:25,905 - INFO - 
Leverage Use Distribution:
2025-03-10 16:05:25,908 - INFO -   Low: 4888 (87.50%)
2025-03-10 16:05:25,911 - INFO -   Medium: 625 (11.19%)
2025-03-10 16:05:25,913 - INFO -   High: 73 (1.31%)
2025-03-10 16:05:25,915 - INFO - 
Leverage Effect Distribution:
2025-03-10 16:05:25,918 - INFO -   Slight: 101 (1.81%)
2025-03-10 16:05:25,925 - INFO -   Strategic: 113 (2.02%)
2025-03-10 16:05:25,930 - INFO -   Systematic: 5059 (90.57%)
2025-03-10 16:05:25,935 - INFO -   Amplified: 313 (5.60%)


In [22]:
import pandas as pd
from sqlalchemy import create_engine
import logging
import os
from datetime import datetime

# Setup logging to both console and file with UTF-8 encoding
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
try:
    console_handler.stream.reconfigure(encoding='utf-8')
except AttributeError:
    pass

logging.basicConfig(
    level=logging.INFO,
    handlers=[
        logging.FileHandler("classification.log", encoding='utf-8'),
        console_handler
    ]
)

# Configuration
CONFIG = {
    "use_detailed_logging": True
}

# Database connection string
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)

# Create database connection
def create_db_connection():
    try:
        engine = create_engine(connection_string)
        logging.info("Database connection established successfully.")
        return engine
    except Exception as e:
        logging.error(f"Error connecting to database: {str(e)}")
        raise

# Helper function to convert strings
def str_to_bool(value):
    if isinstance(value, str):  # Check if the value is a string
        value = value.strip().lower()  # Remove whitespace and standardize case
        if value in ['true', '1']:
            return True
        elif value in ['false', '0']:
            return False
    return bool(value)  # Fallback for non-string types (e.g., None or actual booleans)

# Retrieve data from Funds_to_Screen table with boolean tags
def load_fund_data():
    engine = create_db_connection()
    try:
        query = """
        SELECT 
            f.SymbolCUSIP, f.ProductName, f.fund_family, f.ycharts_url,
            f.cash_long, f.cash_net, f.cash_short,
            f.stock_long, f.stock_net, f.stock_short,
            f.bond_long, f.bond_net, f.bond_short,
            f.other_long, f.other_net, f.other_short,
            f.preferred_long, f.preferred_net, f.preferred_short,
            f.convertible_long, f.convertible_net, f.convertible_short,
            f.leveraged_fund, f.inverse_fund, f.currency_hedged_fund
        FROM Funds_to_Screen f
        """
        df = pd.read_sql(query, engine)
        logging.info(f"Loaded {len(df)} rows from Funds_to_Screen table.")
        return df
    except Exception as e:
        logging.error(f"Error loading data from database: {str(e)}")
        raise
    finally:
        engine.dispose()

# Classify leverage use and effect
def classify_leverage(row, audit_log):
    # Define exposure categories
    exposure_categories = {
        'cash': ['cash_long', 'cash_short', 'cash_net'],
        'stock': ['stock_long', 'stock_short', 'stock_net'],
        'bond': ['bond_long', 'bond_short', 'bond_net'],
        'other': ['other_long', 'other_short', 'other_net'],
        'preferred': ['preferred_long', 'preferred_short', 'preferred_net'],
        'convertible': ['convertible_long', 'convertible_short', 'convertible_net']
    }

    # Convert exposure data to percentages and handle missing values
    total_exposure = 0
    for category in exposure_categories:
        for col in exposure_categories[category]:
            val = pd.to_numeric(row.get(col, 0), errors='coerce')
            row[col] = val * 100 if not pd.isna(val) else 0  # Convert float to percentage (e.g., 0.024 → 2.4%)
            if col in ['cash_long', 'stock_long', 'bond_long', 'other_long', 'preferred_long', 'convertible_long',
                       'cash_short', 'stock_short', 'bond_short', 'other_short', 'preferred_short', 'convertible_short']:
                total_exposure += row[col]

    # Handle zero-data funds
    if total_exposure == 0:
        audit_log.append("Warning: No exposure data for this fund, defaulting to Unclassified/Unclassified")
        return "Unclassified", "Unclassified"

    # Calculate total long, short, net, and other exposures
    total_long = sum(row[exposure_categories[cat][0]] for cat in exposure_categories)
    total_short = sum(row[exposure_categories[cat][1]] for cat in exposure_categories)
    total_net = sum(row[exposure_categories[cat][2]] for cat in exposure_categories)
    total_other = row['other_long'] + row['other_short']
    total_gross_leverage = total_long + total_short
    calculated_net_leverage = total_long - total_short

    # Calculate exposure qualifiers
    total_short_percent = total_short / total_gross_leverage * 100 if total_gross_leverage > 0 else 0
    total_other_percent = total_other / total_gross_leverage * 100 if total_gross_leverage > 0 else 0
    total_cash = row['cash_long']
    cash_percent = total_cash / total_gross_leverage * 100 if total_gross_leverage > 0 else 0

    # Debug logging
    audit_log.append(f"Debug: Total Long = {total_long}%, Total Short = {total_short}%, Total Other = {total_other}%, "
                     f"Total Net (sum) = {total_net}%, Calculated Net = {calculated_net_leverage}%")

    # Classify Leverage Use
    if total_gross_leverage < 101:
        leverage_use = "None"
    elif total_gross_leverage <= 150:
        leverage_use = "Low"
    elif total_gross_leverage <= 250:
        leverage_use = "Medium"
    else:
        leverage_use = "High"

    # Explicitly convert currency_hedged_fund to boolean
    currency_hedged = row.get('currency_hedged_fund', False)
    if isinstance(currency_hedged, str):
        currency_hedged = currency_hedged.lower() == 'true'
    else:
        currency_hedged = bool(currency_hedged)

    # Convert leveraged_fund and inverse_fund to boolean using str_to_bool
    leveraged_fund = str_to_bool(row.get('leveraged_fund', False))
    inverse_fund = str_to_bool(row.get('inverse_fund', False))

    # Classify Leverage Effect with updated conditions
    high_leverage = (abs(calculated_net_leverage) >= 150 or total_other_percent > 15 or total_gross_leverage > 150)
    if (leveraged_fund or inverse_fund) and high_leverage:
        leverage_effect = "Amplified"
    elif high_leverage:
        leverage_effect = "Systematic"
    elif total_short_percent >= 30 and abs(calculated_net_leverage) < 50:
        leverage_effect = "Systematic"
    elif 5 <= total_short_percent < 30 or (5 <= total_other_percent <= 15) or currency_hedged:
        leverage_effect = "Systematic"
    elif total_short_percent < 5 and total_other_percent < 2 and abs(total_net - 100) < 10 and total_gross_leverage <= 105:
        leverage_effect = "Slight"
    else:
        leverage_effect = "Strategic"

    # Final logging
    audit_log.append(
        f"Gross Leverage (Use): {round(total_gross_leverage, 4)}%, Class: {leverage_use}, "
        f"Net Leverage (Effect): {round(total_net, 4)}%, Class: {leverage_effect}, "
        f"Short: {round(total_short_percent, 2)}%, Other: {round(total_other_percent, 2)}%, Cash: {round(cash_percent, 2)}%"
    )

    return leverage_use, leverage_effect

# Process and classify each fund
def process_fund(row):
    audit_log = []
    leverage_use, leverage_effect = classify_leverage(row, audit_log)
    result = {
        'SymbolCUSIP': row.get('SymbolCUSIP', ''),
        'ProductName': row.get('ProductName', ''),
        'fund_family': row.get('fund_family', ''),
        'Leverage_Use': leverage_use,
        'Leverage_Effect': leverage_effect,
        'ycharts_url': row.get('ycharts_url', ''),
        'Audit_Log': "; ".join(audit_log),
        # Add all exposure data
        'cash_long': row.get('cash_long', 0),
        'cash_short': row.get('cash_short', 0),
        'cash_net': row.get('cash_net', 0),
        'stock_long': row.get('stock_long', 0),
        'stock_short': row.get('stock_short', 0),
        'stock_net': row.get('stock_net', 0),
        'bond_long': row.get('bond_long', 0),
        'bond_short': row.get('bond_short', 0),
        'bond_net': row.get('bond_net', 0),
        'other_long': row.get('other_long', 0),
        'other_short': row.get('other_short', 0),
        'other_net': row.get('other_net', 0),
        'preferred_long': row.get('preferred_long', 0),
        'preferred_short': row.get('preferred_short', 0),
        'preferred_net': row.get('preferred_net', 0),
        'convertible_long': row.get('convertible_long', 0),
        'convertible_short': row.get('convertible_short', 0),
        'convertible_net': row.get('convertible_net', 0),
        # Add boolean tags
        'leveraged_fund': row.get('leveraged_fund', False),
        'inverse_fund': row.get('inverse_fund', False),
        'currency_hedged_fund': row.get('currency_hedged_fund', False)
    }
    return result
# Main function
def main():
    # Load data
    df = load_fund_data()

    # Process each row
    results = [process_fund(row) for _, row in df.iterrows()]

    # Create DataFrame
    out_df = pd.DataFrame(results)

    # Define output path
    base_path = r"C:\Users\JulianHeron\Software Projects\Test files"
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f"Risk_Overlays_{timestamp}.xlsx"
    output_path = os.path.join(base_path, output_filename)
    os.makedirs(base_path, exist_ok=True)

    # Save to Excel
    out_df.to_excel(output_path, index=False)
    logging.info(f"Results saved to '{output_path}'")

    # Detailed logging
    if CONFIG.get("use_detailed_logging", True):
        logging.info("=== Detailed Classification Summary ===")
        logging.info(f"Total Funds Processed: {len(out_df)}")
        logging.info("\nLeverage Use Distribution:")
        for lev in ["None", "Low", "Medium", "High", "Unclassified"]:
            count = len(out_df[out_df['Leverage_Use'] == lev])
            logging.info(f"  {lev}: {count} ({count/len(out_df)*100:.2f}%)")
        logging.info("\nLeverage Effect Distribution:")
        for eff in ["Slight", "Strategic", "Systematic", "Amplified", "Missing Data", "Unclassified"]:
            count = len(out_df[out_df['Leverage_Effect'] == eff])
            logging.info(f"  {eff}: {count} ({count/len(out_df)*100:.2f}%)")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logging.error(f"Error running main: {str(e)}")

2025-03-12 14:17:43,063 - INFO - Database connection established successfully.
2025-03-12 14:17:43,280 - INFO - Loaded 5586 rows from Funds_to_Screen table.
2025-03-12 14:17:46,804 - INFO - Results saved to 'C:\Users\JulianHeron\Software Projects\Test files\Risk_Overlays_20250312_141745.xlsx'
2025-03-12 14:17:46,806 - INFO - === Detailed Classification Summary ===
2025-03-12 14:17:46,806 - INFO - Total Funds Processed: 5586
2025-03-12 14:17:46,807 - INFO - 
Leverage Use Distribution:
2025-03-12 14:17:46,809 - INFO -   None: 4250 (76.08%)
2025-03-12 14:17:46,811 - INFO -   Low: 980 (17.54%)
2025-03-12 14:17:46,813 - INFO -   Medium: 163 (2.92%)
2025-03-12 14:17:46,814 - INFO -   High: 145 (2.60%)
2025-03-12 14:17:46,815 - INFO -   Unclassified: 48 (0.86%)
2025-03-12 14:17:46,816 - INFO - 
Leverage Effect Distribution:
2025-03-12 14:17:46,818 - INFO -   Slight: 4517 (80.86%)
2025-03-12 14:17:46,819 - INFO -   Strategic: 266 (4.76%)
2025-03-12 14:17:46,821 - INFO -   Systematic: 744 (13.3

In [24]:
# Final Risk drivers code 

import pandas as pd
from sqlalchemy import create_engine, text  # Add text import
import logging
import os
from datetime import datetime

# Setup logging to both console and file with UTF-8 encoding
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
try:
    console_handler.stream.reconfigure(encoding='utf-8')
except AttributeError:
    pass

logging.basicConfig(
    level=logging.INFO,
    handlers=[
        logging.FileHandler("classification.log", encoding='utf-8'),
        console_handler
    ]
)

# Configuration
CONFIG = {
    "use_detailed_logging": True
}

# Database connection string
connection_string = (
    "mssql+pyodbc://JULIANS_LAPTOP\\SQLEXPRESS/"
    "CWA_Fund_Database?driver=ODBC+Driver+18+for+SQL+Server"
    "&trusted_connection=yes&TrustServerCertificate=yes"
)

# Create database connection
def create_db_connection():
    try:
        engine = create_engine(connection_string)
        logging.info("Database connection established successfully.")
        return engine
    except Exception as e:
        logging.error(f"Error connecting to database: {str(e)}")
        raise

# Helper function to convert strings
def str_to_bool(value):
    if isinstance(value, str):  # Check if the value is a string
        value = value.strip().lower()  # Remove whitespace and standardize case
        if value in ['true', '1']:
            return True
        elif value in ['false', '0']:
            return False
    return bool(value)  # Fallback for non-string types (e.g., None or actual booleans)

# Retrieve data from Funds_to_Screen table with boolean tags
def load_fund_data():
    engine = create_db_connection()
    try:
        query = """
        SELECT 
            f.SymbolCUSIP, f.ProductName, f.fund_family, f.ycharts_url,
            f.cash_long, f.cash_net, f.cash_short,
            f.stock_long, f.stock_net, f.stock_short,
            f.bond_long, f.bond_net, f.bond_short,
            f.other_long, f.other_net, f.other_short,
            f.preferred_long, f.preferred_net, f.preferred_short,
            f.convertible_long, f.convertible_net, f.convertible_short,
            f.leveraged_fund, f.inverse_fund, f.currency_hedged_fund
        FROM Funds_to_Screen f
        """
        df = pd.read_sql(query, engine)
        logging.info(f"Loaded {len(df)} rows from Funds_to_Screen table.")
        return df
    except Exception as e:
        logging.error(f"Error loading data from database: {str(e)}")
        raise
    finally:
        engine.dispose()

# Classify leverage use and effect
def classify_leverage(row, audit_log):
    # Define exposure categories
    exposure_categories = {
        'cash': ['cash_long', 'cash_short', 'cash_net'],
        'stock': ['stock_long', 'stock_short', 'stock_net'],
        'bond': ['bond_long', 'bond_short', 'bond_net'],
        'other': ['other_long', 'other_short', 'other_net'],
        'preferred': ['preferred_long', 'preferred_short', 'preferred_net'],
        'convertible': ['convertible_long', 'convertible_short', 'convertible_net']
    }

    # Convert exposure data to percentages and handle missing values
    total_exposure = 0
    for category in exposure_categories:
        for col in exposure_categories[category]:
            val = pd.to_numeric(row.get(col, 0), errors='coerce')
            row[col] = val * 100 if not pd.isna(val) else 0  # Convert float to percentage (e.g., 0.024 → 2.4%)
            if col in ['cash_long', 'stock_long', 'bond_long', 'other_long', 'preferred_long', 'convertible_long',
                       'cash_short', 'stock_short', 'bond_short', 'other_short', 'preferred_short', 'convertible_short']:
                total_exposure += row[col]

    # Handle zero-data funds
    if total_exposure == 0:
        audit_log.append("Warning: No exposure data for this fund, defaulting to Unclassified/Unclassified")
        return "Unclassified", "Unclassified"

    # Calculate total long, short, net, and other exposures
    total_long = sum(row[exposure_categories[cat][0]] for cat in exposure_categories)
    total_short = sum(row[exposure_categories[cat][1]] for cat in exposure_categories)
    total_net = sum(row[exposure_categories[cat][2]] for cat in exposure_categories)
    total_other = row['other_long'] + row['other_short']
    total_gross_leverage = total_long + total_short
    calculated_net_leverage = total_long - total_short

    # Calculate exposure qualifiers
    total_short_percent = total_short / total_gross_leverage * 100 if total_gross_leverage > 0 else 0
    total_other_percent = total_other / total_gross_leverage * 100 if total_gross_leverage > 0 else 0
    total_cash = row['cash_long']
    cash_percent = total_cash / total_gross_leverage * 100 if total_gross_leverage > 0 else 0

    # Debug logging
    audit_log.append(f"Debug: Total Long = {total_long}%, Total Short = {total_short}%, Total Other = {total_other}%, "
                     f"Total Net (sum) = {total_net}%, Calculated Net = {calculated_net_leverage}%")

    # Classify Leverage Use
    if total_gross_leverage < 101:
        leverage_use = "None"
    elif total_gross_leverage <= 150:
        leverage_use = "Low"
    elif total_gross_leverage <= 250:
        leverage_use = "Medium"
    else:
        leverage_use = "High"

    # Explicitly convert currency_hedged_fund to boolean
    currency_hedged = row.get('currency_hedged_fund', False)
    if isinstance(currency_hedged, str):
        currency_hedged = currency_hedged.lower() == 'true'
    else:
        currency_hedged = bool(currency_hedged)

    # Convert leveraged_fund and inverse_fund to boolean using str_to_bool
    leveraged_fund = str_to_bool(row.get('leveraged_fund', False))
    inverse_fund = str_to_bool(row.get('inverse_fund', False))

    # Classify Leverage Effect with updated conditions
    high_leverage = (abs(calculated_net_leverage) >= 150 or total_other_percent > 15 or total_gross_leverage > 150)
    if (leveraged_fund or inverse_fund) and high_leverage:
        leverage_effect = "Amplified"
    elif high_leverage:
        leverage_effect = "Systematic"
    elif total_short_percent >= 30 and abs(calculated_net_leverage) < 50:
        leverage_effect = "Systematic"
    elif 5 <= total_short_percent < 30 or (5 <= total_other_percent <= 15) or currency_hedged:
        leverage_effect = "Systematic"
    elif total_short_percent < 5 and total_other_percent < 2 and abs(total_net - 100) < 10 and total_gross_leverage <= 105:
        leverage_effect = "Slight"
    else:
        leverage_effect = "Strategic"

    # Final logging
    audit_log.append(
        f"Gross Leverage (Use): {round(total_gross_leverage, 4)}%, Class: {leverage_use}, "
        f"Net Leverage (Effect): {round(total_net, 4)}%, Class: {leverage_effect}, "
        f"Short: {round(total_short_percent, 2)}%, Other: {round(total_other_percent, 2)}%, Cash: {round(cash_percent, 2)}%"
    )

    return leverage_use, leverage_effect

# Process and classify each fund
def process_fund(row):
    audit_log = []
    leverage_use, leverage_effect = classify_leverage(row, audit_log)
    result = {
        'SymbolCUSIP': row.get('SymbolCUSIP', ''),
        'ProductName': row.get('ProductName', ''),
        'fund_family': row.get('fund_family', ''),
        'Leverage_Use': leverage_use,
        'Leverage_Effect': leverage_effect,
        'ycharts_url': row.get('ycharts_url', ''),
        'Audit_Log': "; ".join(audit_log),
        # Add all exposure data
        'cash_long': row.get('cash_long', 0),
        'cash_short': row.get('cash_short', 0),
        'cash_net': row.get('cash_net', 0),
        'stock_long': row.get('stock_long', 0),
        'stock_short': row.get('stock_short', 0),
        'stock_net': row.get('stock_net', 0),
        'bond_long': row.get('bond_long', 0),
        'bond_short': row.get('bond_short', 0),
        'bond_net': row.get('bond_net', 0),
        'other_long': row.get('other_long', 0),
        'other_short': row.get('other_short', 0),
        'other_net': row.get('other_net', 0),
        'preferred_long': row.get('preferred_long', 0),
        'preferred_short': row.get('preferred_short', 0),
        'preferred_net': row.get('preferred_net', 0),
        'convertible_long': row.get('convertible_long', 0),
        'convertible_short': row.get('convertible_short', 0),
        'convertible_net': row.get('convertible_net', 0),
        # Add boolean tags
        'leveraged_fund': row.get('leveraged_fund', False),
        'inverse_fund': row.get('inverse_fund', False),
        'currency_hedged_fund': row.get('currency_hedged_fund', False)
    }
    return result

# Updated function to update existing records in Funds_to_Screen
def update_funds_to_screen(df):
    engine = create_db_connection()
    try:
        with engine.connect() as connection:
            for index, row in df.iterrows():
                update_query = text("""
                    UPDATE Funds_to_Screen
                    SET Leverage_Use = :leverage_use,
                        Leverage_Effect = :leverage_effect
                    WHERE SymbolCUSIP = :symbol_cusip
                """)
                connection.execute(
                    update_query,
                    {
                        'leverage_use': row['Leverage_Use'],
                        'leverage_effect': row['Leverage_Effect'],
                        'symbol_cusip': row['SymbolCUSIP']
                    }
                )
            connection.commit()
        logging.info(f"Successfully updated {len(df)} records in Funds_to_Screen table")
    except Exception as e:
        logging.error(f"Error updating database: {str(e)}")
        raise
    finally:
        engine.dispose()

# Main function
def main():
    # Load data
    df = load_fund_data()

    # Process each row
    results = [process_fund(row) for _, row in df.iterrows()]

    # Create DataFrame
    out_df = pd.DataFrame(results)

    # Define output path
    base_path = r"C:\Users\JulianHeron\Software Projects\Test files"
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f"Risk_Overlays_{timestamp}.xlsx"
    output_path = os.path.join(base_path, output_filename)
    os.makedirs(base_path, exist_ok=True)

    # Save to Excel
    out_df.to_excel(output_path, index=False)
    logging.info(f"Results saved to '{output_path}'")

    # Update database with classification results
    update_funds_to_screen(out_df)

    # Detailed logging
    if CONFIG.get("use_detailed_logging", True):
        logging.info("=== Detailed Classification Summary ===")
        logging.info(f"Total Funds Processed: {len(out_df)}")
        logging.info("\nLeverage Use Distribution:")
        for lev in ["None", "Low", "Medium", "High", "Unclassified"]:
            count = len(out_df[out_df['Leverage_Use'] == lev])
            logging.info(f"  {lev}: {count} ({count/len(out_df)*100:.2f}%)")
        logging.info("\nLeverage Effect Distribution:")
        for eff in ["Slight", "Strategic", "Systematic", "Amplified", "Missing Data", "Unclassified"]:
            count = len(out_df[out_df['Leverage_Effect'] == eff])
            logging.info(f"  {eff}: {count} ({count/len(out_df)*100:.2f}%)")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logging.error(f"Error running main: {str(e)}")

2025-03-12 14:30:51,110 - INFO - Database connection established successfully.
2025-03-12 14:30:51,606 - INFO - Loaded 5586 rows from Funds_to_Screen table.
2025-03-12 14:30:58,296 - INFO - Results saved to 'C:\Users\JulianHeron\Software Projects\Test files\Risk_Overlays_20250312_143054.xlsx'
2025-03-12 14:30:58,298 - INFO - Database connection established successfully.
2025-03-12 14:31:09,705 - INFO - Successfully updated 5586 records in Funds_to_Screen table
2025-03-12 14:31:09,718 - INFO - === Detailed Classification Summary ===
2025-03-12 14:31:09,727 - INFO - Total Funds Processed: 5586
2025-03-12 14:31:09,730 - INFO - 
Leverage Use Distribution:
2025-03-12 14:31:09,740 - INFO -   None: 4250 (76.08%)
2025-03-12 14:31:09,753 - INFO -   Low: 980 (17.54%)
2025-03-12 14:31:09,755 - INFO -   Medium: 163 (2.92%)
2025-03-12 14:31:09,757 - INFO -   High: 145 (2.60%)
2025-03-12 14:31:09,759 - INFO -   Unclassified: 48 (0.86%)
2025-03-12 14:31:09,765 - INFO - 
Leverage Effect Distribution:
