# Global FPGrowth Feature Importance Analysis

## Overview

This notebook performs **global FPGrowth analysis** across all cohorts to create universal encoding features for machine learning models. The analysis covers three item types:

1. **Drug Names**: Pharmacy events (drug co-prescriptions)
2. **ICD Codes**: Diagnosis codes (condition associations)
3. **CPT Codes**: Procedure codes (treatment patterns)

## Use Cases

- **CatBoost Feature Engineering**: Creates consistent encodings across training/validation/test sets
- **Population-Level Insights**: Discovers association patterns across all patients
- **Feature Importance**: Identifies most frequent patterns in the population

## Key Outputs (per item type)

Each item type gets its own folder with:
- **Global Encoding Map**: Universal encodings for ML
- **Frequent Itemsets**: Combinations that appear frequently
- **Association Rules**: Co-occurrence patterns
- **Summary Metrics**: Processing statistics

## S3 Output Structure

```
gold/fpgrowth/global/
‚îú‚îÄ‚îÄ drug_name/
‚îÇ   ‚îú‚îÄ‚îÄ encoding_map.json
‚îÇ   ‚îú‚îÄ‚îÄ itemsets.json
‚îÇ   ‚îú‚îÄ‚îÄ rules.json
‚îÇ   ‚îî‚îÄ‚îÄ metrics.json
‚îú‚îÄ‚îÄ icd_code/
‚îÇ   ‚îî‚îÄ‚îÄ (same files)
‚îî‚îÄ‚îÄ cpt_code/
    ‚îî‚îÄ‚îÄ (same files)
```

## Parameters

- **Min Support**: 0.005 (items must appear in 0.5% of transactions)
- **Min Confidence**: 0.01 (rules must have 1% confidence)
- **Data Source**: Local cohort data from `data/gold/cohorts_F1120/`

---


## Setup and Imports


In [None]:
import os
import sys
import json
import pandas as pd
import numpy as np
from datetime import datetime
import logging
from pathlib import Path
import time

# MLxtend for FP-Growth
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder

# Project root
project_root = Path.cwd().parent if Path.cwd().name == '3_fpgrowth_analysis' else Path.cwd()
sys.path.insert(0, str(project_root))

# Project utilities
from helpers_1997_13.common_imports import s3_client, S3_BUCKET
from helpers_1997_13.duckdb_utils import get_duckdb_connection
from helpers_1997_13.s3_utils import save_to_s3_json, save_to_s3_parquet
from helpers_1997_13.drug_utils import encode_drug_name
from helpers_1997_13.visualization_utils import create_network_visualization

print(f"‚úì Project root: {project_root}")
print(f"‚úì All imports successful")
print(f"‚úì Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


## Configuration


In [None]:
# FP-Growth parameters
MIN_SUPPORT = 0.005  # 0.5% support threshold
MIN_CONFIDENCE = 0.01  # 1% confidence threshold
TOP_K = 50  # Top K itemsets to analyze

# Item types to process
ITEM_TYPES = ['drug_name', 'icd_code', 'cpt_code']

# S3 output base path
S3_OUTPUT_BASE = f"s3://{S3_BUCKET}/gold/fpgrowth/global"

# Local data path
LOCAL_DATA_PATH = project_root / "data" / "gold" / "cohorts_F1120"

# Create logger
logger = logging.getLogger('global_fpgrowth')
logger.setLevel(logging.INFO)
if not logger.handlers:
    handler = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)

print(f"‚úì Min Support: {MIN_SUPPORT}")
print(f"‚úì Min Confidence: {MIN_CONFIDENCE}")
print(f"‚úì Item Types: {ITEM_TYPES}")
print(f"‚úì S3 Output: {S3_OUTPUT_BASE}")
print(f"‚úì Local Data: {LOCAL_DATA_PATH}")
print(f"‚úì Local Data Exists: {LOCAL_DATA_PATH.exists()}")


## Step 1: Define Item Extraction Functions

Create functions to extract different item types from cohort data.


In [None]:
def extract_global_items(local_data_path, item_type, logger):
    """
    Extract all unique items of specified type from local cohort data.
    
    Args:
        item_type: 'drug_name', 'icd_code', or 'cpt_code'
    """
    logger.info(f"Extracting global {item_type}s from local cohort data...")
    start_time = time.time()
    
    # Get DuckDB connection
    con = get_duckdb_connection(logger=logger)
    
    # Build glob pattern for all parquet files
    parquet_pattern = str(local_data_path / "**" / "cohort.parquet")
    
    # Build query based on item type
    if item_type == 'drug_name':
        query = f"""
        SELECT DISTINCT drug_name as item
        FROM read_parquet('{parquet_pattern}', hive_partitioning=1)
        WHERE drug_name IS NOT NULL 
          AND drug_name != ''
          AND event_type = 'PHARMACY'
        ORDER BY item
        """
    elif item_type == 'icd_code':
        # Collect from all ICD diagnosis columns
        query = f"""
        WITH all_icds AS (
            SELECT primary_icd_diagnosis_code as icd FROM read_parquet('{parquet_pattern}', hive_partitioning=1) WHERE primary_icd_diagnosis_code IS NOT NULL AND event_type = 'MEDICAL'
            UNION ALL
            SELECT two_icd_diagnosis_code as icd FROM read_parquet('{parquet_pattern}', hive_partitioning=1) WHERE two_icd_diagnosis_code IS NOT NULL AND event_type = 'MEDICAL'
            UNION ALL
            SELECT three_icd_diagnosis_code as icd FROM read_parquet('{parquet_pattern}', hive_partitioning=1) WHERE three_icd_diagnosis_code IS NOT NULL AND event_type = 'MEDICAL'
            UNION ALL
            SELECT four_icd_diagnosis_code as icd FROM read_parquet('{parquet_pattern}', hive_partitioning=1) WHERE four_icd_diagnosis_code IS NOT NULL AND event_type = 'MEDICAL'
            UNION ALL
            SELECT five_icd_diagnosis_code as icd FROM read_parquet('{parquet_pattern}', hive_partitioning=1) WHERE five_icd_diagnosis_code IS NOT NULL AND event_type = 'MEDICAL'
        )
        SELECT DISTINCT icd as item FROM all_icds WHERE icd != '' ORDER BY item
        """
    elif item_type == 'cpt_code':
        query = f"""
        SELECT DISTINCT procedure_code as item
        FROM read_parquet('{parquet_pattern}', hive_partitioning=1)
        WHERE procedure_code IS NOT NULL 
          AND procedure_code != ''
          AND event_type = 'MEDICAL'
        ORDER BY item
        """
    else:
        raise ValueError(f"Unknown item_type: {item_type}")
    
    logger.info(f"Running query for {item_type}...")
    df = con.execute(query).df()
    con.close()
    
    items = df['item'].tolist()
    
    elapsed = time.time() - start_time
    logger.info(f"‚úì Extracted {len(items):,} unique {item_type}s in {elapsed:.1f}s")
    
    return items

# Test extraction function
print("Testing item extraction...")
test_items = extract_global_items(LOCAL_DATA_PATH, 'drug_name', logger)
print(f"‚úì Found {len(test_items):,} drugs")
print(f"  Sample: {test_items[:5]}")


## Step 2: Define Transaction Creation Functions

Create patient-level transactions for each item type.


In [None]:
def create_global_transactions(local_data_path, item_type, logger):
    """
    Create patient-level transactions from local cohort data.
    
    Args:
        item_type: 'drug_name', 'icd_code', or 'cpt_code'
    """
    logger.info(f"Creating global {item_type} transactions...")
    start_time = time.time()
    
    # Get DuckDB connection
    con = get_duckdb_connection(logger=logger)
    
    # Build glob pattern for all parquet files
    parquet_pattern = str(local_data_path / "**" / "cohort.parquet")
    
    # Build query based on item type
    if item_type == 'drug_name':
        query = f"""
        SELECT mi_person_key, drug_name as item
        FROM read_parquet('{parquet_pattern}', hive_partitioning=1)
        WHERE drug_name IS NOT NULL AND drug_name != '' AND event_type = 'PHARMACY'
        """
    elif item_type == 'icd_code':
        query = f"""
        WITH all_icds AS (
            SELECT mi_person_key, primary_icd_diagnosis_code as icd FROM read_parquet('{parquet_pattern}', hive_partitioning=1) WHERE primary_icd_diagnosis_code IS NOT NULL AND event_type = 'MEDICAL'
            UNION ALL
            SELECT mi_person_key, two_icd_diagnosis_code as icd FROM read_parquet('{parquet_pattern}', hive_partitioning=1) WHERE two_icd_diagnosis_code IS NOT NULL AND event_type = 'MEDICAL'
            UNION ALL
            SELECT mi_person_key, three_icd_diagnosis_code as icd FROM read_parquet('{parquet_pattern}', hive_partitioning=1) WHERE three_icd_diagnosis_code IS NOT NULL AND event_type = 'MEDICAL'
        )
        SELECT mi_person_key, icd as item FROM all_icds WHERE icd != ''
        """
    elif item_type == 'cpt_code':
        query = f"""
        SELECT mi_person_key, procedure_code as item
        FROM read_parquet('{parquet_pattern}', hive_partitioning=1)
        WHERE procedure_code IS NOT NULL AND procedure_code != '' AND event_type = 'MEDICAL'
        """
    else:
        raise ValueError(f"Unknown item_type: {item_type}")
    
    logger.info(f"Loading {item_type} events...")
    df = con.execute(query).df()
    con.close()
    
    # Group by patient and create item lists
    logger.info(f"Grouping by patient...")
    transactions = (
        df.groupby('mi_person_key')['item']
        .apply(lambda x: sorted(set(x.tolist())))
        .tolist()
    )
    
    elapsed = time.time() - start_time
    logger.info(f"‚úì Created {len(transactions):,} patient transactions in {elapsed:.1f}s")
    
    return transactions

print("‚úì Transaction creation function defined")


## Step 3: Process All Item Types

Run FP-Growth analysis for each item type (drug_name, icd_code, cpt_code).


In [None]:
def process_item_type(item_type, local_data_path, s3_output_base, min_support, min_confidence, logger):
    """
    Process a single item type end-to-end: extract, create transactions, run FP-Growth, save results.
    """
    logger.info(f"\n{'='*80}")
    logger.info(f"Processing {item_type.upper()}")
    logger.info(f"{'='*80}")
    
    overall_start = time.time()
    
    # Step 1: Extract items
    items = extract_global_items(local_data_path, item_type, logger)
    
    # Step 2: Create transactions
    transactions = create_global_transactions(local_data_path, item_type, logger)
    
    # Step 3: Encode transactions
    logger.info(f"Encoding {len(transactions):,} transactions...")
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
    logger.info(f"‚úì Encoded to {df_encoded.shape} matrix")
    
    # Step 4: Run FP-Growth
    logger.info(f"Running FP-Growth...")
    itemsets = fpgrowth(df_encoded, min_support=min_support, use_colnames=True)
    itemsets = itemsets.sort_values('support', ascending=False).reset_index(drop=True)
    logger.info(f"‚úì Found {len(itemsets):,} frequent itemsets")
    
    # Step 5: Generate association rules
    logger.info(f"Generating association rules...")
    try:
        rules = association_rules(itemsets, metric="confidence", min_threshold=min_confidence)
        rules = rules.sort_values('lift', ascending=False).reset_index(drop=True)
        logger.info(f"‚úì Generated {len(rules):,} association rules")
    except ValueError as e:
        logger.warning(f"Could not generate rules: {e}")
        rules = pd.DataFrame()
    
    # Step 6: Create encoding map
    logger.info(f"Creating encoding map...")
    encoding_map = {}
    for item in items:
        support = 0.0
        matching = itemsets[itemsets['itemsets'].apply(lambda x: item in x)]
        if not matching.empty:
            support = matching['support'].max()
        
        confidence = 0.0
        if not rules.empty:
            matching_rules = rules[
                rules['antecedents'].apply(lambda x: item in x) |
                rules['consequents'].apply(lambda x: item in x)
            ]
            if not matching_rules.empty:
                confidence = matching_rules['confidence'].max()
        
        # Simple encoding: item_supportXXX_confidenceYYY
        encoding = f"{item}_{int(support*1000):04d}_{int(confidence*1000):04d}"
        encoding_map[item] = encoding
    
    logger.info(f"‚úì Created encoding map with {len(encoding_map):,} items")
    
    # Step 7: Save to S3
    s3_folder = f"{s3_output_base}/{item_type}"
    logger.info(f"Saving to {s3_folder}...")
    
    # Convert frozensets to lists
    itemsets_json = itemsets.copy()
    itemsets_json['itemsets'] = itemsets_json['itemsets'].apply(lambda x: list(x))
    
    # Save encoding map
    encoding_path = f"{s3_folder}/encoding_map.json"
    save_to_s3_json(encoding_map, encoding_path)
    
    # Save itemsets
    itemsets_path = f"{s3_folder}/itemsets.json"
    save_to_s3_json(itemsets_json.to_dict(orient='records'), itemsets_path)
    
    # Save rules
    if not rules.empty:
        rules_json = rules.copy()
        rules_json['antecedents'] = rules_json['antecedents'].apply(lambda x: list(x))
        rules_json['consequents'] = rules_json['consequents'].apply(lambda x: list(x))
        rules_path = f"{s3_folder}/rules.json"
        save_to_s3_json(rules_json.to_dict(orient='records'), rules_path)
    
    # Save metrics
    summary = {
        'timestamp': datetime.now().isoformat(),
        'item_type': item_type,
        'total_items': len(items),
        'total_patients': len(transactions),
        'total_itemsets': len(itemsets),
        'total_rules': len(rules),
        'min_support': min_support,
        'min_confidence': min_confidence,
        'avg_items_per_patient': float(np.mean([len(t) for t in transactions]))
    }
    metrics_path = f"{s3_folder}/metrics.json"
    save_to_s3_json(summary, metrics_path)
    
    elapsed = time.time() - overall_start
    logger.info(f"‚úì {item_type} complete in {elapsed:.1f}s ({elapsed/60:.1f}min)")
    
    return {
        'item_type': item_type,
        'total_items': len(items),
        'total_patients': len(transactions),
        'total_itemsets': len(itemsets),
        'total_rules': len(rules),
        'elapsed_seconds': elapsed,
        's3_folder': s3_folder
    }

# Process all item types
print("\nüöÄ Starting FP-Growth analysis for all item types...\n")
results = []

for item_type in ITEM_TYPES:
    try:
        result = process_item_type(item_type, LOCAL_DATA_PATH, S3_OUTPUT_BASE, MIN_SUPPORT, MIN_CONFIDENCE, logger)
        results.append(result)
        print(f"\n‚úÖ {item_type}: {result['total_itemsets']:,} itemsets, {result['total_rules']:,} rules")
    except Exception as e:
        logger.error(f"‚ùå Failed to process {item_type}: {e}")
        results.append({'item_type': item_type, 'error': str(e)})

print("\n" + "="*80)
print("ALL ITEM TYPES PROCESSED")
print("="*80)


## Step 4: Run FP-Growth Algorithm

Apply FP-Growth to discover frequent drug itemsets.


In [None]:
logger.info(f"Running FP-Growth with min_support={MIN_SUPPORT}...")
start_time = time.time()

# Run FP-Growth
frequent_itemsets = fpgrowth(df_encoded, min_support=MIN_SUPPORT, use_colnames=True)

# Sort by support
frequent_itemsets = frequent_itemsets.sort_values('support', ascending=False).reset_index(drop=True)

elapsed = time.time() - start_time
logger.info(f"‚úì Found {len(frequent_itemsets):,} frequent itemsets in {elapsed:.1f}s")

print(f"\nüìä Frequent Itemsets:")
print(f"  Total itemsets: {len(frequent_itemsets):,}")
print(f"  Size 1 (single drugs): {(frequent_itemsets['itemsets'].apply(len) == 1).sum():,}")
print(f"  Size 2 (pairs): {(frequent_itemsets['itemsets'].apply(len) == 2).sum():,}")
print(f"  Size 3+: {(frequent_itemsets['itemsets'].apply(len) >= 3).sum():,}")
print(f"\n  Top 10 frequent itemsets:")
print(frequent_itemsets.head(10))


## Step 5: Generate Association Rules

Create association rules from frequent itemsets.


In [None]:
logger.info(f"Generating association rules with min_confidence={MIN_CONFIDENCE}...")
start_time = time.time()

# Generate rules
try:
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=MIN_CONFIDENCE)
    rules = rules.sort_values('lift', ascending=False).reset_index(drop=True)
    
    elapsed = time.time() - start_time
    logger.info(f"‚úì Generated {len(rules):,} association rules in {elapsed:.1f}s")
    
    print(f"\nüìä Association Rules:")
    print(f"  Total rules: {len(rules):,}")
    print(f"  Avg confidence: {rules['confidence'].mean():.3f}")
    print(f"  Avg lift: {rules['lift'].mean():.2f}")
    print(f"\n  Top 10 rules by lift:")
    print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))
    
except ValueError as e:
    logger.warning(f"Could not generate rules: {e}")
    rules = pd.DataFrame()
    print(f"\n‚ö†Ô∏è No association rules generated (itemsets may be too sparse)")


## Step 6: Create Global Drug Encoding Map

Generate universal drug encodings with FP-Growth metrics for ML features.


In [None]:
def create_global_encoding_map(drug_names, itemsets_df, rules_df, logger):
    """
    Create global drug encoding map with FP-Growth metrics.
    """
    logger.info("Creating global drug encoding map...")
    start_time = time.time()
    
    encoding_map = {}
    
    for drug in drug_names:
        # Get support from itemsets
        support = 0.0
        matching_itemsets = itemsets_df[itemsets_df['itemsets'].apply(lambda x: drug in x)]
        if not matching_itemsets.empty:
            support = matching_itemsets['support'].max()
        
        # Get confidence from rules
        confidence = 0.0
        if not rules_df.empty:
            matching_rules = rules_df[
                rules_df['antecedents'].apply(lambda x: drug in x) |
                rules_df['consequents'].apply(lambda x: drug in x)
            ]
            if not matching_rules.empty:
                confidence = matching_rules['confidence'].max()
        
        # Create encoding
        encoding = encode_drug_name(drug, support=support, confidence=confidence)
        encoding_map[drug] = encoding
    
    elapsed = time.time() - start_time
    logger.info(f"‚úì Created encoding map for {len(encoding_map):,} drugs in {elapsed:.1f}s")
    
    return encoding_map

# Create encoding map
encoding_map = create_global_encoding_map(global_drug_names, frequent_itemsets, rules, logger)

print(f"\nüìä Global Drug Encoding Map:")
print(f"  Total drugs encoded: {len(encoding_map):,}")
print(f"\n  Sample encodings (first 10 drugs):")
for i, (drug, encoding) in enumerate(list(encoding_map.items())[:10], 1):
    print(f"    {i}. {drug[:30]:30s} ‚Üí {encoding}")


## Step 7: Save Results to S3

Upload all results to S3 for downstream analysis.


In [None]:
logger.info("Saving results to S3...")
start_time = time.time()

# Convert frozensets to lists for JSON serialization
itemsets_json = frequent_itemsets.copy()
itemsets_json['itemsets'] = itemsets_json['itemsets'].apply(lambda x: list(x))

if not rules.empty:
    rules_json = rules.copy()
    rules_json['antecedents'] = rules_json['antecedents'].apply(lambda x: list(x))
    rules_json['consequents'] = rules_json['consequents'].apply(lambda x: list(x))
else:
    rules_json = pd.DataFrame()

# Save to S3
results_saved = {}

try:
    # 1. Drug encoding map
    encoding_path = f"{S3_OUTPUT_BASE}/drug_encoding_map.json"
    save_to_s3_json(encoding_map, encoding_path)
    results_saved['encoding_map'] = encoding_path
    logger.info(f"‚úì Saved encoding map to {encoding_path}")
    
    # 2. Frequent itemsets
    itemsets_path = f"{S3_OUTPUT_BASE}/global_itemsets.json"
    save_to_s3_json(itemsets_json.to_dict(orient='records'), itemsets_path)
    results_saved['itemsets'] = itemsets_path
    logger.info(f"‚úì Saved itemsets to {itemsets_path}")
    
    # 3. Association rules
    if not rules_json.empty:
        rules_path = f"{S3_OUTPUT_BASE}/global_rules.json"
        save_to_s3_json(rules_json.to_dict(orient='records'), rules_path)
        results_saved['rules'] = rules_path
        logger.info(f"‚úì Saved rules to {rules_path}")
    
    # 4. Summary metrics
    summary = {
        'timestamp': datetime.now().isoformat(),
        'total_drugs': len(global_drug_names),
        'total_patients': len(transactions),
        'total_itemsets': len(frequent_itemsets),
        'total_rules': len(rules),
        'min_support': MIN_SUPPORT,
        'min_confidence': MIN_CONFIDENCE,
        'avg_drugs_per_patient': float(np.mean([len(t) for t in transactions])),
        'output_paths': results_saved
    }
    
    summary_path = f"{S3_OUTPUT_BASE}/global_metrics.json"
    save_to_s3_json(summary, summary_path)
    results_saved['summary'] = summary_path
    logger.info(f"‚úì Saved summary to {summary_path}")
    
    elapsed = time.time() - start_time
    logger.info(f"‚úì All results saved to S3 in {elapsed:.1f}s")
    
    print(f"\n‚úÖ Results saved to S3:")
    for result_type, path in results_saved.items():
        print(f"  {result_type}: {path}")
    
except Exception as e:
    logger.error(f"Error saving to S3: {e}")
    print(f"\n‚ùå Error saving to S3: {e}")
    raise


## Summary and Next Steps


In [None]:
print("="*80)
print("GLOBAL FPGROWTH ANALYSIS - SUMMARY")
print("="*80)
print(f"\nüìä Dataset Statistics:")
print(f"  Total unique drugs: {len(global_drug_names):,}")
print(f"  Total patients: {len(transactions):,}")
print(f"  Avg drugs/patient: {np.mean([len(t) for t in transactions]):.1f}")

print(f"\nüîç FP-Growth Results:")
print(f"  Min support: {MIN_SUPPORT} ({MIN_SUPPORT*100:.2f}%)")
print(f"  Min confidence: {MIN_CONFIDENCE} ({MIN_CONFIDENCE*100:.1f}%)")
print(f"  Frequent itemsets: {len(frequent_itemsets):,}")
print(f"  Association rules: {len(rules):,}")

print(f"\nüíæ Output Files (S3):")
for result_type, path in results_saved.items():
    print(f"  {result_type}: {path}")

print(f"\nüéØ Next Steps:")
print(f"  1. Load encoding map in CatBoost: load_from_s3_json('{results_saved['encoding_map']}')")
print(f"  2. Use encodings for feature engineering in ML models")
print(f"  3. Analyze association rules for drug interaction patterns")
print(f"  4. Run cohort-specific FPGrowth analysis for detailed insights")

print(f"\n‚úì Analysis complete: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)
