# Global FPGrowth Feature Importance Analysis

## Overview

This notebook performs **global FPGrowth analysis** across all cohorts to create universal drug encoding features for machine learning models. The results are used for:

1. **CatBoost Feature Engineering**: Creates consistent drug encodings across training/validation/test sets
2. **Population-Level Insights**: Discovers drug association patterns across all patients
3. **Feature Importance**: Identifies which drug patterns are most frequent in the population

## Key Outputs

- **Global Drug Encoding Map**: Universal drug encodings for ML (`gold/fpgrowth/global/drug_encoding_map.json`)
- **Frequent Itemsets**: Drug combinations that appear frequently (`gold/fpgrowth/global/global_itemsets.json`)
- **Association Rules**: Drug co-prescription patterns (`gold/fpgrowth/global/global_rules.json`)
- **Network Visualization**: Interactive drug association network (`gold/fpgrowth/global/drug_network.html`)

## Parameters

- **Min Support**: 0.005 (drugs must appear in 0.5% of transactions)
- **Min Confidence**: 0.01 (rules must have 1% confidence)
- **Data Source**: Local cohort data from `data/gold/cohorts_F1120/`

---


## Setup and Imports


In [None]:
import os
import sys
import json
import pandas as pd
import numpy as np
from datetime import datetime
import logging
from pathlib import Path
import time

# MLxtend for FP-Growth
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder

# Project root
project_root = Path.cwd().parent if Path.cwd().name == '3_fpgrowth_analysis' else Path.cwd()
sys.path.insert(0, str(project_root))

# Project utilities
from helpers_1997_13.common_imports import s3_client, S3_BUCKET
from helpers_1997_13.duckdb_utils import get_duckdb_connection
from helpers_1997_13.s3_utils import save_to_s3_json, save_to_s3_parquet
from helpers_1997_13.drug_utils import encode_drug_name
from helpers_1997_13.visualization_utils import create_network_visualization

print(f"‚úì Project root: {project_root}")
print(f"‚úì All imports successful")
print(f"‚úì Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


## Configuration


In [None]:
# FP-Growth parameters
MIN_SUPPORT = 0.005  # 0.5% support threshold
MIN_CONFIDENCE = 0.01  # 1% confidence threshold
TOP_K = 50  # Top K itemsets to analyze

# S3 output paths
S3_OUTPUT_BASE = f"s3://{S3_BUCKET}/gold/fpgrowth/global"

# Local data path
LOCAL_DATA_PATH = project_root / "data" / "gold" / "cohorts_F1120"

# Create logger
logger = logging.getLogger('global_fpgrowth')
logger.setLevel(logging.INFO)
if not logger.handlers:
    handler = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)

print(f"‚úì Min Support: {MIN_SUPPORT}")
print(f"‚úì Min Confidence: {MIN_CONFIDENCE}")
print(f"‚úì S3 Output: {S3_OUTPUT_BASE}")
print(f"‚úì Local Data: {LOCAL_DATA_PATH}")
print(f"‚úì Local Data Exists: {LOCAL_DATA_PATH.exists()}")


## Step 1: Extract All Drug Names from Cohorts

Load all cohort data and extract unique drug names to create the global drug universe.


In [None]:
def extract_global_drug_names(local_data_path, logger):
    """
    Extract all unique drug names from local cohort data.
    """
    logger.info("Extracting global drug names from local cohort data...")
    start_time = time.time()
    
    # Get DuckDB connection
    con = get_duckdb_connection(logger=logger)
    
    # Build glob pattern for all parquet files
    parquet_pattern = str(local_data_path / "**" / "cohort.parquet")
    
    query = f"""
    SELECT DISTINCT drug_name
    FROM read_parquet('{parquet_pattern}', hive_partitioning=1)
    WHERE drug_name IS NOT NULL 
      AND drug_name != ''
      AND event_type = 'PHARMACY'
    ORDER BY drug_name
    """
    
    logger.info(f"Running query...")
    df = con.execute(query).df()
    con.close()
    
    drug_names = df['drug_name'].tolist()
    
    elapsed = time.time() - start_time
    logger.info(f"‚úì Extracted {len(drug_names):,} unique drug names in {elapsed:.1f}s")
    
    return drug_names

# Execute extraction
global_drug_names = extract_global_drug_names(LOCAL_DATA_PATH, logger)

print(f"\nüìä Global Drug Statistics:")
print(f"  Total unique drugs: {len(global_drug_names):,}")
print(f"\n  Sample drugs (first 10):")
for i, drug in enumerate(global_drug_names[:10], 1):
    print(f"    {i}. {drug}")


## Step 2: Create Global Drug Transactions

Create patient-level drug transactions for FP-Growth algorithm.


In [None]:
def create_global_drug_transactions(local_data_path, logger):
    """
    Create patient-level drug transactions from local cohort data.
    """
    logger.info("Creating global drug transactions...")
    start_time = time.time()
    
    # Get DuckDB connection
    con = get_duckdb_connection(logger=logger)
    
    # Build glob pattern for all parquet files
    parquet_pattern = str(local_data_path / "**" / "cohort.parquet")
    
    query = f"""
    SELECT 
        mi_person_key,
        drug_name
    FROM read_parquet('{parquet_pattern}', hive_partitioning=1)
    WHERE drug_name IS NOT NULL 
      AND drug_name != ''
      AND event_type = 'PHARMACY'
    """
    
    logger.info(f"Loading pharmacy events...")
    df = con.execute(query).df()
    con.close()
    
    # Group by patient and create drug lists
    logger.info(f"Grouping by patient...")
    transactions = (
        df.groupby('mi_person_key')['drug_name']
        .apply(lambda x: sorted(set(x.tolist())))
        .tolist()
    )
    
    elapsed = time.time() - start_time
    logger.info(f"‚úì Created {len(transactions):,} patient transactions in {elapsed:.1f}s")
    
    return transactions

# Execute transaction creation
transactions = create_global_drug_transactions(LOCAL_DATA_PATH, logger)

print(f"\nüìä Transaction Statistics:")
print(f"  Total patients: {len(transactions):,}")
print(f"  Avg drugs per patient: {np.mean([len(t) for t in transactions]):.1f}")
print(f"  Median drugs per patient: {np.median([len(t) for t in transactions]):.0f}")
print(f"  Max drugs per patient: {max([len(t) for t in transactions])}")
print(f"\n  Sample transaction (first patient):")
print(f"    Patient has {len(transactions[0])} drugs: {transactions[0][:5]}...")


## Step 3: Encode Transactions for FP-Growth

Use TransactionEncoder to convert transactions into one-hot encoded format.


In [None]:
logger.info("Encoding transactions...")
start_time = time.time()

# Create TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

elapsed = time.time() - start_time
logger.info(f"‚úì Encoded transactions in {elapsed:.1f}s")

print(f"\nüìä Encoded Transaction Matrix:")
print(f"  Shape: {df_encoded.shape} (patients √ó drugs)")
print(f"  Memory: {df_encoded.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
print(f"\n  Sample (first 5 patients √ó first 5 drugs):")
print(df_encoded.iloc[:5, :5])


## Step 4: Run FP-Growth Algorithm

Apply FP-Growth to discover frequent drug itemsets.


In [None]:
logger.info(f"Running FP-Growth with min_support={MIN_SUPPORT}...")
start_time = time.time()

# Run FP-Growth
frequent_itemsets = fpgrowth(df_encoded, min_support=MIN_SUPPORT, use_colnames=True)

# Sort by support
frequent_itemsets = frequent_itemsets.sort_values('support', ascending=False).reset_index(drop=True)

elapsed = time.time() - start_time
logger.info(f"‚úì Found {len(frequent_itemsets):,} frequent itemsets in {elapsed:.1f}s")

print(f"\nüìä Frequent Itemsets:")
print(f"  Total itemsets: {len(frequent_itemsets):,}")
print(f"  Size 1 (single drugs): {(frequent_itemsets['itemsets'].apply(len) == 1).sum():,}")
print(f"  Size 2 (pairs): {(frequent_itemsets['itemsets'].apply(len) == 2).sum():,}")
print(f"  Size 3+: {(frequent_itemsets['itemsets'].apply(len) >= 3).sum():,}")
print(f"\n  Top 10 frequent itemsets:")
print(frequent_itemsets.head(10))


## Step 5: Generate Association Rules

Create association rules from frequent itemsets.


In [None]:
logger.info(f"Generating association rules with min_confidence={MIN_CONFIDENCE}...")
start_time = time.time()

# Generate rules
try:
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=MIN_CONFIDENCE)
    rules = rules.sort_values('lift', ascending=False).reset_index(drop=True)
    
    elapsed = time.time() - start_time
    logger.info(f"‚úì Generated {len(rules):,} association rules in {elapsed:.1f}s")
    
    print(f"\nüìä Association Rules:")
    print(f"  Total rules: {len(rules):,}")
    print(f"  Avg confidence: {rules['confidence'].mean():.3f}")
    print(f"  Avg lift: {rules['lift'].mean():.2f}")
    print(f"\n  Top 10 rules by lift:")
    print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))
    
except ValueError as e:
    logger.warning(f"Could not generate rules: {e}")
    rules = pd.DataFrame()
    print(f"\n‚ö†Ô∏è No association rules generated (itemsets may be too sparse)")


## Step 6: Create Global Drug Encoding Map

Generate universal drug encodings with FP-Growth metrics for ML features.


In [None]:
def create_global_encoding_map(drug_names, itemsets_df, rules_df, logger):
    """
    Create global drug encoding map with FP-Growth metrics.
    """
    logger.info("Creating global drug encoding map...")
    start_time = time.time()
    
    encoding_map = {}
    
    for drug in drug_names:
        # Get support from itemsets
        support = 0.0
        matching_itemsets = itemsets_df[itemsets_df['itemsets'].apply(lambda x: drug in x)]
        if not matching_itemsets.empty:
            support = matching_itemsets['support'].max()
        
        # Get confidence from rules
        confidence = 0.0
        if not rules_df.empty:
            matching_rules = rules_df[
                rules_df['antecedents'].apply(lambda x: drug in x) |
                rules_df['consequents'].apply(lambda x: drug in x)
            ]
            if not matching_rules.empty:
                confidence = matching_rules['confidence'].max()
        
        # Create encoding
        encoding = encode_drug_name(drug, support=support, confidence=confidence)
        encoding_map[drug] = encoding
    
    elapsed = time.time() - start_time
    logger.info(f"‚úì Created encoding map for {len(encoding_map):,} drugs in {elapsed:.1f}s")
    
    return encoding_map

# Create encoding map
encoding_map = create_global_encoding_map(global_drug_names, frequent_itemsets, rules, logger)

print(f"\nüìä Global Drug Encoding Map:")
print(f"  Total drugs encoded: {len(encoding_map):,}")
print(f"\n  Sample encodings (first 10 drugs):")
for i, (drug, encoding) in enumerate(list(encoding_map.items())[:10], 1):
    print(f"    {i}. {drug[:30]:30s} ‚Üí {encoding}")


## Step 7: Save Results to S3

Upload all results to S3 for downstream analysis.


In [None]:
logger.info("Saving results to S3...")
start_time = time.time()

# Convert frozensets to lists for JSON serialization
itemsets_json = frequent_itemsets.copy()
itemsets_json['itemsets'] = itemsets_json['itemsets'].apply(lambda x: list(x))

if not rules.empty:
    rules_json = rules.copy()
    rules_json['antecedents'] = rules_json['antecedents'].apply(lambda x: list(x))
    rules_json['consequents'] = rules_json['consequents'].apply(lambda x: list(x))
else:
    rules_json = pd.DataFrame()

# Save to S3
results_saved = {}

try:
    # 1. Drug encoding map
    encoding_path = f"{S3_OUTPUT_BASE}/drug_encoding_map.json"
    save_to_s3_json(encoding_map, encoding_path)
    results_saved['encoding_map'] = encoding_path
    logger.info(f"‚úì Saved encoding map to {encoding_path}")
    
    # 2. Frequent itemsets
    itemsets_path = f"{S3_OUTPUT_BASE}/global_itemsets.json"
    save_to_s3_json(itemsets_json.to_dict(orient='records'), itemsets_path)
    results_saved['itemsets'] = itemsets_path
    logger.info(f"‚úì Saved itemsets to {itemsets_path}")
    
    # 3. Association rules
    if not rules_json.empty:
        rules_path = f"{S3_OUTPUT_BASE}/global_rules.json"
        save_to_s3_json(rules_json.to_dict(orient='records'), rules_path)
        results_saved['rules'] = rules_path
        logger.info(f"‚úì Saved rules to {rules_path}")
    
    # 4. Summary metrics
    summary = {
        'timestamp': datetime.now().isoformat(),
        'total_drugs': len(global_drug_names),
        'total_patients': len(transactions),
        'total_itemsets': len(frequent_itemsets),
        'total_rules': len(rules),
        'min_support': MIN_SUPPORT,
        'min_confidence': MIN_CONFIDENCE,
        'avg_drugs_per_patient': float(np.mean([len(t) for t in transactions])),
        'output_paths': results_saved
    }
    
    summary_path = f"{S3_OUTPUT_BASE}/global_metrics.json"
    save_to_s3_json(summary, summary_path)
    results_saved['summary'] = summary_path
    logger.info(f"‚úì Saved summary to {summary_path}")
    
    elapsed = time.time() - start_time
    logger.info(f"‚úì All results saved to S3 in {elapsed:.1f}s")
    
    print(f"\n‚úÖ Results saved to S3:")
    for result_type, path in results_saved.items():
        print(f"  {result_type}: {path}")
    
except Exception as e:
    logger.error(f"Error saving to S3: {e}")
    print(f"\n‚ùå Error saving to S3: {e}")
    raise


## Summary and Next Steps


In [None]:
print("="*80)
print("GLOBAL FPGROWTH ANALYSIS - SUMMARY")
print("="*80)
print(f"\nüìä Dataset Statistics:")
print(f"  Total unique drugs: {len(global_drug_names):,}")
print(f"  Total patients: {len(transactions):,}")
print(f"  Avg drugs/patient: {np.mean([len(t) for t in transactions]):.1f}")

print(f"\nüîç FP-Growth Results:")
print(f"  Min support: {MIN_SUPPORT} ({MIN_SUPPORT*100:.2f}%)")
print(f"  Min confidence: {MIN_CONFIDENCE} ({MIN_CONFIDENCE*100:.1f}%)")
print(f"  Frequent itemsets: {len(frequent_itemsets):,}")
print(f"  Association rules: {len(rules):,}")

print(f"\nüíæ Output Files (S3):")
for result_type, path in results_saved.items():
    print(f"  {result_type}: {path}")

print(f"\nüéØ Next Steps:")
print(f"  1. Load encoding map in CatBoost: load_from_s3_json('{results_saved['encoding_map']}')")
print(f"  2. Use encodings for feature engineering in ML models")
print(f"  3. Analyze association rules for drug interaction patterns")
print(f"  4. Run cohort-specific FPGrowth analysis for detailed insights")

print(f"\n‚úì Analysis complete: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)
