# STRATEGY 1: STRATIFIED SAMPLING BORUTA
## Best for: 4M+ row datasets - Balanced accuracy & speed
### Runtime: ~60 minutes for 200 features

**Approach:**
- Run Boruta 5 times on independent 50k stratified samples
- Aggregate results with stability metrics
- Keep features confirmed in ≥60% of runs
- Apply hard gate: drop only if BOTH targets reject

## Setup and Imports

In [None]:
import numpy as np
import pandas as pd
from boruta import BorutaPy
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit

print("✓ Imports loaded")

## Define Boruta Function

In [None]:
def run_boruta_stratified_sampling(
    data: pd.DataFrame,
    feature_cols: list,
    y_col: str,
    n_samples: int = 50000,
    n_iterations: int = 5,
    min_confirmation_rate: float = 0.6,
    random_state: int = 42,
    max_iter: int = 40,
    n_estimators: int = 300,
    perc: int = 100
):
    """
    Run Boruta multiple times on stratified samples and aggregate.
    
    Returns:
        DataFrame with confirmation_rate, avg_ranking, final_status
    """
    
    feature_cols = [c for c in feature_cols if c in data.columns]
    results_collection = []
    
    print(f"\nRunning {n_iterations} iterations with {n_samples:,} samples each...")
    print(f"Total dataset size: {len(data):,} rows")
    
    for iteration in range(n_iterations):
        print(f"\n{'='*60}")
        print(f"Iteration {iteration + 1}/{n_iterations}")
        print(f"{'='*60}")
        
        # Stratified sampling
        splitter = StratifiedShuffleSplit(
            n_splits=1,
            train_size=min(n_samples, len(data)),
            random_state=random_state + iteration
        )
        
        sample_idx, _ = next(splitter.split(data, data[y_col]))
        sample_data = data.iloc[sample_idx].copy()
        
        class_dist = sample_data[y_col].value_counts(normalize=True)
        print(f"Sample size: {len(sample_data):,}")
        print(f"Class distribution: {class_dist.to_dict()}")
        
        # Prepare data
        X = sample_data[feature_cols].copy()
        y = sample_data[y_col].astype(int).values
        
        X = X.replace([np.inf, -np.inf], np.nan)
        X = SimpleImputer(strategy="median").fit_transform(X)
        
        # Run Boruta
        rf = RandomForestClassifier(
            n_estimators=n_estimators,
            n_jobs=-1,
            class_weight="balanced_subsample",
            random_state=random_state + iteration,
            min_samples_leaf=5,
            max_features="sqrt"
        )
        
        b = BorutaPy(
            estimator=rf,
            n_estimators="auto",
            max_iter=max_iter,
            perc=perc,
            random_state=random_state + iteration,
            verbose=1
        )
        
        b.fit(X, y)
        
        # Record results
        status = np.array(["rejected"] * len(feature_cols), dtype=object)
        status[b.support_weak_] = "tentative"
        status[b.support_] = "confirmed"
        
        iteration_results = pd.DataFrame({
            "feature": feature_cols,
            f"status_iter_{iteration}": status,
            f"ranking_iter_{iteration}": b.ranking_
        })
        
        confirmed_count = (status == "confirmed").sum()
        tentative_count = (status == "tentative").sum()
        rejected_count = (status == "rejected").sum()
        
        print(f"Results: Confirmed={confirmed_count}, Tentative={tentative_count}, Rejected={rejected_count}")
        
        results_collection.append(iteration_results)
    
    # Aggregate results
    print(f"\n{'='*60}")
    print("Aggregating results...")
    print(f"{'='*60}")
    
    aggregated = results_collection[0].copy()
    for i in range(1, len(results_collection)):
        aggregated = aggregated.merge(results_collection[i], on="feature", how="outer")
    
    status_cols = [c for c in aggregated.columns if c.startswith("status_iter_")]
    ranking_cols = [c for c in aggregated.columns if c.startswith("ranking_iter_")]
    
    def confirmation_rate(row):
        statuses = [row[col] for col in status_cols if pd.notna(row[col])]
        return sum(s == "confirmed" for s in statuses) / len(statuses)
    
    def avg_ranking(row):
        rankings = [row[col] for col in ranking_cols if pd.notna(row[col])]
        return np.mean(rankings) if rankings else np.inf
    
    aggregated["confirmation_rate"] = aggregated.apply(confirmation_rate, axis=1)
    aggregated["avg_ranking"] = aggregated.apply(avg_ranking, axis=1)
    aggregated["final_status"] = aggregated["confirmation_rate"].apply(
        lambda x: "confirmed" if x >= min_confirmation_rate else "rejected"
    )
    
    result = aggregated[["feature", "confirmation_rate", "avg_ranking", "final_status"]].copy()
    result = result.sort_values("avg_ranking")
    
    confirmed = (result["final_status"] == "confirmed").sum()
    rejected = (result["final_status"] == "rejected").sum()
    
    print(f"\nFinal: Confirmed={confirmed}, Rejected={rejected}")
    
    return result

print("✓ Function defined")

## Prepare Feature List

In [None]:
# Get all numeric columns
feature_cols_boruta = df.select_dtypes(["int", "float"]).columns.tolist()

# Exclude targets and IDs
exclude_cols = ["applied_flag", "approved", "AFFINITY_ID"]
feature_cols_boruta = [c for c in feature_cols_boruta if c not in exclude_cols]

print(f"Total features for Boruta: {len(feature_cols_boruta)}")
print(f"Dataset size - df: {len(df):,} rows")
print(f"Dataset size - approved_df: {len(approved_df):,} rows")

## Run Boruta for APPLIED_FLAG

In [None]:
print("="*70)
print("RUNNING BORUTA FOR APPLIED_FLAG")
print("="*70)

boruta_applied = run_boruta_stratified_sampling(
    data=df,
    feature_cols=feature_cols_boruta,
    y_col="applied_flag",
    n_samples=50000,
    n_iterations=5,
    min_confirmation_rate=0.6,
    max_iter=40,
    n_estimators=300,
    random_state=42
)

# Rename columns
boruta_applied = boruta_applied.rename(columns={
    "final_status": "boruta_applied_status",
    "avg_ranking": "boruta_applied_ranking",
    "confirmation_rate": "applied_confirmation_rate"
})

print("\n✓ Applied flag processing complete")
display(boruta_applied.head(10))

## Run Boruta for APPROVED

In [None]:
print("="*70)
print("RUNNING BORUTA FOR APPROVED")
print("="*70)

boruta_approved = run_boruta_stratified_sampling(
    data=approved_df,
    feature_cols=feature_cols_boruta,
    y_col="approved",
    n_samples=50000,
    n_iterations=5,
    min_confirmation_rate=0.6,
    max_iter=40,
    n_estimators=300,
    random_state=42
)

# Rename columns
boruta_approved = boruta_approved.rename(columns={
    "final_status": "boruta_approved_status",
    "avg_ranking": "boruta_approved_ranking",
    "confirmation_rate": "approved_confirmation_rate"
})

print("\n✓ Approved processing complete")
display(boruta_approved.head(10))

## Apply Hard Gate (Keep unless BOTH reject)

In [None]:
print("="*70)
print("APPLYING HARD GATE")
print("="*70)

boruta_merge = boruta_applied.merge(
    boruta_approved,
    on="feature",
    how="outer"
).fillna({
    "boruta_applied_status": "rejected",
    "boruta_approved_status": "rejected",
    "applied_confirmation_rate": 0.0,
    "approved_confirmation_rate": 0.0
})

# Hard gate: keep unless BOTH rejected
boruta_merge["keep"] = ~(
    (boruta_merge["boruta_applied_status"] == "rejected") &
    (boruta_merge["boruta_approved_status"] == "rejected")
)

final_kept_features = boruta_merge.loc[boruta_merge["keep"], "feature"].tolist()
final_dropped_features = boruta_merge.loc[~boruta_merge["keep"], "feature"].tolist()

print(f"\nFeatures KEPT: {len(final_kept_features)}")
print(f"Features DROPPED: {len(final_dropped_features)}")

## Generate Detailed Report

In [None]:
# Add reason column
def gate_reason(row):
    a = row["boruta_applied_status"]
    p = row["boruta_approved_status"]
    
    if a == "rejected" and p == "rejected":
        return "DROP: rejected by BOTH"
    if a == "confirmed" and p == "confirmed":
        return "KEEP: confirmed by BOTH"
    if a == "confirmed":
        return f"KEEP: applied confirmed, approved {p}"
    if p == "confirmed":
        return f"KEEP: approved confirmed, applied {a}"
    return f"KEEP: applied {a}, approved {p}"

boruta_merge["reason"] = boruta_merge.apply(gate_reason, axis=1)

# Stability score
boruta_merge["stability_score"] = (
    boruta_merge["applied_confirmation_rate"] +
    boruta_merge["approved_confirmation_rate"]
) / 2

# Buckets
def bucket(row):
    a = row["boruta_applied_status"]
    p = row["boruta_approved_status"]
    if not row["keep"]:
        return "Dropped (both rejected)"
    if a == "confirmed" and p == "confirmed":
        return "Confirmed by BOTH"
    if a == "confirmed":
        return "Applied-only confirmed"
    if p == "confirmed":
        return "Approved-only confirmed"
    return "Kept (other)"

boruta_merge["bucket"] = boruta_merge.apply(bucket, axis=1)

print("✓ Report columns added")

## View Bucket Distribution

In [None]:
print("Bucket Distribution:")
display(boruta_merge["bucket"].value_counts())

## View Dropped Features

In [None]:
dropped_view = boruta_merge.loc[~boruta_merge["keep"], [
    "feature",
    "boruta_applied_status", "applied_confirmation_rate",
    "boruta_approved_status", "approved_confirmation_rate",
    "stability_score",
    "reason"
]].sort_values("stability_score")

print(f"\nDropped Features ({len(dropped_view)}):")
display(dropped_view)

## View Kept Features (Best First)

In [None]:
kept_view = boruta_merge.loc[boruta_merge["keep"], [
    "feature",
    "boruta_applied_status", "applied_confirmation_rate",
    "boruta_approved_status", "approved_confirmation_rate",
    "stability_score",
    "reason"
]].sort_values("stability_score", ascending=False)

print(f"\nKept Features ({len(kept_view)}):")
display(kept_view.head(20))

## Save Report

In [None]:
boruta_merge.to_csv("boruta_stratified_report.csv", index=False)
print("✓ Saved: boruta_stratified_report.csv")

## Filter Your correlation_columns

In [None]:
# Filter to kept features
correlation_columns_boruta = [c for c in correlation_columns if c in final_kept_features]

print(f"Original correlation_columns: {len(correlation_columns)} features")
print(f"After Boruta filtering: {len(correlation_columns_boruta)} features")
print(f"Reduction: {len(correlation_columns) - len(correlation_columns_boruta)} features removed")

# Use in your pipeline
df_corr = df[correlation_columns_boruta].copy()
approved_df_corr = approved_df[correlation_columns_boruta].copy()

print("\n✓ Ready to use correlation_columns_boruta in your pipeline")