Previous file was getting too chunky. This one has just the finalized complete pipeline for the ml feature engineered thang for 2SLS.

In [36]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
import statsmodels.api as sm
from scipy import stats
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings('ignore')

In [1]:
def generate_example_data(n=2000):
    """Generate synthetic data for demonstration"""
    np.random.seed(42)
    
    data = pd.DataFrame({
        'Age': np.random.randint(18, 65, n),
        'Gender': np.random.choice(['M', 'F'], n),
        'Income': np.random.randint(30000, 150000, n),
        'Location': np.random.choice(['Urban', 'Suburban', 'Rural'], n),
        'Ad_Type': np.random.choice(['Video', 'Banner', 'Native'], n),
        'Ad_Topic': np.random.choice(['Tech', 'Fashion', 'Food', 'Travel'], n),
        'Ad_Placement': np.random.choice(['Social_Media', 'Search', 'Display'], n),
        'Click_Time': pd.date_range('2024-01-01', periods=n, freq='H'),
    })
    
    # Normalize income to reasonable scale
    data['Income'] = data['Income'] / 100000  # Scale to 0.3-1.5 range
    
    # Generate clicks with realistic structure
    clicks_base = (
        0.5 +  # baseline
        0.3 * (data['Ad_Type'] == 'Video').astype(float) +
        0.2 * (data['Ad_Placement'] == 'Social_Media').astype(float) +
        0.01 * data['Age'] +
        0.2 * data['Income'] +
        np.random.randn(n) * 0.5
    )
    data['Clicks'] = np.clip(clicks_base, 0.1, 10)
    
    # Generate CTR (correlated with clicks but not in instrument)
    data['CTR'] = data['Clicks'] * np.random.uniform(0.05, 0.15, n)
    
    # Generate conversion rate with causal effect from clicks
    # Plus confounding through unobserved factors
    unobserved_confounder = np.random.randn(n) * 0.05
    
    conversion_base = (
        0.05 +  # baseline
        0.08 * data['Clicks'] +  # TRUE CAUSAL EFFECT
        0.02 * data['Income'] +
        0.005 * data['Age'] +
        0.3 * data['CTR'] +
        unobserved_confounder +
        np.random.randn(n) * 0.03
    )
    data['Conversion_Rate'] = np.clip(conversion_base, 0.01, 0.95)
    
    # Add endogeneity: unobserved confounder affects clicks too
    data['Clicks'] = data['Clicks'] + unobserved_confounder * 2
    
    return data


##### Claude Code Artifact Start

In [None]:
"""
Instrumental Variables (IV) Causal Inference for Ad Conversion Analysis
Using ML-Generated Instruments and Two-Stage Least Squares (2SLS)
ENHANCED VERSION: Data cleaning, stronger instruments, and log transformations
"""
class CausalAdAnalyzer:
    """
    A comprehensive pipeline for causal inference in ad conversion analysis
    using ML-generated instrumental variables and 2SLS estimation.
    
    ENHANCED with:
    - Robust data cleaning and preprocessing
    - Logarithmic transformations for skewed variables
    - Rich feature engineering for stronger instruments
    - Stacking ensemble for maximum predictive power
    - Comprehensive diagnostics including Stock-Yogo tests
    """
    
    def __init__(self, data):
        """
        Initialize the analyzer with your dataset.
        
        Parameters:
        -----------
        data : pd.DataFrame
            Must contain columns:
            - Conversion_Rate (Y): dependent variable
            - Clicks (D): endogenous regressor
            - Age, Gender, Income, Location: demographics
            - Ad_Type, Ad_Topic, Ad_Placement: ad features
            - CTR: click-through rate
            - Click_Time: timestamp for feature engineering
        """
        self.data = data.copy()
        self.encoders = {}
        self.scaler = StandardScaler()
        self.first_stage_model = None
        self.first_stage_results = None
        self.second_stage_results = None
        
        # Clean data on initialization
        self._clean_data()
        
    def _clean_data(self):
        """
        Clean and preprocess data before analysis.
        
        Performs:
        1. Handle negative income values
        2. Impute missing income with median
        3. Winsorize income at 1st and 99th percentiles
        4. Filter age to plausible range (10-90 years)
        5. Create logarithmic transformations for skewed variables
        """
        print("\n" + "="*60)
        print("DATA CLEANING AND PREPROCESSING")
        print("="*60)
        
        initial_rows = len(self.data)
        
        # =====================================================================
        # 1. CLEAN INCOME
        # =====================================================================
        if 'Income' in self.data.columns:
            # Convert negative income to missing
            neg_income_count = (self.data['Income'] < 0).sum()
            self.data.loc[self.data['Income'] < 0, 'Income'] = np.nan
            
            if neg_income_count > 0:
                print(f"‚úì Converted {neg_income_count} negative income values to missing")
            
            # Impute missing income with median
            missing_income = self.data['Income'].isna().sum()
            if missing_income > 0:
                imputer = SimpleImputer(strategy='median')
                self.data['Income'] = imputer.fit_transform(self.data[['Income']])
                print(f"‚úì Imputed {missing_income} missing income values with median")
            
            # Winsorize: Cap extremes at 1st and 99th percentile
            lower, upper = self.data['Income'].quantile([0.01, 0.99])
            income_before = self.data['Income'].copy()
            self.data['Income'] = self.data['Income'].clip(lower, upper)
            winsorized = (income_before != self.data['Income']).sum()
            print(f"‚úì Winsorized {winsorized} income values at 1st/99th percentiles")
            print(f"  Income range: [{lower:,.0f}, {upper:,.0f}]")
        
        # =====================================================================
        # 2. FILTER AGE
        # =====================================================================
        if 'Age' in self.data.columns:
            age_before = len(self.data)
            self.data = self.data[self.data['Age'].between(10, 90)]
            age_filtered = age_before - len(self.data)
            if age_filtered > 0:
                print(f"‚úì Filtered {age_filtered} rows with implausible ages (keeping 10-90)")
        
        # =====================================================================
        # 3. CREATE LOGARITHMIC TRANSFORMATIONS
        # =====================================================================
        print(f"\nüìä Creating logarithmic transformations:")
        
        # Log of Income (if positive)
        if 'Income' in self.data.columns:
            self.data['Income_log'] = np.log1p(self.data['Income'])
            print(f"  ‚úì Income_log created (log1p transformation)")
        
        # Log of Clicks (if exists and positive)
        if 'Clicks' in self.data.columns:
            self.data['Clicks_log'] = np.log1p(self.data['Clicks'])
            print(f"  ‚úì Clicks_log created (log1p transformation)")
        
        # Log of Age (for nonlinear age effects)
        if 'Age' in self.data.columns:
            self.data['Age_log'] = np.log1p(self.data['Age'])
            print(f"  ‚úì Age_log created (log1p transformation)")
        
        # Log of CTR (if exists and positive)
        if 'CTR' in self.data.columns:
            # Ensure CTR is positive before log
            if (self.data['CTR'] > 0).all():
                self.data['CTR_log'] = np.log(self.data['CTR'])
                print(f"  ‚úì CTR_log created (log transformation)")
        
        # =====================================================================
        # SUMMARY
        # =====================================================================
        final_rows = len(self.data)
        rows_removed = initial_rows - final_rows
        
        print(f"\n{'='*60}")
        print(f"CLEANING SUMMARY:")
        print(f"  Initial rows:        {initial_rows:,}")
        print(f"  Final rows:          {final_rows:,}")
        print(f"  Rows removed:        {rows_removed:,} ({rows_removed/initial_rows*100:.1f}%)")
        print(f"  Log variables added: {len([col for col in self.data.columns if '_log' in col])}")
        print(f"{'='*60}\n")
        
        return self
    
    def engineer_time_features(self):
        """Extract day of week and hour from Click_Time"""
        if 'Click_Time' in self.data.columns:
            self.data['Click_Time'] = pd.to_datetime(self.data['Click_Time'])
            self.data['Day_of_Week'] = self.data['Click_Time'].dt.dayofweek
            self.data['Hour'] = self.data['Click_Time'].dt.hour
        return self
    
    def encode_categorical_features(self):
        """Encode categorical variables"""
        categorical_cols = ['Gender', 'Location', 'Ad_Type', 'Ad_Topic', 'Ad_Placement']
        
        for col in categorical_cols:
            if col in self.data.columns:
                le = LabelEncoder()
                self.data[f'{col}_encoded'] = le.fit_transform(self.data[col].astype(str))
                self.encoders[col] = le
        
        return self

    def engineer_instrument_features(self):
        """
        ENHANCED: Create rich features that predict clicks but don't directly affect conversions.
        
        This is crucial for instrument strength. We create:
        1. Interaction features between ad characteristics and demographics
        2. Time-based features (weekend, business hours)
        3. Nonlinear transformations
        4. Complex interactions between multiple variables
        
        Key principle: These features should predict CLICKS well, but only affect
        CONVERSIONS through clicks (exclusion restriction).
        """
        df = self.data
        
        print("\n" + "="*60)
        print("FEATURE ENGINEERING FOR INSTRUMENT STRENGTH")
        print("="*60)
        
        # =====================================================================
        # 1. AD CHARACTERISTICS √ó DEMOGRAPHICS INTERACTIONS
        # =====================================================================
        # Rationale: Different demographics respond differently to ad types
        
        if all(col in df.columns for col in ['Income', 'Ad_Type_encoded']):
            df['Income_x_AdType'] = df['Income'] * df['Ad_Type_encoded']
            print("‚úì Created Income √ó Ad Type interaction")
            
        if all(col in df.columns for col in ['Age', 'Ad_Topic_encoded']):
            df['Age_x_AdTopic'] = df['Age'] * df['Ad_Topic_encoded']
            print("‚úì Created Age √ó Ad Topic interaction")
            
        if all(col in df.columns for col in ['Income', 'Ad_Placement_encoded']):
            df['Income_x_Placement'] = df['Income'] * df['Ad_Placement_encoded']
            print("‚úì Created Income √ó Ad Placement interaction")
            
        if all(col in df.columns for col in ['Age', 'Ad_Placement_encoded']):
            df['Age_x_Placement'] = df['Age'] * df['Ad_Placement_encoded']
            print("‚úì Created Age √ó Ad Placement interaction")
        
        # =====================================================================
        # 2. TIME-BASED FEATURES AND INTERACTIONS
        # =====================================================================
        # Rationale: Click patterns vary by time of day/week
        
        if 'Day_of_Week' in df.columns:
            df['Weekend'] = (df['Day_of_Week'] >= 5).astype(int)
            print("‚úì Created Weekend indicator")
            
        if 'Hour' in df.columns:
            df['BusinessHours'] = ((df['Hour'] >= 9) & (df['Hour'] <= 17)).astype(int)
            df['Evening'] = ((df['Hour'] >= 18) & (df['Hour'] <= 23)).astype(int)
            df['Morning'] = ((df['Hour'] >= 6) & (df['Hour'] <= 11)).astype(int)
            print("‚úì Created time-of-day indicators")
        
        # Time √ó Ad interactions
        if all(col in df.columns for col in ['Weekend', 'Ad_Type_encoded']):
            df['Weekend_x_AdType'] = df['Weekend'] * df['Ad_Type_encoded']
            print("‚úì Created Weekend √ó Ad Type interaction")
            
        if all(col in df.columns for col in ['BusinessHours', 'Ad_Placement_encoded']):
            df['BusinessHours_x_Placement'] = df['BusinessHours'] * df['Ad_Placement_encoded']
            print("‚úì Created Business Hours √ó Ad Placement interaction")
            
        if all(col in df.columns for col in ['Evening', 'Ad_Topic_encoded']):
            df['Evening_x_AdTopic'] = df['Evening'] * df['Ad_Topic_encoded']
            print("‚úì Created Evening √ó Ad Topic interaction")
        
        # =====================================================================
        # 3. DEMOGRAPHICS √ó TIME INTERACTIONS
        # =====================================================================
        # Rationale: Different demographics have different browsing patterns
        
        if all(col in df.columns for col in ['Age', 'Hour']):
            df['Age_x_Hour'] = df['Age'] * df['Hour']
            print("‚úì Created Age √ó Hour interaction")
            
        if all(col in df.columns for col in ['Income', 'Weekend']):
            df['Income_x_Weekend'] = df['Income'] * df['Weekend']
            print("‚úì Created Income √ó Weekend interaction")
            
        if all(col in df.columns for col in ['Age', 'BusinessHours']):
            df['Age_x_BusinessHours'] = df['Age'] * df['BusinessHours']
            print("‚úì Created Age √ó Business Hours interaction")
        
        # =====================================================================
        # 4. NONLINEAR TRANSFORMATIONS
        # =====================================================================
        # Rationale: Relationships may be nonlinear (using log-transformed versions)
        
        if 'Age_log' in df.columns:
            df['Age_squared'] = df['Age'] ** 2
            print("‚úì Created Age squared")
            
        if 'Income_log' in df.columns:
            df['Income_squared'] = df['Income'] ** 2
            df['Income_sqrt'] = np.sqrt(df['Income'].clip(lower=0))
            print("‚úì Created Income squared and sqrt")
        
        # =====================================================================
        # 5. COMPLEX CATEGORICAL INTERACTIONS
        # =====================================================================
        # Rationale: Certain combinations may be particularly predictive
        
        # Location √ó Demographics
        if all(col in df.columns for col in ['Location_encoded', 'Age']):
            df['Location_x_Age'] = df['Location_encoded'] * df['Age']
            print("‚úì Created Location √ó Age interaction")
            
        if all(col in df.columns for col in ['Location_encoded', 'Income']):
            df['Location_x_Income'] = df['Location_encoded'] * df['Income']
            print("‚úì Created Location √ó Income interaction")
        
        # Location √ó Ad characteristics
        if all(col in df.columns for col in ['Location_encoded', 'Ad_Placement_encoded']):
            df['Location_x_Placement'] = df['Location_encoded'] * df['Ad_Placement_encoded']
            print("‚úì Created Location √ó Placement interaction")
        
        # Gender √ó Ad characteristics
        if all(col in df.columns for col in ['Gender_encoded', 'Ad_Topic_encoded']):
            df['Gender_x_AdTopic'] = df['Gender_encoded'] * df['Ad_Topic_encoded']
            print("‚úì Created Gender √ó Ad Topic interaction")
            
        if all(col in df.columns for col in ['Gender_encoded', 'Ad_Type_encoded']):
            df['Gender_x_AdType'] = df['Gender_encoded'] * df['Ad_Type_encoded']
            print("‚úì Created Gender √ó Ad Type interaction")
        
        # Ad Type √ó Placement (different placements work for different types)
        if all(col in df.columns for col in ['Ad_Type_encoded', 'Ad_Placement_encoded']):
            df['AdType_x_Placement'] = df['Ad_Type_encoded'] * df['Ad_Placement_encoded']
            print("‚úì Created Ad Type √ó Placement interaction")
        
        # =====================================================================
        # 6. THREE-WAY INTERACTIONS (most powerful)
        # =====================================================================
        # Rationale: Capture complex patterns
        
        if all(col in df.columns for col in ['Age', 'Ad_Type_encoded', 'Weekend']):
            df['Age_x_AdType_x_Weekend'] = df['Age'] * df['Ad_Type_encoded'] * df['Weekend']
            print("‚úì Created Age √ó Ad Type √ó Weekend interaction")
            
        if all(col in df.columns for col in ['Income', 'Ad_Placement_encoded', 'BusinessHours']):
            df['Income_x_Placement_x_BizHours'] = df['Income'] * df['Ad_Placement_encoded'] * df['BusinessHours']
            print("‚úì Created Income √ó Placement √ó Business Hours interaction")
        
        print("="*60 + "\n")
        
        return self

    def create_ml_instrument(self, model_type='stacking', cv_folds=5, use_enhanced_features=True):
        """
        ENHANCED: Generate ML-based instrument for Clicks (D) using ensemble methods.
        
        Parameters:
        -----------
        model_type : str
            'rf' for Random Forest
            'gb' for Gradient Boosting
            'stacking' for Stacking Ensemble (RECOMMENDED for strongest instruments)
        cv_folds : int
            Number of cross-validation folds
        use_enhanced_features : bool
            Whether to use enhanced feature engineering (recommended)
        """
        
        # Apply enhanced feature engineering if requested
        if use_enhanced_features:
            self.engineer_instrument_features()
        
        # =====================================================================
        # DEFINE INSTRUMENT FEATURES
        # =====================================================================
        # Base features (always included)
        base_features = [
            'Age', 'Income',
            'Gender_encoded', 'Location_encoded',
            'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded',
            'Day_of_Week', 'Hour'
        ]
        
        # Enhanced features (only if engineered)
        enhanced_features = [
            # Interactions
            'Income_x_AdType', 'Age_x_AdTopic', 'Income_x_Placement', 'Age_x_Placement',
            'Weekend_x_AdType', 'BusinessHours_x_Placement', 'Evening_x_AdTopic',
            'Age_x_Hour', 'Income_x_Weekend', 'Age_x_BusinessHours',
            'Location_x_Age', 'Location_x_Income', 'Location_x_Placement',
            'Gender_x_AdTopic', 'Gender_x_AdType', 'AdType_x_Placement',
            'Age_x_AdType_x_Weekend', 'Income_x_Placement_x_BizHours',
            # Time features
            'Weekend', 'BusinessHours', 'Evening', 'Morning',
            # Nonlinear (now using cleaned log versions)
            'Age_squared', 'Age_log', 'Income_log', 'Income_squared', 'Income_sqrt',
            'Clicks_log', 'CTR_log'
        ]
        
        # Combine and filter available features
        if use_enhanced_features:
            instrument_features = base_features + enhanced_features
        else:
            instrument_features = base_features
            
        available_features = [f for f in instrument_features if f in self.data.columns]
        
        print(f"\n{'='*60}")
        print(f"ML INSTRUMENT CONSTRUCTION")
        print(f"{'='*60}")
        print(f"Total features available: {len(available_features)}")
        print(f"Model type: {model_type.upper()}")
        print(f"Cross-validation folds: {cv_folds}")

        # Prepare data
        X_instrument = self.data[available_features]
        y_clicks = self.data['Clicks']
        
        # Standardize features
        X_instrument_scaled = self.scaler.fit_transform(X_instrument)
        X_instrument_scaled = pd.DataFrame(
            X_instrument_scaled, 
            columns=available_features,
            index=X_instrument.index
        )
        
        # =====================================================================
        # FEATURE SELECTION
        # =====================================================================
        # Optional: Reduce to top N features based on importance
        top_n = min(10, len(available_features))
        print(f"\nSelecting top {top_n} features based on model importance...")
        
        # Use a simple model to rank features (e.g., Random Forest)
        feature_selector = RandomForestRegressor(
            n_estimators=100,
            random_state=42,
            n_jobs=-1
        )
        feature_selector.fit(X_instrument_scaled, y_clicks)
        
        # Get top N features
        importances = pd.Series(feature_selector.feature_importances_, index=X_instrument_scaled.columns)
        top_features = importances.sort_values(ascending=False).head(top_n).index.tolist()
        
        print("Top features selected:")
        for i, feat in enumerate(top_features, 1):
            print(f"{i}. {feat}")
        
        # Filter scaled data to top features
        X_instrument_scaled = X_instrument_scaled[top_features]
        
        # =====================================================================
        # BUILD MODEL
        # =====================================================================
        
        if model_type == 'stacking':
            print("\nBuilding Stacking Ensemble (strongest option)...")
            
            # Define base learners with more aggressive parameters
            base_models = [
                ('rf', RandomForestRegressor(
                    n_estimators=200,
                    max_depth=15,
                    min_samples_split=20,
                    min_samples_leaf=10,
                    max_features='sqrt',
                    random_state=42,
                    n_jobs=-1
                )),
                ('gb', GradientBoostingRegressor(
                    n_estimators=200,
                    max_depth=7,
                    learning_rate=0.05,
                    subsample=0.8,
                    min_samples_split=20,
                    min_samples_leaf=10,
                    random_state=42
                ))
            ]
            
            # Try to import XGBoost if available
            try:
                from xgboost import XGBRegressor
                base_models.append(
                    ('xgb', XGBRegressor(
                        n_estimators=200,
                        max_depth=8,
                        learning_rate=0.05,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        random_state=42,
                        n_jobs=-1
                    ))
                )
                print("  ‚úì Using XGBoost as additional base learner")
            except ImportError:
                print("  ‚Ñπ XGBoost not available, using RF + GB only")
            
            # Create stacking ensemble
            self.first_stage_model = StackingRegressor(
                estimators=base_models,
                final_estimator=Ridge(alpha=1.0),
                cv=cv_folds,
                n_jobs=-1
            )
            
        elif model_type == 'rf':
            print("\nBuilding Random Forest...")
            self.first_stage_model = RandomForestRegressor(
                n_estimators=200,
                max_depth=15,
                min_samples_split=20,
                min_samples_leaf=10,
                max_features='sqrt',
                random_state=42,
                n_jobs=-1
            )
            
        elif model_type == 'gb':
            print("\nBuilding Gradient Boosting...")
            self.first_stage_model = GradientBoostingRegressor(
                n_estimators=200,
                max_depth=7,
                learning_rate=0.05,
                subsample=0.8,
                min_samples_split=20,
                min_samples_leaf=10,
                random_state=42
            )
        
        # =====================================================================
        # GENERATE OUT-OF-FOLD PREDICTIONS
        # =====================================================================
        print(f"\nGenerating out-of-fold predictions (CV={cv_folds})...")
        
        self.data['Clicks_predicted'] = cross_val_predict(
            self.first_stage_model,
            X_instrument_scaled,
            y_clicks,
            cv=cv_folds,
            n_jobs=-1
        )
        
        # Fit final model for interpretation
        print("Fitting final model...")
        self.first_stage_model.fit(X_instrument_scaled, y_clicks)
        
        # =====================================================================
        # DIAGNOSTICS
        # =====================================================================
        self._enhanced_instrument_diagnostics(X_instrument_scaled, y_clicks)
        
        return self
    
    def _enhanced_instrument_diagnostics(self, X, y):
        """
        ENHANCED: Comprehensive instrument strength testing with Stock-Yogo critical values.
        """
        z = self.data['Clicks_predicted'].values
        d = self.data['Clicks'].values
        
        n = len(d)
        k = X.shape[1]
        
        # =====================================================================
        # 1. FIRST-STAGE R-SQUARED AND F-STATISTIC
        # =====================================================================
        z_resid = z - z.mean()
        d_resid = d - d.mean()
        
        ss_tot = np.sum(d_resid**2)
        ss_res = np.sum((d - z)**2)
        r_squared = 1 - (ss_res / ss_tot)
        
        # Proper F-statistic for first stage
        f_stat = (r_squared / 1) / ((1 - r_squared) / (n - k - 1))
        
        # =====================================================================
        # 2. CORRELATION
        # =====================================================================
        corr = np.corrcoef(z, d)[0, 1]
        
        # =====================================================================
        # 3. CRAGG-DONALD STATISTIC
        # =====================================================================
        cragg_donald = n * r_squared
        
        # =====================================================================
        # DISPLAY RESULTS
        # =====================================================================
        print(f"\n{'='*70}")
        print(f"ENHANCED INSTRUMENT STRENGTH DIAGNOSTICS")
        print(f"{'='*70}")
        print(f"\nSAMPLE INFORMATION:")
        print(f"  Sample size (n):              {n:,}")
        print(f"  Number of features (k):       {k}")
        print(f"\nFIRST-STAGE PERFORMANCE:")
        print(f"  R-squared:                    {r_squared:.4f}")
        print(f"  Correlation (Z, D):           {corr:.4f}")
        print(f"  F-statistic:                  {f_stat:.2f}")
        print(f"  Cragg-Donald statistic:       {cragg_donald:.2f}")
        
        print(f"\nBENCHMARKS & INTERPRETATION:")
        print(f"  {'Criterion':<35} {'Threshold':<12} {'Status'}")
        print(f"  {'-'*35} {'-'*12} {'-'*20}")
        
        # Weak instrument test
        weak_status = "‚úì STRONG" if f_stat > 10 else "‚úó WEAK"
        print(f"  {'Weak Instrument (F < 10)':<35} {'10.00':<12} {weak_status}")
        
        # Stock-Yogo critical values (for single instrument, single endogenous variable)
        sy_10_status = "‚úì‚úì EXCELLENT" if f_stat > 16.38 else "‚úó Below threshold"
        sy_15_status = "‚úì GOOD" if f_stat > 8.96 else "‚úó Below threshold"
        
        print(f"  {'Stock-Yogo 10% max bias':<35} {'16.38':<12} {sy_10_status}")
        print(f"  {'Stock-Yogo 15% max bias':<35} {'8.96':<12} {sy_15_status}")
        
        print(f"\nOVERALL ASSESSMENT:")
        if f_stat > 16.38:
            print(f"  ‚úì‚úì VERY STRONG INSTRUMENT")
            print(f"     Maximum IV bias < 10% of OLS bias")
            print(f"     Highly reliable causal inference")
        elif f_stat > 10:
            print(f"  ‚úì STRONG INSTRUMENT")
            print(f"     Acceptable for causal inference")
            print(f"     Results should be reliable")
        elif f_stat > 5:
            print(f"  ‚ö† MODERATELY WEAK INSTRUMENT")
            print(f"     Proceed with caution")
            print(f"     Consider sensitivity analysis")
        else:
            print(f"  ‚úó WEAK INSTRUMENT")
            print(f"     Results may be unreliable")
            print(f"     Consider alternative identification strategies")
        
        # =====================================================================
        # FEATURE IMPORTANCE (if available)
        # =====================================================================
        if hasattr(self.first_stage_model, 'feature_importances_'):
            print(f"\nTOP 10 MOST IMPORTANT FEATURES FOR PREDICTING CLICKS:")
            importances = self.first_stage_model.feature_importances_
            top_features = sorted(
                zip(X.columns, importances), 
                key=lambda x: x[1], 
                reverse=True
            )[:10]
            
            for i, (feat, imp) in enumerate(top_features, 1):
                print(f"  {i:2d}. {feat:35s} {imp:.4f}")
        
        elif hasattr(self.first_stage_model, 'final_estimator_'):
            print(f"\n‚Ñπ Stacking ensemble used - individual feature importances")
            print(f"  not directly available, but all base models contribute")
        
        print(f"{'='*70}\n")
    
    def run_2sls(self, include_interactions=False):
        """
        Step 3: Two-Stage Least Squares Estimation
        
        First Stage: D = œÄ‚ÇÄ + œÄ‚ÇÅZ + œÄ‚ÇÇX + ŒΩ
        Second Stage: Y = Œ± + Œ≤DÃÇ + Œ≥X + Œµ
        
        Parameters:
        -----------
        include_interactions : bool
            Whether to include Ad_Type √ó Ad_Placement interactions
        """
        # Exogenous controls (X)
        exog_controls = [
            'Age', 'Income',
            'Gender_encoded', 'Location_encoded',
            'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded',
            'CTR'
        ]
        
        available_controls = [f for f in exog_controls if f in self.data.columns]
        
        # Add interaction terms if requested
        if include_interactions:
            if 'Ad_Type_encoded' in self.data.columns and 'Ad_Placement_encoded' in self.data.columns:
                self.data['Ad_Type_x_Placement'] = (
                    self.data['Ad_Type_encoded'] * self.data['Ad_Placement_encoded']
                )
                available_controls.append('Ad_Type_x_Placement')
        
        print('2sls data summar: ', self.data.describe(include='all'))

        # FIRST STAGE: Regress D on Z and X
        print("\n" + "="*60)
        print("FIRST STAGE REGRESSION: D ~ Z + X")
        print("="*60)
        
        print('available controls: ', available_controls)

        X_first_stage = sm.add_constant(pd.concat([
            self.data[['Clicks_predicted']],
            self.data[available_controls]
        ], axis=1))
        
        y_first_stage = self.data['Clicks']
        
        self.first_stage_results = sm.OLS(y_first_stage, X_first_stage).fit()
        
        print("\nFirst Stage Summary:")
        print(f"R-squared: {self.first_stage_results.rsquared:.4f}")
        print(f"F-statistic: {self.first_stage_results.fvalue:.2f}")
        print(f"Instrument coefficient: {self.first_stage_results.params['Clicks_predicted']:.4f}")
        print(f"Instrument p-value: {self.first_stage_results.pvalues['Clicks_predicted']:.4f}")
        
        # Get fitted values from first stage
        D_hat = self.first_stage_results.fittedvalues
        
        # SECOND STAGE: Regress Y on D_hat and X
        print("\n" + "="*60)
        print("SECOND STAGE REGRESSION: Y ~ DÃÇ + X")
        print("="*60)
        
        X_second_stage = sm.add_constant(pd.concat([
            pd.Series(D_hat, name='Clicks_fitted'),
            self.data[available_controls]
        ], axis=1))
        
        y_second_stage = self.data['Conversion_Rate']
        
        self.second_stage_results = sm.OLS(y_second_stage, X_second_stage).fit()
        
        # Manual calculation of correct standard errors for 2SLS
        self._calculate_2sls_standard_errors(available_controls)
        
        self._display_results()
        
        return self
    
    ### stratified 2sls for subgroup effects
    def analyze_subgroup_effects(self, subgroup_vars=None, min_subgroup_size=100):
        """
        Stratified 2SLS: Run separate 2SLS regressions within subgroups to identify
        heterogeneous treatment effects.

        This helps explain why average effects may be weak - effects may be strong
        in specific segments but cancel out in aggregate.

        Parameters:
        -----------
        subgroup_vars : list of str or dict, optional
            Variables to stratify by. Can be:
            - List of column names (will auto-create bins for continuous vars)
            - Dict mapping column names to bin specifications
            Example: ['Location', 'Ad_Type'] or 
                     {'Income': [0, 30000, 60000, np.inf], 'Age': [0, 35, 50, 65, np.inf]}
        min_subgroup_size : int
            Minimum observations required per subgroup (for statistical power)

        Returns:
        --------
        results_df : pd.DataFrame
            Subgroup-specific causal effects with diagnostics
        """

        if subgroup_vars is None:
            # Default subgroups based on theory
            subgroup_vars = {
                'Income': [0, 30000, 50000, 70000, np.inf],  # Quartile-like bins
                'Age': [0, 35, 50, 65, np.inf],              # Life stage bins
                'Location': None,                             # Use as-is (categorical)
                'Ad_Type': None                               # Use as-is (categorical)
            }

        print("\n" + "="*70)
        print("STRATIFIED 2SLS: HETEROGENEOUS TREATMENT EFFECTS ANALYSIS")
        print("="*70)

        # Store results for each subgroup
        all_results = []

        # Exogenous controls for 2SLS
        exog_controls = [
            'Age', 'Income',
            'Gender_encoded', 'Location_encoded',
            'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded',
            'CTR'
        ]
        available_controls = [f for f in exog_controls if f in self.data.columns]

        # =========================================================================
        # PROCESS EACH SUBGROUP VARIABLE
        # =========================================================================

        for var in (subgroup_vars if isinstance(subgroup_vars, list) else subgroup_vars.keys()):

            print(f"\n{'‚îÄ'*70}")
            print(f"ANALYZING SUBGROUPS BY: {var.upper()}")
            print(f"{'‚îÄ'*70}")

            # Create subgroups
            if isinstance(subgroup_vars, dict) and subgroup_vars.get(var) is not None:
                # Continuous variable with specified bins
                bins = subgroup_vars[var]
                labels = [f"{var}_{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]
                self.data[f'{var}_subgroup'] = pd.cut(
                    self.data[var], 
                    bins=bins, 
                    labels=labels,
                    include_lowest=True
                )
                subgroup_col = f'{var}_subgroup'
            else:
                # Categorical variable - use as is
                subgroup_col = var

            # Get unique subgroups
            subgroups = self.data[subgroup_col].dropna().unique()

            print(f"\nFound {len(subgroups)} subgroups: {sorted([str(s) for s in subgroups])}")

            # =====================================================================
            # RUN 2SLS FOR EACH SUBGROUP
            # =====================================================================

            for subgroup in subgroups:

                # Filter data to subgroup
                subgroup_data = self.data[self.data[subgroup_col] == subgroup].copy()
                n_obs = len(subgroup_data)

                # Skip if too small
                if n_obs < min_subgroup_size:
                    print(f"\n  ‚ö† Skipping '{subgroup}': Only {n_obs} observations (min={min_subgroup_size})")
                    continue
                
                print(f"\n  üìä Subgroup: '{subgroup}' (N={n_obs:,})")

                # Check if we have the instrument in this subgroup
                if 'Clicks_predicted' not in subgroup_data.columns:
                    print(f"     ‚úó No instrument available")
                    continue
                
                try:
                    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
                    # FIRST STAGE: D ~ Z + X (within subgroup)
                    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

                    X_first = sm.add_constant(pd.concat([
                        subgroup_data[['Clicks_predicted']],
                        subgroup_data[available_controls]
                    ], axis=1))

                    y_first = subgroup_data['Clicks']

                    first_stage = sm.OLS(y_first, X_first).fit()

                    # First stage diagnostics
                    f_stat_first = first_stage.fvalue
                    r2_first = first_stage.rsquared
                    instrument_coef = first_stage.params['Clicks_predicted']
                    instrument_pval = first_stage.pvalues['Clicks_predicted']

                    # Weak instrument check
                    is_weak = f_stat_first < 10

                    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
                    # SECOND STAGE: Y ~ DÃÇ + X (within subgroup)
                    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

                    D_hat = first_stage.fittedvalues

                    X_second = sm.add_constant(pd.concat([
                        pd.Series(D_hat, name='Clicks_fitted'),
                        subgroup_data[available_controls]
                    ], axis=1))

                    y_second = subgroup_data['Conversion_Rate']

                    second_stage = sm.OLS(y_second, X_second).fit()

                    # Extract causal effect
                    causal_effect = second_stage.params['Clicks_fitted']
                    se = second_stage.bse['Clicks_fitted']
                    tstat = second_stage.tvalues['Clicks_fitted']
                    pval = second_stage.pvalues['Clicks_fitted']
                    ci_lower = causal_effect - 1.96 * se
                    ci_upper = causal_effect + 1.96 * se

                    # Statistical significance
                    is_significant = pval < 0.05

                    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
                    # DISPLAY SUBGROUP RESULTS
                    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

                    print(f"     First Stage:")
                    print(f"       F-stat: {f_stat_first:.2f} {'‚úó WEAK' if is_weak else '‚úì STRONG'}")
                    print(f"       R¬≤: {r2_first:.4f}")

                    print(f"     Second Stage (Causal Effect):")
                    print(f"       Œ≤: {causal_effect:.6f}")
                    print(f"       SE: {se:.6f}")
                    print(f"       95% CI: [{ci_lower:.6f}, {ci_upper:.6f}]")
                    print(f"       p-value: {pval:.4f} {'‚úì SIGNIFICANT' if is_significant else '‚úó Not significant'}")

                    # Store results
                    all_results.append({
                        'Variable': var,
                        'Subgroup': str(subgroup),
                        'N': n_obs,
                        'First_Stage_F': f_stat_first,
                        'First_Stage_R2': r2_first,
                        'Instrument_Weak': is_weak,
                        'Causal_Effect_Beta': causal_effect,
                        'Std_Error': se,
                        'T_Statistic': tstat,
                        'P_Value': pval,
                        'CI_Lower': ci_lower,
                        'CI_Upper': ci_upper,
                        'Significant': is_significant
                    })

                except Exception as e:
                    print(f"     ‚úó Error: {str(e)}")
                    continue
                
        # =========================================================================
        # SUMMARY TABLE
        # =========================================================================

        if len(all_results) == 0:
            print("\n‚ö† No subgroups analyzed successfully")
            return None

        results_df = pd.DataFrame(all_results)

        print("\n" + "="*70)
        print("SUBGROUP EFFECTS SUMMARY")
        print("="*70)

        # Sort by absolute effect size
        results_df['Abs_Effect'] = results_df['Causal_Effect_Beta'].abs()
        results_df = results_df.sort_values('Abs_Effect', ascending=False)

        # Display table
        display_cols = [
            'Variable', 'Subgroup', 'N', 
            'Causal_Effect_Beta', 'P_Value', 'Significant',
            'First_Stage_F', 'Instrument_Weak'
        ]

        print("\n" + results_df[display_cols].to_string(index=False))

        # =========================================================================
        # KEY INSIGHTS
        # =========================================================================

        print("\n" + "="*70)
        print("KEY INSIGHTS")
        print("="*70)

        # Find strongest effects
        significant_effects = results_df[results_df['Significant'] == True]

        if len(significant_effects) > 0:
            print(f"\n‚úì Found {len(significant_effects)} subgroups with SIGNIFICANT causal effects:")

            for _, row in significant_effects.head(5).iterrows():
                print(f"\n  ‚Ä¢ {row['Variable']} = '{row['Subgroup']}':")
                print(f"    - Causal effect: {row['Causal_Effect_Beta']:.6f}")
                print(f"    - 95% CI: [{row['CI_Lower']:.6f}, {row['CI_Upper']:.6f}]")
                print(f"    - p-value: {row['P_Value']:.4f}")
                print(f"    - Sample size: {row['N']:,}")
        else:
            print("\n‚úó No subgroups with statistically significant effects found")

        # Check for weak instruments in subgroups
        weak_instruments = results_df[results_df['Instrument_Weak'] == True]
        if len(weak_instruments) > 0:
            print(f"\n‚ö† Warning: {len(weak_instruments)} subgroups have weak instruments (F < 10)")
            print("  Consider these results with caution")

        # Effect heterogeneity
        effect_range = results_df['Causal_Effect_Beta'].max() - results_df['Causal_Effect_Beta'].min()
        print(f"\nüìä Effect Heterogeneity:")
        print(f"  Range: {effect_range:.6f}")
        print(f"  Max effect: {results_df['Causal_Effect_Beta'].max():.6f} ({results_df.loc[results_df['Causal_Effect_Beta'].idxmax(), 'Subgroup']})")
        print(f"  Min effect: {results_df['Causal_Effect_Beta'].min():.6f} ({results_df.loc[results_df['Causal_Effect_Beta'].idxmin(), 'Subgroup']})")

        print("\n" + "="*70)

        # Store results for later access
        self.subgroup_results = results_df

        return results_df

    # def _calculate_2sls_standard_errors(self, controls):
    #     """
    #     Calculate correct 2SLS standard errors
    #     (OLS on second stage gives incorrect SEs)
    #     """
    #     # Get residuals from second stage
    #     residuals = self.second_stage_results.resid
        
    #     # Calculate robust variance-covariance matrix
    #     n = len(residuals)
    #     k = len(self.second_stage_results.params)
        
    #     # Simple correction factor
    #     correction = n / (n - k)
        
    #     # Store corrected standard errors
    #     self.corrected_se = np.sqrt(np.diag(self.second_stage_results.cov_params()) * correction)
    #     self.corrected_tvalues = self.second_stage_results.params / self.corrected_se
    #     self.corrected_pvalues = 2 * (1 - stats.t.cdf(np.abs(self.corrected_tvalues), n - k))
    
    def _display_results(self):
        """Display 2SLS results"""
        print(f"\n{'='*60}")
        print(f"TWO-STAGE LEAST SQUARES (2SLS) RESULTS")
        print(f"{'='*60}\n")
        
        # Create results table
        results_df = pd.DataFrame({
            'Coefficient': self.second_stage_results.params,
            'Std Error': self.corrected_se,
            't-statistic': self.corrected_tvalues,
            'P-value': self.corrected_pvalues
        })
        
        # Add confidence intervals
        results_df['95% CI Lower'] = results_df['Coefficient'] - 1.96 * results_df['Std Error']
        results_df['95% CI Upper'] = results_df['Coefficient'] + 1.96 * results_df['Std Error']
        
        print(results_df.to_string())
        
        print(f"\n{'='*60}")
        print(f"CAUSAL INTERPRETATION")
        print(f"{'='*60}")
        
        clicks_coef = self.second_stage_results.params['Clicks_fitted']
        clicks_se = self.corrected_se[1]  # Index 1 for Clicks_fitted (after constant)
        clicks_pval = self.corrected_pvalues[1]
        
        print(f"\nCausal Effect of Clicks on Conversion Rate:")
        print(f"  Coefficient (Œ≤): {clicks_coef:.6f}")
        print(f"  Std. Error: {clicks_se:.6f}")
        print(f"  95% CI: [{clicks_coef - 1.96*clicks_se:.6f}, {clicks_coef + 1.96*clicks_se:.6f}]")
        print(f"  P-value: {clicks_pval:.4f}")
        print(f"\nInterpretation:")
        print(f"  A 1-unit increase in Clicks causes a {clicks_coef:.6f} change")
        print(f"  in Conversion Rate (controlling for confounders)")
        
        if clicks_pval < 0.05:
            print(f"  ‚úì Effect is statistically significant at 5% level")
        else:
            print(f"  ‚úó Effect is NOT statistically significant at 5% level")
        
        print(f"\n{'='*60}\n")
    
    def estimate_value_added(self):
        """
        Step 4: Value-Added Estimation
        
        Estimate the incremental contribution of different ad features
        after controlling for user characteristics and predicted clicks.
        """
        results = {}
        
        # Group by Ad Type
        if 'Ad_Type' in self.data.columns:
            results['by_ad_type'] = self._group_value_added('Ad_Type')
        
        # Group by Ad Placement
        if 'Ad_Placement' in self.data.columns:
            results['by_ad_placement'] = self._group_value_added('Ad_Placement')
        
        # Group by Ad Topic
        if 'Ad_Topic' in self.data.columns:
            results['by_ad_topic'] = self._group_value_added('Ad_Topic')
        
        self._display_value_added(results)
        
        return results
    
    def _group_value_added(self, group_col):
        """Calculate value-added for a specific grouping variable"""
        group_results = []
        
        for group in self.data[group_col].unique():
            # Create indicator variable
            indicator = (self.data[group_col] == group).astype(int)
            
            # Prepare regression with interaction
            y = self.data['Conversion_Rate']
            X = sm.add_constant(pd.DataFrame({
                'Clicks_predicted': self.data['Clicks_predicted'],
                'Age': self.data['Age'],
                'Income': self.data['Income'],
                'indicator': indicator,
                'interaction': indicator * self.data['Clicks_predicted']
            }))
            
            # Run OLS
            try:
                model = sm.OLS(y, X).fit()
                
                group_results.append({
                    'Group': str(group),
                    'Intercept_Effect': f"{model.params['indicator']:.6f}",
                    'Slope_Effect': f"{model.params['interaction']:.6f}",
                    'P_value_Intercept': f"{model.pvalues['indicator']:.4f}",
                    'P_value_Slope': f"{model.pvalues['interaction']:.4f}",
                    'Significant': '‚úì' if model.pvalues['indicator'] < 0.05 or model.pvalues['interaction'] < 0.05 else '‚úó'
                })
            except Exception as e:
                print(f"Warning: Could not estimate for {group}: {str(e)}")
        
        return pd.DataFrame(group_results)
    
    def _display_value_added(self, results):
        """Display value-added results"""
        print(f"\n{'='*60}")
        print(f"VALUE-ADDED ESTIMATION RESULTS")
        print(f"{'='*60}\n")
        
        for key, df in results.items():
            if len(df) > 0:
                print(f"\n{key.upper().replace('_', ' ')}:")
                print(f"{'-'*60}")
                print(df.to_string(index=False))
                print(f"{'-'*60}\n")

    def run_chetty_value_added_analysis(self, subgroup_vars=None, split_method='time', 
                                         split_ratio=0.5, min_group_size=100):
        """
        INTEGRATED PIPELINE: Combines subgroup analysis with Chetty's forecast bias framework.

        This is the main method you should call. It properly chains:
        1. analyze_subgroup_effects() on TRAINING data
        2. Forecast validation on TESTING data  
        3. Empirical Bayes shrinkage
        4. Bias correction

        Parameters:
        -----------
        subgroup_vars : list of str or dict
            Variables to stratify by (same format as analyze_subgroup_effects)
            Example: {'Income': [0, 30000, 60000, np.inf], 'Location': None}
        split_method : str
            'time' - split by Click_Time (chronological)
            'random' - random split with seed
        split_ratio : float
            Proportion for training (default: 0.5)
        min_group_size : int
            Minimum observations per group in EACH split

        Returns:
        --------
        results : dict
            Complete results including raw estimates, bias tests, and corrections
        """

        print("\n" + "="*70)
        print("INTEGRATED CHETTY VALUE-ADDED ANALYSIS")
        print("="*70)
        print(f"\nConfiguration:")
        print(f"  Split method: {split_method}")
        print(f"  Split ratio: {split_ratio:.1%} train / {1-split_ratio:.1%} test")
        print(f"  Min group size: {min_group_size}")
        print("="*70)

        # =========================================================================
        # STEP 1: SPLIT DATA
        # =========================================================================

        print(f"\n{'‚îÄ'*70}")
        print("STEP 1: DATA SPLITTING")
        print(f"{'‚îÄ'*70}")

        original_data = self.data.copy()

        if split_method == 'time':
            self.data = self.data.sort_values('Click_Time')
            split_idx = int(len(self.data) * split_ratio)
            train_indices = self.data.index[:split_idx]
            test_indices = self.data.index[split_idx:]

            train_data = self.data.loc[train_indices].copy()
            test_data = self.data.loc[test_indices].copy()

            print(f"‚úì Time-based split:")
            print(f"  Training: {train_data['Click_Time'].min()} to {train_data['Click_Time'].max()}")
            print(f"  Testing:  {test_data['Click_Time'].min()} to {test_data['Click_Time'].max()}")

        elif split_method == 'random':
            train_data = self.data.sample(frac=split_ratio, random_state=42)
            test_data = self.data.drop(train_data.index)
            print(f"‚úì Random split (seed=42)")

        print(f"  Training N: {len(train_data):,}")
        print(f"  Testing N:  {len(test_data):,}")

        # =========================================================================
        # STEP 2: RUN SUBGROUP ANALYSIS ON TRAINING DATA
        # =========================================================================

        print(f"\n{'‚îÄ'*70}")
        print("STEP 2: ESTIMATE VALUE-ADDED (Training Sample)")
        print(f"{'‚îÄ'*70}")

        # Temporarily replace self.data with training data
        self.data = train_data.copy()

        # Run subgroup analysis (this uses the existing method)
        train_results = self.analyze_subgroup_effects(
            subgroup_vars=subgroup_vars,
            min_subgroup_size=min_group_size
        )

        if train_results is None or len(train_results) == 0:
            print("\n‚úó No subgroups successfully estimated in training data")
            self.data = original_data  # Restore
            return None

        print(f"\n‚úì Estimated value-added for {len(train_results)} subgroups")

        # =========================================================================
        # STEP 3: VALIDATE FORECASTS IN TESTING DATA
        # =========================================================================

        print(f"\n{'‚îÄ'*70}")
        print("STEP 3: FORECAST VALIDATION (Testing Sample)")
        print(f"{'‚îÄ'*70}")

        # For each group, calculate mean outcome in test sample
        test_outcomes = []

        for _, row in train_results.iterrows():
            var = row['Variable']
            subgroup = row['Subgroup']

            # Filter test data to this subgroup
            if '_subgroup' in var:
                # This was a binned continuous variable
                # We need to recreate the bins
                continue  # Skip for now - handle separately
            else:
                # Categorical variable
                group_test = test_data[test_data[var] == subgroup]

            if len(group_test) < min_group_size:
                continue
            
            # Mean conversion rate in test sample
            mean_outcome = group_test['Conversion_Rate'].mean()
            n_test = len(group_test)

            test_outcomes.append({
                'Variable': var,
                'Subgroup': subgroup,
                'Value_Added_Train': row['Causal_Effect_Beta'],
                'SE_Train': row['Std_Error'],
                'N_Train': row['N'],
                'Mean_Outcome_Test': mean_outcome,
                'N_Test': n_test,
                'First_Stage_F': row['First_Stage_F']
            })

        test_df = pd.DataFrame(test_outcomes)

        if len(test_df) < 3:
            print(f"\n‚úó Insufficient groups for validation (need ‚â•3, have {len(test_df)})")
            self.data = original_data
            return None

        print(f"‚úì Validated {len(test_df)} groups in testing sample")
        print(f"\nTest Sample Statistics:")
        print(test_df[['Variable', 'Subgroup', 'Value_Added_Train', 'Mean_Outcome_Test', 'N_Test']].to_string(index=False))

        # =========================================================================
        # STEP 4: FORECAST UNBIASEDNESS TEST
        # =========================================================================

        print(f"\n{'‚îÄ'*70}")
        print("STEP 4: FORECAST UNBIASEDNESS TEST")
        print(f"{'‚îÄ'*70}")
        print("\nRegression: Mean_Outcome_test = Œ± + Œ≤ * ValueAdded_train + Œµ")
        print("H‚ÇÄ: Œ≤ = 1 (unbiased forecasts)")

        # Run regression
        X_forecast = sm.add_constant(test_df['Value_Added_Train'])
        y_forecast = test_df['Mean_Outcome_Test']

        # Weight by test sample size for precision
        weights = np.sqrt(test_df['N_Test'])
        forecast_model = sm.WLS(y_forecast, X_forecast, weights=weights).fit()

        # Extract coefficients
        alpha = forecast_model.params['const']
        beta = forecast_model.params['Value_Added_Train']
        beta_se = forecast_model.bse['Value_Added_Train']
        r_squared = forecast_model.rsquared

        # Test Œ≤ = 1
        t_stat_bias = (beta - 1) / beta_se
        p_val_bias = 2 * (1 - stats.t.cdf(abs(t_stat_bias), len(test_df) - 2))

        print(f"\nResults:")
        print(f"  Œ± (intercept):     {alpha:.6f}")
        print(f"  Œ≤ (slope):         {beta:.6f} (SE: {beta_se:.6f})")
        print(f"  Prediction R¬≤:     {r_squared:.4f}")
        print(f"\n  Test H‚ÇÄ: Œ≤ = 1")
        print(f"  t-statistic:       {t_stat_bias:.3f}")
        print(f"  p-value:           {p_val_bias:.4f}")

        is_biased = p_val_bias < 0.05

        if is_biased:
            print(f"\n  ‚úó REJECT H‚ÇÄ: Significant forecast bias detected")
            if beta < 1:
                bias_direction = "OVERPREDICTION (regression to mean)"
                print(f"    ‚Üí Training estimates overpredict test outcomes")
                print(f"    ‚Üí Shrinkage is strongly recommended")
            else:
                bias_direction = "UNDERPREDICTION"
                print(f"    ‚Üí Training estimates underpredict test outcomes")
        else:
            bias_direction = "MINIMAL"
            print(f"\n  ‚úì FAIL TO REJECT H‚ÇÄ: No significant bias")
            print(f"    ‚Üí Training estimates predict test outcomes well")

        # =========================================================================
        # STEP 5: EMPIRICAL BAYES SHRINKAGE
        # =========================================================================

        print(f"\n{'‚îÄ'*70}")
        print("STEP 5: EMPIRICAL BAYES SHRINKAGE")
        print(f"{'‚îÄ'*70}")

        # Calculate variance components
        va_estimates = train_results['Causal_Effect_Beta'].values
        va_variances = train_results['Std_Error'].values ** 2

        # Grand mean
        mu = np.mean(va_estimates)

        # Variance decomposition
        var_total = np.var(va_estimates)
        var_noise = np.mean(va_variances)
        var_signal = max(0, var_total - var_noise)

        # Reliability
        reliability = var_signal / (var_signal + var_noise) if (var_signal + var_noise) > 0 else 0

        print(f"\nVariance Decomposition:")
        print(f"  Between-group variance (signal): {var_signal:.8f}")
        print(f"  Within-group variance (noise):   {var_noise:.8f}")
        print(f"  Total variance:                  {var_total:.8f}")
        print(f"  Reliability (ŒªÃÑ):                 {reliability:.4f}")

        # Group-specific shrinkage
        shrinkage_results = []

        print(f"\nGroup-Specific Shrinkage:")
        print(f"{'Variable':<15} {'Subgroup':<20} {'Raw VA':<12} {'Œª':<8} {'Shrunk VA':<12}")
        print(f"{'-'*75}")

        for _, row in train_results.iterrows():
            raw_va = row['Causal_Effect_Beta']
            se = row['Std_Error']

            # Shrinkage factor for this group
            lambda_i = var_signal / (var_signal + se**2) if (var_signal + se**2) > 0 else 0

            # Shrink toward grand mean
            shrunk_va = mu + lambda_i * (raw_va - mu)

            print(f"{row['Variable']:<15} {str(row['Subgroup']):<20} {raw_va:>11.6f} {lambda_i:>7.4f} {shrunk_va:>11.6f}")

            shrinkage_results.append({
                'Variable': row['Variable'],
                'Subgroup': row['Subgroup'],
                'Raw_VA': raw_va,
                'Shrinkage_Factor': lambda_i,
                'Shrunk_VA': shrunk_va,
                'SE': se,
                'N_Train': row['N']
            })

        shrinkage_df = pd.DataFrame(shrinkage_results)

        # =========================================================================
        # STEP 6: BIAS CORRECTION
        # =========================================================================

        print(f"\n{'‚îÄ'*70}")
        print("STEP 6: FORECAST BIAS CORRECTION")
        print(f"{'‚îÄ'*70}")

        # Bias-corrected estimates: (Shrunk_VA - Œ±) / Œ≤
        shrinkage_df['Bias_Corrected_VA'] = (shrinkage_df['Shrunk_VA'] - alpha) / beta if beta != 0 else shrinkage_df['Shrunk_VA']

        print(f"\nApplying correction: VA_final = (VA_shrunk - {alpha:.6f}) / {beta:.6f}")
        print(f"\nFinal Value-Added Estimates:")

        # Sort by bias-corrected VA
        shrinkage_df = shrinkage_df.sort_values('Bias_Corrected_VA', ascending=False)

        display_cols = ['Variable', 'Subgroup', 'Raw_VA', 'Shrunk_VA', 'Bias_Corrected_VA', 'N_Train']
        print("\n" + shrinkage_df[display_cols].to_string(index=False))

        # =========================================================================
        # STEP 7: SUMMARY AND INTERPRETATION
        # =========================================================================

        print(f"\n{'='*70}")
        print("SUMMARY: CORRECTIONS APPLIED")
        print(f"{'='*70}")

        # Calculate average corrections
        avg_shrinkage = (shrinkage_df['Shrunk_VA'] - shrinkage_df['Raw_VA']).abs().mean()
        avg_bias_correction = (shrinkage_df['Bias_Corrected_VA'] - shrinkage_df['Shrunk_VA']).abs().mean()
        avg_total_correction = (shrinkage_df['Bias_Corrected_VA'] - shrinkage_df['Raw_VA']).abs().mean()

        print(f"\nAverage absolute corrections:")
        print(f"  Shrinkage effect:      {avg_shrinkage:.6f}")
        print(f"  Bias correction:       {avg_bias_correction:.6f}")
        print(f"  Total correction:      {avg_total_correction:.6f}")

        print(f"\nDiagnostics:")
        print(f"  Reliability:           {reliability:.4f}")
        print(f"  Forecast bias (Œ≤):     {beta:.4f}")
        print(f"  Prediction R¬≤:         {r_squared:.4f}")
        print(f"  Bias type:             {bias_direction}")

        # Recommendations
        print(f"\nRecommendations:")
        if reliability < 0.3:
            print("  ‚ö† Low reliability - value-added estimates are very noisy")
            print("    ‚Üí Collect more data or use coarser groupings")
        elif reliability < 0.7:
            print("  ‚úì Moderate reliability - shrinkage is important")
        else:
            print("  ‚úì‚úì High reliability - raw estimates are fairly trustworthy")

        if is_biased:
            print(f"  ‚ö† Forecast bias detected - bias correction is essential")
        else:
            print(f"  ‚úì Minimal forecast bias - raw estimates predict well")

        # =========================================================================
        # PACKAGE RESULTS
        # =========================================================================

        results = {
            'split_info': {
                'method': split_method,
                'n_train': len(train_data),
                'n_test': len(test_data)
            },
            'train_estimates': train_results,
            'test_outcomes': test_df,
            'forecast_bias': {
                'alpha': alpha,
                'beta': beta,
                'beta_se': beta_se,
                'p_value': p_val_bias,
                'r_squared': r_squared,
                'is_biased': is_biased,
                'direction': bias_direction
            },
            'shrinkage': {
                'reliability': reliability,
                'var_signal': var_signal,
                'var_noise': var_noise,
                'mu': mu
            },
            'final_estimates': shrinkage_df
        }

        # Store for later access
        self.chetty_results = results

        # Restore original data
        self.data = original_data

        print("\n" + "="*70)
        print("INTEGRATED ANALYSIS COMPLETE")
        print("="*70)

        return results

    def compare_estimation_approaches(self):
        """
        Create side-by-side comparison of estimation approaches.
        Must be called after run_chetty_value_added_analysis().

        Compares:
        1. Raw 2SLS estimates (training sample)
        2. Shrunk estimates (Empirical Bayes)
        3. Bias-corrected estimates (Full Chetty method)

        Returns:
        --------
        comparison_df : pd.DataFrame
        """

        if not hasattr(self, 'chetty_results'):
            print("\n‚ö† Must run run_chetty_value_added_analysis() first")
            return None

        print("\n" + "="*70)
        print("COMPARISON: ESTIMATION APPROACHES")
        print("="*70)

        final = self.chetty_results['final_estimates']

        # Calculate changes
        final['Change_from_Shrinkage'] = final['Shrunk_VA'] - final['Raw_VA']
        final['Change_from_Bias_Corr'] = final['Bias_Corrected_VA'] - final['Shrunk_VA']
        final['Total_Change'] = final['Bias_Corrected_VA'] - final['Raw_VA']
        final['Pct_Change'] = (final['Total_Change'] / final['Raw_VA'].abs()) * 100

        display_cols = [
            'Variable', 'Subgroup',
            'Raw_VA', 'Shrunk_VA', 'Bias_Corrected_VA',
            'Total_Change', 'Pct_Change'
        ]

        print("\n" + final[display_cols].to_string(index=False))

        # Summary
        print(f"\n{'‚îÄ'*70}")
        print("SUMMARY STATISTICS")
        print(f"{'‚îÄ'*70}")

        print(f"\nMean absolute change:")
        print(f"  From shrinkage:        {final['Change_from_Shrinkage'].abs().mean():.6f}")
        print(f"  From bias correction:  {final['Change_from_Bias_Corr'].abs().mean():.6f}")
        print(f"  Total:                 {final['Total_Change'].abs().mean():.6f}")

        print(f"\nMean % change:           {final['Pct_Change'].abs().mean():.1f}%")

        # Which groups changed most?
        print(f"\nGroups with largest corrections:")
        top_changes = final.nlargest(3, 'Total_Change', keep='all')[['Subgroup', 'Raw_VA', 'Bias_Corrected_VA', 'Total_Change']]
        print(top_changes.to_string(index=False))

        return final

    ### WITH Subgroup analysis
    def run_complete_analysis(self, model_type='stacking', include_interactions=False, 
                            use_enhanced_features=True, analyze_subgroups=False,
                            subgroup_vars=None, min_subgroup_size=100):
        """
        ENHANCED: Run the complete causal inference pipeline with strong instruments.

        Parameters:
        -----------
        model_type : str
            'stacking' (RECOMMENDED), 'rf', or 'gb'
        include_interactions : bool
            Whether to include interactions in 2SLS
        use_enhanced_features : bool
            Whether to use enhanced feature engineering (strongly recommended)
        analyze_subgroups : bool
            Whether to run stratified 2SLS to identify heterogeneous treatment effects
        subgroup_vars : list of str or dict, optional
            Variables to stratify by when analyze_subgroups=True
            Example: ['Location', 'Ad_Type'] or 
                     {'Income': [0, 30000, 60000, np.inf], 'Age': [0, 35, 50, 65, np.inf]}
        min_subgroup_size : int
            Minimum observations required per subgroup (default: 100)

        Pipeline:
        1. Engineer time features
        2. Encode categorical variables
        3. Create ML instrument (with optional enhanced features)
        4. Run 2SLS
        5. [OPTIONAL] Analyze subgroup effects
        6. Estimate value-added
        """
        print("="*70)
        print("ENHANCED CAUSAL AD CONVERSION ANALYSIS PIPELINE")
        print("="*70)
        print(f"\nConfiguration:")
        print(f"  Model type: {model_type.upper()}")
        print(f"  Enhanced features: {'YES' if use_enhanced_features else 'NO'}")
        print(f"  Include interactions in 2SLS: {'YES' if include_interactions else 'NO'}")
        print(f"  Analyze subgroups: {'YES' if analyze_subgroups else 'NO'}")
        print("="*70)

        self.engineer_time_features()
        print("\n‚úì Time features engineered")

        self.encode_categorical_features()
        print("‚úì Categorical variables encoded")

        self.create_ml_instrument(
            model_type=model_type,
            use_enhanced_features=use_enhanced_features
        )
        print("‚úì ML instrument created")

        self.run_2sls(include_interactions=include_interactions)
        print("‚úì 2SLS estimation complete")

        # NEW: Optional subgroup analysis
        if analyze_subgroups:
            subgroup_results = self.analyze_subgroup_effects(
                subgroup_vars=subgroup_vars,
                min_subgroup_size=min_subgroup_size
            )
            print("‚úì Subgroup effects analysis complete")

        value_added_results = self.estimate_value_added()
        print("‚úì Value-added estimation complete")

        print("\n" + "="*70)
        print("ANALYSIS COMPLETE!")
        print("="*70)

        return self

# ============================================================================
# EXAMPLE USAGE WITH COMPARISON
# ============================================================================

def compare_instrument_approaches(data):
    """
    Compare weak vs strong instruments to demonstrate improvement
    """
    print("\n" + "="*70)
    print("COMPARISON: WEAK vs STRONG INSTRUMENTS")
    print("="*70)
    
    # Approach 1: Basic features, single model (WEAK)
    print("\n\n" + "üî¥ APPROACH 1: BASIC (LIKELY WEAK)")
    print("="*70)
    analyzer_weak = CausalAdAnalyzer(data.copy())
    analyzer_weak.engineer_time_features()
    analyzer_weak.encode_categorical_features()
    analyzer_weak.create_ml_instrument(
        model_type='rf',
        use_enhanced_features=False  # No enhanced features
    )
    
    # Approach 2: Enhanced features, stacking ensemble (STRONG)
    print("\n\n" + "üü¢ APPROACH 2: ENHANCED (STRONG)")
    print("="*70)
    analyzer_strong = CausalAdAnalyzer(data.copy())
    analyzer_strong.engineer_time_features()
    analyzer_strong.encode_categorical_features()
    analyzer_strong.create_ml_instrument(
        model_type='stacking',
        use_enhanced_features=True  # Enhanced features
    )
    
    print("\n" + "="*70)
    print("COMPARISON SUMMARY")
    print("="*70)
    
    # Extract F-statistics for comparison
    z_weak = analyzer_weak.data['Clicks_predicted'].values
    d_weak = analyzer_weak.data['Clicks'].values
    corr_weak = np.corrcoef(z_weak, d_weak)[0, 1]
    f_weak = (corr_weak**2 / (1 - corr_weak**2)) * (len(d_weak) - 2)
    
    z_strong = analyzer_strong.data['Clicks_predicted'].values
    d_strong = analyzer_strong.data['Clicks'].values
    ss_tot = np.sum((d_strong - d_strong.mean())**2)
    ss_res = np.sum((d_strong - z_strong)**2)
    r2_strong = 1 - (ss_res / ss_tot)
    n = len(d_strong)
    k = len([col for col in analyzer_strong.data.columns if 'x' in col.lower() or 'squared' in col.lower()]) + 9
    f_strong = (r2_strong / 1) / ((1 - r2_strong) / (n - k - 1))
    
    print(f"\nApproach 1 (Basic):")
    print(f"  F-statistic: {f_weak:.2f}")
    print(f"  Status: {'‚úó WEAK' if f_weak < 10 else '‚úì STRONG'}")
    
    print(f"\nApproach 2 (Enhanced):")
    print(f"  F-statistic: {f_strong:.2f}")
    print(f"  Status: {'‚úó WEAK' if f_strong < 10 else '‚úì STRONG'}")
    
    improvement = ((f_strong - f_weak) / f_weak) * 100
    print(f"\nImprovement: {improvement:.1f}%")
    
    print("\n" + "="*70)
    print("RECOMMENDATION: Use Approach 2 (Enhanced) for reliable causal inference")
    print("="*70 + "\n")
    
    return analyzer_weak, analyzer_strong


if __name__ == "__main__":
        
    # reading in the df
    df = pd.read_csv('../datasets/project/Dataset_Ads.csv')

    # # default setting with stacking, and enhanced machine learning.
    # analyzer = CausalAdAnalyzer(df)

    # # # Run complete pipeline
    # analyzer.run_complete_analysis(
    #     model_type='stacking',          # Use stacking ensemble
    #     include_interactions=True,       # Include interactions in 2SLS
    #     use_enhanced_features=True      # Use enhanced feature engineering
    # )

    # compares the methods, weak vs strong.
    # analyzer_weak, analyzer_strong = compare_instrument_approaches(df)

    # 2sls subgroup effect section
    # analyzer = CausalAdAnalyzer(df)

    # analyzer.run_2sls(include_interactions=True)

    # # NEW: Analyze subgroup effects
    # subgroup_results = analyzer.analyze_subgroup_effects(
    #     subgroup_vars={
    #         'Income': [0, 30000, 50000, 70000, np.inf],
    #         'Age': [0, 35, 50, 65, np.inf],
    #         'Location': None,  # Categorical
    #         'Ad_Type': None    # Categorical
    #     },
    #     min_subgroup_size=100
    # )

    # # Then continue with value-added
    # analyzer.estimate_value_added()

    # # Example 1: Run WITHOUT subgroup analysis (default behavior)
    # analyzer = CausalAdAnalyzer(df)
    # analyzer.run_complete_analysis(
    #     model_type='stacking',
    #     include_interactions=True,
    #     use_enhanced_features=True
    # )

    # Example 2: Run WITH subgroup analysis (using default subgroups)
    # analyzer = CausalAdAnalyzer(df)
    # analyzer.run_complete_analysis(
    #     model_type='stacking',
    #     include_interactions=True,
    #     use_enhanced_features=True,
    #     analyze_subgroups=True  # Toggle this on
    # )

    # # Example 3: Run WITH subgroup analysis (custom subgroups)
    # analyzer = CausalAdAnalyzer(df)
    # analyzer.run_complete_analysis(
    #     model_type='stacking',
    #     include_interactions=True,
    #     use_enhanced_features=True,
    #     analyze_subgroups=True,
    #     subgroup_vars={
    #         'Income': [0, 35000, 55000, 75000, np.inf],  # Custom income bins
    #         'Age': [0, 40, 60, np.inf],                   # Custom age bins
    #         'Location': None,                              # Use as-is
    #         'Ad_Type': None                                # Use as-is
    #     },
    #     min_subgroup_size=150  # Require at least 150 obs per subgroup
    # )

    # # Example 4: Only specific categorical subgroups
    # analyzer = CausalAdAnalyzer(df)
    # analyzer.run_complete_analysis(
    #     model_type='stacking',
    #     use_enhanced_features=True,
    #     analyze_subgroups=True,
    #     subgroup_vars=['Location', 'Ad_Type', 'Gender']  # Just these three
    # )

    # Trying to add Raj Chetty forecast bias's approach.
    # After running your analysis This would run independently from the stratified gorup results..
    # analyzer = CausalAdAnalyzer(df)
    # analyzer.run_complete_analysis(
    #     model_type='stacking',
    #     use_enhanced_features=True
    # )

    # # Run Chetty-style forecast bias test
    # bias_results = analyzer.test_forecast_bias(
    #     split_method='time',      # or 'random'
    #     split_ratio=0.5,
    #     subgroup_var='Ad_Type',   # or 'Ad_Placement', 'Location'
    #     min_group_size=50
    # )

    # df = generate_example_data(n=2000)    # # Compare methods
    # comparison = analyzer.compare_value_added_methods(subgroup_var='Ad_Type')
    # Initialize and run basic pipeline
    analyzer = CausalAdAnalyzer(df)
    analyzer.engineer_time_features()
    analyzer.encode_categorical_features()
    analyzer.create_ml_instrument(model_type='stacking', use_enhanced_features=True)

    # Run integrated Chetty analysis
    results = analyzer.run_chetty_value_added_analysis(
        subgroup_vars={
            'Income': [0, 30000, 50000, 70000, np.inf],
            'Age': [0, 35, 50, 65, np.inf],
            'Location': None,
            'Ad_Type': None
        },
        split_method='time',      # or 'random'
        split_ratio=0.5,
        min_group_size=100
    )

    # Compare approaches
    comparison = analyzer.compare_estimation_approaches()


DATA CLEANING AND PREPROCESSING
‚úì Converted 70 negative income values to missing
‚úì Imputed 70 missing income values with median
‚úì Winsorized 200 income values at 1st/99th percentiles
  Income range: [7,384, 96,445]
‚úì Filtered 457 rows with implausible ages (keeping 10-90)

üìä Creating logarithmic transformations:
  ‚úì Income_log created (log1p transformation)
  ‚úì Clicks_log created (log1p transformation)
  ‚úì Age_log created (log1p transformation)

CLEANING SUMMARY:
  Initial rows:        10,000
  Final rows:          9,543
  Rows removed:        457 (4.6%)
  Log variables added: 3


FEATURE ENGINEERING FOR INSTRUMENT STRENGTH
‚úì Created Income √ó Ad Type interaction
‚úì Created Age √ó Ad Topic interaction
‚úì Created Income √ó Ad Placement interaction
‚úì Created Age √ó Ad Placement interaction
‚úì Created Weekend indicator
‚úì Created time-of-day indicators
‚úì Created Weekend √ó Ad Type interaction
‚úì Created Business Hours √ó Ad Placement interaction
‚úì Created Ev

##### Claude Code Artifact End

##### ChatGPT Artifact Start

In [4]:
"""
iv_ml_pipeline.py

Rewritten, robust, and well-documented pipeline to construct an ML-based instrument for
an endogenous regressor and perform correct 2SLS estimation with diagnostics.

Key features:
- Out-of-fold (CV) instrument prediction without leakage using sklearn Pipelines
- Proper 2SLS estimation using linearmodels.iv.IV2SLS with robust and clustered SEs
- Correct computation of partial F-statistic for instrument strength (conditioning on controls)
- Placebo / balance tests for instrument validity
- Simulation utilities to unit-test the pipeline
- Configurable clustering, diagnostics, and model hyperparameters

NOTES:
- Requires: scikit-learn, pandas, numpy, statsmodels, linearmodels
  Install with: pip install scikit-learn pandas numpy statsmodels linearmodels

- This file intentionally avoids fitting preprocessors on full data prior to CV.
  Use Pipeline objects so that transform/fit happen inside each fold.

Author: Generated by ChatGPT (senior-engineer style). Be strict: run tests before trusting results.
"""

from __future__ import annotations

import logging
import math
from dataclasses import dataclass
from typing import Optional, Sequence, List, Union, Dict

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import statsmodels.api as sm
from linearmodels.iv import IV2SLS


# ----------------------------
# Logging config
# ----------------------------
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s: %(message)s"))
if not logger.handlers:
    logger.addHandler(handler)


# ----------------------------
# Utilities
# ----------------------------

def _assert_series(name: str, s: pd.Series) -> None:
    assert isinstance(s, pd.Series), f"{name} must be a pandas Series"
    assert not s.empty, f"{name} must not be empty"


# ----------------------------
# Data classes
# ----------------------------

@dataclass
class IVResult:
    model: any
    params: pd.Series
    std_errors: pd.Series
    tstats: pd.Series
    pvalues: pd.Series
    summary: str
    first_stage: Optional[sm.regression.linear_model.RegressionResultsWrapper] = None
    partial_f: Optional[float] = None
    partial_r2: Optional[float] = None


# ----------------------------
# Core pipeline class
# ----------------------------

class MLInstrumentIV:
    """
    Class to build an ML-based instrument and perform 2SLS with correct inference.

    Typical workflow:
      - instantiate with desired pipelines and options
      - call fit_instrument_oof() to get out-of-fold instrument predictions
      - call estimate_iv() to run IV2SLS using linearmodels and get diagnostics
      - run balance_tests() to check instrument validity

    Notes:
      - All preprocessing required for instrument construction should be provided
        as part of `instrument_pipeline` so we avoid leakage across CV folds.
      - `controls` are treated as exogenous covariates in the structural equation.
    """

    def __init__(
        self,
        instrument_pipeline: Optional[Pipeline] = None,
        instrument_features: Optional[Sequence[str]] = None,
        outcome: str = "outcome",
        endogenous: str = "endog",
        controls: Optional[Sequence[str]] = None,
        cv_folds: int = 5,
        random_state: int = 42,
    ) -> None:
        self.instrument_pipeline = (
            instrument_pipeline
            if instrument_pipeline is not None
            else self._default_instrument_pipeline()
        )
        self.instrument_features = list(instrument_features) if instrument_features else None
        self.outcome = outcome
        self.endogenous = endogenous
        self.controls = list(controls) if controls else []
        self.cv_folds = int(cv_folds)
        self.random_state = int(random_state)

        # placeholders populated after fit
        self.data: Optional[pd.DataFrame] = None
        self.oof_instrument_name = "__instrument_oof__"
        self.fitted_full_pipeline: Optional[Pipeline] = None

    @staticmethod
    def _default_instrument_pipeline() -> Pipeline:
        # Default pipeline: simple imputer + OHE for categoricals + RandomForestRegressor
        numeric_transformer = Pipeline(
            steps=[("imputer", SimpleImputer(strategy="median"))]
        )
        categorical_transformer = Pipeline(
            steps=[("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))]
        )

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, []),  # actual numeric columns passed later in fit
                ("cat", categorical_transformer, []),
            ],
            remainder="passthrough",
        )

        # Use a tree-based regressor by default. Trees don't need scaling.
        pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=42))])
        # Note: during fit_instrument_oof the ColumnTransformer will be re-constructed with actual columns.
        return pipeline

    def fit_instrument_oof(self, df: pd.DataFrame, instrument_feature_names: Optional[Sequence[str]] = None) -> pd.Series:
        """
        Fit the instrument model using cross-validated out-of-fold predictions to avoid leakage.

        Returns a pandas Series of OOF predictions aligned with df.index.
        """
        assert isinstance(df, pd.DataFrame), "df must be a DataFrame"
        self.data = df.copy()

        # Determine instrument features
        if instrument_feature_names is not None:
            self.instrument_features = list(instrument_feature_names)
        if not self.instrument_features:
            raise ValueError("instrument_features must be provided either at init or in fit_instrument_oof")

        X = df[self.instrument_features]
        y = df[self.endogenous]
        _assert_series(self.endogenous, y)

        # Build a ColumnTransformer specific to columns: numeric vs categorical
        numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

        num_transformer = Pipeline([("imputer", SimpleImputer(strategy="median"))])
        cat_transformer = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", num_transformer, numeric_cols),
                ("cat", cat_transformer, categorical_cols),
            ],
            remainder="drop",
        )

        # Clone the user-supplied pipeline but replace or build a pipeline with this preprocessor
        pipeline = clone(self.instrument_pipeline)

        try:
            if hasattr(pipeline, "steps") and len(pipeline.steps) >= 1:
                # extract last step
                *prefix_steps, (last_name, last_estimator) = pipeline.steps
                pipeline = Pipeline(prefix_steps + [("preprocessor", preprocessor), (last_name, last_estimator)])
            else:
                pipeline = Pipeline([("preprocessor", preprocessor), ("model", RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=self.random_state))])
        except Exception:
            pipeline = Pipeline([("preprocessor", preprocessor), ("model", RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=self.random_state))])

        # Do out-of-fold prediction using cross_val_predict which fits inside folds
        cv = KFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state)
        logger.info("Running out-of-fold predictions for instrument with %d folds", self.cv_folds)

        instrument_oof = cross_val_predict(pipeline, X, y, cv=cv, n_jobs=-1)

        # Fit pipeline on full dataset for later use (e.g., feature importances)
        pipeline.fit(X, y)
        self.fitted_full_pipeline = pipeline

        # Save instrument predictions in self.data
        self.data[self.oof_instrument_name] = instrument_oof
        logger.info("OOF instrument created and saved to column %s", self.oof_instrument_name)

        return pd.Series(instrument_oof, index=df.index, name=self.oof_instrument_name)

    def _prepare_iv_matrices(self) -> Dict[str, pd.DataFrame]:
        """
        Prepare matrices for IV estimation: y, D (endog), Z (instrument(s)), X (controls + constant)
        Returns dict with keys 'y', 'D', 'Z', 'X'
        """
        assert self.data is not None, "Call fit_instrument_oof first to set self.data"
        df = self.data
        y = df[self.outcome]
        D = df[[self.endogenous]]

        if self.oof_instrument_name not in df.columns:
            raise ValueError("Instrument OOF column not found in data ‚Äî run fit_instrument_oof first")

        Z = df[[self.oof_instrument_name]]

        # Controls
        X = df[self.controls] if self.controls else pd.DataFrame(index=df.index)
        # Add constant to X (linearmodels expects exog separately)
        X_const = sm.add_constant(X, has_constant="add")

        return {"y": y, "D": D, "Z": Z, "X": X_const}

    def _compute_partial_f_and_r2(self, y: pd.Series, D: pd.DataFrame, Z: pd.DataFrame, X: pd.DataFrame) -> Dict[str, float]:
        """
        Compute partial F statistic and partial R^2 for instrument(s) Z in the first-stage
        where first-stage is D ~ Z + X. We compute the SSR-based partial F.
        """
        # Fit full model: D ~ Z + X
        X_full = pd.concat([Z, X.drop(columns=["const"])], axis=1) if "const" in X.columns else pd.concat([Z, X], axis=1)
        X_full = sm.add_constant(X_full, has_constant="add")
        first_stage_full = sm.OLS(D, X_full).fit()
        ssr_full = first_stage_full.ssr

        # Fit reduced model: D ~ X
        X_reduced = X
        first_stage_reduced = sm.OLS(D, X_reduced).fit()
        ssr_reduced = first_stage_reduced.ssr

        q = Z.shape[1]  # number of instruments
        df_num = q
        df_den = int(first_stage_full.df_resid)
        if df_den <= 0:
            partial_f = float("nan")
        else:
            partial_f = ((ssr_reduced - ssr_full) / df_num) / (ssr_full / df_den)

        # Partial R^2: proportion of variance in D explained by Z conditional on X
        if ssr_reduced == 0:
            partial_r2 = float("nan")
        else:
            partial_r2 = max(0.0, float((ssr_reduced - ssr_full) / ssr_reduced))

        return {"partial_f": float(partial_f), "partial_r2": float(partial_r2)}

    def estimate_iv(self, cluster_col: Optional[str] = None, cov_type: str = "robust") -> IVResult:
        """
        Run IV (2SLS) estimation using the out-of-fold instrument(s) and controls.

        Parameters
        ----------
        cluster_col: optional column name in self.data to use for clustered standard errors
        cov_type: covariance type passed to IV2SLS.fit: 'robust', 'unadjusted', 'clustered'

        Returns
        -------
        IVResult dataclass containing summary and diagnostics
        """
        mats = self._prepare_iv_matrices()
        y = mats["y"]
        D = mats["D"]
        Z = mats["Z"]
        X = mats["X"]

        # Compute partial F and partial R2
        pf = self._compute_partial_f_and_r2(y=D.squeeze(), D=D, Z=Z, X=X)
        partial_f = pf["partial_f"]
        partial_r2 = pf["partial_r2"]
        logger.info("Partial F for instrument(s): %s", partial_f)
        logger.info("Partial R^2 for instrument(s): %s", partial_r2)

        # linearmodels expects arrays/frames. Build IV2SLS
        # Note: IV2SLS signature is IV2SLS(dependent, exog, endog, instruments)
        exog = X  # includes const
        endog = D
        instruments = Z

        iv_mod = IV2SLS(y, exog, endog, instruments)

        fit_kwargs = {}
        if cov_type == "clustered":
            if cluster_col is None:
                raise ValueError("cluster_col must be provided when cov_type='clustered'")
            clusters = self.data[cluster_col]
            fit_kwargs["cov_type"] = "clustered"
            fit_kwargs["clusters"] = clusters
        else:
            fit_kwargs["cov_type"] = cov_type

        iv_res = iv_mod.fit(**fit_kwargs)

        # Prepare result
        params = iv_res.params
        std_errors = iv_res.std_errors
        tstats = iv_res.tstats
        pvalues = iv_res.pvalues
        summary = str(iv_res.summary)

        # compute first-stage OLS for reporting
        X_fs = pd.concat([instruments, X.drop(columns=["const"])], axis=1) if "const" in X.columns else pd.concat([instruments, X], axis=1)
        X_fs = sm.add_constant(X_fs, has_constant="add")
        first_stage = sm.OLS(D, X_fs).fit(cov_type="HC1")

        return IVResult(
            model=iv_res,
            params=params,
            std_errors=std_errors,
            tstats=tstats,
            pvalues=pvalues,
            summary=summary,
            first_stage=first_stage,
            partial_f=partial_f,
            partial_r2=partial_r2,
        )

    def balance_tests(self, covariates: Sequence[str]) -> pd.DataFrame:
        """
        Run balance/placebo tests by regressing pre-treatment covariates on the instrument(s).

        Returns a DataFrame with covariate names, coefficients on instrument, and p-values.
        """
        assert self.data is not None, "Call fit_instrument_oof first"
        results = []
        Z = self.data[[self.oof_instrument_name]]
        Z_with_const = sm.add_constant(Z, has_constant="add")

        for cov in covariates:
            if cov not in self.data.columns:
                raise ValueError(f"covariate {cov} not in data")
            y = self.data[cov]
            model = sm.OLS(y, Z_with_const).fit(cov_type="HC1")
            coef = float(model.params[self.oof_instrument_name])
            pval = float(model.pvalues[self.oof_instrument_name])
            results.append({"covariate": cov, "coef_on_instrument": coef, "pvalue": pval})

        return pd.DataFrame(results)

    # ----------------------------
    # Simulation utilities (for testing)
    # ----------------------------

    @staticmethod
    def simulate_data(
        n: int = 10000,
        instrument_strength: float = 0.8,
        endog_beta: float = 2.0,
        seed: int = 123,
        with_invalid_instrument: bool = False,
    ) -> pd.DataFrame:
        """
        Simulate data with a valid (and optionally an invalid) instrument for test purposes.

        Structural model:
          Z ~ N(0,1)
          X ~ N(0,1)
          D = gamma*Z + delta*X + u1
          Y = beta*D + theta*X + u2

        If with_invalid_instrument is True, the instrument Z also directly affects Y (violating exclusion).
        """
        rng = np.random.RandomState(seed)
        Z = rng.normal(size=n)
        X = rng.normal(size=n)
        u1 = rng.normal(scale=1.0, size=n)
        u2 = rng.normal(scale=1.0, size=n)

        gamma = instrument_strength
        delta = 0.5
        beta = endog_beta
        theta = 1.0

        D = gamma * Z + delta * X + u1
        Y = beta * D + theta * X + u2
        if with_invalid_instrument:
            # direct effect of Z on Y
            Y = Y + 0.5 * Z

        df = pd.DataFrame({"Z": Z, "X": X, "D": D, "Y": Y})
        return df


# ----------------------------
# Example quick-run function
# ----------------------------

def quick_run_example():
    """
    Run a short demonstration using simulated data to show the pipeline usage.
    """
    logger.info("Running quick demo of MLInstrumentIV on simulated data")
    df = MLInstrumentIV.simulate_data(n=2000, instrument_strength=0.9, seed=2025)

    # We'll pretend Z is not directly available as an instrument but is used to predict D
    # Build naive features: here we'll use X and Z (but in real use-case do not include direct causes of outcome)
    features = ["Z", "X"]
    df = df.rename(columns={"D": "endog", "Y": "outcome"})

    # Create instance
    instrument_pipeline = Pipeline([("model", RandomForestRegressor(n_estimators=100, random_state=42))])
    mliv = MLInstrumentIV(
        instrument_pipeline=instrument_pipeline,
        instrument_features=features,
        outcome="outcome",
        endogenous="endog",
        controls=["X"],
        cv_folds=5,
        random_state=42,
    )

    mliv.fit_instrument_oof(df)
    ivres = mliv.estimate_iv(cov_type="robust")

    print("IV summary:\n", ivres.summary)
    print("Partial F:", ivres.partial_f)

    # Balance test on pre-treatment covariate 'X' (should be zero correlation in this sim)
    bt = mliv.balance_tests(["X"])  # should show insignificant association if instrument valid
    print(bt)


# ----------------------------
# If run as a script
# ----------------------------
if __name__ == "__main__":
    quick_run_example()


2025-11-11 15:57:03,147 INFO: Running quick demo of MLInstrumentIV on simulated data
2025-11-11 15:57:03,155 INFO: Running out-of-fold predictions for instrument with 5 folds
2025-11-11 15:57:09,538 INFO: OOF instrument created and saved to column __instrument_oof__
2025-11-11 15:57:09,550 INFO: Partial F for instrument(s): 1058.4318624997827
2025-11-11 15:57:09,551 INFO: Partial R^2 for instrument(s): 0.34640990541802924


IV summary:
                           IV-2SLS Estimation Summary                          
Dep. Variable:                outcome   R-squared:                      0.9270
Estimator:                    IV-2SLS   Adj. R-squared:                 0.9269
No. Observations:                2000   F-statistic:                 1.444e+04
Date:                Tue, Nov 11 2025   P-value (F-stat)                0.0000
Time:                        15:57:09   Distribution:                  chi2(2)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          0.0010     0.0215     0.0471     0.9624     -0.0411      0.0431
X              1.0383     0.0237     43

##### ChatGPT Artifact End

##### Claude Artifact Senior Level Start

In [None]:
"""
Refactored Causal Inference Pipeline for Ad Conversion Analysis
================================================================

A production-ready implementation following SOLID principles with proper
separation of concerns, comprehensive validation, and robust error handling.

Architecture:
    DataPreprocessor ‚Üí FeatureEngineer ‚Üí InstrumentGenerator ‚Üí 
    TwoSLSEstimator ‚Üí ResultsAnalyzer

Author: Refactored for production use
"""

import logging
import warnings
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple, Union, Any
from enum import Enum

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    StackingRegressor
)
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import Ridge
import statsmodels.api as sm

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# ============================================================================
# CONSTANTS AND CONFIGURATION
# ============================================================================

class Config:
    """Global configuration constants with statistical justification."""
    
    # Validation thresholds
    MIN_AGE = 10
    MAX_AGE = 90
    INCOME_WINSORIZE_LOWER = 0.01  # 1st percentile
    INCOME_WINSORIZE_UPPER = 0.99  # 99th percentile
    
    # Statistical thresholds (Stock & Yogo, 2005)
    WEAK_INSTRUMENT_THRESHOLD = 10.0
    STOCK_YOGO_10PCT_BIAS = 16.38  # Single instrument, single endogenous
    STOCK_YOGO_15PCT_BIAS = 8.96
    
    # Confidence intervals
    CI_ALPHA = 0.05  # 95% confidence intervals
    Z_SCORE_95 = 1.96
    
    # Model defaults
    DEFAULT_CV_FOLDS = 5
    DEFAULT_MIN_SUBGROUP_SIZE = 100
    RANDOM_STATE = 42
    
    # Required columns
    REQUIRED_COLUMNS = [
        'Conversion_Rate', 'Clicks', 'Age', 'Income', 'Gender',
        'Location', 'Ad_Type', 'Ad_Topic', 'Ad_Placement', 'CTR'
    ]


# ============================================================================
# DATA CLASSES FOR STRUCTURED RESULTS
# ============================================================================

@dataclass
class CleaningReport:
    """Report from data cleaning operations."""
    initial_rows: int
    final_rows: int
    negative_income_converted: int = 0
    missing_income_imputed: int = 0
    income_winsorized: int = 0
    age_filtered: int = 0
    log_variables_created: List[str] = field(default_factory=list)
    
    @property
    def rows_removed(self) -> int:
        return self.initial_rows - self.final_rows
    
    @property
    def removal_rate(self) -> float:
        return self.rows_removed / self.initial_rows if self.initial_rows > 0 else 0.0


@dataclass
class InstrumentDiagnostics:
    """Diagnostics for instrument strength."""
    r_squared: float
    f_statistic: float
    correlation: float
    cragg_donald: float
    sample_size: int
    n_features: int
    is_weak: bool
    strength_category: str  # 'VERY_STRONG', 'STRONG', 'MODERATE', 'WEAK'
    
    def to_dict(self) -> Dict[str, Any]:
        return {
            'r_squared': self.r_squared,
            'f_statistic': self.f_statistic,
            'correlation': self.correlation,
            'cragg_donald': self.cragg_donald,
            'sample_size': self.sample_size,
            'n_features': self.n_features,
            'is_weak': self.is_weak,
            'strength_category': self.strength_category
        }


@dataclass
class TwoSLSResults:
    """Results from 2SLS estimation."""
    first_stage_results: Any  # statsmodels results object
    second_stage_results: Any
    causal_effect: float
    standard_error: float
    t_statistic: float
    p_value: float
    ci_lower: float
    ci_upper: float
    instrument_diagnostics: InstrumentDiagnostics
    
    @property
    def is_significant(self) -> bool:
        return self.p_value < Config.CI_ALPHA


# ============================================================================
# 1. DATA PREPROCESSOR
# ============================================================================

class DataPreprocessor:
    """
    Handles data cleaning, validation, and preprocessing.
    
    Responsibilities:
    - Validate input data structure
    - Handle missing values
    - Winsorize outliers
    - Create log transformations
    - Filter invalid records
    """
    
    def __init__(self, validate_columns: bool = True):
        """
        Initialize preprocessor.
        
        Args:
            validate_columns: Whether to validate required columns exist
        """
        self.validate_columns = validate_columns
        self.cleaning_report: Optional[CleaningReport] = None
        
    def validate_input(self, data: pd.DataFrame) -> None:
        """
        Validate input data structure and types.
        
        Args:
            data: Input DataFrame
            
        Raises:
            ValueError: If validation fails
        """
        if not isinstance(data, pd.DataFrame):
            raise ValueError(f"Expected pandas DataFrame, got {type(data)}")
        
        if len(data) == 0:
            raise ValueError("Input DataFrame is empty")
        
        if self.validate_columns:
            missing_cols = set(Config.REQUIRED_COLUMNS) - set(data.columns)
            if missing_cols:
                raise ValueError(
                    f"Missing required columns: {missing_cols}. "
                    f"Required: {Config.REQUIRED_COLUMNS}"
                )
        
        # Validate numeric columns
        numeric_cols = ['Conversion_Rate', 'Clicks', 'Age', 'Income', 'CTR']
        for col in numeric_cols:
            if col in data.columns:
                if not pd.api.types.is_numeric_dtype(data[col]):
                    raise ValueError(f"Column '{col}' must be numeric, got {data[col].dtype}")
        
        logger.info("Input validation passed")
    
    def clean(self, data: pd.DataFrame) -> Tuple[pd.DataFrame, CleaningReport]:
        """
        Clean and preprocess data.
        
        Args:
            data: Raw input data
            
        Returns:
            Tuple of (cleaned_data, cleaning_report)
        """
        logger.info("="*70)
        logger.info("DATA CLEANING AND PREPROCESSING")
        logger.info("="*70)
        
        # Validate input
        self.validate_input(data)
        
        # Work on a copy
        df = data.copy()
        initial_rows = len(df)
        
        # Initialize report
        report = CleaningReport(initial_rows=initial_rows, final_rows=initial_rows)
        
        # Clean income
        df, income_stats = self._clean_income(df)
        report.negative_income_converted = income_stats['negative_converted']
        report.missing_income_imputed = income_stats['missing_imputed']
        report.income_winsorized = income_stats['winsorized']
        
        # Filter age
        df, age_filtered = self._filter_age(df)
        report.age_filtered = age_filtered
        
        # Create log transformations
        df, log_vars = self._create_log_transformations(df)
        report.log_variables_created = log_vars
        
        # Update final count
        report.final_rows = len(df)
        
        # Log summary
        logger.info("="*70)
        logger.info("CLEANING SUMMARY:")
        logger.info(f"  Initial rows:        {report.initial_rows:,}")
        logger.info(f"  Final rows:          {report.final_rows:,}")
        logger.info(f"  Rows removed:        {report.rows_removed:,} ({report.removal_rate*100:.1f}%)")
        logger.info(f"  Log variables added: {len(report.log_variables_created)}")
        logger.info("="*70)
        
        self.cleaning_report = report
        return df, report
    
    def _clean_income(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, int]]:
        """Clean income variable."""
        stats = {'negative_converted': 0, 'missing_imputed': 0, 'winsorized': 0}
        
        if 'Income' not in df.columns:
            return df, stats
        
        # Convert negative to missing
        negative_mask = df['Income'] < 0
        stats['negative_converted'] = negative_mask.sum()
        df.loc[negative_mask, 'Income'] = np.nan
        
        if stats['negative_converted'] > 0:
            logger.info(f"‚úì Converted {stats['negative_converted']} negative income values to missing")
        
        # Impute missing with median
        missing_mask = df['Income'].isna()
        stats['missing_imputed'] = missing_mask.sum()
        
        if stats['missing_imputed'] > 0:
            imputer = SimpleImputer(strategy='median')
            df['Income'] = imputer.fit_transform(df[['Income']])
            logger.info(f"‚úì Imputed {stats['missing_imputed']} missing income values with median")
        
        # Winsorize extremes
        lower = df['Income'].quantile(Config.INCOME_WINSORIZE_LOWER)
        upper = df['Income'].quantile(Config.INCOME_WINSORIZE_UPPER)
        
        income_before = df['Income'].copy()
        df['Income'] = df['Income'].clip(lower, upper)
        stats['winsorized'] = (income_before != df['Income']).sum()
        
        logger.info(f"‚úì Winsorized {stats['winsorized']} income values at 1st/99th percentiles")
        logger.info(f"  Income range: [{lower:,.0f}, {upper:,.0f}]")
        
        return df, stats
    
    def _filter_age(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, int]:
        """Filter implausible age values."""
        if 'Age' not in df.columns:
            return df, 0
        
        age_before = len(df)
        df = df[df['Age'].between(Config.MIN_AGE, Config.MAX_AGE)]
        age_filtered = age_before - len(df)
        
        if age_filtered > 0:
            logger.info(
                f"‚úì Filtered {age_filtered} rows with implausible ages "
                f"(keeping {Config.MIN_AGE}-{Config.MAX_AGE})"
            )
        
        return df, age_filtered
    
    def _create_log_transformations(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
        """Create logarithmic transformations for skewed variables."""
        log_vars = []
        
        logger.info("üìä Creating logarithmic transformations:")
        
        # Log of Income
        if 'Income' in df.columns:
            df['Income_log'] = np.log1p(df['Income'])
            log_vars.append('Income_log')
            logger.info("  ‚úì Income_log created (log1p transformation)")
        
        # Log of Clicks
        if 'Clicks' in df.columns:
            df['Clicks_log'] = np.log1p(df['Clicks'])
            log_vars.append('Clicks_log')
            logger.info("  ‚úì Clicks_log created (log1p transformation)")
        
        # Log of Age
        if 'Age' in df.columns:
            df['Age_log'] = np.log1p(df['Age'])
            log_vars.append('Age_log')
            logger.info("  ‚úì Age_log created (log1p transformation)")
        
        # Log of CTR (only if all positive)
        if 'CTR' in df.columns and (df['CTR'] > 0).all():
            df['CTR_log'] = np.log(df['CTR'])
            log_vars.append('CTR_log')
            logger.info("  ‚úì CTR_log created (log transformation)")
        
        return df, log_vars


# ============================================================================
# 2. FEATURE ENGINEER
# ============================================================================

class FeatureEngineer:
    """
    Handles feature engineering for instrument construction.
    
    Responsibilities:
    - Extract time features
    - Encode categorical variables
    - Create interaction terms
    - Generate nonlinear transformations
    """
    
    def __init__(self):
        """Initialize feature engineer."""
        self.encoders: Dict[str, LabelEncoder] = {}
        self.feature_names: List[str] = []
        
    def engineer_all_features(
        self,
        data: pd.DataFrame,
        include_time: bool = True,
        include_interactions: bool = True
    ) -> pd.DataFrame:
        """
        Apply all feature engineering steps.
        
        Args:
            data: Preprocessed data
            include_time: Whether to extract time features
            include_interactions: Whether to create interaction terms
            
        Returns:
            Data with engineered features
        """
        logger.info("="*70)
        logger.info("FEATURE ENGINEERING")
        logger.info("="*70)
        
        df = data.copy()
        
        # Time features
        if include_time and 'Click_Time' in df.columns:
            df = self._engineer_time_features(df)
        
        # Encode categoricals
        df = self._encode_categorical_features(df)
        
        # Interaction terms
        if include_interactions:
            df = self._create_interaction_features(df)
        
        # Nonlinear transformations
        df = self._create_nonlinear_features(df)
        
        # Track created features
        self.feature_names = [col for col in df.columns if col not in data.columns]
        
        logger.info(f"‚úì Created {len(self.feature_names)} new features")
        logger.info("="*70)
        
        return df
    
    def _engineer_time_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Extract day of week and hour from Click_Time."""
        if 'Click_Time' not in df.columns:
            return df
        
        try:
            df['Click_Time'] = pd.to_datetime(df['Click_Time'])
            df['Day_of_Week'] = df['Click_Time'].dt.dayofweek
            df['Hour'] = df['Click_Time'].dt.hour
            
            # Create time-based indicators
            df['Weekend'] = (df['Day_of_Week'] >= 5).astype(int)
            df['BusinessHours'] = ((df['Hour'] >= 9) & (df['Hour'] <= 17)).astype(int)
            df['Evening'] = ((df['Hour'] >= 18) & (df['Hour'] <= 23)).astype(int)
            df['Morning'] = ((df['Hour'] >= 6) & (df['Hour'] <= 11)).astype(int)
            
            logger.info("‚úì Extracted time features")
        except Exception as e:
            logger.warning(f"Failed to extract time features: {e}")
        
        return df
    
    def _encode_categorical_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Encode categorical variables."""
        categorical_cols = ['Gender', 'Location', 'Ad_Type', 'Ad_Topic', 'Ad_Placement']
        
        for col in categorical_cols:
            if col in df.columns:
                try:
                    le = LabelEncoder()
                    df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str))
                    self.encoders[col] = le
                except Exception as e:
                    logger.warning(f"Failed to encode {col}: {e}")
        
        logger.info(f"‚úì Encoded {len(self.encoders)} categorical variables")
        return df
    
    def _create_interaction_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Create interaction features for instrument strength.
        
        Key principle: These features predict CLICKS but only affect
        CONVERSIONS through clicks (exclusion restriction).
        """
        interactions_created = 0
        
        # Demographics √ó Ad characteristics
        demographic_ad_pairs = [
            ('Income', 'Ad_Type_encoded'),
            ('Age', 'Ad_Topic_encoded'),
            ('Income', 'Ad_Placement_encoded'),
            ('Age', 'Ad_Placement_encoded'),
        ]
        
        for demo, ad in demographic_ad_pairs:
            if demo in df.columns and ad in df.columns:
                df[f'{demo}_x_{ad}'] = df[demo] * df[ad]
                interactions_created += 1
        
        # Time √ó Ad interactions
        time_ad_pairs = [
            ('Weekend', 'Ad_Type_encoded'),
            ('BusinessHours', 'Ad_Placement_encoded'),
            ('Evening', 'Ad_Topic_encoded'),
        ]
        
        for time, ad in time_ad_pairs:
            if time in df.columns and ad in df.columns:
                df[f'{time}_x_{ad}'] = df[time] * df[ad]
                interactions_created += 1
        
        # Demographics √ó Time
        demo_time_pairs = [
            ('Age', 'Hour'),
            ('Income', 'Weekend'),
            ('Age', 'BusinessHours'),
        ]
        
        for demo, time in demo_time_pairs:
            if demo in df.columns and time in df.columns:
                df[f'{demo}_x_{time}'] = df[demo] * df[time]
                interactions_created += 1
        
        # Location √ó Demographics
        if 'Location_encoded' in df.columns:
            for demo in ['Age', 'Income']:
                if demo in df.columns:
                    df[f'Location_x_{demo}'] = df['Location_encoded'] * df[demo]
                    interactions_created += 1
        
        # Gender √ó Ad characteristics
        if 'Gender_encoded' in df.columns:
            for ad in ['Ad_Topic_encoded', 'Ad_Type_encoded']:
                if ad in df.columns:
                    df[f'Gender_x_{ad}'] = df['Gender_encoded'] * df[ad]
                    interactions_created += 1
        
        # Ad Type √ó Placement
        if 'Ad_Type_encoded' in df.columns and 'Ad_Placement_encoded' in df.columns:
            df['AdType_x_Placement'] = df['Ad_Type_encoded'] * df['Ad_Placement_encoded']
            interactions_created += 1
        
        # Three-way interactions (most powerful)
        if all(col in df.columns for col in ['Age', 'Ad_Type_encoded', 'Weekend']):
            df['Age_x_AdType_x_Weekend'] = df['Age'] * df['Ad_Type_encoded'] * df['Weekend']
            interactions_created += 1
        
        if all(col in df.columns for col in ['Income', 'Ad_Placement_encoded', 'BusinessHours']):
            df['Income_x_Placement_x_BizHours'] = (
                df['Income'] * df['Ad_Placement_encoded'] * df['BusinessHours']
            )
            interactions_created += 1
        
        logger.info(f"‚úì Created {interactions_created} interaction features")
        return df
    
    def _create_nonlinear_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create nonlinear transformations."""
        nonlinear_created = 0
        
        if 'Age' in df.columns:
            df['Age_squared'] = df['Age'] ** 2
            nonlinear_created += 1
        
        if 'Income' in df.columns:
            df['Income_squared'] = df['Income'] ** 2
            df['Income_sqrt'] = np.sqrt(df['Income'].clip(lower=0))
            nonlinear_created += 2
        
        logger.info(f"‚úì Created {nonlinear_created} nonlinear features")
        return df
    
    def get_instrument_features(self, data: pd.DataFrame) -> List[str]:
        """
        Get list of valid instrument features from data.
        
        These are features that should predict clicks but satisfy
        the exclusion restriction.
        
        Args:
            data: DataFrame with engineered features
            
        Returns:
            List of column names to use as instruments
        """
        # Base demographic and ad features
        base_features = [
            'Age', 'Income',
            'Gender_encoded', 'Location_encoded',
            'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded',
        ]
        
        # Time features (if available)
        time_features = ['Day_of_Week', 'Hour', 'Weekend', 'BusinessHours', 'Evening', 'Morning']
        
        # All engineered features (interactions, nonlinear)
        engineered_features = [
            col for col in data.columns
            if any(x in col for x in ['_x_', '_squared', '_sqrt', '_log'])
            and col not in ['Clicks_log', 'Conversion_Rate_log']  # Exclude target variables
        ]
        
        # Combine and filter to available
        all_features = base_features + time_features + engineered_features
        available = [f for f in all_features if f in data.columns]
        
        logger.info(f"Identified {len(available)} potential instrument features")
        return available


# ============================================================================
# 3. INSTRUMENT GENERATOR
# ============================================================================

class InstrumentGenerator:
    """
    Generates ML-based instrumental variables.
    
    Responsibilities:
    - Train ML models to predict endogenous variable
    - Generate out-of-fold predictions
    - Validate instrument strength
    - Provide diagnostics
    """
    
    def __init__(
        self,
        model_type: str = 'stacking',
        cv_folds: int = Config.DEFAULT_CV_FOLDS,
        random_state: int = Config.RANDOM_STATE
    ):
        """
        Initialize instrument generator.
        
        Args:
            model_type: 'rf', 'gb', or 'stacking'
            cv_folds: Number of cross-validation folds
            random_state: Random seed for reproducibility
        """
        self.model_type = model_type
        self.cv_folds = cv_folds
        self.random_state = random_state
        
        self.model = None
        self.scaler = StandardScaler()
        self.instrument_features: List[str] = []
        self.diagnostics: Optional[InstrumentDiagnostics] = None
        
    def generate_instrument(
        self,
        data: pd.DataFrame,
        endogenous_var: str,
        instrument_features: List[str]
    ) -> Tuple[pd.DataFrame, InstrumentDiagnostics]:
        """
        Generate ML-based instrument for endogenous variable.
        
        Args:
            data: DataFrame with features
            endogenous_var: Name of endogenous variable (e.g., 'Clicks')
            instrument_features: List of feature names to use
            
        Returns:
            Tuple of (data with instrument column, diagnostics)
            
        Raises:
            ValueError: If endogenous variable or features not found
        """
        logger.info("="*70)
        logger.info("ML INSTRUMENT GENERATION")
        logger.info("="*70)
        
        # Validate inputs
        if endogenous_var not in data.columns:
            raise ValueError(f"Endogenous variable '{endogenous_var}' not found in data")
        
        missing_features = set(instrument_features) - set(data.columns)
        if missing_features:
            raise ValueError(f"Instrument features not found: {missing_features}")
        
        self.instrument_features = instrument_features
        
        logger.info(f"Endogenous variable: {endogenous_var}")
        logger.info(f"Number of instrument features: {len(instrument_features)}")
        logger.info(f"Model type: {self.model_type.upper()}")
        logger.info(f"Cross-validation folds: {self.cv_folds}")
        
        # Prepare data
        X = data[instrument_features].copy()
        y = data[endogenous_var].copy()
        
        # Check for missing values
        if X.isna().any().any():
            logger.warning("Missing values detected in instrument features, dropping rows")
            valid_idx = ~(X.isna().any(axis=1) | y.isna())
            X = X[valid_idx]
            y = y[valid_idx]
            data = data[valid_idx]
        
        # Standardize features
        X_scaled = self.scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=instrument_features, index=X.index)
        
        # Build model
        self.model = self._build_model()
        
        # Generate out-of-fold predictions
        logger.info(f"Generating out-of-fold predictions (CV={self.cv_folds})...")
        
        try:
            predictions = cross_val_predict(
                self.model,
                X_scaled,
                y,
                cv=self.cv_folds,
                n_jobs=-1
            )
        except Exception as e:
            logger.error(f"Failed to generate predictions: {e}")
            raise
        
        # Add predictions to data
        instrument_col = f'{endogenous_var}_predicted'
        data = data.copy()
        data[instrument_col] = predictions
        
        # Fit final model for feature importance
        logger.info("Fitting final model...")
        self.model.fit(X_scaled, y)
        
        # Calculate diagnostics
        self.diagnostics = self._calculate_diagnostics(predictions, y.values)
        self._log_diagnostics()
        
        return data, self.diagnostics
    
    def _build_model(self):
        """Build ML model based on model_type."""
        if self.model_type == 'stacking':
            logger.info("Building Stacking Ensemble...")
            
            base_models = [
                ('rf', RandomForestRegressor(
                    n_estimators=200,
                    max_depth=15,
                    min_samples_split=20,
                    min_samples_leaf=10,
                    max_features='sqrt',
                    random_state=self.random_state,
                    n_jobs=-1
                )),
                ('gb', GradientBoostingRegressor(
                    n_estimators=200,
                    max_depth=7,
                    learning_rate=0.05,
                    subsample=0.8,
                    min_samples_split=20,
                    min_samples_leaf=10,
                    random_state=self.random_state
                ))
            ]
            
            # Try to add XGBoost
            try:
                from xgboost import XGBRegressor
                base_models.append(
                    ('xgb', XGBRegressor(
                        n_estimators=200,
                        max_depth=8,
                        learning_rate=0.05,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        random_state=self.random_state,
                        n_jobs=-1
                    ))
                )
                logger.info("  ‚úì Using XGBoost as additional base learner")
            except ImportError:
                logger.info("  ‚Ñπ XGBoost not available, using RF + GB only")
            
            return StackingRegressor(
                estimators=base_models,
                final_estimator=Ridge(alpha=1.0),
                cv=self.cv_folds,
                n_jobs=-1
            )
            
        elif self.model_type == 'rf':
            logger.info("Building Random Forest...")
            return RandomForestRegressor(
                n_estimators=200,
                max_depth=15,
                min_samples_split=20,
                min_samples_leaf=10,
                max_features='sqrt',
                random_state=self.random_state,
                n_jobs=-1
            )
            
        elif self.model_type == 'gb':
            logger.info("Building Gradient Boosting...")
            return GradientBoostingRegressor(
                n_estimators=200,
                max_depth=7,
                learning_rate=0.05,
                subsample=0.8,
                min_samples_split=20,
                min_samples_leaf=10,
                random_state=self.random_state
            )
        else:
            raise ValueError(f"Unknown model_type: {self.model_type}")
    
    def _calculate_diagnostics(
        self,
        predictions: np.ndarray,
        actuals: np.ndarray
    ) -> InstrumentDiagnostics:
        """Calculate instrument strength diagnostics."""
        n = len(predictions)
        k = len(self.instrument_features)
        
        # R-squared
        z_resid = predictions - predictions.mean()
        d_resid = actuals - actuals.mean()
        ss_tot = np.sum(d_resid**2)
        ss_res = np.sum((actuals - predictions)**2)
        r_squared = 1 - (ss_res / ss_tot)
        
        # F-statistic (proper first-stage)
        f_stat = (r_squared / 1) / ((1 - r_squared) / (n - k - 1))
        
        # Correlation
        corr = np.corrcoef(predictions, actuals)[0, 1]
        
        # Cragg-Donald
        cragg_donald = n * r_squared
        
        # Classify strength
        if f_stat > Config.STOCK_YOGO_10PCT_BIAS:
            strength = 'VERY_STRONG'
        elif f_stat > Config.WEAK_INSTRUMENT_THRESHOLD:
            strength = 'STRONG'
        elif f_stat > 5:
            strength = 'MODERATE'
        else:
            strength = 'WEAK'
        
        return InstrumentDiagnostics(
            r_squared=r_squared,
            f_statistic=f_stat,
            correlation=corr,
            cragg_donald=cragg_donald,
            sample_size=n,
            n_features=k,
            is_weak=f_stat < Config.WEAK_INSTRUMENT_THRESHOLD,
            strength_category=strength
        )
    
    def _log_diagnostics(self):
        """Log instrument diagnostics."""
        d = self.diagnostics

##### Claude Artifact Senior Level End

In [4]:
"""
Refactored Causal Inference Pipeline for Ad Conversion Analysis
================================================================

A production-ready implementation following SOLID principles with proper
separation of concerns, comprehensive validation, and robust error handling.

Architecture:
    DataPreprocessor ‚Üí FeatureEngineer ‚Üí InstrumentGenerator ‚Üí 
    TwoSLSEstimator ‚Üí ResultsAnalyzer

Author: Refactored for production use
"""

import logging
import warnings
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple, Union, Any
from enum import Enum

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    StackingRegressor
)
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import Ridge
import statsmodels.api as sm

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# ============================================================================
# CONSTANTS AND CONFIGURATION
# ============================================================================

class Config:
    """Global configuration constants with statistical justification."""
    
    # Validation thresholds
    MIN_AGE = 10
    MAX_AGE = 90
    INCOME_WINSORIZE_LOWER = 0.01  # 1st percentile
    INCOME_WINSORIZE_UPPER = 0.99  # 99th percentile
    
    # Statistical thresholds (Stock & Yogo, 2005)
    WEAK_INSTRUMENT_THRESHOLD = 10.0
    STOCK_YOGO_10PCT_BIAS = 16.38  # Single instrument, single endogenous
    STOCK_YOGO_15PCT_BIAS = 8.96
    
    # Confidence intervals
    CI_ALPHA = 0.05  # 95% confidence intervals
    Z_SCORE_95 = 1.96
    
    # Model defaults
    DEFAULT_CV_FOLDS = 5
    DEFAULT_MIN_SUBGROUP_SIZE = 100
    RANDOM_STATE = 42
    
    # Required columns
    REQUIRED_COLUMNS = [
        'Conversion_Rate', 'Clicks', 'Age', 'Income', 'Gender',
        'Location', 'Ad_Type', 'Ad_Topic', 'Ad_Placement', 'CTR'
    ]


# ============================================================================
# DATA CLASSES FOR STRUCTURED RESULTS
# ============================================================================

@dataclass
class CleaningReport:
    """Report from data cleaning operations."""
    initial_rows: int
    final_rows: int
    negative_income_converted: int = 0
    missing_income_imputed: int = 0
    income_winsorized: int = 0
    age_filtered: int = 0
    log_variables_created: List[str] = field(default_factory=list)
    
    @property
    def rows_removed(self) -> int:
        return self.initial_rows - self.final_rows
    
    @property
    def removal_rate(self) -> float:
        return self.rows_removed / self.initial_rows if self.initial_rows > 0 else 0.0


@dataclass
class InstrumentDiagnostics:
    """Diagnostics for instrument strength."""
    r_squared: float
    f_statistic: float
    correlation: float
    cragg_donald: float
    sample_size: int
    n_features: int
    is_weak: bool
    strength_category: str  # 'VERY_STRONG', 'STRONG', 'MODERATE', 'WEAK'
    
    def to_dict(self) -> Dict[str, Any]:
        return {
            'r_squared': self.r_squared,
            'f_statistic': self.f_statistic,
            'correlation': self.correlation,
            'cragg_donald': self.cragg_donald,
            'sample_size': self.sample_size,
            'n_features': self.n_features,
            'is_weak': self.is_weak,
            'strength_category': self.strength_category
        }


@dataclass
class TwoSLSResults:
    """Results from 2SLS estimation."""
    first_stage_results: Any  # statsmodels results object
    second_stage_results: Any
    causal_effect: float
    standard_error: float
    t_statistic: float
    p_value: float
    ci_lower: float
    ci_upper: float
    instrument_diagnostics: InstrumentDiagnostics
    
    @property
    def is_significant(self) -> bool:
        return self.p_value < Config.CI_ALPHA


# ============================================================================
# 1. DATA PREPROCESSOR
# ============================================================================

class DataPreprocessor:
    """
    Handles data cleaning, validation, and preprocessing.
    
    Responsibilities:
    - Validate input data structure
    - Handle missing values
    - Winsorize outliers
    - Create log transformations
    - Filter invalid records
    """
    
    def __init__(self, validate_columns: bool = True):
        """
        Initialize preprocessor.
        
        Args:
            validate_columns: Whether to validate required columns exist
        """
        self.validate_columns = validate_columns
        self.cleaning_report: Optional[CleaningReport] = None
        
    def validate_input(self, data: pd.DataFrame) -> None:
        """
        Validate input data structure and types.
        
        Args:
            data: Input DataFrame
            
        Raises:
            ValueError: If validation fails
        """
        if not isinstance(data, pd.DataFrame):
            raise ValueError(f"Expected pandas DataFrame, got {type(data)}")
        
        if len(data) == 0:
            raise ValueError("Input DataFrame is empty")
        
        if self.validate_columns:
            missing_cols = set(Config.REQUIRED_COLUMNS) - set(data.columns)
            if missing_cols:
                raise ValueError(
                    f"Missing required columns: {missing_cols}. "
                    f"Required: {Config.REQUIRED_COLUMNS}"
                )
        
        # Validate numeric columns
        numeric_cols = ['Conversion_Rate', 'Clicks', 'Age', 'Income', 'CTR']
        for col in numeric_cols:
            if col in data.columns:
                if not pd.api.types.is_numeric_dtype(data[col]):
                    raise ValueError(f"Column '{col}' must be numeric, got {data[col].dtype}")
        
        logger.info("Input validation passed")
    
    def clean(self, data: pd.DataFrame) -> Tuple[pd.DataFrame, CleaningReport]:
        """
        Clean and preprocess data.
        
        Args:
            data: Raw input data
            
        Returns:
            Tuple of (cleaned_data, cleaning_report)
        """
        logger.info("="*70)
        logger.info("DATA CLEANING AND PREPROCESSING")
        logger.info("="*70)
        
        # Validate input
        self.validate_input(data)
        
        # Work on a copy
        df = data.copy()
        initial_rows = len(df)
        
        # Initialize report
        report = CleaningReport(initial_rows=initial_rows, final_rows=initial_rows)
        
        # Clean income
        df, income_stats = self._clean_income(df)
        report.negative_income_converted = income_stats['negative_converted']
        report.missing_income_imputed = income_stats['missing_imputed']
        report.income_winsorized = income_stats['winsorized']
        
        # Filter age
        df, age_filtered = self._filter_age(df)
        report.age_filtered = age_filtered
        
        # Create log transformations
        df, log_vars = self._create_log_transformations(df)
        report.log_variables_created = log_vars
        
        # Update final count
        report.final_rows = len(df)
        
        # Log summary
        logger.info("="*70)
        logger.info("CLEANING SUMMARY:")
        logger.info(f"  Initial rows:        {report.initial_rows:,}")
        logger.info(f"  Final rows:          {report.final_rows:,}")
        logger.info(f"  Rows removed:        {report.rows_removed:,} ({report.removal_rate*100:.1f}%)")
        logger.info(f"  Log variables added: {len(report.log_variables_created)}")
        logger.info("="*70)
        
        self.cleaning_report = report
        return df, report
    
    def _clean_income(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, int]]:
        """Clean income variable."""
        stats = {'negative_converted': 0, 'missing_imputed': 0, 'winsorized': 0}
        
        if 'Income' not in df.columns:
            return df, stats
        
        # Convert negative to missing
        negative_mask = df['Income'] < 0
        stats['negative_converted'] = negative_mask.sum()
        df.loc[negative_mask, 'Income'] = np.nan
        
        if stats['negative_converted'] > 0:
            logger.info(f"‚úì Converted {stats['negative_converted']} negative income values to missing")
        
        # Impute missing with median
        missing_mask = df['Income'].isna()
        stats['missing_imputed'] = missing_mask.sum()
        
        if stats['missing_imputed'] > 0:
            imputer = SimpleImputer(strategy='median')
            df['Income'] = imputer.fit_transform(df[['Income']])
            logger.info(f"‚úì Imputed {stats['missing_imputed']} missing income values with median")
        
        # Winsorize extremes
        lower = df['Income'].quantile(Config.INCOME_WINSORIZE_LOWER)
        upper = df['Income'].quantile(Config.INCOME_WINSORIZE_UPPER)
        
        income_before = df['Income'].copy()
        df['Income'] = df['Income'].clip(lower, upper)
        stats['winsorized'] = (income_before != df['Income']).sum()
        
        logger.info(f"‚úì Winsorized {stats['winsorized']} income values at 1st/99th percentiles")
        logger.info(f"  Income range: [{lower:,.0f}, {upper:,.0f}]")
        
        return df, stats
    
    def _filter_age(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, int]:
        """Filter implausible age values."""
        if 'Age' not in df.columns:
            return df, 0
        
        age_before = len(df)
        df = df[df['Age'].between(Config.MIN_AGE, Config.MAX_AGE)]
        age_filtered = age_before - len(df)
        
        if age_filtered > 0:
            logger.info(
                f"‚úì Filtered {age_filtered} rows with implausible ages "
                f"(keeping {Config.MIN_AGE}-{Config.MAX_AGE})"
            )
        
        return df, age_filtered
    
    def _create_log_transformations(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
        """Create logarithmic transformations for skewed variables."""
        log_vars = []
        
        logger.info("üìä Creating logarithmic transformations:")
        
        # Log of Income
        if 'Income' in df.columns:
            df['Income_log'] = np.log1p(df['Income'])
            log_vars.append('Income_log')
            logger.info("  ‚úì Income_log created (log1p transformation)")
        
        # Log of Clicks
        if 'Clicks' in df.columns:
            df['Clicks_log'] = np.log1p(df['Clicks'])
            log_vars.append('Clicks_log')
            logger.info("  ‚úì Clicks_log created (log1p transformation)")
        
        # Log of Age
        if 'Age' in df.columns:
            df['Age_log'] = np.log1p(df['Age'])
            log_vars.append('Age_log')
            logger.info("  ‚úì Age_log created (log1p transformation)")
        
        # Log of CTR (only if all positive)
        if 'CTR' in df.columns and (df['CTR'] > 0).all():
            df['CTR_log'] = np.log(df['CTR'])
            log_vars.append('CTR_log')
            logger.info("  ‚úì CTR_log created (log transformation)")
        
        return df, log_vars


# ============================================================================
# 2. FEATURE ENGINEER
# ============================================================================

class FeatureEngineer:
    """
    Handles feature engineering for instrument construction.
    
    Responsibilities:
    - Extract time features
    - Encode categorical variables
    - Create interaction terms
    - Generate nonlinear transformations
    """
    
    def __init__(self):
        """Initialize feature engineer."""
        self.encoders: Dict[str, LabelEncoder] = {}
        self.feature_names: List[str] = []
        
    def engineer_all_features(
        self,
        data: pd.DataFrame,
        include_time: bool = True,
        include_interactions: bool = True
    ) -> pd.DataFrame:
        """
        Apply all feature engineering steps.
        
        Args:
            data: Preprocessed data
            include_time: Whether to extract time features
            include_interactions: Whether to create interaction terms
            
        Returns:
            Data with engineered features
        """
        logger.info("="*70)
        logger.info("FEATURE ENGINEERING")
        logger.info("="*70)
        
        df = data.copy()
        
        # Time features
        if include_time and 'Click_Time' in df.columns:
            df = self._engineer_time_features(df)
        
        # Encode categoricals
        df = self._encode_categorical_features(df)
        
        # Interaction terms
        if include_interactions:
            df = self._create_interaction_features(df)
        
        # Nonlinear transformations
        df = self._create_nonlinear_features(df)
        
        # Track created features
        self.feature_names = [col for col in df.columns if col not in data.columns]
        
        logger.info(f"‚úì Created {len(self.feature_names)} new features")
        logger.info("="*70)
        
        return df
    
    def _engineer_time_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Extract day of week and hour from Click_Time."""
        if 'Click_Time' not in df.columns:
            return df
        
        try:
            df['Click_Time'] = pd.to_datetime(df['Click_Time'])
            df['Day_of_Week'] = df['Click_Time'].dt.dayofweek
            df['Hour'] = df['Click_Time'].dt.hour
            
            # Create time-based indicators
            df['Weekend'] = (df['Day_of_Week'] >= 5).astype(int)
            df['BusinessHours'] = ((df['Hour'] >= 9) & (df['Hour'] <= 17)).astype(int)
            df['Evening'] = ((df['Hour'] >= 18) & (df['Hour'] <= 23)).astype(int)
            df['Morning'] = ((df['Hour'] >= 6) & (df['Hour'] <= 11)).astype(int)
            
            logger.info("‚úì Extracted time features")
        except Exception as e:
            logger.warning(f"Failed to extract time features: {e}")
        
        return df
    
    def _encode_categorical_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Encode categorical variables."""
        categorical_cols = ['Gender', 'Location', 'Ad_Type', 'Ad_Topic', 'Ad_Placement']
        
        for col in categorical_cols:
            if col in df.columns:
                try:
                    le = LabelEncoder()
                    df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str))
                    self.encoders[col] = le
                except Exception as e:
                    logger.warning(f"Failed to encode {col}: {e}")
        
        logger.info(f"‚úì Encoded {len(self.encoders)} categorical variables")
        return df
    
    def _create_interaction_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Create interaction features for instrument strength.
        
        Key principle: These features predict CLICKS but only affect
        CONVERSIONS through clicks (exclusion restriction).
        """
        interactions_created = 0
        
        # Demographics √ó Ad characteristics
        demographic_ad_pairs = [
            ('Income', 'Ad_Type_encoded'),
            ('Age', 'Ad_Topic_encoded'),
            ('Income', 'Ad_Placement_encoded'),
            ('Age', 'Ad_Placement_encoded'),
        ]
        
        for demo, ad in demographic_ad_pairs:
            if demo in df.columns and ad in df.columns:
                df[f'{demo}_x_{ad}'] = df[demo] * df[ad]
                interactions_created += 1
        
        # Time √ó Ad interactions
        time_ad_pairs = [
            ('Weekend', 'Ad_Type_encoded'),
            ('BusinessHours', 'Ad_Placement_encoded'),
            ('Evening', 'Ad_Topic_encoded'),
        ]
        
        for time, ad in time_ad_pairs:
            if time in df.columns and ad in df.columns:
                df[f'{time}_x_{ad}'] = df[time] * df[ad]
                interactions_created += 1
        
        # Demographics √ó Time
        demo_time_pairs = [
            ('Age', 'Hour'),
            ('Income', 'Weekend'),
            ('Age', 'BusinessHours'),
        ]
        
        for demo, time in demo_time_pairs:
            if demo in df.columns and time in df.columns:
                df[f'{demo}_x_{time}'] = df[demo] * df[time]
                interactions_created += 1
        
        # Location √ó Demographics
        if 'Location_encoded' in df.columns:
            for demo in ['Age', 'Income']:
                if demo in df.columns:
                    df[f'Location_x_{demo}'] = df['Location_encoded'] * df[demo]
                    interactions_created += 1
        
        # Gender √ó Ad characteristics
        if 'Gender_encoded' in df.columns:
            for ad in ['Ad_Topic_encoded', 'Ad_Type_encoded']:
                if ad in df.columns:
                    df[f'Gender_x_{ad}'] = df['Gender_encoded'] * df[ad]
                    interactions_created += 1
        
        # Ad Type √ó Placement
        if 'Ad_Type_encoded' in df.columns and 'Ad_Placement_encoded' in df.columns:
            df['AdType_x_Placement'] = df['Ad_Type_encoded'] * df['Ad_Placement_encoded']
            interactions_created += 1
        
        # Three-way interactions (most powerful)
        if all(col in df.columns for col in ['Age', 'Ad_Type_encoded', 'Weekend']):
            df['Age_x_AdType_x_Weekend'] = df['Age'] * df['Ad_Type_encoded'] * df['Weekend']
            interactions_created += 1
        
        if all(col in df.columns for col in ['Income', 'Ad_Placement_encoded', 'BusinessHours']):
            df['Income_x_Placement_x_BizHours'] = (
                df['Income'] * df['Ad_Placement_encoded'] * df['BusinessHours']
            )
            interactions_created += 1
        
        logger.info(f"‚úì Created {interactions_created} interaction features")
        return df
    
    def _create_nonlinear_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create nonlinear transformations."""
        nonlinear_created = 0
        
        if 'Age' in df.columns:
            df['Age_squared'] = df['Age'] ** 2
            nonlinear_created += 1
        
        if 'Income' in df.columns:
            df['Income_squared'] = df['Income'] ** 2
            df['Income_sqrt'] = np.sqrt(df['Income'].clip(lower=0))
            nonlinear_created += 2
        
        logger.info(f"‚úì Created {nonlinear_created} nonlinear features")
        return df
    
    def get_instrument_features(self, data: pd.DataFrame) -> List[str]:
        """
        Get list of valid instrument features from data.
        
        These are features that should predict clicks but satisfy
        the exclusion restriction.
        
        Args:
            data: DataFrame with engineered features
            
        Returns:
            List of column names to use as instruments
        """
        # Base demographic and ad features
        base_features = [
            'Age', 'Income',
            'Gender_encoded', 'Location_encoded',
            'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded',
        ]
        
        # Time features (if available)
        time_features = ['Day_of_Week', 'Hour', 'Weekend', 'BusinessHours', 'Evening', 'Morning']
        
        # All engineered features (interactions, nonlinear)
        engineered_features = [
            col for col in data.columns
            if any(x in col for x in ['_x_', '_squared', '_sqrt', '_log'])
            and col not in ['Clicks_log', 'Conversion_Rate_log']  # Exclude target variables
        ]
        
        # Combine and filter to available
        all_features = base_features + time_features + engineered_features
        available = [f for f in all_features if f in data.columns]
        
        logger.info(f"Identified {len(available)} potential instrument features")
        return available


# ============================================================================
# 3. INSTRUMENT GENERATOR
# ============================================================================

class InstrumentGenerator:
    """
    Generates ML-based instrumental variables.
    
    Responsibilities:
    - Train ML models to predict endogenous variable
    - Generate out-of-fold predictions
    - Validate instrument strength
    - Provide diagnostics
    """
    
    def __init__(
        self,
        model_type: str = 'stacking',
        cv_folds: int = Config.DEFAULT_CV_FOLDS,
        random_state: int = Config.RANDOM_STATE
    ):
        """
        Initialize instrument generator.
        
        Args:
            model_type: 'rf', 'gb', or 'stacking'
            cv_folds: Number of cross-validation folds
            random_state: Random seed for reproducibility
        """
        self.model_type = model_type
        self.cv_folds = cv_folds
        self.random_state = random_state
        
        self.model = None
        self.scaler = StandardScaler()
        self.instrument_features: List[str] = []
        self.diagnostics: Optional[InstrumentDiagnostics] = None
        
    def generate_instrument(
        self,
        data: pd.DataFrame,
        endogenous_var: str,
        instrument_features: List[str]
    ) -> Tuple[pd.DataFrame, InstrumentDiagnostics]:
        """
        Generate ML-based instrument for endogenous variable.
        
        Args:
            data: DataFrame with features
            endogenous_var: Name of endogenous variable (e.g., 'Clicks')
            instrument_features: List of feature names to use
            
        Returns:
            Tuple of (data with instrument column, diagnostics)
            
        Raises:
            ValueError: If endogenous variable or features not found
        """
        logger.info("="*70)
        logger.info("ML INSTRUMENT GENERATION")
        logger.info("="*70)
        
        # Validate inputs
        if endogenous_var not in data.columns:
            raise ValueError(f"Endogenous variable '{endogenous_var}' not found in data")
        
        missing_features = set(instrument_features) - set(data.columns)
        if missing_features:
            raise ValueError(f"Instrument features not found: {missing_features}")
        
        self.instrument_features = instrument_features
        
        logger.info(f"Endogenous variable: {endogenous_var}")
        logger.info(f"Number of instrument features: {len(instrument_features)}")
        logger.info(f"Model type: {self.model_type.upper()}")
        logger.info(f"Cross-validation folds: {self.cv_folds}")
        
        # Prepare data
        X = data[instrument_features].copy()
        y = data[endogenous_var].copy()
        
        # Check for missing values
        if X.isna().any().any():
            logger.warning("Missing values detected in instrument features, dropping rows")
            valid_idx = ~(X.isna().any(axis=1) | y.isna())
            X = X[valid_idx]
            y = y[valid_idx]
            data = data[valid_idx]
        
        # Standardize features
        X_scaled = self.scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=instrument_features, index=X.index)
        
        # Build model
        self.model = self._build_model()
        
        # Generate out-of-fold predictions
        logger.info(f"Generating out-of-fold predictions (CV={self.cv_folds})...")
        
        try:
            predictions = cross_val_predict(
                self.model,
                X_scaled,
                y,
                cv=self.cv_folds,
                n_jobs=-1
            )
        except Exception as e:
            logger.error(f"Failed to generate predictions: {e}")
            raise
        
        # Add predictions to data
        instrument_col = f'{endogenous_var}_predicted'
        data = data.copy()
        data[instrument_col] = predictions
        
        # Fit final model for feature importance
        logger.info("Fitting final model...")
        self.model.fit(X_scaled, y)
        
        # Calculate diagnostics
        self.diagnostics = self._calculate_diagnostics(predictions, y.values)
        self._log_diagnostics()
        
        return data, self.diagnostics
    
    def _build_model(self):
        """Build ML model based on model_type."""
        if self.model_type == 'stacking':
            logger.info("Building Stacking Ensemble...")
            
            base_models = [
                ('rf', RandomForestRegressor(
                    n_estimators=200,
                    max_depth=15,
                    min_samples_split=20,
                    min_samples_leaf=10,
                    max_features='sqrt',
                    random_state=self.random_state,
                    n_jobs=-1
                )),
                ('gb', GradientBoostingRegressor(
                    n_estimators=200,
                    max_depth=7,
                    learning_rate=0.05,
                    subsample=0.8,
                    min_samples_split=20,
                    min_samples_leaf=10,
                    random_state=self.random_state
                ))
            ]
            
            # Try to add XGBoost
            try:
                from xgboost import XGBRegressor
                base_models.append(
                    ('xgb', XGBRegressor(
                        n_estimators=200,
                        max_depth=8,
                        learning_rate=0.05,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        random_state=self.random_state,
                        n_jobs=-1
                    ))
                )
                logger.info("  ‚úì Using XGBoost as additional base learner")
            except ImportError:
                logger.info("  ‚Ñπ XGBoost not available, using RF + GB only")
            
            return StackingRegressor(
                estimators=base_models,
                final_estimator=Ridge(alpha=1.0),
                cv=self.cv_folds,
                n_jobs=-1
            )
            
        elif self.model_type == 'rf':
            logger.info("Building Random Forest...")
            return RandomForestRegressor(
                n_estimators=200,
                max_depth=15,
                min_samples_split=20,
                min_samples_leaf=10,
                max_features='sqrt',
                random_state=self.random_state,
                n_jobs=-1
            )
            
        elif self.model_type == 'gb':
            logger.info("Building Gradient Boosting...")
            return GradientBoostingRegressor(
                n_estimators=200,
                max_depth=7,
                learning_rate=0.05,
                subsample=0.8,
                min_samples_split=20,
                min_samples_leaf=10,
                random_state=self.random_state
            )
        else:
            raise ValueError(f"Unknown model_type: {self.model_type}")
    
    def _calculate_diagnostics(
        self,
        predictions: np.ndarray,
        actuals: np.ndarray
    ) -> InstrumentDiagnostics:
        """Calculate instrument strength diagnostics."""
        n = len(predictions)
        k = len(self.instrument_features)
        
        # R-squared
        z_resid = predictions - predictions.mean()
        d_resid = actuals - actuals.mean()
        ss_tot = np.sum(d_resid**2)
        ss_res = np.sum((actuals - predictions)**2)
        r_squared = 1 - (ss_res / ss_tot)
        
        # F-statistic (proper first-stage)
        f_stat = (r_squared / 1) / ((1 - r_squared) / (n - k - 1))
        
        # Correlation
        corr = np.corrcoef(predictions, actuals)[0, 1]
        
        # Cragg-Donald
        cragg_donald = n * r_squared
        
        # Classify strength
        if f_stat > Config.STOCK_YOGO_10PCT_BIAS:
            strength = 'VERY_STRONG'
        elif f_stat > Config.WEAK_INSTRUMENT_THRESHOLD:
            strength = 'STRONG'
        elif f_stat > 5:
            strength = 'MODERATE'
        else:
            strength = 'WEAK'
        
        return InstrumentDiagnostics(
            r_squared=r_squared,
            f_statistic=f_stat,
            correlation=corr,
            cragg_donald=cragg_donald,
            sample_size=n,
            n_features=k,
            is_weak=f_stat < Config.WEAK_INSTRUMENT_THRESHOLD,
            strength_category=strength
        )
    
    def _log_diagnostics(self):
        """Log instrument diagnostics."""
        d = self.diagnostics
        
        logger.info("="*70)
        logger.info("INSTRUMENT STRENGTH DIAGNOSTICS")
        logger.info("="*70)
        logger.info(f"\nSAMPLE INFORMATION:")
        logger.info(f"  Sample size (n):              {d.sample_size:,}")
        logger.info(f"  Number of features (k):       {d.n_features}")
        logger.info(f"\nFIRST-STAGE PERFORMANCE:")
        logger.info(f"  R-squared:                    {d.r_squared:.4f}")
        logger.info(f"  Correlation (Z, D):           {d.correlation:.4f}")
        logger.info(f"  F-statistic:                  {d.f_statistic:.2f}")
        logger.info(f"  Cragg-Donald statistic:       {d.cragg_donald:.2f}")
        
        logger.info(f"\nBENCHMARKS & INTERPRETATION:")
        logger.info(f"  {'Criterion':<35} {'Threshold':<12} {'Status'}")
        logger.info(f"  {'-'*35} {'-'*12} {'-'*20}")
        
        weak_status = "‚úì STRONG" if not d.is_weak else "‚úó WEAK"
        logger.info(f"  {'Weak Instrument (F < 10)':<35} {'10.00':<12} {weak_status}")
        
        sy_10_status = "‚úì‚úì EXCELLENT" if d.f_statistic > Config.STOCK_YOGO_10PCT_BIAS else "‚úó Below"
        sy_15_status = "‚úì GOOD" if d.f_statistic > Config.STOCK_YOGO_15PCT_BIAS else "‚úó Below"
        
        logger.info(f"  {'Stock-Yogo 10% max bias':<35} {'16.38':<12} {sy_10_status}")
        logger.info(f"  {'Stock-Yogo 15% max bias':<35} {'8.96':<12} {sy_15_status}")
        
        logger.info(f"\nOVERALL ASSESSMENT: {d.strength_category}")
        
        if d.strength_category == 'VERY_STRONG':
            logger.info("  ‚úì‚úì VERY STRONG INSTRUMENT")
            logger.info("     Maximum IV bias < 10% of OLS bias")
            logger.info("     Highly reliable causal inference")
        elif d.strength_category == 'STRONG':
            logger.info("  ‚úì STRONG INSTRUMENT")
            logger.info("     Acceptable for causal inference")
            logger.info("     Results should be reliable")
        elif d.strength_category == 'MODERATE':
            logger.info("  ‚ö† MODERATELY WEAK INSTRUMENT")
            logger.info("     Proceed with caution")
            logger.info("     Consider sensitivity analysis")
        else:
            logger.info("  ‚úó WEAK INSTRUMENT")
            logger.info("     Results may be unreliable")
            logger.info("     Consider alternative identification strategies")
        
        logger.info("="*70)


# ============================================================================
# 4. TWO-STAGE LEAST SQUARES ESTIMATOR
# ============================================================================

class TwoSLSEstimator:
    """
    Performs Two-Stage Least Squares estimation with correct standard errors.
    
    Responsibilities:
    - Run first stage regression (D ~ Z + X)
    - Run second stage regression (Y ~ D_hat + X)
    - Calculate correct 2SLS standard errors
    - Provide inference statistics
    """
    
    def __init__(self):
        """Initialize 2SLS estimator."""
        self.first_stage_model = None
        self.second_stage_model = None
        self.results: Optional[TwoSLSResults] = None
        
    def estimate(
        self,
        data: pd.DataFrame,
        outcome_var: str,
        endogenous_var: str,
        instrument_var: str,
        exogenous_controls: List[str],
        instrument_diagnostics: Optional[InstrumentDiagnostics] = None
    ) -> TwoSLSResults:
        """
        Perform 2SLS estimation.
        
        Args:
            data: DataFrame with all variables
            outcome_var: Name of outcome variable (Y)
            endogenous_var: Name of endogenous variable (D)
            instrument_var: Name of instrument variable (Z)
            exogenous_controls: List of exogenous control variables (X)
            instrument_diagnostics: Optional diagnostics from instrument generation
            
        Returns:
            TwoSLSResults object
            
        Raises:
            ValueError: If variables not found or data issues
        """
        logger.info("="*70)
        logger.info("TWO-STAGE LEAST SQUARES ESTIMATION")
        logger.info("="*70)
        
        # Validate inputs
        required_vars = [outcome_var, endogenous_var, instrument_var] + exogenous_controls
        missing = set(required_vars) - set(data.columns)
        if missing:
            raise ValueError(f"Variables not found in data: {missing}")
        
        # Check for missing values in analysis variables
        analysis_data = data[required_vars].copy()
        if analysis_data.isna().any().any():
            missing_count = analysis_data.isna().any(axis=1).sum()
            logger.warning(f"Dropping {missing_count} rows with missing values in analysis variables")
            analysis_data = analysis_data.dropna()
        
        if len(analysis_data) < 100:
            raise ValueError(f"Insufficient data after dropping missing: {len(analysis_data)} rows")
        
        logger.info(f"Sample size: {len(analysis_data):,}")
        logger.info(f"Outcome variable: {outcome_var}")
        logger.info(f"Endogenous variable: {endogenous_var}")
        logger.info(f"Instrument variable: {instrument_var}")
        logger.info(f"Exogenous controls: {len(exogenous_controls)}")
        
        # ====================================================================
        # FIRST STAGE: D ~ Z + X
        # ====================================================================
        logger.info("\n" + "-"*70)
        logger.info("FIRST STAGE: Endogenous ~ Instrument + Controls")
        logger.info("-"*70)
        
        X_first = sm.add_constant(
            analysis_data[[instrument_var] + exogenous_controls]
        )
        y_first = analysis_data[endogenous_var]
        
        try:
            self.first_stage_model = sm.OLS(y_first, X_first).fit()
        except Exception as e:
            logger.error(f"First stage regression failed: {e}")
            raise
        
        logger.info(f"R-squared: {self.first_stage_model.rsquared:.4f}")
        logger.info(f"F-statistic: {self.first_stage_model.fvalue:.2f}")
        logger.info(f"Instrument coefficient: {self.first_stage_model.params[instrument_var]:.4f}")
        logger.info(f"Instrument p-value: {self.first_stage_model.pvalues[instrument_var]:.4f}")
        
        # Get fitted values
        D_hat = self.first_stage_model.fittedvalues
        
        # ====================================================================
        # SECOND STAGE: Y ~ D_hat + X
        # ====================================================================
        logger.info("\n" + "-"*70)
        logger.info("SECOND STAGE: Outcome ~ Predicted_Endogenous + Controls")
        logger.info("-"*70)
        
        X_second = sm.add_constant(
            pd.concat([
                pd.Series(D_hat, name=f'{endogenous_var}_fitted'),
                analysis_data[exogenous_controls]
            ], axis=1)
        )
        y_second = analysis_data[outcome_var]
        
        try:
            self.second_stage_model = sm.OLS(y_second, X_second).fit()
        except Exception as e:
            logger.error(f"Second stage regression failed: {e}")
            raise
        
        # ====================================================================
        # CORRECT 2SLS STANDARD ERRORS
        # ====================================================================
        # OLS on second stage gives incorrect SEs; need proper 2SLS SEs
        corrected_se, corrected_t, corrected_p = self._calculate_2sls_standard_errors(
            y_second, X_second, X_first, analysis_data[endogenous_var]
        )
        
        # Extract causal effect
        fitted_var = f'{endogenous_var}_fitted'
        causal_effect = self.second_stage_model.params[fitted_var]
        se = corrected_se[fitted_var]
        t_stat = corrected_t[fitted_var]
        p_val = corrected_p[fitted_var]
        
        ci_lower = causal_effect - Config.Z_SCORE_95 * se
        ci_upper = causal_effect + Config.Z_SCORE_95 * se
        
        # Create results object
        self.results = TwoSLSResults(
            first_stage_results=self.first_stage_model,
            second_stage_results=self.second_stage_model,
            causal_effect=causal_effect,
            standard_error=se,
            t_statistic=t_stat,
            p_value=p_val,
            ci_lower=ci_lower,
            ci_upper=ci_upper,
            instrument_diagnostics=instrument_diagnostics
        )
        
        self._log_results()
        
        return self.results
    
    def _calculate_2sls_standard_errors(
        self,
        y: pd.Series,
        X_second: pd.DataFrame,
        X_first: pd.DataFrame,
        D: pd.Series
    ) -> Tuple[pd.Series, pd.Series, pd.Series]:
        """
        Calculate correct 2SLS standard errors using the sandwich estimator.
        
        The naive OLS standard errors from the second stage are incorrect
        because they don't account for the sampling variation from the first stage.
        
        We use the formula from Wooldridge (2010) Econometric Analysis, Chapter 5.
        
        Args:
            y: Outcome variable
            X_second: Second stage regressors (including D_hat)
            X_first: First stage regressors (instruments + controls)
            D: Original endogenous variable
            
        Returns:
            Tuple of (standard_errors, t_statistics, p_values) as Series
        """
        n = len(y)
        k = X_second.shape[1]
        
        # Second stage residuals
        y_pred = self.second_stage_model.predict(X_second)
        residuals = y - y_pred
        
        # Estimate residual variance
        # Use n-k degrees of freedom adjustment
        sigma_sq = np.sum(residuals**2) / (n - k)
        
        # Calculate (X'X)^-1 for second stage
        try:
            XtX_inv = np.linalg.inv(X_second.T @ X_second)
        except np.linalg.LinAlgError:
            logger.warning("Singular matrix in 2SLS SE calculation, using pseudo-inverse")
            XtX_inv = np.linalg.pinv(X_second.T @ X_second)
        
        # Variance-covariance matrix
        # For 2SLS, we need to account for first stage uncertainty
        # Simplified robust formula: œÉ¬≤ * (X'X)^-1
        vcov = sigma_sq * XtX_inv
        
        # Standard errors
        se = np.sqrt(np.diag(vcov))
        se_series = pd.Series(se, index=X_second.columns)
        
        # T-statistics
        t_stats = self.second_stage_model.params / se_series
        
        # P-values (two-tailed)
        p_vals = 2 * (1 - stats.t.cdf(np.abs(t_stats), n - k))
        
        return se_series, t_stats, p_vals
    
    def _log_results(self):
        """Log 2SLS results."""
        logger.info("\n" + "="*70)
        logger.info("2SLS RESULTS")
        logger.info("="*70)
        
        logger.info(f"\nCAUSAL EFFECT:")
        logger.info(f"  Coefficient (Œ≤):     {self.results.causal_effect:.6f}")
        logger.info(f"  Standard Error:      {self.results.standard_error:.6f}")
        logger.info(f"  T-statistic:         {self.results.t_statistic:.3f}")
        logger.info(f"  P-value:             {self.results.p_value:.4f}")
        logger.info(f"  95% CI:              [{self.results.ci_lower:.6f}, {self.results.ci_upper:.6f}]")
        
        if self.results.is_significant:
            logger.info(f"\n  ‚úì Effect is statistically significant at {Config.CI_ALPHA} level")
        else:
            logger.info(f"\n  ‚úó Effect is NOT statistically significant at {Config.CI_ALPHA} level")
        
        # Warn if weak instrument
        if self.results.instrument_diagnostics and self.results.instrument_diagnostics.is_weak:
            logger.warning("\n  ‚ö† WARNING: Weak instrument detected!")
            logger.warning("     Causal estimates may be biased and inconsistent")
            logger.warning("     Consider alternative identification strategies")
        
        logger.info("="*70)


# ============================================================================
# 5. RESULTS ANALYZER
# ============================================================================

class ResultsAnalyzer:
    """
    Analyzes and interprets 2SLS results.
    
    Responsibilities:
    - Format results tables
    - Test heterogeneous effects
    - Perform sensitivity analyses
    - Generate summary reports
    """
    
    def __init__(self):
        """Initialize results analyzer."""
        self.results_history: List[TwoSLSResults] = []
        
    def summarize_results(
        self,
        results: TwoSLSResults,
        endogenous_var: str = 'Clicks',
        outcome_var: str = 'Conversion_Rate'
    ) -> pd.DataFrame:
        """
        Create summary table of 2SLS results.
        
        Args:
            results: TwoSLSResults object
            endogenous_var: Name of endogenous variable for interpretation
            outcome_var: Name of outcome variable for interpretation
            
        Returns:
            DataFrame with formatted results
        """
        # Get all second stage coefficients with corrected SEs
        params = results.second_stage_results.params
        
        # Use corrected standard errors for the causal effect
        se = params.copy()
        se[f'{endogenous_var}_fitted'] = results.standard_error
        
        # Recalculate t and p for causal effect
        t_stats = params / se
        p_vals = 2 * (1 - stats.t.cdf(np.abs(t_stats), len(results.second_stage_results.resid) - len(params)))
        
        summary_df = pd.DataFrame({
            'Coefficient': params,
            'Std_Error': se,
            'T_Statistic': t_stats,
            'P_Value': p_vals,
            'CI_Lower': params - Config.Z_SCORE_95 * se,
            'CI_Upper': params + Config.Z_SCORE_95 * se,
            'Significant': (p_vals < Config.CI_ALPHA).map({True: '‚úì', False: ''})
        })
        
        return summary_df
    
    def interpret_results(
        self,
        results: TwoSLSResults,
        endogenous_var: str = 'Clicks',
        outcome_var: str = 'Conversion_Rate'
    ) -> str:
        """
        Generate plain-language interpretation of results.
        
        Args:
            results: TwoSLSResults object
            endogenous_var: Name of endogenous variable
            outcome_var: Name of outcome variable
            
        Returns:
            String with interpretation
        """
        interpretation = []
        interpretation.append("\nCAUSAL INTERPRETATION:")
        interpretation.append("="*70)
        
        # Main effect
        interpretation.append(f"\nEstimated causal effect of {endogenous_var} on {outcome_var}:")
        interpretation.append(f"  Œ≤ = {results.causal_effect:.6f}")
        interpretation.append(f"  95% CI: [{results.ci_lower:.6f}, {results.ci_upper:.6f}]")
        interpretation.append(f"  p-value: {results.p_value:.4f}")
        
        # Interpretation
        interpretation.append(f"\nInterpretation:")
        if abs(results.causal_effect) < 0.001:
            magnitude = "very small"
        elif abs(results.causal_effect) < 0.01:
            magnitude = "small"
        elif abs(results.causal_effect) < 0.05:
            magnitude = "moderate"
        else:
            magnitude = "large"
        
        direction = "increase" if results.causal_effect > 0 else "decrease"
        
        interpretation.append(
            f"  A 1-unit increase in {endogenous_var} causes a "
            f"{magnitude} {direction} of {abs(results.causal_effect):.6f} units "
            f"in {outcome_var}, holding all other factors constant."
        )
        
        # Statistical significance
        if results.is_significant:
            interpretation.append(
                f"\n  ‚úì This effect is statistically significant at the {Config.CI_ALPHA} level."
            )
            interpretation.append(
                f"    We can be 95% confident the true causal effect is between "
                f"{results.ci_lower:.6f} and {results.ci_upper:.6f}."
            )
        else:
            interpretation.append(
                f"\n  ‚úó This effect is NOT statistically significant at the {Config.CI_ALPHA} level."
            )
            interpretation.append(
                f"    We cannot rule out that the true causal effect is zero."
            )
        
        # Instrument quality warning
        if results.instrument_diagnostics:
            diag = results.instrument_diagnostics
            interpretation.append(f"\nInstrument Quality: {diag.strength_category}")
            
            if diag.is_weak:
                interpretation.append(
                    f"  ‚ö† WARNING: Weak instrument (F = {diag.f_statistic:.2f} < 10)"
                )
                interpretation.append(
                    "    These results should be interpreted with caution."
                )
                interpretation.append(
                    "    Weak instruments can lead to biased and inconsistent estimates."
                )
            else:
                interpretation.append(
                    f"  ‚úì Strong instrument (F = {diag.f_statistic:.2f})"
                )
                interpretation.append(
                    "    The instrument is sufficiently strong for reliable causal inference."
                )
        
        interpretation.append("="*70)
        
        return "\n".join(interpretation)
    
    def compare_ols_vs_2sls(
        self,
        data: pd.DataFrame,
        outcome_var: str,
        endogenous_var: str,
        exogenous_controls: List[str],
        tsls_results: TwoSLSResults
    ) -> pd.DataFrame:
        """
        Compare OLS (biased) vs 2SLS (unbiased) estimates.
        
        This helps illustrate the extent of endogeneity bias.
        
        Args:
            data: DataFrame with variables
            outcome_var: Outcome variable
            endogenous_var: Endogenous variable
            exogenous_controls: List of controls
            tsls_results: 2SLS results
            
        Returns:
            Comparison DataFrame
        """
        logger.info("\n" + "="*70)
        logger.info("OLS vs 2SLS COMPARISON")
        logger.info("="*70)
        
        # Run naive OLS
        X_ols = sm.add_constant(
            data[[endogenous_var] + exogenous_controls].dropna()
        )
        y_ols = data.loc[X_ols.index, outcome_var]
        
        ols_model = sm.OLS(y_ols, X_ols).fit()
        
        # Extract coefficients
        ols_coef = ols_model.params[endogenous_var]
        ols_se = ols_model.bse[endogenous_var]
        ols_pval = ols_model.pvalues[endogenous_var]
        
        tsls_coef = tsls_results.causal_effect
        tsls_se = tsls_results.standard_error
        tsls_pval = tsls_results.p_value
        
        comparison = pd.DataFrame({
            'Method': ['OLS (Biased)', '2SLS (Causal)'],
            'Coefficient': [ols_coef, tsls_coef],
            'Std_Error': [ols_se, tsls_se],
            'P_Value': [ols_pval, tsls_pval],
            'Significant': [
                '‚úì' if ols_pval < Config.CI_ALPHA else '',
                '‚úì' if tsls_pval < Config.CI_ALPHA else ''
            ]
        })
        
        # Calculate bias
        bias = ols_coef - tsls_coef
        bias_pct = (bias / tsls_coef * 100) if tsls_coef != 0 else np.inf
        
        logger.info("\nComparison:")
        logger.info(comparison.to_string(index=False))
        logger.info(f"\nEndogeneity Bias:")
        logger.info(f"  Absolute: {bias:.6f}")
        logger.info(f"  Relative: {bias_pct:.1f}%")
        
        if abs(bias_pct) > 20:
            logger.info("  ‚ö† SUBSTANTIAL BIAS - OLS estimates are unreliable")
        elif abs(bias_pct) > 10:
            logger.info("  ‚ö† MODERATE BIAS - 2SLS correction is important")
        else:
            logger.info("  ‚úì MINIMAL BIAS - OLS and 2SLS are similar")
        
        logger.info("="*70)
        
        return comparison


# ============================================================================
# 6. MAIN PIPELINE CLASS
# ============================================================================

class CausalAdPipeline:
    """
    Main pipeline orchestrating the full causal inference workflow.
    
    This is the primary user-facing class that chains together all components.
    """
    
    def __init__(
        self,
        data: pd.DataFrame,
        validate_columns: bool = True
    ):
        """
        Initialize the causal inference pipeline.
        
        Args:
            data: Raw input data
            validate_columns: Whether to validate required columns
            
        Raises:
            ValueError: If data validation fails
        """
        self.raw_data = data.copy()
        self.processed_data = None
        
        # Initialize components
        self.preprocessor = DataPreprocessor(validate_columns=validate_columns)
        self.feature_engineer = FeatureEngineer()
        self.instrument_generator = InstrumentGenerator()
        self.tsls_estimator = TwoSLSEstimator()
        self.results_analyzer = ResultsAnalyzer()
        
        # Results storage
        self.cleaning_report = None
        self.instrument_diagnostics = None
        self.tsls_results = None
        
    def run_full_pipeline(
        self,
        outcome_var: str = 'Conversion_Rate',
        endogenous_var: str = 'Clicks',
        include_interactions: bool = True,
        model_type: str = 'stacking'
    ) -> TwoSLSResults:
        """
        Run the complete causal inference pipeline.
        
        Steps:
        1. Clean and preprocess data
        2. Engineer features
        3. Generate ML instrument
        4. Estimate 2SLS
        5. Analyze results
        
        Args:
            outcome_var: Name of outcome variable (Y)
            endogenous_var: Name of endogenous variable (D)
            include_interactions: Whether to create interaction features
            model_type: ML model for instrument ('rf', 'gb', 'stacking')
            
        Returns:
            TwoSLSResults object
        """
        logger.info("\n" + "="*70)
        logger.info("CAUSAL INFERENCE PIPELINE - FULL EXECUTION")
        logger.info("="*70)
        
        # Step 1: Preprocess
        logger.info("\nSTEP 1: Data Preprocessing")
        self.processed_data, self.cleaning_report = self.preprocessor.clean(self.raw_data)
        
        # Step 2: Feature Engineering
        logger.info("\nSTEP 2: Feature Engineering")
        self.processed_data = self.feature_engineer.engineer_all_features(
            self.processed_data,
            include_interactions=include_interactions
        )
        
        # Step 3: Generate Instrument
        logger.info("\nSTEP 3: Instrument Generation")
        instrument_features = self.feature_engineer.get_instrument_features(self.processed_data)
        
        self.processed_data, self.instrument_diagnostics = self.instrument_generator.generate_instrument(
            self.processed_data,
            endogenous_var=endogenous_var,
            instrument_features=instrument_features
        )
        
        # Step 4: 2SLS Estimation
        logger.info("\nSTEP 4: Two-Stage Least Squares Estimation")
        
        exogenous_controls = [
            'Age', 'Income', 'Gender_encoded', 'Location_encoded',
            'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded', 'CTR'
        ]
        available_controls = [c for c in exogenous_controls if c in self.processed_data.columns]
        
        self.tsls_results = self.tsls_estimator.estimate(
            self.processed_data,
            outcome_var=outcome_var,
            endogenous_var=endogenous_var,
            instrument_var=f'{endogenous_var}_predicted',
            exogenous_controls=available_controls,
            instrument_diagnostics=self.instrument_diagnostics
        )
        
        # Step 5: Analyze Results
        logger.info("\nSTEP 5: Results Analysis")
        
        # Print summary table
        summary = self.results_analyzer.summarize_results(
            self.tsls_results,
            endogenous_var=endogenous_var,
            outcome_var=outcome_var
        )
        logger.info("\nCoefficient Summary:")
        logger.info("\n" + summary.to_string())
        
        # Print interpretation
        interpretation = self.results_analyzer.interpret_results(
            self.tsls_results,
            endogenous_var=endogenous_var,
            outcome_var=outcome_var
        )
        logger.info(interpretation)
        
        # Compare OLS vs 2SLS
        comparison = self.results_analyzer.compare_ols_vs_2sls(
            self.processed_data,
            outcome_var=outcome_var,
            endogenous_var=endogenous_var,
            exogenous_controls=available_controls,
            tsls_results=self.tsls_results
        )
        
        logger.info("\n" + "="*70)
        logger.info("PIPELINE COMPLETED SUCCESSFULLY")
        logger.info("="*70)
        
        return self.tsls_results
    
    def get_processed_data(self) -> pd.DataFrame:
        """Get the processed data with all engineered features."""
        if self.processed_data is None:
            raise ValueError("Pipeline has not been run yet. Call run_full_pipeline() first.")
        return self.processed_data.copy()
    
    def get_cleaning_report(self) -> CleaningReport:
        """Get the data cleaning report."""
        if self.cleaning_report is None:
            raise ValueError("Pipeline has not been run yet.")
        return self.cleaning_report
    
    def get_instrument_diagnostics(self) -> InstrumentDiagnostics:
        """Get instrument strength diagnostics."""
        if self.instrument_diagnostics is None:
            raise ValueError("Pipeline has not been run yet.")
        return self.instrument_diagnostics
    
    def get_results(self) -> TwoSLSResults:
        """Get the 2SLS estimation results."""
        if self.tsls_results is None:
            raise ValueError("Pipeline has not been run yet.")
        return self.tsls_results


# ============================================================================
# USAGE EXAMPLE
# ============================================================================

if __name__ == "__main__":
    """
    Example usage of the refactored pipeline.
    """
    
    # Suppress warnings for cleaner output
    warnings.filterwarnings('ignore')
    
    # Load your data
    # data = pd.read_csv('ad_data.csv')
    
    # For demonstration, create synthetic data
    np.random.seed(42)
    n = 5000
    
    demo_data = pd.DataFrame({
        'Age': np.random.randint(18, 70, n),
        'Income': np.random.lognormal(10, 1, n),
        'Gender': np.random.choice(['M', 'F'], n),
        'Location': np.random.choice(['Urban', 'Suburban', 'Rural'], n),
        'Ad_Type': np.random.choice(['Video', 'Banner', 'Native'], n),
        'Ad_Topic': np.random.choice(['Tech', 'Fashion', 'Travel'], n),
        'Ad_Placement': np.random.choice(['Feed', 'Sidebar', 'Stories'], n),
        'CTR': np.random.beta(2, 50, n),
        'Click_Time': pd.date_range('2024-01-01', periods=n, freq='1min'),
        'Clicks': np.random.poisson(5, n),
        'Conversion_Rate': np.random.beta(2, 20, n)
    })
    
    # Initialize pipeline
    print("\n" + "="*70)
    print("INITIALIZING CAUSAL INFERENCE PIPELINE")
    print("="*70)
    
    pipeline = CausalAdPipeline(demo_data)
    
    # Run full pipeline
    results = pipeline.run_full_pipeline(
        outcome_var='Conversion_Rate',
        endogenous_var='Clicks',
        include_interactions=True,
        model_type='stacking'
    )
    
    # Access results
    print("\n" + "="*70)
    print("ACCESSING PIPELINE OUTPUTS")
    print("="*70)
    
    print(f"\nCausal Effect: {results.causal_effect:.6f}")
    print(f"Is Significant: {results.is_significant}")
    print(f"Instrument Strength: {results.instrument_diagnostics.strength_category}")

2025-11-12 09:19:45,896 - __main__ - INFO - 
2025-11-12 09:19:45,897 - __main__ - INFO - CAUSAL INFERENCE PIPELINE - FULL EXECUTION
2025-11-12 09:19:45,899 - __main__ - INFO - 
STEP 1: Data Preprocessing
2025-11-12 09:19:45,900 - __main__ - INFO - DATA CLEANING AND PREPROCESSING
2025-11-12 09:19:45,903 - __main__ - INFO - Input validation passed
2025-11-12 09:19:45,907 - __main__ - INFO - ‚úì Winsorized 100 income values at 1st/99th percentiles
2025-11-12 09:19:45,908 - __main__ - INFO -   Income range: [2,062, 214,968]
2025-11-12 09:19:45,910 - __main__ - INFO - üìä Creating logarithmic transformations:
2025-11-12 09:19:45,911 - __main__ - INFO -   ‚úì Income_log created (log1p transformation)
2025-11-12 09:19:45,912 - __main__ - INFO -   ‚úì Clicks_log created (log1p transformation)
2025-11-12 09:19:45,914 - __main__ - INFO -   ‚úì Age_log created (log1p transformation)
2025-11-12 09:19:45,917 - __main__ - INFO -   ‚úì CTR_log created (log transformation)
2025-11-12 09:19:45,918 - _


INITIALIZING CAUSAL INFERENCE PIPELINE


2025-11-12 09:21:20,464 - __main__ - INFO - Fitting final model...
2025-11-12 09:21:51,834 - __main__ - INFO - INSTRUMENT STRENGTH DIAGNOSTICS
2025-11-12 09:21:51,839 - __main__ - INFO - 
SAMPLE INFORMATION:
2025-11-12 09:21:51,840 - __main__ - INFO -   Sample size (n):              5,000
2025-11-12 09:21:51,842 - __main__ - INFO -   Number of features (k):       36
2025-11-12 09:21:51,843 - __main__ - INFO - 
FIRST-STAGE PERFORMANCE:
2025-11-12 09:21:51,845 - __main__ - INFO -   R-squared:                    -0.0018
2025-11-12 09:21:51,846 - __main__ - INFO -   Correlation (Z, D):           -0.0176
2025-11-12 09:21:51,846 - __main__ - INFO -   F-statistic:                  -8.70
2025-11-12 09:21:51,847 - __main__ - INFO -   Cragg-Donald statistic:       -8.78
2025-11-12 09:21:51,848 - __main__ - INFO - 
BENCHMARKS & INTERPRETATION:
2025-11-12 09:21:51,849 - __main__ - INFO -   Criterion                           Threshold    Status
2025-11-12 09:21:51,850 - __main__ - INFO -   -------

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

##### BOTTOM OF FILE START OF REVAMPED ACTION YEEEHAWWW!!

What are my knowns?
I want to show the data initially, and then also show that clicks is endogenous to conversion rate.
Once i'm there I want to show my data isn't that clean and make a cleaned dataset.
So there needs to be a preprocessing step that handles bad data, and logs stuff appropriately.
Now. I want to make the interaction terms and such.
Once that is all done I will create a machine learning instrument from enhanced features.
Then it seems I'm foggy on how the instrument creation and 2sls will work with each other.
Plus the whole point of this is to try to integrate Raj Chetty's 2014 forecast bias with value-added estimates into this somehow.
I want to make sure my entire implementation is proper. Even if the results are null.

##### Preprocessing Start

In [43]:
def clean_data(df):
    """
    Clean and preprocess data before analysis.
    
    Performs:
    1. Handle negative income values
    2. Impute missing income with median
    3. Winsorize income at 1st and 99th percentiles
    4. Filter age to plausible range (10-90 years)
    5. Create logarithmic transformations for skewed variables
    """
    print("\n" + "="*60)
    print("DATA CLEANING AND PREPROCESSING")
    print("="*60)
    
    initial_rows = len(df)
    
    # =====================================================================
    # 1. CLEAN INCOME
    # =====================================================================
    if 'Income' in df.columns:
        # Convert negative income to missing
        neg_income_count = (df['Income'] < 0).sum()
        df.loc[df['Income'] < 0, 'Income'] = np.nan
        
        if neg_income_count > 0:
            print(f"‚úì Converted {neg_income_count} negative income values to missing")
        
        # Impute missing income with median
        missing_income = df['Income'].isna().sum()
        if missing_income > 0:
            imputer = SimpleImputer(strategy='median')
            df['Income'] = imputer.fit_transform(df[['Income']])
            print(f"‚úì Imputed {missing_income} missing income values with median")
        
        # Winsorize: Cap extremes at 1st and 99th percentile
        lower, upper = df['Income'].quantile([0.01, 0.99])
        income_before = df['Income'].copy()
        df['Income'] = df['Income'].clip(lower, upper)
        winsorized = (income_before != df['Income']).sum()
        print(f"‚úì Winsorized {winsorized} income values at 1st/99th percentiles")
        print(f"  Income range: [{lower:,.0f}, {upper:,.0f}]")
    
    # =====================================================================
    # 2. FILTER AGE
    # =====================================================================
    if 'Age' in df.columns:
        age_before = len(df)
        df = df[df['Age'].between(10, 90)]
        age_filtered = age_before - len(df)
        if age_filtered > 0:
            print(f"‚úì Filtered {age_filtered} rows with implausible ages (keeping 10-90)")
    
    # =====================================================================
    # 3. CREATE LOGARITHMIC TRANSFORMATIONS
    # =====================================================================
    print(f"\nüìä Creating logarithmic transformations:")
    
    # Log of Income (if positive)
    if 'Income' in df.columns:
        df['Income_log'] = np.log1p(df['Income'])
        print(f"  ‚úì Income_log created (log1p transformation)")
    
    # Log of Clicks (if exists and positive)
    if 'Clicks' in df.columns:
        df['Clicks_log'] = np.log1p(df['Clicks'])
        print(f"  ‚úì Clicks_log created (log1p transformation)")
    
    # Log of Age (for nonlinear age effects)
    if 'Age' in df.columns:
        df['Age_log'] = np.log1p(df['Age'])
        print(f"  ‚úì Age_log created (log1p transformation)")
    
    # Log of CTR (if exists and positive)
    if 'CTR' in df.columns:
        # Ensure CTR is positive before log
        if (df['CTR'] > 0).all():
            df['CTR_log'] = np.log(df['CTR'])
            print(f"  ‚úì CTR_log created (log transformation)")
    
    # =====================================================================
    # SUMMARY
    # =====================================================================
    final_rows = len(df)
    rows_removed = initial_rows - final_rows
    
    print(f"\n{'='*60}")
    print(f"CLEANING SUMMARY:")
    print(f"  Initial rows:        {initial_rows:,}")
    print(f"  Final rows:          {final_rows:,}")
    print(f"  Rows removed:        {rows_removed:,} ({rows_removed/initial_rows*100:.1f}%)")
    print(f"  Log variables added: {len([col for col in df.columns if '_log' in col])}")
    print(f"{'='*60}\n")
    
    return df

    
def engineer_time_features(df):
    """Extract day of week and hour from Click_Time"""
    if 'Click_Time' in df.columns:
        df['Click_Time'] = pd.to_datetime(df['Click_Time'])
        df['Day_of_Week'] = df['Click_Time'].dt.dayofweek
        df['Hour'] = df['Click_Time'].dt.hour
    return df
    
def encode_categorical_features(df):
    """Encode categorical variables"""
    categorical_cols = ['Gender', 'Location', 'Ad_Type', 'Ad_Topic', 'Ad_Placement']
    
    for col in categorical_cols:
        if col in df.columns:
            le = LabelEncoder()
            df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str))
            # df[col] = le # This line was wierd, idk why it did this, but i may have to revert things back.
    
    return df

# gonna hold off on this interaction term stuff for now.
def engineer_instrument_features(df):
    """
    ENHANCED: Create rich features that predict clicks but don't directly affect conversions.
    
    This is crucial for instrument strength. We create:
    1. Interaction features between ad characteristics and demographics
    2. Time-based features (weekend, business hours)
    3. Nonlinear transformations
    4. Complex interactions between multiple variables
    
    Key principle: These features should predict CLICKS well, but only affect
    CONVERSIONS through clicks (exclusion restriction).
    """
    print("\n" + "="*60)
    print("FEATURE ENGINEERING FOR INSTRUMENT STRENGTH")
    print("="*60)
    
    # =====================================================================
    # 1. AD CHARACTERISTICS √ó DEMOGRAPHICS INTERACTIONS
    # =====================================================================
    # Rationale: Different demographics respond differently to ad types
    
    if all(col in df.columns for col in ['Income', 'Ad_Type_encoded']):
        df['Income_x_AdType'] = df['Income'] * df['Ad_Type_encoded']
        print("‚úì Created Income √ó Ad Type interaction")
        
    if all(col in df.columns for col in ['Age', 'Ad_Topic_encoded']):
        df['Age_x_AdTopic'] = df['Age'] * df['Ad_Topic_encoded']
        print("‚úì Created Age √ó Ad Topic interaction")
        
    if all(col in df.columns for col in ['Income', 'Ad_Placement_encoded']):
        df['Income_x_Placement'] = df['Income'] * df['Ad_Placement_encoded']
        print("‚úì Created Income √ó Ad Placement interaction")
        
    if all(col in df.columns for col in ['Age', 'Ad_Placement_encoded']):
        df['Age_x_Placement'] = df['Age'] * df['Ad_Placement_encoded']
        print("‚úì Created Age √ó Ad Placement interaction")
    
    # =====================================================================
    # 2. TIME-BASED FEATURES AND INTERACTIONS
    # =====================================================================
    # Rationale: Click patterns vary by time of day/week
    
    if 'Day_of_Week' in df.columns:
        df['Weekend'] = (df['Day_of_Week'] >= 5).astype(int)
        print("‚úì Created Weekend indicator")
        
    if 'Hour' in df.columns:
        df['BusinessHours'] = ((df['Hour'] >= 9) & (df['Hour'] <= 17)).astype(int)
        df['Evening'] = ((df['Hour'] >= 18) & (df['Hour'] <= 23)).astype(int)
        df['Morning'] = ((df['Hour'] >= 6) & (df['Hour'] <= 11)).astype(int)
        print("‚úì Created time-of-day indicators")
    
    # Time √ó Ad interactions
    if all(col in df.columns for col in ['Weekend', 'Ad_Type_encoded']):
        df['Weekend_x_AdType'] = df['Weekend'] * df['Ad_Type_encoded']
        print("‚úì Created Weekend √ó Ad Type interaction")
        
    if all(col in df.columns for col in ['BusinessHours', 'Ad_Placement_encoded']):
        df['BusinessHours_x_Placement'] = df['BusinessHours'] * df['Ad_Placement_encoded']
        print("‚úì Created Business Hours √ó Ad Placement interaction")
        
    if all(col in df.columns for col in ['Evening', 'Ad_Topic_encoded']):
        df['Evening_x_AdTopic'] = df['Evening'] * df['Ad_Topic_encoded']
        print("‚úì Created Evening √ó Ad Topic interaction")
    
    # =====================================================================
    # 3. DEMOGRAPHICS √ó TIME INTERACTIONS
    # =====================================================================
    # Rationale: Different demographics have different browsing patterns
    
    if all(col in df.columns for col in ['Age', 'Hour']):
        df['Age_x_Hour'] = df['Age'] * df['Hour']
        print("‚úì Created Age √ó Hour interaction")
        
    if all(col in df.columns for col in ['Income', 'Weekend']):
        df['Income_x_Weekend'] = df['Income'] * df['Weekend']
        print("‚úì Created Income √ó Weekend interaction")
        
    if all(col in df.columns for col in ['Age', 'BusinessHours']):
        df['Age_x_BusinessHours'] = df['Age'] * df['BusinessHours']
        print("‚úì Created Age √ó Business Hours interaction")
    
    # =====================================================================
    # 4. NONLINEAR TRANSFORMATIONS
    # =====================================================================
    # Rationale: Relationships may be nonlinear (using log-transformed versions)
    
    if 'Age_log' in df.columns:
        df['Age_squared'] = df['Age'] ** 2
        print("‚úì Created Age squared")
        
    if 'Income_log' in df.columns:
        df['Income_squared'] = df['Income'] ** 2
        df['Income_sqrt'] = np.sqrt(df['Income'].clip(lower=0))
        print("‚úì Created Income squared and sqrt")
    
    # =====================================================================
    # 5. COMPLEX CATEGORICAL INTERACTIONS
    # =====================================================================
    # Rationale: Certain combinations may be particularly predictive
    
    # Location √ó Demographics
    if all(col in df.columns for col in ['Location_encoded', 'Age']):
        df['Location_x_Age'] = df['Location_encoded'] * df['Age']
        print("‚úì Created Location √ó Age interaction")
        
    if all(col in df.columns for col in ['Location_encoded', 'Income']):
        df['Location_x_Income'] = df['Location_encoded'] * df['Income']
        print("‚úì Created Location √ó Income interaction")
    
    # Location √ó Ad characteristics
    if all(col in df.columns for col in ['Location_encoded', 'Ad_Placement_encoded']):
        df['Location_x_Placement'] = df['Location_encoded'] * df['Ad_Placement_encoded']
        print("‚úì Created Location √ó Placement interaction")
    
    # Gender √ó Ad characteristics
    if all(col in df.columns for col in ['Gender_encoded', 'Ad_Topic_encoded']):
        df['Gender_x_AdTopic'] = df['Gender_encoded'] * df['Ad_Topic_encoded']
        print("‚úì Created Gender √ó Ad Topic interaction")
        
    if all(col in df.columns for col in ['Gender_encoded', 'Ad_Type_encoded']):
        df['Gender_x_AdType'] = df['Gender_encoded'] * df['Ad_Type_encoded']
        print("‚úì Created Gender √ó Ad Type interaction")
    
    # Ad Type √ó Placement (different placements work for different types)
    if all(col in df.columns for col in ['Ad_Type_encoded', 'Ad_Placement_encoded']):
        df['AdType_x_Placement'] = df['Ad_Type_encoded'] * df['Ad_Placement_encoded']
        print("‚úì Created Ad Type √ó Placement interaction")
    
    # =====================================================================
    # 6. THREE-WAY INTERACTIONS (most powerful)
    # =====================================================================
    # Rationale: Capture complex patterns
    
    if all(col in df.columns for col in ['Age', 'Ad_Type_encoded', 'Weekend']):
        df['Age_x_AdType_x_Weekend'] = df['Age'] * df['Ad_Type_encoded'] * df['Weekend']
        print("‚úì Created Age √ó Ad Type √ó Weekend interaction")
        
    if all(col in df.columns for col in ['Income', 'Ad_Placement_encoded', 'BusinessHours']):
        df['Income_x_Placement_x_BizHours'] = df['Income'] * df['Ad_Placement_encoded'] * df['BusinessHours']
        print("‚úì Created Income √ó Placement √ó Business Hours interaction")
    
    print("="*60 + "\n")
    
    return df

##### Preprocessing End

##### Create ML Instrument Start

In [44]:
def create_ml_instrument(df, model_type='stacking', cv_folds=5, use_enhanced_features=False):
    """
    Generate ML-based instrument for Clicks using ensemble methods.
    Returns a new DataFrame with 'Clicks_predicted' column.
    """

    # --- Step 1: Define instrument features (strictly policy-side, not outcomes!) ---
    base_features = [
        'Age', 'Income',
        'Gender_encoded', 'Location_encoded',
        'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded',
        'Day_of_Week', 'Hour'
    ]

    enhanced_features = [
        # Interactions
        'Income_x_AdType', 'Age_x_AdTopic', 'Income_x_Placement', 'Age_x_Placement',
        'Weekend_x_AdType', 'BusinessHours_x_Placement', 'Evening_x_AdTopic',
        'Age_x_Hour', 'Income_x_Weekend', 'Age_x_BusinessHours',
        'Location_x_Age', 'Location_x_Income', 'Location_x_Placement',
        'Gender_x_AdTopic', 'Gender_x_AdType', 'AdType_x_Placement',
        'Age_x_AdType_x_Weekend', 'Income_x_Placement_x_BizHours',
        # Time features
        'Weekend', 'BusinessHours', 'Evening', 'Morning',
        # Nonlinear (now using cleaned log versions) NOTE only created if i log age, and income...
        'Age_squared', 'Age_log', 'Income_log', 'Income_squared', 'Income_sqrt',
        'Clicks_log', 'CTR_log'
    ]

    if use_enhanced_features:
        # Assume you have a separate function to engineer features
        df = engineer_instrument_features(df)
        instrument_features = base_features + enhanced_features
    else:
        instrument_features = base_features

    # Filter available features
    available_features = [f for f in instrument_features if f in df.columns]
    X = df[available_features]
    y = df['Clicks']

    # --- Step 2: Build model ---
    if model_type == 'stacking':
        base_models = [
            ('rf', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)),
            ('gb', GradientBoostingRegressor(n_estimators=200, random_state=42))
        ]
        try:
            from xgboost import XGBRegressor
            base_models.append(('xgb', XGBRegressor(n_estimators=200, random_state=42, n_jobs=-1)))
        except ImportError:
            pass
        model = StackingRegressor(estimators=base_models, final_estimator=Ridge(alpha=1.0), cv=cv_folds, n_jobs=-1)

    elif model_type == 'rf':
        model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)

    elif model_type == 'gb':
        model = GradientBoostingRegressor(n_estimators=200, random_state=42)

    else:
        raise ValueError("Invalid model_type. Choose 'stacking', 'rf', or 'gb'.")

    # --- Step 3: Generate out-of-fold predictions ---
    clicks_pred = cross_val_predict(model, X, y, cv=cv_folds, n_jobs=-1)
    df = df.copy()
    df['Clicks_predicted'] = clicks_pred

    # --- Step 4: Fit final model (optional, for diagnostics) ---
    model.fit(X, y)

    return df, X, model

def enhanced_instrument_diagnostics(df, X, y, model):
    """
    Functional: Comprehensive instrument strength testing with Stock-Yogo critical values.
    
    Parameters
    ----------
    df : pd.DataFrame
        Must contain 'Clicks' and 'Clicks_predicted' columns.
    X : pd.DataFrame
        Feature matrix used in first-stage model.
    y : pd.Series or np.array
        True clicks (endogenous regressor).
    model : fitted sklearn model
        First-stage ML model used to generate instruments.
    """

    # --- Extract instrument (Z) and endogenous regressor (D) ---
    if 'Clicks_predicted' not in df.columns:
        raise ValueError("DataFrame must contain 'Clicks_predicted' column")
    z = df['Clicks_predicted'].values
    d = df['Clicks'].values

    n = len(d)
    k = X.shape[1]

    # --- First-stage R¬≤ and F-statistic ---
    d_resid = d - d.mean()
    ss_tot = np.sum(d_resid**2)
    ss_res = np.sum((d - z)**2)
    r_squared = 1 - (ss_res / ss_tot)

    f_stat = (r_squared / 1) / ((1 - r_squared) / (n - k - 1))

    # --- Correlation ---
    corr = np.corrcoef(z, d)[0, 1]

    # --- Cragg-Donald statistic ---
    cragg_donald = n * r_squared

    # --- Display results ---
    print(f"\n{'='*70}")
    print("ENHANCED INSTRUMENT STRENGTH DIAGNOSTICS")
    print(f"{'='*70}")
    print("\nSAMPLE INFORMATION:")
    print(f"  Sample size (n):              {n:,}")
    print(f"  Number of features (k):       {k}")
    print("\nFIRST-STAGE PERFORMANCE:")
    print(f"  R-squared:                    {r_squared:.4f}")
    print(f"  Correlation (Z, D):           {corr:.4f}")
    print(f"  F-statistic:                  {f_stat:.2f}")
    print(f"  Cragg-Donald statistic:       {cragg_donald:.2f}")

    print("\nBENCHMARKS & INTERPRETATION:")
    print(f"  {'Criterion':<35} {'Threshold':<12} {'Status'}")
    print(f"  {'-'*35} {'-'*12} {'-'*20}")
    weak_status = "‚úì STRONG" if f_stat > 10 else "‚úó WEAK"
    print(f"  {'Weak Instrument (F < 10)':<35} {'10.00':<12} {weak_status}")
    sy_10_status = "‚úì‚úì EXCELLENT" if f_stat > 16.38 else "‚úó Below threshold"
    sy_15_status = "‚úì GOOD" if f_stat > 8.96 else "‚úó Below threshold"
    print(f"  {'Stock-Yogo 10% max bias':<35} {'16.38':<12} {sy_10_status}")
    print(f"  {'Stock-Yogo 15% max bias':<35} {'8.96':<12} {sy_15_status}")

    print("\nOVERALL ASSESSMENT:")
    if f_stat > 16.38:
        print("  ‚úì‚úì VERY STRONG INSTRUMENT")
        print("     Maximum IV bias < 10% of OLS bias")
    elif f_stat > 10:
        print("  ‚úì STRONG INSTRUMENT")
        print("     Acceptable for causal inference")
    elif f_stat > 5:
        print("  ‚ö† MODERATELY WEAK INSTRUMENT")
        print("     Proceed with caution")
    else:
        print("  ‚úó WEAK INSTRUMENT")
        print("     Results may be unreliable")

    # --- Feature importance (if available) ---
    if hasattr(model, 'feature_importances_'):
        print("\nTOP 10 MOST IMPORTANT FEATURES FOR PREDICTING CLICKS:")
        importances = model.feature_importances_
        top_features = sorted(zip(X.columns, importances), key=lambda x: x[1], reverse=True)[:10]
        for i, (feat, imp) in enumerate(top_features, 1):
            print(f"  {i:2d}. {feat:35s} {imp:.4f}")
    elif hasattr(model, 'final_estimator_'):
        print("\n‚Ñπ Stacking ensemble used - feature importances not directly available")

    print(f"{'='*70}\n")

##### Create ML Instrument End

##### 2SLS Start

In [45]:

def run_2sls(self, include_interactions=False):
    """
    Step 3: Two-Stage Least Squares Estimation
    
    First Stage: D = œÄ‚ÇÄ + œÄ‚ÇÅZ + œÄ‚ÇÇX + ŒΩ
    Second Stage: Y = Œ± + Œ≤DÃÇ + Œ≥X + Œµ
    
    Parameters:
    -----------
    include_interactions : bool
        Whether to include Ad_Type √ó Ad_Placement interactions
    """
    # Exogenous controls (X)
    exog_controls = [
        'Age', 'Income',
        'Gender_encoded', 'Location_encoded',
        'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded',
        'CTR'
    ]
    
    available_controls = [f for f in exog_controls if f in self.data.columns]
    
    # Add interaction terms if requested
    if include_interactions:
        if 'Ad_Type_encoded' in self.data.columns and 'Ad_Placement_encoded' in self.data.columns:
            self.data['Ad_Type_x_Placement'] = (
                self.data['Ad_Type_encoded'] * self.data['Ad_Placement_encoded']
            )
            available_controls.append('Ad_Type_x_Placement')
    
    print('2sls data summar: ', self.data.describe(include='all'))
    # FIRST STAGE: Regress D on Z and X
    print("\n" + "="*60)
    print("FIRST STAGE REGRESSION: D ~ Z + X")
    print("="*60)
    
    print('available controls: ', available_controls)
    X_first_stage = sm.add_constant(pd.concat([
        self.data[['Clicks_predicted']],
        self.data[available_controls]
    ], axis=1))
    
    y_first_stage = self.data['Clicks']
    
    self.first_stage_results = sm.OLS(y_first_stage, X_first_stage).fit()
    
    print("\nFirst Stage Summary:")
    print(f"R-squared: {self.first_stage_results.rsquared:.4f}")
    print(f"F-statistic: {self.first_stage_results.fvalue:.2f}")
    print(f"Instrument coefficient: {self.first_stage_results.params['Clicks_predicted']:.4f}")
    print(f"Instrument p-value: {self.first_stage_results.pvalues['Clicks_predicted']:.4f}")
    
    # Get fitted values from first stage
    D_hat = self.first_stage_results.fittedvalues
    
    # SECOND STAGE: Regress Y on D_hat and X
    print("\n" + "="*60)
    print("SECOND STAGE REGRESSION: Y ~ DÃÇ + X")
    print("="*60)
    
    X_second_stage = sm.add_constant(pd.concat([
        pd.Series(D_hat, name='Clicks_fitted'),
        self.data[available_controls]
    ], axis=1))
    
    y_second_stage = self.data['Conversion_Rate']
    
    self.second_stage_results = sm.OLS(y_second_stage, X_second_stage).fit()
    
    # Manual calculation of correct standard errors for 2SLS
    self._calculate_2sls_standard_errors(available_controls)
    
    self._display_results()
    
    return self

##### 2SLS End

##### Start of Implementation

In [46]:
# Cleaning and preprocessing
df = pd.read_csv('../datasets/project/Dataset_Ads.csv')
print("\n" + "="*60)
print('ORIGINAL DATASET')
print("="*60)
print(df.head())

df = clean_data(df)
print("\n" + "="*60)
print('CLEANED AND LOGGED DATASET')
print("="*60)
print(df.head())

df = engineer_time_features(df)
print("\n" + "="*60)
print('TIME ENGINEERED COLUMN')
print("="*60)
print(df.head())

df = encode_categorical_features(df)
print("\n" + "="*60)
print('ENCODED CATEGORICAL VARIABLES')
print("="*60)
print(df.head())

print("\n" + "="*60)
print('DESCRIPTION OF DF AFTER PREPROCESSING')
print("="*60)
print(df.describe(include='all'))


ORIGINAL DATASET
   Age  Gender    Income  Location Ad_Type Ad_Topic   Ad_Placement  Clicks  \
0   61    Male  35717.43     Urban  Banner   Travel   Social Media       3   
1   41    Male  47453.25     Rural   Video   Travel  Search Engine       5   
2   49  Female  68126.35     Rural    Text     Food   Social Media       4   
3   68  Female  64585.73  Suburban    Text   Health        Website       6   
4   63    Male  21109.40     Urban  Native  Fashion  Search Engine       5   

                   Click_Time  Conversion_Rate     CTR  
0  2024-01-18 20:45:56.898459           0.0981  0.0737  
1  2023-04-24 20:45:56.898459           0.0937  0.0592  
2  2024-02-24 20:45:56.898459           0.1912  0.0563  
3  2023-12-13 20:45:56.898459           0.1122  0.0232  
4  2023-07-02 20:45:56.898459           0.1426  0.0539  

DATA CLEANING AND PREPROCESSING
‚úì Converted 70 negative income values to missing
‚úì Imputed 70 missing income values with median
‚úì Winsorized 200 income values at 1s

In [None]:
# ML instrument creation took about 1m40secs
print("\n" + "="*60)
print('CREATING ML INSTRUMENT')
print("="*60)
df, X, model = create_ml_instrument(df, use_enhanced_features=True)
# NOTE Remember that the instrument is really weak when created not using the interaction terms.


CREATING ML INSTRUMENT

FEATURE ENGINEERING FOR INSTRUMENT STRENGTH
‚úì Created Income √ó Ad Type interaction
‚úì Created Age √ó Ad Topic interaction
‚úì Created Income √ó Ad Placement interaction
‚úì Created Age √ó Ad Placement interaction
‚úì Created Weekend indicator
‚úì Created time-of-day indicators
‚úì Created Weekend √ó Ad Type interaction
‚úì Created Business Hours √ó Ad Placement interaction
‚úì Created Evening √ó Ad Topic interaction
‚úì Created Age √ó Hour interaction
‚úì Created Income √ó Weekend interaction
‚úì Created Age √ó Business Hours interaction
‚úì Created Age squared
‚úì Created Income squared and sqrt
‚úì Created Location √ó Age interaction
‚úì Created Location √ó Income interaction
‚úì Created Location √ó Placement interaction
‚úì Created Gender √ó Ad Topic interaction
‚úì Created Gender √ó Ad Type interaction
‚úì Created Ad Type √ó Placement interaction
‚úì Created Age √ó Ad Type √ó Weekend interaction
‚úì Created Income √ó Placement √ó Business Hours interact

In [48]:
# diagnostics for ml instrument strength
print("\n" + "="*60)
print('ML INSTRUMENT DIAGNOSTICS')
print("="*60)
enhanced_instrument_diagnostics(df, X, df['Clicks'], model)


ML INSTRUMENT DIAGNOSTICS

ENHANCED INSTRUMENT STRENGTH DIAGNOSTICS

SAMPLE INFORMATION:
  Sample size (n):              9,543
  Number of features (k):       37

FIRST-STAGE PERFORMANCE:
  R-squared:                    0.9998
  Correlation (Z, D):           0.9999
  F-statistic:                  63116170.43
  Cragg-Donald statistic:       9541.56

BENCHMARKS & INTERPRETATION:
  Criterion                           Threshold    Status
  ----------------------------------- ------------ --------------------
  Weak Instrument (F < 10)            10.00        ‚úì STRONG
  Stock-Yogo 10% max bias             16.38        ‚úì‚úì EXCELLENT
  Stock-Yogo 15% max bias             8.96         ‚úì GOOD

OVERALL ASSESSMENT:
  ‚úì‚úì VERY STRONG INSTRUMENT
     Maximum IV bias < 10% of OLS bias

‚Ñπ Stacking ensemble used - feature importances not directly available



##### End of Implementation