Previous file was getting too chunky. This one has just the finalized complete pipeline for the ml feature engineered thang for 2SLS.

##### Python Libraries

In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
import statsmodels.api as sm
from scipy import stats
from xgboost import XGBRegressor
from linearmodels.iv import IV2SLS
import warnings

warnings.filterwarnings('ignore')

Thought vomit
What are my knowns?

I want to show the data initially, and then also show that clicks is endogenous to conversion rate.

Once i'm there I want to show my data isn't that clean and make a cleaned dataset.

So there needs to be a preprocessing step that handles bad data, and logs stuff appropriately.

Now. I want to make the interaction terms and such.

Once that is all done I will create a machine learning instrument from enhanced features.

Then it seems I'm foggy on how the instrument creation and 2sls will work with each other.

Plus the whole point of this is to try to integrate Raj Chetty's 2014 forecast bias with value-added estimates into this somehow.

I want to make sure my entire implementation is proper. Even if the results are null.


##### Preprocessing Start

In [13]:
def clean_data(df):
    """
    Clean and preprocess data before analysis.
    
    Performs:
    1. Handle negative income values
    2. Impute missing income with median
    3. Winsorize income at 1st and 99th percentiles
    4. Filter age to plausible range (10-90 years)
    5. Create logarithmic transformations for skewed variables
    """
    print("\n" + "="*60)
    print("DATA CLEANING AND PREPROCESSING")
    print("="*60)
    
    initial_rows = len(df)
    
    # =====================================================================
    # 1. CLEAN INCOME
    # =====================================================================
    if 'Income' in df.columns:
        # Convert negative income to missing
        neg_income_count = (df['Income'] < 0).sum()
        df.loc[df['Income'] < 0, 'Income'] = np.nan
        
        if neg_income_count > 0:
            print(f"âœ“ Converted {neg_income_count} negative income values to missing")
        
        # Impute missing income with median
        missing_income = df['Income'].isna().sum()
        if missing_income > 0:
            imputer = SimpleImputer(strategy='median')
            df['Income'] = imputer.fit_transform(df[['Income']])
            print(f"âœ“ Imputed {missing_income} missing income values with median")
        
        # Winsorize: Cap extremes at 1st and 99th percentile
        lower, upper = df['Income'].quantile([0.01, 0.99])
        income_before = df['Income'].copy()
        df['Income'] = df['Income'].clip(lower, upper)
        winsorized = (income_before != df['Income']).sum()
        print(f"âœ“ Winsorized {winsorized} income values at 1st/99th percentiles")
        print(f"  Income range: [{lower:,.0f}, {upper:,.0f}]")
    
    # =====================================================================
    # 2. FILTER AGE
    # =====================================================================
    if 'Age' in df.columns:
        age_before = len(df)
        df = df[df['Age'].between(10, 90)]
        age_filtered = age_before - len(df)
        if age_filtered > 0:
            print(f"âœ“ Filtered {age_filtered} rows with implausible ages (keeping 10-90)")
    
    # =====================================================================
    # 3. CREATE LOGARITHMIC TRANSFORMATIONS
    # =====================================================================
    print(f"\nðŸ“Š Creating logarithmic transformations:")
    
    # Log of Income (if positive)
    if 'Income' in df.columns:
        df['Income_log'] = np.log1p(df['Income'])
        print(f"  âœ“ Income_log created (log1p transformation)")
    
    # Log of Clicks (if exists and positive)
    if 'Clicks' in df.columns:
        df['Clicks_log'] = np.log1p(df['Clicks'])
        print(f"  âœ“ Clicks_log created (log1p transformation)")
    
    # Log of Age (for nonlinear age effects)
    if 'Age' in df.columns:
        df['Age_log'] = np.log1p(df['Age'])
        print(f"  âœ“ Age_log created (log1p transformation)")
    
    # Log of CTR (if exists and positive)
    if 'CTR' in df.columns:
        # Ensure CTR is positive before log
        if (df['CTR'] > 0).all():
            df['CTR_log'] = np.log(df['CTR'])
            print(f"  âœ“ CTR_log created (log transformation)")
    
    # =====================================================================
    # SUMMARY
    # =====================================================================
    final_rows = len(df)
    rows_removed = initial_rows - final_rows
    
    print(f"\n{'='*60}")
    print(f"CLEANING SUMMARY:")
    print(f"  Initial rows:        {initial_rows:,}")
    print(f"  Final rows:          {final_rows:,}")
    print(f"  Rows removed:        {rows_removed:,} ({rows_removed/initial_rows*100:.1f}%)")
    print(f"  Log variables added: {len([col for col in df.columns if '_log' in col])}")
    print(f"{'='*60}\n")
    
    return df

    
def engineer_time_features(df):
    """Extract day of week and hour from Click_Time"""
    if 'Click_Time' in df.columns:
        df['Click_Time'] = pd.to_datetime(df['Click_Time'])
        df['Day_of_Week'] = df['Click_Time'].dt.dayofweek
        df['Hour'] = df['Click_Time'].dt.hour
    return df
    
def encode_categorical_features(df):
    """Encode categorical variables"""
    categorical_cols = ['Gender', 'Location', 'Ad_Type', 'Ad_Topic', 'Ad_Placement']
    
    for col in categorical_cols:
        if col in df.columns:
            le = LabelEncoder()
            df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str))
            # df[col] = le # This line was wierd, idk why it did this, but i may have to revert things back.
    
    return df

# gonna hold off on this interaction term stuff for now.
def engineer_instrument_features(df):
    """
    ENHANCED: Create rich features that predict clicks but don't directly affect conversions.
    
    This is crucial for instrument strength. We create:
    1. Interaction features between ad characteristics and demographics
    2. Time-based features (weekend, business hours)
    3. Nonlinear transformations
    4. Complex interactions between multiple variables
    
    Key principle: These features should predict CLICKS well, but only affect
    CONVERSIONS through clicks (exclusion restriction).
    """
    print("\n" + "="*60)
    print("FEATURE ENGINEERING FOR INSTRUMENT STRENGTH")
    print("="*60)
    
    # =====================================================================
    # 1. AD CHARACTERISTICS Ã— DEMOGRAPHICS INTERACTIONS
    # =====================================================================
    # Rationale: Different demographics respond differently to ad types
    
    if all(col in df.columns for col in ['Income', 'Ad_Type_encoded']):
        df['Income_x_AdType'] = df['Income'] * df['Ad_Type_encoded']
        print("âœ“ Created Income Ã— Ad Type interaction")
        
    if all(col in df.columns for col in ['Age', 'Ad_Topic_encoded']):
        df['Age_x_AdTopic'] = df['Age'] * df['Ad_Topic_encoded']
        print("âœ“ Created Age Ã— Ad Topic interaction")
        
    if all(col in df.columns for col in ['Income', 'Ad_Placement_encoded']):
        df['Income_x_Placement'] = df['Income'] * df['Ad_Placement_encoded']
        print("âœ“ Created Income Ã— Ad Placement interaction")
        
    if all(col in df.columns for col in ['Age', 'Ad_Placement_encoded']):
        df['Age_x_Placement'] = df['Age'] * df['Ad_Placement_encoded']
        print("âœ“ Created Age Ã— Ad Placement interaction")
    
    # =====================================================================
    # 2. TIME-BASED FEATURES AND INTERACTIONS
    # =====================================================================
    # Rationale: Click patterns vary by time of day/week
    
    if 'Day_of_Week' in df.columns:
        df['Weekend'] = (df['Day_of_Week'] >= 5).astype(int)
        print("âœ“ Created Weekend indicator")
        
    if 'Hour' in df.columns:
        df['BusinessHours'] = ((df['Hour'] >= 9) & (df['Hour'] <= 17)).astype(int)
        df['Evening'] = ((df['Hour'] >= 18) & (df['Hour'] <= 23)).astype(int)
        df['Morning'] = ((df['Hour'] >= 6) & (df['Hour'] <= 11)).astype(int)
        print("âœ“ Created time-of-day indicators")
    
    # Time Ã— Ad interactions
    if all(col in df.columns for col in ['Weekend', 'Ad_Type_encoded']):
        df['Weekend_x_AdType'] = df['Weekend'] * df['Ad_Type_encoded']
        print("âœ“ Created Weekend Ã— Ad Type interaction")
        
    if all(col in df.columns for col in ['BusinessHours', 'Ad_Placement_encoded']):
        df['BusinessHours_x_Placement'] = df['BusinessHours'] * df['Ad_Placement_encoded']
        print("âœ“ Created Business Hours Ã— Ad Placement interaction")
        
    if all(col in df.columns for col in ['Evening', 'Ad_Topic_encoded']):
        df['Evening_x_AdTopic'] = df['Evening'] * df['Ad_Topic_encoded']
        print("âœ“ Created Evening Ã— Ad Topic interaction")
    
    # =====================================================================
    # 3. DEMOGRAPHICS Ã— TIME INTERACTIONS
    # =====================================================================
    # Rationale: Different demographics have different browsing patterns
    
    if all(col in df.columns for col in ['Age', 'Hour']):
        df['Age_x_Hour'] = df['Age'] * df['Hour']
        print("âœ“ Created Age Ã— Hour interaction")
        
    if all(col in df.columns for col in ['Income', 'Weekend']):
        df['Income_x_Weekend'] = df['Income'] * df['Weekend']
        print("âœ“ Created Income Ã— Weekend interaction")
        
    if all(col in df.columns for col in ['Age', 'BusinessHours']):
        df['Age_x_BusinessHours'] = df['Age'] * df['BusinessHours']
        print("âœ“ Created Age Ã— Business Hours interaction")
    
    # =====================================================================
    # 4. NONLINEAR TRANSFORMATIONS
    # =====================================================================
    # Rationale: Relationships may be nonlinear (using log-transformed versions)
    
    if 'Age_log' in df.columns:
        df['Age_squared'] = df['Age'] ** 2
        print("âœ“ Created Age squared")
        
    if 'Income_log' in df.columns:
        df['Income_squared'] = df['Income'] ** 2
        df['Income_sqrt'] = np.sqrt(df['Income'].clip(lower=0))
        print("âœ“ Created Income squared and sqrt")
    
    # =====================================================================
    # 5. COMPLEX CATEGORICAL INTERACTIONS
    # =====================================================================
    # Rationale: Certain combinations may be particularly predictive
    
    # Location Ã— Demographics
    if all(col in df.columns for col in ['Location_encoded', 'Age']):
        df['Location_x_Age'] = df['Location_encoded'] * df['Age']
        print("âœ“ Created Location Ã— Age interaction")
        
    if all(col in df.columns for col in ['Location_encoded', 'Income']):
        df['Location_x_Income'] = df['Location_encoded'] * df['Income']
        print("âœ“ Created Location Ã— Income interaction")
    
    # Location Ã— Ad characteristics
    if all(col in df.columns for col in ['Location_encoded', 'Ad_Placement_encoded']):
        df['Location_x_Placement'] = df['Location_encoded'] * df['Ad_Placement_encoded']
        print("âœ“ Created Location Ã— Placement interaction")
    
    # Gender Ã— Ad characteristics
    if all(col in df.columns for col in ['Gender_encoded', 'Ad_Topic_encoded']):
        df['Gender_x_AdTopic'] = df['Gender_encoded'] * df['Ad_Topic_encoded']
        print("âœ“ Created Gender Ã— Ad Topic interaction")
        
    if all(col in df.columns for col in ['Gender_encoded', 'Ad_Type_encoded']):
        df['Gender_x_AdType'] = df['Gender_encoded'] * df['Ad_Type_encoded']
        print("âœ“ Created Gender Ã— Ad Type interaction")
    
    # Ad Type Ã— Placement (different placements work for different types)
    if all(col in df.columns for col in ['Ad_Type_encoded', 'Ad_Placement_encoded']):
        df['AdType_x_Placement'] = df['Ad_Type_encoded'] * df['Ad_Placement_encoded']
        print("âœ“ Created Ad Type Ã— Placement interaction")
    
    # =====================================================================
    # 6. THREE-WAY INTERACTIONS (most powerful)
    # =====================================================================
    # Rationale: Capture complex patterns
    
    if all(col in df.columns for col in ['Age', 'Ad_Type_encoded', 'Weekend']):
        df['Age_x_AdType_x_Weekend'] = df['Age'] * df['Ad_Type_encoded'] * df['Weekend']
        print("âœ“ Created Age Ã— Ad Type Ã— Weekend interaction")
        
    if all(col in df.columns for col in ['Income', 'Ad_Placement_encoded', 'BusinessHours']):
        df['Income_x_Placement_x_BizHours'] = df['Income'] * df['Ad_Placement_encoded'] * df['BusinessHours']
        print("âœ“ Created Income Ã— Placement Ã— Business Hours interaction")
    
    print("="*60 + "\n")
    
    return df

##### Preprocessing End

##### Create ML Instrument Start

In [14]:
def create_ml_instrument(df, model_type='stacking', cv_folds=5, use_enhanced_features=False):
    """
    Generate ML-based instrument for Clicks using ensemble methods.
    Returns a new DataFrame with 'Clicks_predicted' column.
    """

    # --- Step 1: Define instrument features (strictly policy-side, not outcomes!) ---
    base_features = [
        'Age', 'Income',
        'Gender_encoded', 'Location_encoded',
        'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded',
        'Day_of_Week', 'Hour'
    ]

    enhanced_features = [
        # Interactions
        'Income_x_AdType', 'Age_x_AdTopic', 'Income_x_Placement', 'Age_x_Placement',
        'Weekend_x_AdType', 'BusinessHours_x_Placement', 'Evening_x_AdTopic',
        'Age_x_Hour', 'Income_x_Weekend', 'Age_x_BusinessHours',
        'Location_x_Age', 'Location_x_Income', 'Location_x_Placement',
        'Gender_x_AdTopic', 'Gender_x_AdType', 'AdType_x_Placement',
        'Age_x_AdType_x_Weekend', 'Income_x_Placement_x_BizHours',
        # Time features
        'Weekend', 'BusinessHours', 'Evening', 'Morning',
        # Nonlinear (now using cleaned log versions) NOTE only created if i log age, and income...
        'Age_squared', 'Age_log', 'Income_log', 'Income_squared', 'Income_sqrt',
        'Clicks_log', 'CTR_log'
    ]

    if use_enhanced_features:
        # Assume you have a separate function to engineer features
        df = engineer_instrument_features(df)
        instrument_features = base_features + enhanced_features
    else:
        instrument_features = base_features

    # Filter available features
    available_features = [f for f in instrument_features if f in df.columns]
    X = df[available_features]
    y = df['Clicks']

    # --- Step 2: Build model ---
    if model_type == 'stacking':
        base_models = [
            ('rf', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)),
            ('gb', GradientBoostingRegressor(n_estimators=200, random_state=42))
        ]
        try:
            from xgboost import XGBRegressor
            base_models.append(('xgb', XGBRegressor(n_estimators=200, random_state=42, n_jobs=-1)))
        except ImportError:
            pass
        model = StackingRegressor(estimators=base_models, final_estimator=Ridge(alpha=1.0), cv=cv_folds, n_jobs=-1)

    elif model_type == 'rf':
        model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)

    elif model_type == 'gb':
        model = GradientBoostingRegressor(n_estimators=200, random_state=42)

    else:
        raise ValueError("Invalid model_type. Choose 'stacking', 'rf', or 'gb'.")

    # --- Step 3: Generate out-of-fold predictions ---
    clicks_pred = cross_val_predict(model, X, y, cv=cv_folds, n_jobs=-1)
    df = df.copy()
    df['Clicks_predicted'] = clicks_pred

    # --- Step 4: Fit final model (optional, for diagnostics) ---
    model.fit(X, y)

    return df, X, model

def enhanced_instrument_diagnostics(df, X, y, model):
    """
    Functional: Comprehensive instrument strength testing with Stock-Yogo critical values.
    
    Parameters
    ----------
    df : pd.DataFrame
        Must contain 'Clicks' and 'Clicks_predicted' columns.
    X : pd.DataFrame
        Feature matrix used in first-stage model.
    y : pd.Series or np.array
        True clicks (endogenous regressor).
    model : fitted sklearn model
        First-stage ML model used to generate instruments.
    """

    # --- Extract instrument (Z) and endogenous regressor (D) ---
    if 'Clicks_predicted' not in df.columns:
        raise ValueError("DataFrame must contain 'Clicks_predicted' column")
    z = df['Clicks_predicted'].values
    d = df['Clicks'].values

    n = len(d)
    k = X.shape[1]

    # --- First-stage RÂ² and F-statistic ---
    d_resid = d - d.mean()
    ss_tot = np.sum(d_resid**2)
    ss_res = np.sum((d - z)**2)
    r_squared = 1 - (ss_res / ss_tot)

    f_stat = (r_squared / 1) / ((1 - r_squared) / (n - k - 1))

    # --- Correlation ---
    corr = np.corrcoef(z, d)[0, 1]

    # --- Cragg-Donald statistic ---
    cragg_donald = n * r_squared

    # --- Display results ---
    print(f"\n{'='*70}")
    print("ENHANCED INSTRUMENT STRENGTH DIAGNOSTICS")
    print(f"{'='*70}")
    print("\nSAMPLE INFORMATION:")
    print(f"  Sample size (n):              {n:,}")
    print(f"  Number of features (k):       {k}")
    print("\nFIRST-STAGE PERFORMANCE:")
    print(f"  R-squared:                    {r_squared:.4f}")
    print(f"  Correlation (Z, D):           {corr:.4f}")
    print(f"  F-statistic:                  {f_stat:.2f}")
    print(f"  Cragg-Donald statistic:       {cragg_donald:.2f}")

    print("\nBENCHMARKS & INTERPRETATION:")
    print(f"  {'Criterion':<35} {'Threshold':<12} {'Status'}")
    print(f"  {'-'*35} {'-'*12} {'-'*20}")
    weak_status = "âœ“ STRONG" if f_stat > 10 else "âœ— WEAK"
    print(f"  {'Weak Instrument (F < 10)':<35} {'10.00':<12} {weak_status}")
    sy_10_status = "âœ“âœ“ EXCELLENT" if f_stat > 16.38 else "âœ— Below threshold"
    sy_15_status = "âœ“ GOOD" if f_stat > 8.96 else "âœ— Below threshold"
    print(f"  {'Stock-Yogo 10% max bias':<35} {'16.38':<12} {sy_10_status}")
    print(f"  {'Stock-Yogo 15% max bias':<35} {'8.96':<12} {sy_15_status}")

    print("\nOVERALL ASSESSMENT:")
    if f_stat > 16.38:
        print("  âœ“âœ“ VERY STRONG INSTRUMENT")
        print("     Maximum IV bias < 10% of OLS bias")
    elif f_stat > 10:
        print("  âœ“ STRONG INSTRUMENT")
        print("     Acceptable for causal inference")
    elif f_stat > 5:
        print("  âš  MODERATELY WEAK INSTRUMENT")
        print("     Proceed with caution")
    else:
        print("  âœ— WEAK INSTRUMENT")
        print("     Results may be unreliable")

    # --- Feature importance (if available) ---
    if hasattr(model, 'feature_importances_'):
        print("\nTOP 10 MOST IMPORTANT FEATURES FOR PREDICTING CLICKS:")
        importances = model.feature_importances_
        top_features = sorted(zip(X.columns, importances), key=lambda x: x[1], reverse=True)[:10]
        for i, (feat, imp) in enumerate(top_features, 1):
            print(f"  {i:2d}. {feat:35s} {imp:.4f}")
    elif hasattr(model, 'final_estimator_'):
        print("\nâ„¹ Stacking ensemble used - feature importances not directly available")

    print(f"{'='*70}\n")

##### Create ML Instrument End

##### 2SLS Start

In [15]:
def run_2sls(
    df,
    y_col='Conversion_Rate',
    d_col='Clicks',
    z_col='Clicks_predicted',
    base_controls=None,
    include_interactions=False,
    add_constant=True,
    cluster_col=None,
    cov_type='robust'  # 'robust' for HC, or 'cluster' if cluster_col is set
):
    """
    Functional 2SLS using linearmodels.iv.IV2SLS.

    Model:
        First stage: D = Ï€0 + Ï€1 Z + Î“ X + Î½
        Second stage: Y = Î± + Î² D + Î˜ X + Îµ

    Parameters
    ----------
    df : pd.DataFrame
        Must contain y_col, d_col, z_col, and any control columns.
    y_col : str
        Outcome column (e.g., 'Conversion_Rate').
    d_col : str
        Endogenous regressor (e.g., 'Clicks').
    z_col : str
        Instrument column (e.g., 'Clicks_predicted' from ML first stage).
    base_controls : list[str] or None
        Exogenous controls. Do NOT include outcome-adjacent variables like CTR.
        Recommended: ['Age', 'Income', 'Gender_encoded', 'Location_encoded',
                      'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded'].
    include_interactions : bool
        If True, include Ad_Type Ã— Ad_Placement interaction (exogenous).
    add_constant : bool
        If True, add a constant term automatically.
    cluster_col : str or None
        Column name for cluster-robust SEs (e.g., 'UserID', 'CampaignID').
    cov_type : str
        'robust' (HC), 'cluster' (requires cluster_col), or 'unadjusted'.

    Returns
    -------
    results : IV2SLSResults
        Fitted IV results object from linearmodels.
    data_used : pd.DataFrame
        DataFrame with columns actually used in estimation.
    """

    # --- Validate required columns ---
    required = [y_col, d_col, z_col]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    # --- Controls: enforce exogeneity discipline ---
    if base_controls is None:
        base_controls = [
            'Age', 'Income',
            'Gender_encoded', 'Location_encoded',
            'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded'
        ]

    # Strict: exclude CTR or any post-click/outcome-adjacent metrics from controls
    invalid_controls = [c for c in base_controls if c not in df.columns]
    if invalid_controls:
        # Warn strictly but proceed with available subset
        print(f"âš  Skipping missing controls: {invalid_controls}")
    controls = [c for c in base_controls if c in df.columns]

    # --- Optional exogenous interaction ---
    interaction_col = None
    if include_interactions:
        if ('Ad_Type_encoded' in df.columns) and ('Ad_Placement_encoded' in df.columns):
            interaction_col = 'Ad_Type_x_Placement'
            if interaction_col not in df.columns:
                df = df.copy()
                df[interaction_col] = df['Ad_Type_encoded'] * df['Ad_Placement_encoded']
                print("âœ“ Added exogenous interaction: Ad_Type_x_Placement")
            controls.append(interaction_col)
        else:
            print("âš  Interaction requested but required columns not present; skipping.")

    # --- Build data frame used in estimation ---
    cols_needed = [y_col, d_col, z_col] + controls
    data = df[cols_needed].dropna().copy()
    if data.empty:
        raise ValueError("After dropping NA, no rows remain for estimation.")

    # --- Build formula for IV2SLS ---
    # dependent ~ exog + [endog ~ instruments]
    exog_formula = ' + '.join(controls) if controls else '1'
    if add_constant and exog_formula != '1':
        exog_formula = '1 + ' + exog_formula  # linearmodels adds constant via '1 +'
    elif add_constant and exog_formula == '1':
        # '1' already denotes constant in linearmodels formula
        pass
    else:
        # No constant: use '-1' to suppress intercept if you have exog terms
        if controls:
            exog_formula = '-1 + ' + ' + '.join(controls)

    formula = f"{y_col} ~ {exog_formula} + [{d_col} ~ {z_col}]"

    # --- Fit IV2SLS ---
    if cov_type == 'cluster' and (cluster_col is not None) and (cluster_col in df.columns):
        clusters = df.loc[data.index, cluster_col]
        results = IV2SLS.from_formula(formula, data=data).fit(cov_type='clustered', clusters=clusters)
    else:
        if cov_type == 'cluster' and cluster_col is None:
            print("âš  cov_type='cluster' requested but no cluster_col provided; defaulting to robust.")
        results = IV2SLS.from_formula(formula, data=data).fit(cov_type='robust' if cov_type != 'unadjusted' else 'unadjusted')

    # --- Strict reporting ---
    print("\n" + "="*70)
    print("2SLS ESTIMATION SUMMARY (linearmodels.iv.IV2SLS)")
    print("="*70)
    print(results.summary)

    # Optional first-stage diagnostics available via .first_stage (dict of RegressionResults)
    # Example:
    try:
        fs = results.first_stage[d_col]
        print("\nFirst-stage summary (endogenous regressor: {}):".format(d_col))
        print(f"  R-squared: {fs.rsquared:.4f}")
        print(f"  F-statistic (excluded instrument): {getattr(fs, 'f_statistic', None)}")
    except Exception:
        pass

    return results, data

##### 2SLS End

##### Stratified 2SLS Start

In [None]:
import numpy as np
import pandas as pd
from linearmodels.iv import IV2SLS

def analyze_subgroup_effects_iv(
    df,
    subgroup_vars=None,
    min_subgroup_size=100,
    y_col='Conversion_Rate',
    d_col='Clicks',
    z_col='Clicks_predicted',
    base_controls=None,
    add_constant=True,
    cov_type='robust',         # 'robust', 'unadjusted', or 'cluster'
    cluster_col=None,
    verbose=True
):
    """
    Stratified IV2SLS with robust input validation and rank-deficiency pruning.
    """

    # --- Required columns ---
    required = [y_col, d_col, z_col]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    # --- Controls: strictly exogenous, no post-treatment metrics ---
    if base_controls is None:
        base_controls = [
            'Age', 'Income',
            'Gender_encoded', 'Location_encoded',
            'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded'
        ]
    controls_all = [c for c in base_controls if c in df.columns]
    if verbose:
        missing_controls = [c for c in base_controls if c not in df.columns]
        if missing_controls:
            print(f"âš  Skipping missing controls: {missing_controls}")

    # --- Default subgroup spec ---
    if subgroup_vars is None:
        subgroup_vars = {
            'Income': [0, 30000, 50000, 70000, np.inf],
            'Age': [0, 35, 50, 65, np.inf],
            'Location': None,
            'Ad_Type': None
        }

    # --- Helper: build formula ---
    def build_formula(controls_list):
        if controls_list:
            exog = ' + '.join(controls_list)
            exog = ('1 + ' + exog) if add_constant else ('-1 + ' + exog)
        else:
            exog = '1' if add_constant else '-1'
        return f"{y_col} ~ {exog} + [{d_col} ~ {z_col}]"

    # --- Helper: prune zero-variance controls in subgroup ---
    def prune_controls(sub_df, ctrl_cols):
        pruned = []
        for c in ctrl_cols:
            if c not in sub_df.columns:
                continue
            # Drop if constant (zero variance) in subgroup
            if sub_df[c].nunique() <= 1:
                continue
            pruned.append(c)
        return pruned

    rows = []
    subgroup_keys = subgroup_vars if isinstance(subgroup_vars, list) else list(subgroup_vars.keys())

    for var in subgroup_keys:
        df_local = df.copy()

        # Build subgroup column
        if isinstance(subgroup_vars, dict) and subgroup_vars.get(var) is not None:
            bins = subgroup_vars[var]
            if not isinstance(bins, (list, tuple)) or len(bins) < 2:
                if verbose: print(f"âš  Invalid bins for {var}; skipping.")
                continue
            labels = [f"{var}_{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]
            try:
                df_local[f'{var}_subgroup'] = pd.cut(
                    df_local[var], bins=bins, labels=labels, include_lowest=True
                )
                subgroup_col = f'{var}_subgroup'
            except Exception as e:
                if verbose: print(f"âš  Failed to bin {var}: {e}; skipping.")
                continue
        else:
            subgroup_col = var
            if subgroup_col not in df_local.columns:
                if verbose: print(f"âš  Subgroup column {subgroup_col} missing; skipping.")
                continue

        subgroups = df_local[subgroup_col].dropna().unique()
        if len(subgroups) == 0:
            if verbose: print(f"âš  No valid subgroups for {var}; skipping.")
            continue

        for sg in subgroups:
            dsg = df_local[df_local[subgroup_col] == sg]
            n_obs = len(dsg)
            if n_obs < min_subgroup_size:
                continue

            # Ensure variation in instrument and endogenous regressor
            if dsg[z_col].nunique() <= 1 or dsg[d_col].nunique() <= 1:
                # No first-stage or second-stage variation â†’ skip
                continue

            # Build and prune controls for this subgroup
            controls = prune_controls(dsg, controls_all)

            # Build estimation data
            cols_needed = [y_col, d_col, z_col] + controls
            data = dsg[cols_needed].dropna()
            n_used = len(data)
            if n_used < min_subgroup_size:
                continue

            # Final sanity after NA drop: instrument/endog still vary?
            if data[z_col].nunique() <= 1 or data[d_col].nunique() <= 1:
                continue

            formula = build_formula(controls)

            try:
                if cov_type == 'cluster' and cluster_col and (cluster_col in dsg.columns):
                    clusters = dsg.loc[data.index, cluster_col]
                    res = IV2SLS.from_formula(formula, data=data).fit(
                        cov_type='clustered', clusters=clusters
                    )
                else:
                    if cov_type == 'cluster' and not cluster_col and verbose:
                        print("âš  cov_type='cluster' requested without cluster_col; defaulting to robust.")
                    res = IV2SLS.from_formula(formula, data=data).fit(
                        cov_type='robust' if cov_type != 'unadjusted' else 'unadjusted'
                    )

                # Extract second-stage estimates
                beta = res.params.get(d_col, np.nan)
                se = res.std_errors.get(d_col, np.nan)
                pval = res.pvalues.get(d_col, np.nan)
                ci_lower = beta - 1.96 * se if pd.notnull(se) else np.nan
                ci_upper = beta + 1.96 * se if pd.notnull(se) else np.nan
                significant = bool(pd.notnull(pval) and pval < 0.05)

                # First-stage diagnostics
                fs = res.first_stage
                # If multiple endogenous regressors, res.first_stage is a dict
                if isinstance(fs, dict):
                    fs = fs[d_col]  # select Clicks' first-stage results
                fs_r2 = getattr(fs, 'rsquared', np.nan)
                fs_fstat_obj = getattr(fs, 'f_statistic', None)
                if fs_fstat_obj is not None and hasattr(fs_fstat_obj, 'stat'):
                    fs_f = float(fs_fstat_obj.stat)
                else:
                    fs_f = np.nan
                instrument_weak = bool(pd.notnull(fs_f) and fs_f < 10)

                rows.append({
                    'Variable': var,
                    'Subgroup': str(sg),
                    'N': n_obs,
                    'N_Used': n_used,
                    'Controls_Used': ','.join(controls) if controls else '(none)',
                    'First_Stage_R2': fs_r2,
                    'First_Stage_F': fs_f,
                    'Instrument_Weak': instrument_weak,
                    'Beta': beta,
                    'Std_Error': se,
                    'P_Value': pval,
                    'CI_Lower': ci_lower,
                    'CI_Upper': ci_upper,
                    'Significant': significant
                })
            except Exception as e:
                if verbose:
                    print(f"âœ— Error in subgroup '{var}={sg}': {e}")
                continue

    if not rows:
        if verbose: print("âš  No subgroups estimated successfully.")
        return None

    out = pd.DataFrame(rows)
    out['Abs_Effect'] = out['Beta'].abs()
    out = out.sort_values('Abs_Effect', ascending=False)
    return out


##### Stratified 2SLS End

In [17]:
def generate_example_data(n=2000):
    """Generate synthetic data for demonstration"""
    np.random.seed(42)
    
    data = pd.DataFrame({
        'Age': np.random.randint(18, 65, n),
        'Gender': np.random.choice(['M', 'F'], n),
        'Income': np.random.randint(30000, 150000, n),
        'Location': np.random.choice(['Urban', 'Suburban', 'Rural'], n),
        'Ad_Type': np.random.choice(['Video', 'Banner', 'Native'], n),
        'Ad_Topic': np.random.choice(['Tech', 'Fashion', 'Food', 'Travel'], n),
        'Ad_Placement': np.random.choice(['Social_Media', 'Search', 'Display'], n),
        'Click_Time': pd.date_range('2024-01-01', periods=n, freq='H'),
    })
    
    # Normalize income to reasonable scale
    data['Income'] = data['Income'] / 100000  # Scale to 0.3-1.5 range
    
    # Generate clicks with realistic structure
    clicks_base = (
        0.5 +  # baseline
        0.3 * (data['Ad_Type'] == 'Video').astype(float) +
        0.2 * (data['Ad_Placement'] == 'Social_Media').astype(float) +
        0.01 * data['Age'] +
        0.2 * data['Income'] +
        np.random.randn(n) * 0.5
    )
    data['Clicks'] = np.clip(clicks_base, 0.1, 10)
    
    # Generate CTR (correlated with clicks but not in instrument)
    data['CTR'] = data['Clicks'] * np.random.uniform(0.05, 0.15, n)
    
    # Generate conversion rate with causal effect from clicks
    # Plus confounding through unobserved factors
    unobserved_confounder = np.random.randn(n) * 0.05
    
    conversion_base = (
        0.05 +  # baseline
        0.08 * data['Clicks'] +  # TRUE CAUSAL EFFECT
        0.02 * data['Income'] +
        0.005 * data['Age'] +
        0.3 * data['CTR'] +
        unobserved_confounder +
        np.random.randn(n) * 0.03
    )
    data['Conversion_Rate'] = np.clip(conversion_base, 0.01, 0.95)
    
    # Add endogeneity: unobserved confounder affects clicks too
    data['Clicks'] = data['Clicks'] + unobserved_confounder * 2
    
    return data

##### Start of Implementation

In [18]:
# Cleaning and preprocessing
df = pd.read_csv('../datasets/project/Dataset_Ads.csv')
# df = generate_example_data(n=5000)
print("\n" + "="*60)
print('ORIGINAL DATASET')
print("="*60)
print(df.head())

df = clean_data(df)
print("\n" + "="*60)
print('CLEANED AND LOGGED DATASET')
print("="*60)
print(df.head())

df = engineer_time_features(df)
print("\n" + "="*60)
print('TIME ENGINEERED COLUMN')
print("="*60)
print(df.head())

df = encode_categorical_features(df)
print("\n" + "="*60)
print('ENCODED CATEGORICAL VARIABLES')
print("="*60)
print(df.head())

print("\n" + "="*60)
print('DESCRIPTION OF DF AFTER PREPROCESSING')
print("="*60)
print(df.describe(include='all'))


ORIGINAL DATASET
   Age  Gender    Income  Location Ad_Type Ad_Topic   Ad_Placement  Clicks  \
0   61    Male  35717.43     Urban  Banner   Travel   Social Media       3   
1   41    Male  47453.25     Rural   Video   Travel  Search Engine       5   
2   49  Female  68126.35     Rural    Text     Food   Social Media       4   
3   68  Female  64585.73  Suburban    Text   Health        Website       6   
4   63    Male  21109.40     Urban  Native  Fashion  Search Engine       5   

                   Click_Time  Conversion_Rate     CTR  
0  2024-01-18 20:45:56.898459           0.0981  0.0737  
1  2023-04-24 20:45:56.898459           0.0937  0.0592  
2  2024-02-24 20:45:56.898459           0.1912  0.0563  
3  2023-12-13 20:45:56.898459           0.1122  0.0232  
4  2023-07-02 20:45:56.898459           0.1426  0.0539  

DATA CLEANING AND PREPROCESSING
âœ“ Converted 70 negative income values to missing
âœ“ Imputed 70 missing income values with median
âœ“ Winsorized 200 income values at 1s

In [19]:
# ML instrument creation took about 1m40secs with real data
# ML instrument creation took about 3m8secs with synthetic data
print("\n" + "="*60)
print('CREATING ML INSTRUMENT')
print("="*60)
df, X, model = create_ml_instrument(df, use_enhanced_features=True)
# NOTE Remember that the instrument is really weak when created not using the interaction terms.


CREATING ML INSTRUMENT

FEATURE ENGINEERING FOR INSTRUMENT STRENGTH
âœ“ Created Income Ã— Ad Type interaction
âœ“ Created Age Ã— Ad Topic interaction
âœ“ Created Income Ã— Ad Placement interaction
âœ“ Created Age Ã— Ad Placement interaction
âœ“ Created Weekend indicator
âœ“ Created time-of-day indicators
âœ“ Created Weekend Ã— Ad Type interaction
âœ“ Created Business Hours Ã— Ad Placement interaction
âœ“ Created Evening Ã— Ad Topic interaction
âœ“ Created Age Ã— Hour interaction
âœ“ Created Income Ã— Weekend interaction
âœ“ Created Age Ã— Business Hours interaction
âœ“ Created Age squared
âœ“ Created Income squared and sqrt
âœ“ Created Location Ã— Age interaction
âœ“ Created Location Ã— Income interaction
âœ“ Created Location Ã— Placement interaction
âœ“ Created Gender Ã— Ad Topic interaction
âœ“ Created Gender Ã— Ad Type interaction
âœ“ Created Ad Type Ã— Placement interaction
âœ“ Created Age Ã— Ad Type Ã— Weekend interaction
âœ“ Created Income Ã— Placement Ã— Business Hours interact

In [20]:
# diagnostics for ml instrument strength
print("\n" + "="*60)
print('ML INSTRUMENT DIAGNOSTICS')
print("="*60)
enhanced_instrument_diagnostics(df, X, df['Clicks'], model)


ML INSTRUMENT DIAGNOSTICS

ENHANCED INSTRUMENT STRENGTH DIAGNOSTICS

SAMPLE INFORMATION:
  Sample size (n):              9,543
  Number of features (k):       37

FIRST-STAGE PERFORMANCE:
  R-squared:                    0.9998
  Correlation (Z, D):           0.9999
  F-statistic:                  63116170.43
  Cragg-Donald statistic:       9541.56

BENCHMARKS & INTERPRETATION:
  Criterion                           Threshold    Status
  ----------------------------------- ------------ --------------------
  Weak Instrument (F < 10)            10.00        âœ“ STRONG
  Stock-Yogo 10% max bias             16.38        âœ“âœ“ EXCELLENT
  Stock-Yogo 15% max bias             8.96         âœ“ GOOD

OVERALL ASSESSMENT:
  âœ“âœ“ VERY STRONG INSTRUMENT
     Maximum IV bias < 10% of OLS bias

â„¹ Stacking ensemble used - feature importances not directly available



In [21]:
# 2SLS with ml featured instrument
# The r-squared value is super tiny with real data for whatever reason.
# However, with my synthetic data I actually get a good r-squared score and a small p-value.
# Perhpas the move is to end the real data stuff there, but continue with Raj Chetty methodlogies with the
# synthetic data for reaserch sake.

controls = [
    'Age', 'Income',
    'Gender_encoded', 'Location_encoded',
    'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded'
]

results, data_used = run_2sls(
    df,
    y_col='Conversion_Rate',
    d_col='Clicks',
    z_col='Clicks_predicted',
    base_controls=controls,
    include_interactions=True,   # optional
    add_constant=True,
    cov_type='robust'            # or 'cluster' with cluster_col='CampaignID'
)

âœ“ Added exogenous interaction: Ad_Type_x_Placement

2SLS ESTIMATION SUMMARY (linearmodels.iv.IV2SLS)
                          IV-2SLS Estimation Summary                          
Dep. Variable:        Conversion_Rate   R-squared:                      0.0006
Estimator:                    IV-2SLS   Adj. R-squared:                -0.0004
No. Observations:                9543   F-statistic:                    5.2175
Date:                Wed, Nov 12 2025   P-value (F-stat)                0.8149
Time:                        12:17:54   Distribution:                  chi2(9)
Cov. Estimator:                robust                                         
                                                                              
                                  Parameter Estimates                                   
                      Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
---------------------------------------------------------------------------------------

In [29]:
# stratified 2sls time
controls = [
    'Age', 'Income',
    'Gender_encoded', 'Location_encoded',
    'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded'
]

results_df = analyze_subgroup_effects_iv(
    df,
    subgroup_vars={'Income': [0, 30000, 50000, 70000, np.inf], 'Age': [0, 35, 50, 65, np.inf], 'Location': None, 'Ad_Type': None},
    min_subgroup_size=200,
    y_col='Conversion_Rate',
    d_col='Clicks',
    z_col='Clicks_predicted',
    base_controls=controls,
    add_constant=True,
    cov_type='robust'
)

# Inspect strongest effects
if results_df is not None:
    print(results_df[['Variable','Subgroup','N_Used','Beta','P_Value','First_Stage_F','Instrument_Weak']].head(10).to_string(index=False))


Variable           Subgroup  N_Used     Beta  P_Value  First_Stage_F  Instrument_Weak
  Income Income_30000-50000    3248 0.001916 0.054200            NaN            False
 Ad_Type               Text    2356 0.001605 0.170903            NaN            False
 Ad_Type             Banner    2460 0.001221 0.261627            NaN            False
     Age          Age_35-50    3261 0.001156 0.224826            NaN            False
Location           Suburban    3161 0.001114 0.256089            NaN            False
     Age          Age_50-65    1185 0.000766 0.625189            NaN            False
Location              Urban    3143 0.000761 0.427995            NaN            False
     Age           Age_0-35    4918 0.000730 0.350965            NaN            False
 Ad_Type             Native    2325 0.000670 0.560855            NaN            False
Location              Rural    3239 0.000470 0.625379            NaN            False


##### End of Implementation