Previous file was getting too chunky. This one has just the finalized complete pipeline for the ml feature engineered thang for 2SLS.

##### Python Libraries

In [59]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
import statsmodels.api as sm
from scipy import stats
from xgboost import XGBRegressor
from linearmodels.iv import IV2SLS
import statsmodels.api as sm
import warnings

warnings.filterwarnings('ignore')

Thought vomit
What are my knowns?

I want to show the data initially, and then also show that clicks is endogenous to conversion rate.

Once i'm there I want to show my data isn't that clean and make a cleaned dataset.

So there needs to be a preprocessing step that handles bad data, and logs stuff appropriately.

Now. I want to make the interaction terms and such.

Once that is all done I will create a machine learning instrument from enhanced features.

Then it seems I'm foggy on how the instrument creation and 2sls will work with each other.

Plus the whole point of this is to try to integrate Raj Chetty's 2014 forecast bias with value-added estimates into this somehow.

I want to make sure my entire implementation is proper. Even if the results are null.


##### Preprocessing Start

In [60]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

def clean_data_strict(df):
    """
    Clean ONLY exogenous variables. Do NOT touch endogenous variables.
    
    Rule: If it's measured AFTER ad exposure, don't transform it here.
    """
    print("\n" + "="*60)
    print("DATA CLEANING (EXOGENOUS VARIABLES ONLY)")
    print("="*60)
    
    initial_rows = len(df)
    df = df.copy()
    
    # ===== INCOME =====
    if 'Income' in df.columns:
        neg_income = (df['Income'] < 0).sum()
        df.loc[df['Income'] < 0, 'Income'] = np.nan
        if neg_income > 0:
            print(f"‚úì Converted {neg_income} negative income values to missing")
        
        missing_income = df['Income'].isna().sum()
        if missing_income > 0:
            imputer = SimpleImputer(strategy='median')
            df['Income'] = imputer.fit_transform(df[['Income']])
            print(f"‚úì Imputed {missing_income} missing income values")
        
        lower, upper = df['Income'].quantile([0.01, 0.99])
        df['Income'] = df['Income'].clip(lower, upper)
        print(f"‚úì Winsorized income: [{lower:,.0f}, {upper:,.0f}]")
    
    # ===== AGE =====
    if 'Age' in df.columns:
        age_before = len(df)
        df = df[df['Age'].between(10, 90)]
        age_filtered = age_before - len(df)
        if age_filtered > 0:
            print(f"‚úì Filtered {age_filtered} implausible ages")
    
    # ===== LOG TRANSFORMS (EXOGENOUS ONLY) =====
    print(f"\nüìä Log transformations:")
    if 'Income' in df.columns:
        df['Income_log'] = np.log1p(df['Income'])
        print(f"  ‚úì Income_log")
    
    if 'Age' in df.columns:
        df['Age_log'] = np.log1p(df['Age'])
        print(f"  ‚úì Age_log")
    
    # ===== SUMMARY =====
    final_rows = len(df)
    print(f"\n{'='*60}")
    print(f"Rows: {initial_rows:,} ‚Üí {final_rows:,} (removed {initial_rows-final_rows:,})")
    print(f"{'='*60}\n")
    
    return df


def engineer_time_features_enhanced(df):
    """
    Extract ALL temporal features that might predict clicks.
    """
    if 'Click_Time' not in df.columns:
        return df
    
    print("\n" + "="*60)
    print("TIME FEATURE ENGINEERING")
    print("="*60)
    
    df['Click_Time'] = pd.to_datetime(df['Click_Time'])
    
    # Basic time features
    df['Day_of_Week'] = df['Click_Time'].dt.dayofweek
    df['Hour'] = df['Click_Time'].dt.hour
    df['Month'] = df['Click_Time'].dt.month
    df['Day_of_Month'] = df['Click_Time'].dt.day
    df['Week_of_Year'] = df['Click_Time'].dt.isocalendar().week
    
    # Binary indicators
    df['Weekend'] = (df['Day_of_Week'] >= 5).astype(int)
    df['BusinessHours'] = ((df['Hour'] >= 9) & (df['Hour'] <= 17)).astype(int)
    df['Evening'] = ((df['Hour'] >= 18) & (df['Hour'] <= 23)).astype(int)
    df['Morning'] = ((df['Hour'] >= 6) & (df['Hour'] <= 11)).astype(int)
    df['Night'] = ((df['Hour'] >= 0) & (df['Hour'] <= 5)).astype(int)
    df['Lunch'] = ((df['Hour'] >= 12) & (df['Hour'] <= 13)).astype(int)
    
    # Seasonal indicators
    df['Is_Summer'] = df['Month'].isin([6, 7, 8]).astype(int)
    df['Is_Winter'] = df['Month'].isin([12, 1, 2]).astype(int)
    df['Is_Holiday_Season'] = df['Month'].isin([11, 12]).astype(int)
    
    # Paycheck cycles
    df['Beginning_of_Month'] = (df['Day_of_Month'] <= 5).astype(int)
    df['End_of_Month'] = (df['Day_of_Month'] >= 25).astype(int)
    
    # Specific high-traffic days
    df['Is_Monday'] = (df['Day_of_Week'] == 0).astype(int)
    df['Is_Friday'] = (df['Day_of_Week'] == 4).astype(int)
    
    print(f"‚úì Created {len([c for c in df.columns if c not in ['Click_Time']])} time features")
    print("="*60 + "\n")
    
    return df


def encode_categorical_features(df):
    """Encode categorical variables."""
    categorical_cols = ['Gender', 'Location', 'Ad_Type', 'Ad_Topic', 'Ad_Placement']
    
    print("\n" + "="*60)
    print("CATEGORICAL ENCODING")
    print("="*60)
    
    for col in categorical_cols:
        if col in df.columns:
            le = LabelEncoder()
            df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str))
            print(f"‚úì Encoded {col}")
    
    print("="*60 + "\n")
    return df


def engineer_instrument_features_comprehensive(df):
    """
    Create ALL possible exogenous interactions for maximum instrument strength.
    
    Categories:
    1. Demographics √ó Ad Characteristics
    2. Demographics √ó Time
    3. Ad Characteristics √ó Time
    4. Location-specific interactions
    5. Nonlinear transformations
    6. Complex three-way interactions
    """
    print("\n" + "="*60)
    print("COMPREHENSIVE INSTRUMENT FEATURE ENGINEERING")
    print("="*60)
    
    feature_count = 0
    
    # ================================================================
    # 1. DEMOGRAPHICS √ó AD CHARACTERISTICS
    # ================================================================
    print("\n[1] Demographics √ó Ad Characteristics:")
    
    demo_cols = ['Age', 'Income', 'Gender_encoded']
    ad_cols = ['Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded']
    
    for demo in demo_cols:
        for ad in ad_cols:
            if all(c in df.columns for c in [demo, ad]):
                feat_name = f'{demo}_x_{ad}'
                df[feat_name] = df[demo] * df[ad]
                feature_count += 1
    
    print(f"  Created {feature_count} interactions")
    
    # ================================================================
    # 2. DEMOGRAPHICS √ó TIME
    # ================================================================
    print("\n[2] Demographics √ó Time:")
    prev_count = feature_count
    
    time_cols = ['Weekend', 'BusinessHours', 'Evening', 'Morning', 'Hour', 
                 'Is_Monday', 'Is_Friday', 'Beginning_of_Month', 'End_of_Month']
    
    for demo in ['Age', 'Income']:
        for time in time_cols:
            if all(c in df.columns for c in [demo, time]):
                feat_name = f'{demo}_x_{time}'
                df[feat_name] = df[demo] * df[time]
                feature_count += 1
    
    print(f"  Created {feature_count - prev_count} interactions")
    
    # ================================================================
    # 3. AD CHARACTERISTICS √ó TIME
    # ================================================================
    print("\n[3] Ad Characteristics √ó Time:")
    prev_count = feature_count
    
    for ad in ad_cols:
        for time in time_cols:
            if all(c in df.columns for c in [ad, time]):
                feat_name = f'{ad}_x_{time}'
                df[feat_name] = df[ad] * df[time]
                feature_count += 1
    
    print(f"  Created {feature_count - prev_count} interactions")
    
    # ================================================================
    # 4. LOCATION-SPECIFIC INTERACTIONS
    # ================================================================
    print("\n[4] Location-specific interactions:")
    prev_count = feature_count
    
    if 'Location_encoded' in df.columns:
        # Location √ó Demographics
        for demo in ['Age', 'Income', 'Gender_encoded']:
            if demo in df.columns:
                df[f'Location_x_{demo}'] = df['Location_encoded'] * df[demo]
                feature_count += 1
        
        # Location √ó Ad characteristics
        for ad in ad_cols:
            if ad in df.columns:
                df[f'Location_x_{ad}'] = df['Location_encoded'] * df[ad]
                feature_count += 1
        
        # Location √ó Time
        for time in ['Weekend', 'BusinessHours', 'Evening']:
            if time in df.columns:
                df[f'Location_x_{time}'] = df['Location_encoded'] * df[time]
                feature_count += 1
    
    print(f"  Created {feature_count - prev_count} interactions")
    
    # ================================================================
    # 5. NONLINEAR TRANSFORMATIONS
    # ================================================================
    print("\n[5] Nonlinear transformations:")
    prev_count = feature_count
    
    if 'Age' in df.columns:
        df['Age_squared'] = df['Age'] ** 2
        df['Age_cubed'] = df['Age'] ** 3
        df['Age_sqrt'] = np.sqrt(df['Age'])
        feature_count += 3
    
    if 'Income' in df.columns:
        df['Income_squared'] = df['Income'] ** 2
        df['Income_sqrt'] = np.sqrt(df['Income'].clip(lower=0))
        feature_count += 2
    
    if 'Hour' in df.columns:
        df['Hour_squared'] = df['Hour'] ** 2
        # Cyclical encoding (hour 23 is close to hour 0)
        df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24)
        df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24)
        feature_count += 3
    
    print(f"  Created {feature_count - prev_count} transformations")
    
    # ================================================================
    # 6. THREE-WAY INTERACTIONS (HIGH-ORDER)
    # ================================================================
    print("\n[6] Three-way interactions:")
    prev_count = feature_count
    
    # Age √ó Ad Type √ó Time
    if all(c in df.columns for c in ['Age', 'Ad_Type_encoded', 'Weekend']):
        df['Age_x_AdType_x_Weekend'] = df['Age'] * df['Ad_Type_encoded'] * df['Weekend']
        feature_count += 1
    
    if all(c in df.columns for c in ['Age', 'Ad_Type_encoded', 'Evening']):
        df['Age_x_AdType_x_Evening'] = df['Age'] * df['Ad_Type_encoded'] * df['Evening']
        feature_count += 1
    
    # Income √ó Ad Placement √ó Time
    if all(c in df.columns for c in ['Income', 'Ad_Placement_encoded', 'BusinessHours']):
        df['Income_x_Placement_x_BizHours'] = (
            df['Income'] * df['Ad_Placement_encoded'] * df['BusinessHours']
        )
        feature_count += 1
    
    if all(c in df.columns for c in ['Income', 'Ad_Placement_encoded', 'Weekend']):
        df['Income_x_Placement_x_Weekend'] = (
            df['Income'] * df['Ad_Placement_encoded'] * df['Weekend']
        )
        feature_count += 1
    
    # Location √ó Ad √ó Time
    if all(c in df.columns for c in ['Location_encoded', 'Ad_Type_encoded', 'Evening']):
        df['Location_x_AdType_x_Evening'] = (
            df['Location_encoded'] * df['Ad_Type_encoded'] * df['Evening']
        )
        feature_count += 1
    
    # Gender √ó Ad Topic √ó Time
    if all(c in df.columns for c in ['Gender_encoded', 'Ad_Topic_encoded', 'Weekend']):
        df['Gender_x_Topic_x_Weekend'] = (
            df['Gender_encoded'] * df['Ad_Topic_encoded'] * df['Weekend']
        )
        feature_count += 1
    
    print(f"  Created {feature_count - prev_count} three-way interactions")
    
    # ================================================================
    # 7. AD CHARACTERISTIC INTERACTIONS
    # ================================================================
    print("\n[7] Ad characteristic interactions:")
    prev_count = feature_count
    
    if all(c in df.columns for c in ['Ad_Type_encoded', 'Ad_Placement_encoded']):
        df['AdType_x_Placement'] = df['Ad_Type_encoded'] * df['Ad_Placement_encoded']
        feature_count += 1
    
    if all(c in df.columns for c in ['Ad_Type_encoded', 'Ad_Topic_encoded']):
        df['AdType_x_Topic'] = df['Ad_Type_encoded'] * df['Ad_Topic_encoded']
        feature_count += 1
    
    if all(c in df.columns for c in ['Ad_Topic_encoded', 'Ad_Placement_encoded']):
        df['AdTopic_x_Placement'] = df['Ad_Topic_encoded'] * df['Ad_Placement_encoded']
        feature_count += 1
    
    print(f"  Created {feature_count - prev_count} ad interactions")
    
    # ================================================================
    # SUMMARY
    # ================================================================
    print(f"\n{'='*60}")
    print(f"TOTAL NEW FEATURES CREATED: {feature_count}")
    print(f"{'='*60}\n")
    # print(df.describe(include='all'))
    # Select only object (string) columns
    # string_cols = df.select_dtypes(include=['object']).columns
    # print(string_cols)

    return df


# def create_leave_one_out_aggregates(df):
#     """
#     Create group-level aggregates using leave-one-out encoding.
    
#     These capture "propensity to click" at group level, which is exogenous
#     for an individual observation.
    
#     WARNING: This is advanced and borderline. Use only if desperate for
#     instrument strength.
#     """
#     print("\n" + "="*60)
#     print("LEAVE-ONE-OUT AGGREGATE FEATURES")
#     print("="*60)
    
#     if 'Clicks' not in df.columns:
#         print("‚ö†Ô∏è  'Clicks' column not found, skipping aggregates")
#         return df
    
#     kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
#     # Location-level average
#     if 'Location' in df.columns:
#         df['Location_AvgClicks'] = np.nan
#         for train_idx, test_idx in kf.split(df):
#             loc_avg = df.iloc[train_idx].groupby('Location')['Clicks'].mean()
#             df.loc[test_idx, 'Location_AvgClicks'] = (
#                 df.loc[test_idx, 'Location'].map(loc_avg)
#             )
#         print("‚úì Location_AvgClicks (leave-one-out)")
    
#     # Age-group-level average
#     if 'Age' in df.columns:
#         df['Age_Group'] = pd.cut(df['Age'], bins=[0, 25, 35, 50, 100])
#         df['AgeGroup_AvgClicks'] = np.nan
#         for train_idx, test_idx in kf.split(df):
#             age_avg = df.iloc[train_idx].groupby('

In [61]:
def clean_data(df):
    """
    Clean and preprocess data before analysis.
    
    Performs:
    1. Handle negative income values
    2. Impute missing income with median
    3. Winsorize income at 1st and 99th percentiles
    4. Filter age to plausible range (10-90 years)
    5. Create logarithmic transformations for skewed variables
    """
    print("\n" + "="*60)
    print("DATA CLEANING AND PREPROCESSING")
    print("="*60)
    
    initial_rows = len(df)
    
    # =====================================================================
    # 1. CLEAN INCOME
    # =====================================================================
    if 'Income' in df.columns:
        # Convert negative income to missing
        neg_income_count = (df['Income'] < 0).sum()
        df.loc[df['Income'] < 0, 'Income'] = np.nan
        
        if neg_income_count > 0:
            print(f"‚úì Converted {neg_income_count} negative income values to missing")
        
        # Impute missing income with median
        missing_income = df['Income'].isna().sum()
        if missing_income > 0:
            imputer = SimpleImputer(strategy='median')
            df['Income'] = imputer.fit_transform(df[['Income']])
            print(f"‚úì Imputed {missing_income} missing income values with median")
        
        # Winsorize: Cap extremes at 1st and 99th percentile
        lower, upper = df['Income'].quantile([0.01, 0.99])
        income_before = df['Income'].copy()
        df['Income'] = df['Income'].clip(lower, upper)
        winsorized = (income_before != df['Income']).sum()
        print(f"‚úì Winsorized {winsorized} income values at 1st/99th percentiles")
        print(f"  Income range: [{lower:,.0f}, {upper:,.0f}]")
    
    # =====================================================================
    # 2. FILTER AGE
    # =====================================================================
    if 'Age' in df.columns:
        age_before = len(df)
        df = df[df['Age'].between(10, 90)]
        age_filtered = age_before - len(df)
        if age_filtered > 0:
            print(f"‚úì Filtered {age_filtered} rows with implausible ages (keeping 10-90)")
    
    # =====================================================================
    # 3. CREATE LOGARITHMIC TRANSFORMATIONS
    # =====================================================================
    print(f"\nüìä Creating logarithmic transformations:")
    
    # Log of Income (if positive)
    if 'Income' in df.columns:
        df['Income_log'] = np.log1p(df['Income'])
        print(f"  ‚úì Income_log created (log1p transformation)")
    
    # # Log of Clicks (if exists and positive)
    # if 'Clicks' in df.columns:
    #     df['Clicks_log'] = np.log1p(df['Clicks'])
    #     print(f"  ‚úì Clicks_log created (log1p transformation)")
    
    # Log of Age (for nonlinear age effects)
    if 'Age' in df.columns:
        df['Age_log'] = np.log1p(df['Age'])
        print(f"  ‚úì Age_log created (log1p transformation)")
    
    # # Log of CTR (if exists and positive)
    # if 'CTR' in df.columns:
    #     # Ensure CTR is positive before log
    #     if (df['CTR'] > 0).all():
    #         df['CTR_log'] = np.log(df['CTR'])
    #         print(f"  ‚úì CTR_log created (log transformation)")
    
    # =====================================================================
    # SUMMARY
    # =====================================================================
    final_rows = len(df)
    rows_removed = initial_rows - final_rows
    
    print(f"\n{'='*60}")
    print(f"CLEANING SUMMARY:")
    print(f"  Initial rows:        {initial_rows:,}")
    print(f"  Final rows:          {final_rows:,}")
    print(f"  Rows removed:        {rows_removed:,} ({rows_removed/initial_rows*100:.1f}%)")
    print(f"  Log variables added: {len([col for col in df.columns if '_log' in col])}")
    print(f"{'='*60}\n")
    
    return df

    
def engineer_time_features(df):
    """Extract day of week and hour from Click_Time"""
    if 'Click_Time' in df.columns:
        df['Click_Time'] = pd.to_datetime(df['Click_Time'])
        df['Day_of_Week'] = df['Click_Time'].dt.dayofweek
        df['Hour'] = df['Click_Time'].dt.hour
    return df
    
def encode_categorical_features(df):
    """Encode categorical variables"""
    categorical_cols = ['Gender', 'Location', 'Ad_Type', 'Ad_Topic', 'Ad_Placement']
    
    for col in categorical_cols:
        if col in df.columns:
            le = LabelEncoder()
            df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str))
            # df[col] = le # This line was wierd, idk why it did this, but i may have to revert things back.
    
    return df

# gonna hold off on this interaction term stuff for now.
def engineer_instrument_features(df):
    """
    ENHANCED: Create rich features that predict clicks but don't directly affect conversions.
    
    This is crucial for instrument strength. We create:
    1. Interaction features between ad characteristics and demographics
    2. Time-based features (weekend, business hours)
    3. Nonlinear transformations
    4. Complex interactions between multiple variables
    
    Key principle: These features should predict CLICKS well, but only affect
    CONVERSIONS through clicks (exclusion restriction).
    """
    print("\n" + "="*60)
    print("FEATURE ENGINEERING FOR INSTRUMENT STRENGTH")
    print("="*60)
    
    # =====================================================================
    # 1. AD CHARACTERISTICS √ó DEMOGRAPHICS INTERACTIONS
    # =====================================================================
    # Rationale: Different demographics respond differently to ad types
    
    if all(col in df.columns for col in ['Income', 'Ad_Type_encoded']):
        df['Income_x_AdType'] = df['Income'] * df['Ad_Type_encoded']
        print("‚úì Created Income √ó Ad Type interaction")
        
    if all(col in df.columns for col in ['Age', 'Ad_Topic_encoded']):
        df['Age_x_AdTopic'] = df['Age'] * df['Ad_Topic_encoded']
        print("‚úì Created Age √ó Ad Topic interaction")
        
    if all(col in df.columns for col in ['Income', 'Ad_Placement_encoded']):
        df['Income_x_Placement'] = df['Income'] * df['Ad_Placement_encoded']
        print("‚úì Created Income √ó Ad Placement interaction")
        
    if all(col in df.columns for col in ['Age', 'Ad_Placement_encoded']):
        df['Age_x_Placement'] = df['Age'] * df['Ad_Placement_encoded']
        print("‚úì Created Age √ó Ad Placement interaction")
    
    # =====================================================================
    # 2. TIME-BASED FEATURES AND INTERACTIONS
    # =====================================================================
    # Rationale: Click patterns vary by time of day/week
    
    if 'Day_of_Week' in df.columns:
        df['Weekend'] = (df['Day_of_Week'] >= 5).astype(int)
        print("‚úì Created Weekend indicator")
        
    if 'Hour' in df.columns:
        df['BusinessHours'] = ((df['Hour'] >= 9) & (df['Hour'] <= 17)).astype(int)
        df['Evening'] = ((df['Hour'] >= 18) & (df['Hour'] <= 23)).astype(int)
        df['Morning'] = ((df['Hour'] >= 6) & (df['Hour'] <= 11)).astype(int)
        print("‚úì Created time-of-day indicators")
    
    # Time √ó Ad interactions
    if all(col in df.columns for col in ['Weekend', 'Ad_Type_encoded']):
        df['Weekend_x_AdType'] = df['Weekend'] * df['Ad_Type_encoded']
        print("‚úì Created Weekend √ó Ad Type interaction")
        
    if all(col in df.columns for col in ['BusinessHours', 'Ad_Placement_encoded']):
        df['BusinessHours_x_Placement'] = df['BusinessHours'] * df['Ad_Placement_encoded']
        print("‚úì Created Business Hours √ó Ad Placement interaction")
        
    if all(col in df.columns for col in ['Evening', 'Ad_Topic_encoded']):
        df['Evening_x_AdTopic'] = df['Evening'] * df['Ad_Topic_encoded']
        print("‚úì Created Evening √ó Ad Topic interaction")
    
    # =====================================================================
    # 3. DEMOGRAPHICS √ó TIME INTERACTIONS
    # =====================================================================
    # Rationale: Different demographics have different browsing patterns
    
    if all(col in df.columns for col in ['Age', 'Hour']):
        df['Age_x_Hour'] = df['Age'] * df['Hour']
        print("‚úì Created Age √ó Hour interaction")
        
    if all(col in df.columns for col in ['Income', 'Weekend']):
        df['Income_x_Weekend'] = df['Income'] * df['Weekend']
        print("‚úì Created Income √ó Weekend interaction")
        
    if all(col in df.columns for col in ['Age', 'BusinessHours']):
        df['Age_x_BusinessHours'] = df['Age'] * df['BusinessHours']
        print("‚úì Created Age √ó Business Hours interaction")
    
    # =====================================================================
    # 4. NONLINEAR TRANSFORMATIONS
    # =====================================================================
    # Rationale: Relationships may be nonlinear (using log-transformed versions)
    
    if 'Age_log' in df.columns:
        df['Age_squared'] = df['Age'] ** 2
        print("‚úì Created Age squared")
        
    if 'Income_log' in df.columns:
        df['Income_squared'] = df['Income'] ** 2
        df['Income_sqrt'] = np.sqrt(df['Income'].clip(lower=0))
        print("‚úì Created Income squared and sqrt")
    
    # =====================================================================
    # 5. COMPLEX CATEGORICAL INTERACTIONS
    # =====================================================================
    # Rationale: Certain combinations may be particularly predictive
    
    # Location √ó Demographics
    if all(col in df.columns for col in ['Location_encoded', 'Age']):
        df['Location_x_Age'] = df['Location_encoded'] * df['Age']
        print("‚úì Created Location √ó Age interaction")
        
    if all(col in df.columns for col in ['Location_encoded', 'Income']):
        df['Location_x_Income'] = df['Location_encoded'] * df['Income']
        print("‚úì Created Location √ó Income interaction")
    
    # Location √ó Ad characteristics
    if all(col in df.columns for col in ['Location_encoded', 'Ad_Placement_encoded']):
        df['Location_x_Placement'] = df['Location_encoded'] * df['Ad_Placement_encoded']
        print("‚úì Created Location √ó Placement interaction")
    
    # Gender √ó Ad characteristics
    if all(col in df.columns for col in ['Gender_encoded', 'Ad_Topic_encoded']):
        df['Gender_x_AdTopic'] = df['Gender_encoded'] * df['Ad_Topic_encoded']
        print("‚úì Created Gender √ó Ad Topic interaction")
        
    if all(col in df.columns for col in ['Gender_encoded', 'Ad_Type_encoded']):
        df['Gender_x_AdType'] = df['Gender_encoded'] * df['Ad_Type_encoded']
        print("‚úì Created Gender √ó Ad Type interaction")
    
    # Ad Type √ó Placement (different placements work for different types)
    if all(col in df.columns for col in ['Ad_Type_encoded', 'Ad_Placement_encoded']):
        df['AdType_x_Placement'] = df['Ad_Type_encoded'] * df['Ad_Placement_encoded']
        print("‚úì Created Ad Type √ó Placement interaction")
    
    # =====================================================================
    # 6. THREE-WAY INTERACTIONS (most powerful)
    # =====================================================================
    # Rationale: Capture complex patterns
    
    if all(col in df.columns for col in ['Age', 'Ad_Type_encoded', 'Weekend']):
        df['Age_x_AdType_x_Weekend'] = df['Age'] * df['Ad_Type_encoded'] * df['Weekend']
        print("‚úì Created Age √ó Ad Type √ó Weekend interaction")
        
    if all(col in df.columns for col in ['Income', 'Ad_Placement_encoded', 'BusinessHours']):
        df['Income_x_Placement_x_BizHours'] = df['Income'] * df['Ad_Placement_encoded'] * df['BusinessHours']
        print("‚úì Created Income √ó Placement √ó Business Hours interaction")
    
    print("="*60 + "\n")
    
    return df

##### Preprocessing End

##### Create ML Instrument Start

In [62]:
def create_ml_instrument(df, model_type='stacking', cv_folds=5, use_enhanced_features=False):
    """
    Generate ML-based instrument for Clicks using ensemble methods.
    Returns a new DataFrame with 'Clicks_predicted' column.
    """

    # --- Step 1: Define instrument features (strictly policy-side, not outcomes!) ---
    base_features = [
        'Age', 'Income',
        'Gender_encoded', 'Location_encoded',
        'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded',
        'Day_of_Week', 'Hour'
    ]

    enhanced_features = [
        # Interactions
        'Income_x_AdType', 'Age_x_AdTopic', 'Income_x_Placement', 'Age_x_Placement',
        'Weekend_x_AdType', 'BusinessHours_x_Placement', 'Evening_x_AdTopic',
        'Age_x_Hour', 'Income_x_Weekend', 'Age_x_BusinessHours',
        'Location_x_Age', 'Location_x_Income', 'Location_x_Placement',
        'Gender_x_AdTopic', 'Gender_x_AdType', 'AdType_x_Placement',
        'Age_x_AdType_x_Weekend', 'Income_x_Placement_x_BizHours',
        # Time features
        'Weekend', 'BusinessHours', 'Evening', 'Morning',
        # Nonlinear (now using cleaned log versions) NOTE only created if i log age, and income...
        'Age_squared', 'Age_log', 'Income_log', 'Income_squared', 'Income_sqrt'
    ]

    # NOTE so this totally failed! But! Let's just use Lasso regularization at the end! Boom.
    # i dont want to rename variables and idk if this works so for now it is renamed to 
    # NOTE advanced_enhanced features, but is name to enhanced_features for ease of use.
    # enhanced_features = [
    #     'Income_log', 'Age_log', 'Month', 'Day_of_Month', 'Week_of_Year', 
    #     'Weekend', 'BusinessHours', 
    #     'Evening', 'Morning', 'Night', 'Lunch', 'Is_Summer', 'Is_Winter', 'Is_Holiday_Season', 'Beginning_of_Month', 'End_of_Month', 
    #     'Is_Monday', 'Is_Friday',
    #     'Age_x_Ad_Type_encoded', 'Age_x_Ad_Topic_encoded', 'Age_x_Ad_Placement_encoded', 'Income_x_Ad_Type_encoded', 
    #     'Income_x_Ad_Topic_encoded', 'Income_x_Ad_Placement_encoded', 'Gender_encoded_x_Ad_Type_encoded', 
    #     'Gender_encoded_x_Ad_Topic_encoded', 'Gender_encoded_x_Ad_Placement_encoded', 'Age_x_Weekend', 'Age_x_BusinessHours', 
    #     'Age_x_Evening', 'Age_x_Morning', 'Age_x_Hour', 'Age_x_Is_Monday', 'Age_x_Is_Friday', 'Age_x_Beginning_of_Month', 
    #     'Age_x_End_of_Month', 'Income_x_Weekend', 'Income_x_BusinessHours', 'Income_x_Evening', 'Income_x_Morning', 'Income_x_Hour', 
    #     'Income_x_Is_Monday', 'Income_x_Is_Friday', 'Income_x_Beginning_of_Month', 'Income_x_End_of_Month', 'Ad_Type_encoded_x_Weekend', 
    #     'Ad_Type_encoded_x_BusinessHours', 'Ad_Type_encoded_x_Evening', 'Ad_Type_encoded_x_Morning', 'Ad_Type_encoded_x_Hour', 
    #     'Ad_Type_encoded_x_Is_Monday', 'Ad_Type_encoded_x_Is_Friday', 'Ad_Type_encoded_x_Beginning_of_Month', 
    #     'Ad_Type_encoded_x_End_of_Month', 'Ad_Topic_encoded_x_Weekend', 'Ad_Topic_encoded_x_BusinessHours', 
    #     'Ad_Topic_encoded_x_Evening', 'Ad_Topic_encoded_x_Morning', 'Ad_Topic_encoded_x_Hour', 'Ad_Topic_encoded_x_Is_Monday', 
    #     'Ad_Topic_encoded_x_Is_Friday', 'Ad_Topic_encoded_x_Beginning_of_Month', 'Ad_Topic_encoded_x_End_of_Month', 
    #     'Ad_Placement_encoded_x_Weekend', 'Ad_Placement_encoded_x_BusinessHours', 'Ad_Placement_encoded_x_Evening', 
    #     'Ad_Placement_encoded_x_Morning', 'Ad_Placement_encoded_x_Hour', 'Ad_Placement_encoded_x_Is_Monday', 
    #     'Ad_Placement_encoded_x_Is_Friday', 'Ad_Placement_encoded_x_Beginning_of_Month', 'Ad_Placement_encoded_x_End_of_Month', 
    #     'Location_x_Age', 'Location_x_Income', 'Location_x_Gender_encoded', 'Location_x_Ad_Type_encoded', 'Location_x_Ad_Topic_encoded',
    #     'Location_x_Ad_Placement_encoded', 'Location_x_Weekend', 'Location_x_BusinessHours', 'Location_x_Evening', 'Age_squared', 
    #     'Age_cubed', 'Age_sqrt', 'Income_squared', 'Income_sqrt', 'Hour_squared', 'Hour_sin', 'Hour_cos', 'Age_x_AdType_x_Weekend', 
    #     'Age_x_AdType_x_Evening', 'Income_x_Placement_x_BizHours', 'Income_x_Placement_x_Weekend',
    #     'Location_x_AdType_x_Evening', 'Gender_x_Topic_x_Weekend', 'AdType_x_Placement', 'AdType_x_Topic', 'AdTopic_x_Placement'
    # ]
    
    if use_enhanced_features:
        # Assume you have a separate function to engineer features
        # df = engineer_instrument_features(df)
        df = engineer_instrument_features_comprehensive(df)

        instrument_features = base_features + enhanced_features
    else:
        instrument_features = base_features

    # Filter available features
    available_features = [f for f in instrument_features if f in df.columns]
    # print('\n FLAG', available_features)
    # print(df.describe(include='all'))
    print(df.dtypes)

    X = df[available_features]
    y = df['Clicks']

    # --- Step 2: Build model ---
    if model_type == 'stacking':
        base_models = [
            ('rf', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)),
            ('gb', GradientBoostingRegressor(n_estimators=200, random_state=42))
        ]
        try:
            from xgboost import XGBRegressor
            base_models.append(('xgb', XGBRegressor(n_estimators=200, random_state=42, n_jobs=-1)))
        except ImportError:
            pass
        model = StackingRegressor(estimators=base_models, final_estimator=Ridge(alpha=1.0), cv=cv_folds, n_jobs=-1)

    elif model_type == 'rf':
        model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)

    elif model_type == 'gb':
        model = GradientBoostingRegressor(n_estimators=200, random_state=42)

    else:
        raise ValueError("Invalid model_type. Choose 'stacking', 'rf', or 'gb'.")

    # --- Step 3: Generate out-of-fold predictions ---
    clicks_pred = cross_val_predict(model, X, y, cv=cv_folds, n_jobs=-1)
    df = df.copy()
    df['Clicks_predicted'] = clicks_pred

    # --- Step 4: Fit final model (optional, for diagnostics) ---
    model.fit(X, y)

    return df, X, model

def enhanced_instrument_diagnostics(df, X, y, model):
    """
    Functional: Comprehensive instrument strength testing with Stock-Yogo critical values.
    
    Parameters
    ----------
    df : pd.DataFrame
        Must contain 'Clicks' and 'Clicks_predicted' columns.
    X : pd.DataFrame
        Feature matrix used in first-stage model.
    y : pd.Series or np.array
        True clicks (endogenous regressor).
    model : fitted sklearn model
        First-stage ML model used to generate instruments.
    """

    # --- Extract instrument (Z) and endogenous regressor (D) ---
    if 'Clicks_predicted' not in df.columns:
        raise ValueError("DataFrame must contain 'Clicks_predicted' column")
    z = df['Clicks_predicted'].values
    d = df['Clicks'].values

    n = len(d)
    k = X.shape[1]

    # --- First-stage R¬≤ and F-statistic ---
    d_resid = d - d.mean()
    ss_tot = np.sum(d_resid**2)
    ss_res = np.sum((d - z)**2)
    r_squared = 1 - (ss_res / ss_tot)

    f_stat = (r_squared / 1) / ((1 - r_squared) / (n - k - 1))

    # --- Correlation ---
    corr = np.corrcoef(z, d)[0, 1]

    # --- Cragg-Donald statistic ---
    cragg_donald = n * r_squared

    # --- Display results ---
    print(f"\n{'='*70}")
    print("ENHANCED INSTRUMENT STRENGTH DIAGNOSTICS")
    print(f"{'='*70}")
    print("\nSAMPLE INFORMATION:")
    print(f"  Sample size (n):              {n:,}")
    print(f"  Number of features (k):       {k}")
    print("\nFIRST-STAGE PERFORMANCE:")
    print(f"  R-squared:                    {r_squared:.4f}")
    print(f"  Correlation (Z, D):           {corr:.4f}")
    print(f"  F-statistic:                  {f_stat:.2f}")
    print(f"  Cragg-Donald statistic:       {cragg_donald:.2f}")

    print("\nBENCHMARKS & INTERPRETATION:")
    print(f"  {'Criterion':<35} {'Threshold':<12} {'Status'}")
    print(f"  {'-'*35} {'-'*12} {'-'*20}")
    weak_status = "‚úì STRONG" if f_stat > 10 else "‚úó WEAK"
    print(f"  {'Weak Instrument (F < 10)':<35} {'10.00':<12} {weak_status}")
    sy_10_status = "‚úì‚úì EXCELLENT" if f_stat > 16.38 else "‚úó Below threshold"
    sy_15_status = "‚úì GOOD" if f_stat > 8.96 else "‚úó Below threshold"
    print(f"  {'Stock-Yogo 10% max bias':<35} {'16.38':<12} {sy_10_status}")
    print(f"  {'Stock-Yogo 15% max bias':<35} {'8.96':<12} {sy_15_status}")

    print("\nOVERALL ASSESSMENT:")
    if f_stat > 16.38:
        print("  ‚úì‚úì VERY STRONG INSTRUMENT")
        print("     Maximum IV bias < 10% of OLS bias")
    elif f_stat > 10:
        print("  ‚úì STRONG INSTRUMENT")
        print("     Acceptable for causal inference")
    elif f_stat > 5:
        print("  ‚ö† MODERATELY WEAK INSTRUMENT")
        print("     Proceed with caution")
    else:
        print("  ‚úó WEAK INSTRUMENT")
        print("     Results may be unreliable")

    # --- Feature importance (if available) ---
    if hasattr(model, 'feature_importances_'):
        print("\nTOP 10 MOST IMPORTANT FEATURES FOR PREDICTING CLICKS:")
        importances = model.feature_importances_
        top_features = sorted(zip(X.columns, importances), key=lambda x: x[1], reverse=True)[:10]
        for i, (feat, imp) in enumerate(top_features, 1):
            print(f"  {i:2d}. {feat:35s} {imp:.4f}")
    elif hasattr(model, 'final_estimator_'):
        print("\n‚Ñπ Stacking ensemble used - feature importances not directly available")

    print(f"{'='*70}\n")

##### Create ML Instrument End

##### OLS Start

In [63]:
def run_ols(
    df,
    y_col='Conversion_Rate',
    d_col='Clicks',
    base_controls=None,
    include_interactions=False,
    add_constant=True,
    cluster_col=None,
    cov_type='robust'  # 'robust' for HC, or 'cluster' if cluster_col is set
):
    """
    Na√Øve OLS regression treating Clicks as exogenous.
    Model:
        Y = Œ± + Œ≤ D + Œò X + Œµ

    Parameters
    ----------
    df : pd.DataFrame
        Must contain y_col, d_col, and any control columns.
    y_col : str
        Outcome column (e.g., 'Conversion_Rate').
    d_col : str
        Regressor (treated as exogenous here).
    base_controls : list[str] or None
        Exogenous controls.
    include_interactions : bool
        If True, include Ad_Type √ó Ad_Placement interaction (exogenous).
    add_constant : bool
        If True, add a constant term automatically.
    cluster_col : str or None
        Column name for cluster-robust SEs.
    cov_type : str
        'robust' (HC), 'cluster' (requires cluster_col), or 'unadjusted'.

    Returns
    -------
    results : RegressionResults
        Fitted OLS results object from statsmodels.
    data_used : pd.DataFrame
        DataFrame with columns actually used in estimation.
    """

    # --- Controls ---
    if base_controls is None:
        base_controls = [
            'Age', 'Income',
            'Gender_encoded', 'Location_encoded',
            'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded'
        ]

    invalid_controls = [c for c in base_controls if c not in df.columns]
    if invalid_controls:
        print(f"‚ö† Skipping missing controls: {invalid_controls}")
    controls = [c for c in base_controls if c in df.columns]

    # --- Optional interaction ---
    if include_interactions:
        if ('Ad_Type_encoded' in df.columns) and ('Ad_Placement_encoded' in df.columns):
            interaction_col = 'Ad_Type_x_Placement'
            if interaction_col not in df.columns:
                df = df.copy()
                df[interaction_col] = df['Ad_Type_encoded'] * df['Ad_Placement_encoded']
                print("‚úì Added exogenous interaction: Ad_Type_x_Placement")
            controls.append(interaction_col)

    # --- Build data ---
    cols_needed = [y_col, d_col] + controls
    data = df[cols_needed].dropna().copy()
    if data.empty:
        raise ValueError("After dropping NA, no rows remain for estimation.")

    y = data[y_col]
    X = data[[d_col] + controls]

    if add_constant:
        X = sm.add_constant(X)

    # --- Fit OLS ---
    if cov_type == 'cluster' and (cluster_col is not None) and (cluster_col in df.columns):
        clusters = df.loc[data.index, cluster_col]
        results = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': clusters})
    else:
        if cov_type == 'cluster' and cluster_col is None:
            print("‚ö† cov_type='cluster' requested but no cluster_col provided; defaulting to robust.")
            cov_type = 'robust'
        results = sm.OLS(y, X).fit(cov_type='HC1' if cov_type == 'robust' else 'nonrobust')

    # --- Reporting ---
    print("\n" + "="*70)
    print("OLS ESTIMATION SUMMARY (statsmodels.api.OLS)")
    print("="*70)
    print(results.summary())

    return results, data


##### OLS End

##### 2SLS Start

In [64]:
def run_2sls(
    df,
    y_col='Conversion_Rate',
    d_col='Clicks',
    z_col='Clicks_predicted',
    base_controls=None,
    include_interactions=False,
    add_constant=True,
    cluster_col=None,
    cov_type='robust'  # 'robust' for HC, or 'cluster' if cluster_col is set
):
    """
    Functional 2SLS using linearmodels.iv.IV2SLS.

    Model:
        First stage: D = œÄ0 + œÄ1 Z + Œì X + ŒΩ
        Second stage: Y = Œ± + Œ≤ D + Œò X + Œµ

    Parameters
    ----------
    df : pd.DataFrame
        Must contain y_col, d_col, z_col, and any control columns.
    y_col : str
        Outcome column (e.g., 'Conversion_Rate').
    d_col : str
        Endogenous regressor (e.g., 'Clicks').
    z_col : str
        Instrument column (e.g., 'Clicks_predicted' from ML first stage).
    base_controls : list[str] or None
        Exogenous controls. Do NOT include outcome-adjacent variables like CTR.
        Recommended: ['Age', 'Income', 'Gender_encoded', 'Location_encoded',
                      'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded'].
    include_interactions : bool
        If True, include Ad_Type √ó Ad_Placement interaction (exogenous).
    add_constant : bool
        If True, add a constant term automatically.
    cluster_col : str or None
        Column name for cluster-robust SEs (e.g., 'UserID', 'CampaignID').
    cov_type : str
        'robust' (HC), 'cluster' (requires cluster_col), or 'unadjusted'.

    Returns
    -------
    results : IV2SLSResults
        Fitted IV results object from linearmodels.
    data_used : pd.DataFrame
        DataFrame with columns actually used in estimation.
    """

    # --- Validate required columns ---
    required = [y_col, d_col, z_col]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    # --- Controls: enforce exogeneity discipline ---
    if base_controls is None:
        base_controls = [
            'Age', 'Income',
            'Gender_encoded', 'Location_encoded',
            'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded'
        ]

    # Strict: exclude CTR or any post-click/outcome-adjacent metrics from controls
    invalid_controls = [c for c in base_controls if c not in df.columns]
    if invalid_controls:
        # Warn strictly but proceed with available subset
        print(f"‚ö† Skipping missing controls: {invalid_controls}")
    controls = [c for c in base_controls if c in df.columns]

    # --- Optional exogenous interaction ---
    interaction_col = None
    if include_interactions:
        if ('Ad_Type_encoded' in df.columns) and ('Ad_Placement_encoded' in df.columns):
            interaction_col = 'Ad_Type_x_Placement'
            if interaction_col not in df.columns:
                df = df.copy()
                df[interaction_col] = df['Ad_Type_encoded'] * df['Ad_Placement_encoded']
                print("‚úì Added exogenous interaction: Ad_Type_x_Placement")
            controls.append(interaction_col)
        else:
            print("‚ö† Interaction requested but required columns not present; skipping.")

    # --- Build data frame used in estimation ---
    cols_needed = [y_col, d_col, z_col] + controls
    data = df[cols_needed].dropna().copy()
    if data.empty:
        raise ValueError("After dropping NA, no rows remain for estimation.")

    # --- Build formula for IV2SLS ---
    # dependent ~ exog + [endog ~ instruments]
    exog_formula = ' + '.join(controls) if controls else '1'
    if add_constant and exog_formula != '1':
        exog_formula = '1 + ' + exog_formula  # linearmodels adds constant via '1 +'
    elif add_constant and exog_formula == '1':
        # '1' already denotes constant in linearmodels formula
        pass
    else:
        # No constant: use '-1' to suppress intercept if you have exog terms
        if controls:
            exog_formula = '-1 + ' + ' + '.join(controls)

    formula = f"{y_col} ~ {exog_formula} + [{d_col} ~ {z_col}]"

    # --- Fit IV2SLS ---
    if cov_type == 'cluster' and (cluster_col is not None) and (cluster_col in df.columns):
        clusters = df.loc[data.index, cluster_col]
        results = IV2SLS.from_formula(formula, data=data).fit(cov_type='clustered', clusters=clusters)
    else:
        if cov_type == 'cluster' and cluster_col is None:
            print("‚ö† cov_type='cluster' requested but no cluster_col provided; defaulting to robust.")
        results = IV2SLS.from_formula(formula, data=data).fit(cov_type='robust' if cov_type != 'unadjusted' else 'unadjusted')

    # --- Strict reporting ---
    print("\n" + "="*70)
    print("2SLS ESTIMATION SUMMARY (linearmodels.iv.IV2SLS)")
    print("="*70)
    print(results.summary)

    # Optional first-stage diagnostics available via .first_stage (dict of RegressionResults)
    # Example:
    try:
        fs = results.first_stage[d_col]
        print("\nFirst-stage summary (endogenous regressor: {}):".format(d_col))
        print(f"  R-squared: {fs.rsquared:.4f}")
        print(f"  F-statistic (excluded instrument): {getattr(fs, 'f_statistic', None)}")
    except Exception:
        pass

    return results, data

##### 2SLS End

##### Stratified 2SLS Start

In [65]:
# import numpy as np
# import pandas as pd
# from linearmodels.iv import IV2SLS

# def analyze_subgroup_effects_iv(
#     df,
#     subgroup_vars=None,
#     min_subgroup_size=100,
#     y_col='Conversion_Rate',
#     d_col='Clicks',
#     z_col='Clicks_predicted',
#     base_controls=None,
#     add_constant=True,
#     cov_type='robust',         # 'robust', 'unadjusted', or 'cluster'
#     cluster_col=None,
#     verbose=True
# ):
#     """
#     Stratified IV2SLS with robust input validation and rank-deficiency pruning.
#     """

#     # --- Required columns ---
#     required = [y_col, d_col, z_col]
#     missing = [c for c in required if c not in df.columns]
#     if missing:
#         raise ValueError(f"Missing required columns: {missing}")

#     # --- Controls: strictly exogenous, no post-treatment metrics ---
#     if base_controls is None:
#         base_controls = [
#             'Age', 'Income',
#             'Gender_encoded', 'Location_encoded',
#             'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded'
#         ]
#     controls_all = [c for c in base_controls if c in df.columns]
#     if verbose:
#         missing_controls = [c for c in base_controls if c not in df.columns]
#         if missing_controls:
#             print(f"‚ö† Skipping missing controls: {missing_controls}")

#     # --- Default subgroup spec ---
#     if subgroup_vars is None:
#         subgroup_vars = {
#             'Income': [0, 30000, 50000, 70000, np.inf],
#             'Age': [0, 35, 50, 65, np.inf],
#             'Location': None,
#             'Ad_Type': None
#         }

#     # --- Helper: build formula ---
#     def build_formula(controls_list):
#         if controls_list:
#             exog = ' + '.join(controls_list)
#             exog = ('1 + ' + exog) if add_constant else ('-1 + ' + exog)
#         else:
#             exog = '1' if add_constant else '-1'
#         return f"{y_col} ~ {exog} + [{d_col} ~ {z_col}]"

#     # --- Helper: prune zero-variance and duplicate columns in subgroup ---
#     def prune_controls(sub_df, ctrl_cols):
#         pruned = []
#         for c in ctrl_cols:
#             if c not in sub_df.columns:
#                 continue
#             # drop if constant (zero variance)
#             if sub_df[c].nunique() <= 1:
#                 continue
#             pruned.append(c)
#         # Optional: drop perfectly duplicated columns
#         # (cheap check: drop cols identical to the intercept vector)
#         return pruned

#     rows = []
#     subgroup_keys = subgroup_vars if isinstance(subgroup_vars, list) else list(subgroup_vars.keys())

#     for var in subgroup_keys:
#         df_local = df.copy()

#         # Build subgroup column
#         if isinstance(subgroup_vars, dict) and subgroup_vars.get(var) is not None:
#             bins = subgroup_vars[var]
#             if not isinstance(bins, (list, tuple)) or len(bins) < 2:
#                 if verbose: print(f"‚ö† Invalid bins for {var}; skipping.")
#                 continue
#             labels = [f"{var}_{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]
#             try:
#                 df_local[f'{var}_subgroup'] = pd.cut(df_local[var], bins=bins, labels=labels, include_lowest=True)
#                 subgroup_col = f'{var}_subgroup'
#             except Exception as e:
#                 if verbose: print(f"‚ö† Failed to bin {var}: {e}; skipping.")
#                 continue
#         else:
#             subgroup_col = var
#             if subgroup_col not in df_local.columns:
#                 if verbose: print(f"‚ö† Subgroup column {subgroup_col} missing; skipping.")
#                 continue

#         subgroups = df_local[subgroup_col].dropna().unique()
#         if len(subgroups) == 0:
#             if verbose: print(f"‚ö† No valid subgroups for {var}; skipping.")
#             continue

#         for sg in subgroups:
#             dsg = df_local[df_local[subgroup_col] == sg]
#             n_obs = len(dsg)
#             if n_obs < min_subgroup_size:
#                 continue

#             # Ensure variation in instrument and endogenous regressor
#             if dsg[z_col].nunique() <= 1 or dsg[d_col].nunique() <= 1:
#                 # No first-stage or second-stage variation ‚Üí skip
#                 continue

#             # Build and prune controls for this subgroup
#             controls = prune_controls(dsg, controls_all)

#             # Build estimation data
#             cols_needed = [y_col, d_col, z_col] + controls
#             data = dsg[cols_needed].dropna()
#             n_used = len(data)
#             if n_used < min_subgroup_size:
#                 continue

#             # Final sanity: instrument/endog still vary after NA drop
#             if data[z_col].nunique() <= 1 or data[d_col].nunique() <= 1:
#                 continue

#             formula = build_formula(controls)

#             try:
#                 if cov_type == 'cluster' and cluster_col and (cluster_col in dsg.columns):
#                     clusters = dsg.loc[data.index, cluster_col]
#                     res = IV2SLS.from_formula(formula, data=data).fit(
#                         cov_type='clustered', clusters=clusters
#                     )
#                 else:
#                     if cov_type == 'cluster' and not cluster_col and verbose:
#                         print("‚ö† cov_type='cluster' requested without cluster_col; defaulting to robust.")
#                     res = IV2SLS.from_formula(formula, data=data).fit(
#                         cov_type='robust' if cov_type != 'unadjusted' else 'unadjusted'
#                     )

#                 # Extract estimates (second stage)
#                 beta = res.params.get(d_col, np.nan)
#                 se = res.std_errors.get(d_col, np.nan)
#                 pval = res.pvalues.get(d_col, np.nan)
#                 ci_lower = beta - 1.96 * se if pd.notnull(se) else np.nan
#                 ci_upper = beta + 1.96 * se if pd.notnull(se) else np.nan
#                 significant = bool(pd.notnull(pval) and pval < 0.05)

#                 # First-stage diagnostics (single endogenous ‚Üí FirstStageResults object)
#                 fs = res.first_stage
#                 fs_r2 = getattr(fs, 'rsquared', np.nan)
#                 fs_fstat_obj = getattr(fs, 'f_statistic', None)
#                 if hasattr(fs_fstat_obj, 'stat'):
#                     fs_f = float(fs_fstat_obj.stat)
#                 elif isinstance(fs_fstat_obj, (int, float)):
#                     fs_f = float(fs_fstat_obj)
#                 else:
#                     fs_f = np.nan
#                 instrument_weak = bool(pd.notnull(fs_f) and fs_f < 10)

#                 rows.append({
#                     'Variable': var,
#                     'Subgroup': str(sg),
#                     'N': n_obs,
#                     'N_Used': n_used,
#                     'Controls_Used': ','.join(controls) if controls else '(none)',
#                     'First_Stage_R2': fs_r2,
#                     'First_Stage_F': fs_f,
#                     'Instrument_Weak': instrument_weak,
#                     'Beta': beta,
#                     'Std_Error': se,
#                     'P_Value': pval,
#                     'CI_Lower': ci_lower,
#                     'CI_Upper': ci_upper,
#                     'Significant': significant
#                 })
#             except Exception as e:
#                 if verbose:
#                     print(f"‚úó Error in subgroup '{var}={sg}': {e}")
#                 continue

#     if not rows:
#         if verbose: print("‚ö† No subgroups estimated successfully.")
#         return None

#     out = pd.DataFrame(rows)
#     out['Abs_Effect'] = out['Beta'].abs()
#     out = out.sort_values('Abs_Effect', ascending=False)
#     return out


##### Stratified 2SLS End

In [66]:
# Making sure that I clicks is indeed endogenous.
def generate_example_data(n=2000):
    """Generate synthetic data with endogenous clicks"""
    np.random.seed(42)
    
    data = pd.DataFrame({
        'Age': np.random.randint(18, 65, n),
        'Gender': np.random.choice(['M', 'F'], n),
        'Income': np.random.randint(30000, 150000, n),
        'Location': np.random.choice(['Urban', 'Suburban', 'Rural'], n),
        'Ad_Type': np.random.choice(['Video', 'Banner', 'Native'], n),
        'Ad_Topic': np.random.choice(['Tech', 'Fashion', 'Food', 'Travel'], n),
        'Ad_Placement': np.random.choice(['Social_Media', 'Search', 'Display'], n),
        'Click_Time': pd.date_range('2024-01-01', periods=n, freq='H'),
    })
    
    # Normalize income
    data['Income'] = data['Income'] / 100000
    
    # Unobserved confounder (correlated with both clicks and conversion error)
    unobserved_confounder = np.random.randn(n)
    
    # Generate clicks (endogenous regressor)
    clicks_base = (
        0.5
        + 0.3 * (data['Ad_Type'] == 'Video').astype(float)
        + 0.2 * (data['Ad_Placement'] == 'Social_Media').astype(float)
        + 0.01 * data['Age']
        + 0.2 * data['Income']
        + 0.8 * unobserved_confounder   # <-- confounder drives clicks
        + np.random.randn(n) * 0.5
    )
    data['Clicks'] = np.clip(clicks_base, 0.1, 10)
    
    # CTR (correlated with clicks)
    data['CTR'] = data['Clicks'] * np.random.uniform(0.05, 0.15, n)
    
    # Conversion rate: causal effect + controls + error term
    # Error term includes the SAME confounder that drives clicks ‚Üí endogeneity
    epsilon = 0.05 * unobserved_confounder + np.random.randn(n) * 0.03
    
    conversion_base = (
        0.05
        + 0.08 * data['Clicks']        # true causal effect
        + 0.02 * data['Income']
        + 0.005 * data['Age']
        + 0.3 * data['CTR']
        + epsilon                      # endogenous error
    )
    data['Conversion_Rate'] = np.clip(conversion_base, 0.01, 0.95)
    
    return data

# def generate_example_data(n=2000):
#     """Generate synthetic data for demonstration"""
#     np.random.seed(42)
    
#     data = pd.DataFrame({
#         'Age': np.random.randint(18, 65, n),
#         'Gender': np.random.choice(['M', 'F'], n),
#         'Income': np.random.randint(30000, 150000, n),
#         'Location': np.random.choice(['Urban', 'Suburban', 'Rural'], n),
#         'Ad_Type': np.random.choice(['Video', 'Banner', 'Native'], n),
#         'Ad_Topic': np.random.choice(['Tech', 'Fashion', 'Food', 'Travel'], n),
#         'Ad_Placement': np.random.choice(['Social_Media', 'Search', 'Display'], n),
#         'Click_Time': pd.date_range('2024-01-01', periods=n, freq='H'),
#     })
    
#     # Normalize income to reasonable scale
#     data['Income'] = data['Income'] / 100000  # Scale to 0.3-1.5 range
    
#     # Generate clicks with realistic structure
#     clicks_base = (
#         0.5 +  # baseline
#         0.3 * (data['Ad_Type'] == 'Video').astype(float) +
#         0.2 * (data['Ad_Placement'] == 'Social_Media').astype(float) +
#         0.01 * data['Age'] +
#         0.2 * data['Income'] +
#         np.random.randn(n) * 0.5
#     )
#     data['Clicks'] = np.clip(clicks_base, 0.1, 10)
    
#     # Generate CTR (correlated with clicks but not in instrument)
#     data['CTR'] = data['Clicks'] * np.random.uniform(0.05, 0.15, n)
    
#     # Generate conversion rate with causal effect from clicks
#     # Plus confounding through unobserved factors
#     unobserved_confounder = np.random.randn(n) * 0.05
    
#     conversion_base = (
#         0.05 +  # baseline
#         0.08 * data['Clicks'] +  # TRUE CAUSAL EFFECT
#         0.02 * data['Income'] +
#         0.005 * data['Age'] +
#         0.3 * data['CTR'] +
#         unobserved_confounder +
#         np.random.randn(n) * 0.03
#     )
#     data['Conversion_Rate'] = np.clip(conversion_base, 0.01, 0.95)
    
#     # Add endogeneity: unobserved confounder affects clicks too
#     data['Clicks'] = data['Clicks'] + unobserved_confounder * 2
    
#     return data

##### Start of Implementation

In [67]:
# Cleaning and preprocessing
# df = pd.read_csv('../datasets/project/Dataset_Ads.csv')
df = generate_example_data(n=5000)
print("\n" + "="*60)
print('ORIGINAL DATASET')
print("="*60)
print(df.head())
print(df.describe(include='all'))

df = clean_data(df)
print("\n" + "="*60)
print('CLEANED AND LOGGED DATASET')
print("="*60)
print(df.head())

df = engineer_time_features(df)
print("\n" + "="*60)
print('TIME ENGINEERED COLUMN')
print("="*60)
print(df.head())

df = encode_categorical_features(df)
print("\n" + "="*60)
print('ENCODED CATEGORICAL VARIABLES')
print("="*60)
print(df.head())

print("\n" + "="*60)
print('DESCRIPTION OF DF AFTER PREPROCESSING')
print("="*60)
print(df.describe(include='all'))


ORIGINAL DATASET
   Age Gender   Income  Location Ad_Type Ad_Topic  Ad_Placement  \
0   56      M  0.87676  Suburban  Native   Travel        Search   
1   46      F  0.41560     Rural  Native   Travel       Display   
2   32      M  0.84401     Rural  Banner     Tech       Display   
3   60      M  0.80468  Suburban  Native     Food  Social_Media   
4   25      M  1.45884     Rural  Native  Fashion       Display   

           Click_Time    Clicks       CTR  Conversion_Rate  
0 2024-01-01 00:00:00  1.266525  0.120705         0.439875  
1 2024-01-01 01:00:00  1.240839  0.090825         0.289356  
2 2024-01-01 02:00:00  1.255371  0.178216         0.344477  
3 2024-01-01 03:00:00  0.829583  0.073841         0.421605  
4 2024-01-01 04:00:00  3.401662  0.354961         0.701755  
               Age Gender       Income  Location Ad_Type Ad_Topic  \
count   5000.00000   5000  5000.000000      5000    5000     5000   
unique         NaN      2          NaN         3       3        4   
top   

In [68]:
# # Advanced cleaning and instrument stuff.
# # Cleaning and preprocessing
# # df = pd.read_csv('../datasets/project/Dataset_Ads.csv')
# df = generate_example_data(n=5000)
# print("\n" + "="*60)
# print('ORIGINAL DATASET')
# print("="*60)
# print(df.head())

# df = clean_data_strict(df)
# print("\n" + "="*60)
# print('CLEANED AND LOGGED DATASET')
# print("="*60)
# print(df.head())

# df = engineer_time_features_enhanced(df)
# print("\n" + "="*60)
# print('TIME ENGINEERED COLUMN')
# print("="*60)
# print(df.head())

# df = encode_categorical_features(df)
# print("\n" + "="*60)
# print('ENCODED CATEGORICAL VARIABLES')
# print("="*60)
# print(df.head())

# # df = engineer_instrument_features_comprehensive(df)
# # print("\n" + "="*60)
# # print('ADVANCED INSTRUMENT FEATURE VARIABLES')
# # print("="*60)
# # print(df.head())

# print("\n" + "="*60)
# print('DESCRIPTION OF DF AFTER PREPROCESSING')
# print("="*60)
# print(df.describe(include='all'))

In [69]:
# ML instrument creation took about 1m40secs with real data
# ML instrument creation took about 3m8secs with synthetic data
# ML instrument creation took about 5m40secs with real data
# with advanced instrument variables it took 11m51secs and still has a very bad score.
print("\n" + "="*60)
print('CREATING ML INSTRUMENT')
print("="*60)
df, X, model = create_ml_instrument(df, model_type='stacking', use_enhanced_features=True)
# NOTE Remember that the instrument is really weak when created not using the interaction terms.


CREATING ML INSTRUMENT

COMPREHENSIVE INSTRUMENT FEATURE ENGINEERING

[1] Demographics √ó Ad Characteristics:
  Created 9 interactions

[2] Demographics √ó Time:
  Created 2 interactions

[3] Ad Characteristics √ó Time:
  Created 3 interactions

[4] Location-specific interactions:
  Created 6 interactions

[5] Nonlinear transformations:
  Created 8 transformations

[6] Three-way interactions:
  Created 0 three-way interactions

[7] Ad characteristic interactions:
  Created 3 ad interactions

TOTAL NEW FEATURES CREATED: 31

Age                                               int32
Gender                                           object
Income                                          float64
Location                                         object
Ad_Type                                          object
Ad_Topic                                         object
Ad_Placement                                     object
Click_Time                               datetime64[ns]
Clicks                

In [70]:
# diagnostics for ml (complex) instrument strength
print("\n" + "="*60)
print('ML INSTRUMENT DIAGNOSTICS')
print("="*60)
enhanced_instrument_diagnostics(df, X, df['Clicks'], model)


ML INSTRUMENT DIAGNOSTICS

ENHANCED INSTRUMENT STRENGTH DIAGNOSTICS

SAMPLE INFORMATION:
  Sample size (n):              5,000
  Number of features (k):       18

FIRST-STAGE PERFORMANCE:
  R-squared:                    0.0199
  Correlation (Z, D):           0.1412
  F-statistic:                  101.24
  Cragg-Donald statistic:       99.61

BENCHMARKS & INTERPRETATION:
  Criterion                           Threshold    Status
  ----------------------------------- ------------ --------------------
  Weak Instrument (F < 10)            10.00        ‚úì STRONG
  Stock-Yogo 10% max bias             16.38        ‚úì‚úì EXCELLENT
  Stock-Yogo 15% max bias             8.96         ‚úì GOOD

OVERALL ASSESSMENT:
  ‚úì‚úì VERY STRONG INSTRUMENT
     Maximum IV bias < 10% of OLS bias

‚Ñπ Stacking ensemble used - feature importances not directly available



In [71]:
# ols

controls = [
    'Age', 'Income',
    'Gender_encoded', 'Location_encoded',
    'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded'
]

results, data_used = run_ols(
    df,
    y_col='Conversion_Rate',
    d_col='Clicks',
    base_controls=controls,
    # include_interactions=True,   # optional
    add_constant=True,
    cov_type='robust'            # or 'cluster' with cluster_col='CampaignID'
)


OLS ESTIMATION SUMMARY (statsmodels.api.OLS)
                            OLS Regression Results                            
Dep. Variable:        Conversion_Rate   R-squared:                       0.930
Model:                            OLS   Adj. R-squared:                  0.929
Method:                 Least Squares   F-statistic:                     7726.
Date:                Thu, 13 Nov 2025   Prob (F-statistic):               0.00
Time:                        16:20:18   Log-Likelihood:                 8679.7
No. Observations:                5000   AIC:                        -1.734e+04
Df Residuals:                    4991   BIC:                        -1.728e+04
Df Model:                           8                                         
Covariance Type:                  HC1                                         
                           coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------

In [72]:
# 2SLS with ml featured instrument
# The r-squared value is super tiny with real data for whatever reason.
# However, with my synthetic data I actually get a good r-squared score and a small p-value.
# Perhpas the move is to end the real data stuff there, but continue with Raj Chetty methodlogies with the
# synthetic data for reaserch sake.

controls = [
    'Age', 'Income',
    'Gender_encoded', 'Location_encoded',
    'Ad_Type_encoded', 'Ad_Topic_encoded', 'Ad_Placement_encoded'
]

results, data_used = run_2sls(
    df,
    y_col='Conversion_Rate',
    d_col='Clicks',
    z_col='Clicks_predicted',
    base_controls=controls,
    include_interactions=True,   # optional
    add_constant=True,
    cov_type='robust'            # or 'cluster' with cluster_col='CampaignID'
)

‚úì Added exogenous interaction: Ad_Type_x_Placement

2SLS ESTIMATION SUMMARY (linearmodels.iv.IV2SLS)
                          IV-2SLS Estimation Summary                          
Dep. Variable:        Conversion_Rate   R-squared:                     -0.5283
Estimator:                    IV-2SLS   Adj. R-squared:                -0.5311
No. Observations:                5000   F-statistic:                    819.72
Date:                Thu, Nov 13 2025   P-value (F-stat)                0.0000
Time:                        16:20:18   Distribution:                  chi2(9)
Cov. Estimator:                robust                                         
                                                                              
                                  Parameter Estimates                                   
                      Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
---------------------------------------------------------------------------------------

##### End of Implementation

##### Spitballing...

In [73]:
# Cleaning and preprocessing
df = pd.read_csv('../datasets/project/Dataset_Ads.csv')
# df = generate_example_data(n=5000)
print("\n" + "="*60)
print('ORIGINAL DATASET')
print("="*60)
print(df.head())
print(df.describe(include='all'))

df = clean_data(df)
print("\n" + "="*60)
print('CLEANED AND LOGGED DATASET')
print("="*60)
print(df.head())

df = engineer_time_features(df)
print("\n" + "="*60)
print('TIME ENGINEERED COLUMN')
print("="*60)
print(df.head())

df = encode_categorical_features(df)
print("\n" + "="*60)
print('ENCODED CATEGORICAL VARIABLES')
print("="*60)
print(df.head())

print("\n" + "="*60)
print('DESCRIPTION OF DF AFTER PREPROCESSING')
print("="*60)
print(df.describe(include='all'))


ORIGINAL DATASET
   Age  Gender    Income  Location Ad_Type Ad_Topic   Ad_Placement  Clicks  \
0   61    Male  35717.43     Urban  Banner   Travel   Social Media       3   
1   41    Male  47453.25     Rural   Video   Travel  Search Engine       5   
2   49  Female  68126.35     Rural    Text     Food   Social Media       4   
3   68  Female  64585.73  Suburban    Text   Health        Website       6   
4   63    Male  21109.40     Urban  Native  Fashion  Search Engine       5   

                   Click_Time  Conversion_Rate     CTR  
0  2024-01-18 20:45:56.898459           0.0981  0.0737  
1  2023-04-24 20:45:56.898459           0.0937  0.0592  
2  2024-02-24 20:45:56.898459           0.1912  0.0563  
3  2023-12-13 20:45:56.898459           0.1122  0.0232  
4  2023-07-02 20:45:56.898459           0.1426  0.0539  
                 Age Gender         Income Location Ad_Type Ad_Topic  \
count   10000.000000  10000   10000.000000    10000   10000    10000   
unique           NaN      3

In [74]:
import pandas as pd
import numpy as np

# Core packages
import statsmodels.api as sm
from linearmodels.iv import IV2SLS

# 1) Prepare data columns
# Encoded dummies are assumed present; otherwise create pandas get_dummies for raw categorical columns.

# Instruments (Z)
Z_cols = [
    'Ad_Type_encoded',        # categorical encoding (0..3)
    'Ad_Placement_encoded',   # categorical encoding (0..2)
    'Ad_Topic_encoded',       # categorical encoding (0..5)
    'Day_of_Week'             # 0..6
    # 'Hour'  # include only if it varies; drop if constant
]

# Exogenous controls (X)
X_cols = [
    'Age_log',
    'Income_log',
    'Gender_encoded',
    'Location_encoded'
]

# Outcome (Y) and endogenous regressor (D)
Y_col = 'Conversion_Rate'
D_col = 'Clicks'

# Drop rows with missing values in required columns
cols_needed = Z_cols + X_cols + [Y_col, D_col]
df2 = df.dropna(subset=cols_needed).copy()

# Optional: add interactions to strengthen first-stage relevance (comment/uncomment)
# df2['Type_x_Placement'] = df2['Ad_Type_encoded'] * df2['Ad_Placement_encoded']
# df2['Topic_x_DOW'] = df2['Ad_Topic_encoded'] * df2['Day_of_Week']
# Z_cols += ['Type_x_Placement', 'Topic_x_DOW']

# 2) First-stage OLS: Clicks on instruments + controls
Z = df2[Z_cols]
X = df2[X_cols]
D = df2[D_col].astype(float)

# Add constant
FS_design = sm.add_constant(pd.concat([Z, X], axis=1))
fs_model = sm.OLS(D, FS_design).fit(cov_type='HC1')  # robust SE (White)

print("FIRST-STAGE SUMMARY (Clicks ~ Z + X):")
print(fs_model.summary())

# First-stage F-stat for instruments (partial F): compute using nested models
FS_design_ZX = FS_design
FS_design_X_only = sm.add_constant(X)

restricted = sm.OLS(D, FS_design_X_only).fit(cov_type='HC1')
unrestricted = fs_model

SSR_r = np.sum(restricted.resid**2)
SSR_ur = np.sum(unrestricted.resid**2)
q = FS_design_ZX.shape[1] - FS_design_X_only.shape[1]  # number of instrument parameters
n = FS_design_ZX.shape[0]
k = FS_design_ZX.shape[1]  # total parameters

F_partial = ((SSR_r - SSR_ur) / q) / (SSR_ur / (n - k))
print(f"\nPartial F-statistic for instruments (relevance): {F_partial:.2f}")

# Predicted Clicks
df2['Clicks_hat'] = unrestricted.predict(FS_design)

# 3) Second-stage IV: Conversion_Rate on Clicks instrumented by (Z), with controls (X)
Y = df2[Y_col].astype(float)

# Build matrices for IV2SLS
iv_exog = sm.add_constant(df2[X_cols])            # controls + constant
iv_endog = df2[[D_col]]                           # endogenous regressor
iv_instr = df2[Z_cols]                            # instruments

iv_model = IV2SLS(Y, iv_exog, iv_endog, iv_instr).fit(cov_type='robust')

print("\nSECOND-STAGE IV SUMMARY (Conversion_Rate ~ Clicks_hat + X):")
print(iv_model.summary)

# 4) Sanity checks
corr_clicks_hat = np.corrcoef(df2['Clicks'], df2['Clicks_hat'])[0, 1]
print(f"\nCorrelation(Clicks, Clicks_hat): {corr_clicks_hat:.3f}")
print(f"N: {len(df2)} | Instruments: {len(Z_cols)} | Controls: {len(X_cols)}")


FIRST-STAGE SUMMARY (Clicks ~ Z + X):
                            OLS Regression Results                            
Dep. Variable:                 Clicks   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.8155
Date:                Thu, 13 Nov 2025   Prob (F-statistic):              0.589
Time:                        16:33:58   Log-Likelihood:                -21311.
No. Observations:                9543   AIC:                         4.264e+04
Df Residuals:                    9534   BIC:                         4.270e+04
Df Model:                           8                                         
Covariance Type:                  HC1                                         
                           coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------