# Monte Carlo Testing for Proxy Finder Algorithm


In [None]:
from proxy_finder import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def prepare_dataset(df, target, target_correlation):
  # split dataset in half
  df_train = df.sample(frac=0.5, random_state=42)
  df_test = df.drop(df_train.index)

  # add synthetic proxies to test set
  target_column = df_test[target]
  synthetic_proxies = generate_synthetic_proxies(target_column, target_correlation)
  for name, proxy in synthetic_proxies.items():
    df_test[name] = proxy

  # drop target from test set
  df_test = df_test.drop(columns=[target])

  return df_train, df_test

In [None]:
def generate_synthetic_proxies(target_column, target_correlation, noise_level=0.1):
   # Convert target_column to numpy array and standardize
    target = np.array(target_column)
    target = (target - np.mean(target)) / np.std(target)

    synthetic_proxies = {}

    # Generate independent standard normal variable
    z = np.random.standard_normal(len(target))

    # Create correlated variable using the correlation formula
    proxy = target_correlation * target + np.sqrt(1 - target_correlation**2) * z

    # Add controlled noise
    proxy = proxy + np.random.normal(0, noise_level, len(target))

    # Standardize final proxy
    proxy = (proxy - np.mean(proxy)) / np.std(proxy)

    synthetic_proxies[f'proxy_{target_correlation:.2f}'] = proxy

    return synthetic_proxies

# Stage 1: Testing Mean Penalty Approach with Several Target Correlations

In [12]:
def run_and_visualize_monte_carlo(df, weights, num_iterations, target, target_correlations, predictors):
    selection_trackers = []
    proxy_names = []
    
    # Run Monte Carlo for each target correlation
    for target_correlation in target_correlations:
        # Prepare dataset
        df_train, df_test = prepare_dataset(df, target, target_correlation)
        selection_tracker = {orth_weight: {} for orth_weight in weights}
        
        # Run iterations for each weight
        for orth_weight in weights:
            print(f"Testing with orthogonality weight: {orth_weight}")
            print(f"Testing with target correlation: {target_correlation}")
            
            for i in range(num_iterations):
                print(f"Running iteration {i+1}/{num_iterations}")
                top_proxies = proxy_finder(df_train=df_train, 
                                         df_test=df_test, 
                                         target=target, 
                                         predictors=predictors, 
                                         num_proxies=5, 
                                         orth_weight=orth_weight, 
                                         orthogonal_vars=predictors)
                
                # Update selection tracker for top pick
                for rank, proxy in enumerate(top_proxies, 1):
                    if rank == 1:
                        selection_tracker[orth_weight][proxy] = selection_tracker[orth_weight].get(proxy, 0) + 1
        
        selection_trackers.append(selection_tracker)
        proxy_names.append(f'proxy_{target_correlation:.2f}')
    
    # Visualization
    plt.figure(figsize=(10, 6))
    
    # Plot results for each target correlation
    for index, tracker in enumerate(selection_trackers):
        results = []
        for orth_weight, proxies in tracker.items():
            for proxy, frequency in proxies.items():
                results.append({
                    'orth_weight': orth_weight,
                    'proxy': proxy,
                    'frequency': (frequency / num_iterations) * 100
                })
        
        results_df = pd.DataFrame(results)
        pivot_data = results_df.pivot(index='orth_weight', columns='proxy', values='frequency')
        pivot_data.fillna(0, inplace=True)
        print(pivot_data)
        
        # Plot each proxy as a separate line
        name = proxy_names[index]
        plt.plot(pivot_data.index, pivot_data[name], marker='o', label=name, linewidth=2)
    
    # Create the line plot
    plt.xlabel('Orthogonality Weight')
    plt.ylabel('Selection Frequency')
    plt.title('Selection Frequency vs Orthogonality Weight')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    plt.show()

In [None]:
# change parameters as needed
df = pd.read_stata("/content/temp_yougov.dta")
weights = np.arange(0.4, 0.85, 0.05)
target_correlations = [0.95, 0.90, 0.80]
num_iterations = 30
target = 'christian_nationalism'
predictors = [
                   'presvote20post',
                   'housevote20post',
                   'senvote20post',
                   'pff_jb',
                   'pff_dt',
                   'pid7',
                   'election_fairnness',
                   'educ',
                   'hispanic',
                   'partisan_violence',
                   'immigrant_citizenship',
                   'immigrant_deport',
                   'auth_grid_1',
                   'auth_grid_3',
                   'auth_grid_2',
                   'faminc_new']
run_and_visualize_monte_carlo(df, weights, num_iterations, target, target_correlations, predictors)

# Stage 2: Testing Threshold-Based Penalty Approach with Several Tau Values 


In [None]:
from proxy_finder_threshold_orthogonalization import proxy_finder_threshold_ortho as pf

In [None]:
def run_and_visualize_monte_carlo2(df, weights, num_iterations, target, target_correlation, predictors, taus):
    selection_trackers = []
    proxy_name = (f'proxy_{target_correlation:.2f}')
    tau_names = [f'tau_{tau:.2f}' for tau in taus]

    # Run Monte Carlo for each target correlation
    for tau in taus:
        # Prepare dataset
        df_train, df_test = prepare_dataset(df, target, target_correlation)
        selection_tracker = {orth_weight: {} for orth_weight in weights}
        
        # Run iterations for each weight
        for orth_weight in weights:
            print(f"Testing with orthogonality weight: {orth_weight}")
            print(f"Testing with target correlation: {target_correlation}")
            
            for i in range(num_iterations):
                print(f"Running iteration {i+1}/{num_iterations}")
                top_proxies = pf(df_train=df_train, 
                                         df_test=df_test, 
                                         target=target, 
                                         predictors=predictors, 
                                         num_proxies=5, 
                                         orth_weight=orth_weight, 
                                         orthogonal_vars=predictors)
                
                # Update selection tracker for top pick
                for rank, proxy in enumerate(top_proxies, 1):
                    if rank == 1:
                        selection_tracker[orth_weight][proxy] = selection_tracker[orth_weight].get(proxy, 0) + 1
        
        selection_trackers.append(selection_tracker)
    
    # Visualization
    plt.figure(figsize=(10, 6))
    
    # Plot results for each target correlation
    for index, tracker in enumerate(selection_trackers):
        results = []
        for orth_weight, proxies in tracker.items():
            for proxy, frequency in proxies.items():
                results.append({
                    'orth_weight': orth_weight,
                    'proxy': proxy,
                    'frequency': (frequency / num_iterations) * 100
                })
        
        results_df = pd.DataFrame(results)
        pivot_data = results_df.pivot(index='orth_weight', columns='proxy', values='frequency')
        pivot_data.fillna(0, inplace=True)
        print(pivot_data)
        
        # Plot each proxy as a separate line
        name = tau_names[index]
        plt.plot(pivot_data.index, pivot_data[proxy_name], marker='o', label=name, linewidth=2)
    
    # Create the line plot
    plt.xlabel('Orthogonality Weight')
    plt.ylabel('Selection Frequency')
    plt.title('Selection Frequency vs Orthogonality Weight')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    plt.show()

In [None]:
df = pd.read_stata("/content/temp_yougov.dta")
weights=[0.35, 0.65]
taus = [0.00, 0.05]
num_iterations = 2
target_correlation = 0.90
target = 'christian_nationalism'
predictors = [
                   'presvote20post',
                   'housevote20post',
                   'senvote20post',
                   'pff_jb',
                   'pff_dt',
                   'pid7',
                   'election_fairnness',
                   'educ',
                   'hispanic',
                   'partisan_violence',
                   'immigrant_citizenship',
                   'immigrant_deport',
                   'auth_grid_1',
                   'auth_grid_3',
                   'auth_grid_2',
                   'faminc_new']

run_and_visualize_monte_carlo2(df, weights, num_iterations, target, target_correlation, predictors, taus)