# Monte Carlo Testing for Proxy Finder Algorithm


In [7]:
from proxy_finder import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Stage 1: Testing Orthogonality Weight

Steps:
- Split YouGov dataset in half. Train_df and Test_df
- Train model using Train_df with christian_nationalism as the target and the 17 predictors
- Prepare Test_df by deleting christian_nationalism and adding several synthetic proxies with DIFFERENT levels of correlation with the target (in this case, CN)
- Call generate proxies function with many levels of ortho_weights and see at which weights are these synthetic proxies chosen

In [12]:
def prepare_dataset(df, target):
  # split dataset in half
  df_train = df.sample(frac=0.5, random_state=42)
  df_test = df.drop(df_train.index)

  # add synthetic proxies to test set
  target_column = df_test[target]
  synthetic_proxies = generate_synthetic_proxies(target_column)
  for name, proxy in synthetic_proxies.items():
    df_test[name] = proxy

  # drop target from test set
  df_test = df_test.drop(columns=[target])

  return df_train, df_test

In [None]:
def generate_synthetic_proxies(target_column, target_correlations=[0.95, 0.9, 0.8, 0.7], noise_level=0.1):
   # Convert target_column to numpy array and standardize
    target = np.array(target_column)
    target = (target - np.mean(target)) / np.std(target)

    synthetic_proxies = {}
    for corr in target_correlations:
        # Generate independent standard normal variable
        z = np.random.standard_normal(len(target))

        # Create correlated variable using the correlation formula
        proxy = corr * target + np.sqrt(1 - corr**2) * z

        # Add controlled noise
        proxy = proxy + np.random.normal(0, noise_level, len(target))

        # Standardize final proxy
        proxy = (proxy - np.mean(proxy)) / np.std(proxy)

        synthetic_proxies[f'proxy_{corr:.2f}'] = proxy

    return synthetic_proxies

In [None]:
# Running the Monte Carlo Simulations
df = pd.read_stata("/content/temp_yougov.dta")
weights = [0.45, 0.47, 0.49, 0.51, 0.53, 0.55, 0.57, 0.59, 0.61, 0.63, 0.65, 0.67, 0.69, 0.71]
num_iterations = 5
target = 'christian_nationalism'
predictors = [
                   'presvote20post',
                   'housevote20post',
                   'senvote20post',
                   'pff_jb',
                   'pff_dt',
                   'pid7',
                   'election_fairnness',
                   'educ',
                   'hispanic',
                   'partisan_violence',
                   'immigrant_citizenship',
                   'immigrant_deport',
                   'auth_grid_1',
                   'auth_grid_3',
                   'auth_grid_2',
                   'faminc_new']

df_train, df_test = prepare_dataset(df, target)
selection_tracker = {orth_weight: {} for orth_weight in weights}

for orth_weight in weights:
  print(f"Testing with orthogonality weight: {orth_weight}")

  for i in range(num_iterations):
    print(f"Running iteration {i+1}/{num_iterations}")
    top_proxies = proxy_finder(df_train=df_train, df_test=df_test, target=target, predictors=predictors, num_proxies=5, orth_weight=orth_weight, orthogonal_vars=predictors)

    for rank, proxy in enumerate(top_proxies, 1):
          # Update selection counter for top pick
      if rank == 1:
        selection_tracker[orth_weight][proxy] = selection_tracker[orth_weight].get(proxy, 0) + 1

# Visualize results
results = []
for orth_weight, proxies in selection_tracker.items():
    for proxy, frequency in proxies.items():
        results.append({'orth_weight': orth_weight, 'proxy': proxy, 'frequency': (frequency / num_iterations) * 100})

results_df = pd.DataFrame(results)
pivot_data = results_df.pivot(index='orth_weight', columns='proxy', values='frequency')

# Create the line plot
plt.figure(figsize=(10, 6))

# Plot each proxy as a separate line
plt.plot(pivot_data.index, pivot_data['proxy_0.95'], marker='o', label='proxy_0.95', linewidth=2)
plt.plot(pivot_data.index, pivot_data['proxy_0.90'], marker='o', label='proxy_0.90', linewidth=2)
plt.plot(pivot_data.index, pivot_data['proxy_0.80'], marker='o', label='proxy_0.80', linewidth=2)


# Customize the plot
plt.xlabel('Orthogonal Weight')
plt.ylabel('Frequency')
plt.ylim(0, 100)
plt.title('Proxy Frequency vs Orthogonal Weight')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend()

# Show the plot
plt.show()