# Monte Carlo Testing for Proxy Finder Algorithm


In [7]:
from proxy_finder import proxy_finder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Stage 1: Testing Orthogonality Weight

In [12]:
def generate_synthetic_data(n_samples=1000):
  # Generate predictors
  pred_1 = np.random.normal(0, 1, n_samples)
  pred_2 = np.random.normal(0, 1, n_samples)
  pred_3 = np.random.normal(0, 1, n_samples)
  predictors_df = pd.DataFrame({'pred_1': pred_1, 'pred_2': pred_2, 'pred_3': pred_3})

  # # Generate orthogonal variables
  # ortho_1 = np.random.normal(0, 1, n_samples)
  # ortho_2 = np.random.normal(0, 1, n_samples)
  # ortho_3 = np.random.normal(0, 1, n_samples)
  # orthogonals_df = pd.DataFrame({'ortho_1': ortho_1, 'ortho_2': ortho_2, 'ortho_3': ortho_3})

  # Generate target (linear combination of the predictors)
  target = 0.2 * pred_1 + 0.3 * pred_2 + 0.5 * pred_3 + np.random.normal(0, 0.1, n_samples)
  target_df = pd.DataFrame({'target': target})

  # Generate several candidates of varying quality
  good_proxy = 0.7 * target + 0.3 * np.random.normal(0, 1, n_samples) # good candidate
  okay_proxy = 0.5 * target + 0.5 * pred_1 + np.random.normal(0, 0.1, n_samples) # okay candidate
  bad_proxy = np.random.normal(0, 1, n_samples) # bad candidate

  # # Use Graham Schmidtt to generate a proxy variable that predicts target while remaining orthogonal to given variables
  # ortho_proxy = 0.8 * target + 0.2 * np.random.normal(0, 1, n_samples)
  # for ortho_i in orthogonals_df.columns:
  #   val = orthogonals_df[ortho_i].values
  #   ortho_proxy = ortho_proxy - (np.dot(ortho_proxy, val) / np.dot(val, val)) * val

  # candidates_df = pd.DataFrame({'good_proxy': good_proxy, 'bad_proxy': bad_proxy, 'okay_proxy': okay_proxy, 'ortho_proxy': ortho_proxy,})

  # return predictors_df, target_df, candidates_df, orthogals_df

  candidates_df = pd.DataFrame({'good_proxy': good_proxy, 'okay_proxy': okay_proxy, 'bad_proxy': bad_proxy})

  return predictors_df, target_df, candidates_df

In [None]:
# Run a single iteration of the proxy finder algorithm with the provided weight
def test_orth_weight(orth_weight, n_iterations):
  results = []

  for iter in range(n_iterations):
    train_predictors, train_target, train_candidates = generate_synthetic_data()
    test_predictors, _, test_candidates = generate_synthetic_data()

    df_train = pd.concat([train_predictors, train_target], axis=1) # training set
    df_test = pd.concat([test_predictors, test_candidates], axis=1) # test set

    top_proxies = proxy_finder(df_train=df_train,
                 df_test=df_test,
                 target='target',
                 predictors = train_predictors.columns.tolist(),
                 num_proxies = 1,
                 orth_weight = orth_weight,
                 candidates = test_candidates.columns.tolist(),
                 orthogonal_vars = train_predictors.columns.tolist()
    )

    results.append(top_proxies[0])

  return results

In [None]:
# Run the simulations, display results
def monte_carlo(n_iterations=100, ortho_weights=np.arange(0, 1.1, 0.1)):
  performance = {}

  for weight in ortho_weights:
    results = test_orth_weight(weight, n_iterations)
    performance[weight] = {
      'good_proxy_selection': results.count('good_proxy') / n_iterations,
      'okay_proxy_selection': results.count('okay_proxy') / n_iterations,
      'bad_proxy_selection': results.count('bad_proxy') / n_iterations
    }

  performance_df = pd.DataFrame(performance).T
  plt.figure
  plt.plot('good_proxy_selection', data=performance_df, label='good proxy')
  plt.xlabel('Orthogonality Weight')
  plt.ylabel('Selection Ratio')
  plt.title('Proxy Selection Performance vs Orthogonality Weight')
  plt.legend()
  plt.grid(True)
  plt.show()

  return performance_df

In [None]:
monte_carlo(n_iterations = 5, ortho_weights = np.arange(0.0, 0.9, 0.1))