# Monte Carlo Testing for Proxy Finder Algorithm


In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import sys
from gain import GAIN # download gain from https://github.com/evolext/GAIN
from usage_example import * # download this from https://github.com/evolext/GAIN
import utils # download this from https://github.com/evolext/GAIN
import models # download this from https://github.com/evolext/GAIN
import matplotlib.pyplot as plt

In [None]:
def proxy_finder_validate(item, candidates, df1, df2, predictors, orthogonal_vars):
    # validate proxies and st item
    assert item in df1.columns, f'AssertionError: item {item} not in df1.columns'

    assert predictors, f'AssertionError: missing predictors. If you would prefer to not specify predictors, do not pass in a variable.'

    for c in predictors:
        assert c in df1.columns, f'AssertionError: predictor {c} not in df1.columns'
        assert c in df2.columns, f'AssertionError: predictor {c} not in df2.columns' # we need same variable in second dataset
        assert c in df1.select_dtypes(include=['number']).columns, f'predictor {c} is not a numeric column in df1'
        assert c in df2.select_dtypes(include=['number']).columns, f'predictor {c} is not a numeric column in df2'

    for c in candidates:
        assert c in df2.columns, f'AssertionError: candidate {c} not in df2.columns'

    if (orthogonal_vars != None):
        for c in orthogonal_vars:
            assert c in df2.columns, f'AssertionError: orthogonal variable {c} not in df2.columns'

In [None]:
def get_predictions(df_train, df_test, predictors, target, epochs=50, learning_rate=0.001, l2_lambda=0.001):
  # CODE IMPLEMENTATION ASSISTED BY GENERATIVE AI

  # Set parameters
  SEED = 13
  DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
  TRAIN_SIZE = 1.0  # Using all of df1 for training
  np.random.seed(SEED)

  df1 = df_train.copy()
  df2 = df_test.copy()

  # drop everything but predictors and target from df1
  target_col_df1 = df1[target]
  df1 = df1[predictors]
  df1[target] = target_col_df1

  # drop everything but predictors from df2
  df2 = df2[predictors]
  # add missing target
  df2[target] = np.nan

  combined_df = pd.concat([df1, df2])

  # Step 3: Normalize the data
  scaler = MinMaxScaler()
  combined_data_std = scaler.fit_transform(combined_df)

  # Split back into df1 (training) and df2 (prediction)
  df1_std = combined_data_std[:len(df1)]
  df2_std = combined_data_std[len(df1):]

  # Create tensors and masks
  X_train_tensor = torch.tensor(df1_std).float()
  M_train_tensor = get_mask(X_train_tensor)  # This creates mask with 0s for observed values, 1s for missing values
  train_dataset = TensorDataset(X_train_tensor, M_train_tensor)
  train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=False)

  X_test_tensor = torch.tensor(df2_std).float()
  M_test_tensor = get_mask(X_test_tensor)  # This will mark all values in the target column as missing
  test_dataset = TensorDataset(X_test_tensor, M_test_tensor)
  test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False)

  # Step 4: Initialize and train the GAIN model
  stopper = EarlyStopper(patience=2, min_delta=0.001)
  model = GAIN(train_loader=train_loader, seed=SEED)

  optimizer_G = torch.optim.Adam(model.G.parameters())
  optimizer_D = torch.optim.Adam(model.D.parameters())
  model.set_optimizer(optimizer=optimizer_G, generator=True)
  model.set_optimizer(optimizer=optimizer_D, generator=False)

  model.to(DEVICE)
  model.train(n_epoches=100, verbose=True, stopper=stopper)

  # Step 5: Use the trained model to predict (impute) target values for df2
  predictions = []

  for x_test_batch, m_batch in test_loader:
      x_batch_imputed = model.imputation(x=x_test_batch, m=m_batch)
      x_batch_imputed = x_batch_imputed.cpu().numpy()
      predictions.append(x_batch_imputed)

  # Combine predictions and inverse transform
  predictions_combined = np.vstack(predictions)
  predictions_original_scale = scaler.inverse_transform(predictions_combined)

  # Extract the target column predictions
  target_column_index = df1.columns.get_loc(target)
  df2_predictions = predictions_original_scale[:, target_column_index]

  return df2_predictions

In [None]:
 # orthogonalization method
# all data is preprocessed and df test has been appended target preds
def orthogonalize(candidates, df_test, orthogonal_vars):
        orth_scores = {}
        for c in candidates:
            candset = df_test[[c, 'predicted_target']].copy().dropna() # assumes candidate has mostly non-NaN entries
            candcol = candset[c]

            X = sm.add_constant(candcol)
            temp_orth_scores = []
            for orth_var in orthogonal_vars:
                orthset = df_test[[orth_var]].copy().dropna()
                common_indices = candset.index.intersection(orthset.index)
                if common_indices.empty:
                    continue
                orth_col = orthset.loc[common_indices, orth_var]
                if np.var(orth_col) == 0:
                    print("ortho:", orth_var, "candidate", c)
                    continue # zero variance leads to divide by zero error
                candcol_common = candset.loc[common_indices, c]

                X_common = sm.add_constant(candcol_common)
                model = sm.OLS(orth_col, X_common).fit()
                temp_orth_scores.append(model.rsquared)

            if temp_orth_scores:
                orth_scores[c] = sum(temp_orth_scores) / len(temp_orth_scores)
            else:
                orth_scores[c] = 0
        return orth_scores

In [None]:
def proxy_finder(df_train, df_test, target, predictors, num_proxies=1, orth_weight=0.65, candidates=None, orthogonal_vars=None, neural_net="original", drop=True):
    if candidates is None:
        candidates = list(df_test.select_dtypes(include='number').columns) #only numerical data (don't encode categories, make user do that)


    proxy_finder_validate(target, candidates, df_train, df_test, predictors, orthogonal_vars)

    #print(f"Predictors: {predictors}") #DEBUGDEBUGDEBUG------------------------------------------------------------
    #print(f"Candidates: {candidates}")

    df_train = data_rescale(df_train, predictors, target, drop)
    df_test = data_rescale(df_test, predictors, target, drop)
    # drop any rows that are missing data from target
    df_train = df_train.dropna(subset=target)

    if neural_net == "torch":
      predicted_scores = get_predictionsTorch(df_train, df_test, predictors, target)
    elif neural_net == "tiered":
      predicted_scores = get_predictionsTiered(df_train, df_test, predictors, target)
    else:
      predicted_scores = get_predictions(df_train, df_test, predictors, target)


    df_test['predicted_target'] = predicted_scores
    #print(f"Predicted scores: {predicted_scores[:10]}")  #DEBUG DEBUG------------------------------------------------------------

    results = {}

    for c in candidates:
        candset = df_test[[c, 'predicted_target']].copy().dropna()
        if candset.empty:
            continue

        pred_scores = candset['predicted_target']
        candcol = candset[c]

        X_pred = sm.add_constant(candcol)
        model_pred = sm.OLS(pred_scores, X_pred).fit()
        results[c] = {
            'R_squared': model_pred.rsquared,
            'p_value': model_pred.pvalues.iloc[1],
            'coef': model_pred.params.iloc[1]
        }
        #print(f"candidate {c}: Results: {results}")  # Debug statement------------------------------------------------------------

    best_proxies = []

    if orthogonal_vars:
        orth_scores = orthogonalize(candidates, df_test, orthogonal_vars)
        proxy_scores = {}
        for c in candidates:
            try:
                proxy_scores[c] = (c, (1 - orth_weight) * results[c]['R_squared'] - orth_weight * orth_scores[c])
            except KeyError as e:
                continue

        sorted_results = sorted(proxy_scores.values(), key=lambda x: x[1], reverse=True)

        for i in range(min(num_proxies, len(sorted_results))):
            proxy, score = sorted_results[i]
            best_proxies.append(proxy)
            print(f"Proxy {i+1} for {target}: {proxy} with score: {score}")
    else:
        sorted_results = sorted(results.items(), key=lambda x: (-x[1]['R_squared'], x[1]['p_value']))

        for i in range(min(num_proxies, len(sorted_results))):
            proxy, metrics = sorted_results[i]
            best_proxies.append(proxy)
            print(f"Proxy {i+1} for {target}: {proxy} with R_squared: {metrics['R_squared']} and p_value: {metrics['p_value']}")

    return best_proxies

In [None]:
# return a new df that is a copy of df, with: rescale all columns to be
#  between 0 and 1, inclusive. Drop any non-numeric columns. Drop any
# rows that are missing at least one predictor.
def data_rescale(df, predictors, target, drop=True):
    df = df.copy() # preserve immutability

    # Select only the numeric columns
    numeric_cols = df.select_dtypes(include=['number']).columns

    if drop:
      # drop any rows that are missing at least one predictor
      df = df.dropna(subset=predictors)

    # print('the dataframe we\'re rescaling is size: ') # debug
    # Initialize the scaler
    scaler = MinMaxScaler()

    # Fit the scaler to the data and transform it
    scaled_values = scaler.fit_transform(df[numeric_cols])

    # Create a new DataFrame with the scaled values, maintaining the original column names
    scaled_df = pd.DataFrame(scaled_values, columns=numeric_cols, index=df.index)

    return scaled_df

In [None]:
def generate_synthetic_proxies(target_column, target_correlation, noise_level=0.1):

   # Convert target_column to numpy array and standardize
    target = np.array(target_column)
    length = len(target)

    target = (target - np.mean(target)) / np.std(target)

    synthetic_proxies = {}

    # Generate independent standard normal variable
    z = np.random.standard_normal(length)

    # Create correlated variable using the correlation formula
    proxy = target_correlation * target + np.sqrt(1 - target_correlation**2) * z

    # Add controlled noise
    proxy = proxy + np.random.normal(0, noise_level, length)

    # Standardize final proxy
    proxy = (proxy - np.mean(proxy)) / np.std(proxy)

    synthetic_proxies[f'proxy_{target_correlation:.2f}'] = proxy

    return synthetic_proxies

In [None]:
def prepare_dataset(df, target, target_correlation):

  # add synthetic proxies to test set
  target_column = df[target]
  synthetic_proxies = generate_synthetic_proxies(target_column, target_correlation)
  for name, proxy in synthetic_proxies.items():
    df[name] = proxy

  # drop target from test set
  df = df.drop(columns=[target])

  return df

# Stage 1: Testing Mean Penalty Approach with Several Target Correlations

In [None]:
def run_and_visualize_monte_carlo(df1, df2, weights, num_iterations, target, target_correlations, predictors):
    selection_trackers = []
    proxy_names = []


    # Run Monte Carlo for each target correlation
    for target_correlation in target_correlations:
        # Add synthetic proxy
        df2 = df2.dropna(subset=target)

        df2 = prepare_dataset(df2, target, target_correlation)

        selection_tracker = {orth_weight: {} for orth_weight in weights}

        # Run iterations for each weight
        for orth_weight in weights:
            print(f"Testing with orthogonality weight: {orth_weight}")
            print(f"Testing with target correlation: {target_correlation}")

            for i in range(num_iterations):
                print(f"Running iteration {i+1}/{num_iterations}")
                top_proxies = proxy_finder(df_train=df1,
                                         df_test=df2,
                                         target=target,
                                         predictors=predictors,
                                         num_proxies=50,
                                         orth_weight=orth_weight,
                                         orthogonal_vars=predictors,
                                         )

                # Update selection tracker for top pick
                for rank, proxy in enumerate(top_proxies, 1):
                    if rank == 1:
                        selection_tracker[orth_weight][proxy] = selection_tracker[orth_weight].get(proxy, 0) + 1

        selection_trackers.append(selection_tracker)
        proxy_names.append(f'proxy_{target_correlation:.2f}')



    # SAVE TO CSV --------------------
    data = []

    for target_correlation, selection_tracker in zip(proxy_names, selection_trackers):
        for orth_weight, proxies in selection_tracker.items():
            for proxy, count in proxies.items():
                data.append({
                    'Target Correlation': target_correlation,
                    'Orthogonality Weight': orth_weight,
                    'Proxy': proxy,
                    'Count': count
                })

    df_selection_tracker = pd.DataFrame(data)
    df_selection_tracker.to_csv('selection_tracker.csv', index=False)
    # SAVE TO CSV --------------------


    # Visualization
    plt.figure(figsize=(10, 6))

    # Plot results for each target correlation
    for index, tracker in enumerate(selection_trackers):
        results = []
        for orth_weight, proxies in tracker.items():
            for proxy, frequency in proxies.items():
                results.append({
                    'orth_weight': orth_weight,
                    'proxy': proxy,
                    'frequency': (frequency / num_iterations) * 100
                })

        results_df = pd.DataFrame(results)
        pivot_data = results_df.pivot(index='orth_weight', columns='proxy', values='frequency')
        pivot_data.fillna(0, inplace=True)
        print(pivot_data)

        # Plot each proxy as a separate line
        name = proxy_names[index]
        plt.plot(pivot_data.index, pivot_data[name], marker='o', label=name, linewidth=2)

    # Create the line plot
    plt.xlabel('Orthogonality Weight')
    plt.ylabel('Selection Frequency')
    plt.title('Selection Frequency vs Orthogonality Weight')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    plt.show()

In [170]:
# change parameters as needed
df1 = pd.read_stata("/content/yougov_recoded.dta")
df2 = pd.read_stata("/content/anes_recoded.dta")
weights = [0.75]
target_correlations = [0.99]
num_iterations = 1
target = 'educ'  # The target variable in the training set
predictors = [ # predictors in both training and test set
                   'presvote20post',
                   'housevote20post',
                   'senvote20post',
                   'pff_jb',
                   'election_fairnness',
                   'pid7',
                   'hispanic',
                   'partisan_violence',
                   'immigrant_citizenship',
                   'immigrant_deport',
                   'auth_grid_1',
                   'auth_grid_3',
                   'auth_grid_2',
                   ]
run_and_visualize_monte_carlo(df1, df2, weights, num_iterations, target, target_correlations, predictors)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = proxy


Testing with orthogonality weight: 0.75
Testing with target correlation: 0.99
Running iteration 1/1


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[

ortho: hispanic candidate V200014a
ortho: hispanic candidate V200014b
ortho: hispanic candidate V200014c
ortho: hispanic candidate V200014d
Proxy 1 for educ: V202355 with score: 0.009483457069082192
Proxy 2 for educ: V201407 with score: 0.007433819315819497
Proxy 3 for educ: V202346 with score: 0.006362889440264379
Proxy 4 for educ: V201516 with score: 0.006215695502157144
Proxy 5 for educ: V202207y7 with score: 0.005903668649364924
Proxy 6 for educ: V203513 with score: 0.005819693551683925
Proxy 7 for educ: V203514 with score: 0.005819693551683925
Proxy 8 for educ: V201435 with score: 0.005751354488060398
Proxy 9 for educ: V203169 with score: 0.005682736688141912
Proxy 10 for educ: V203270 with score: 0.00561284238527638
Proxy 11 for educ: V202150 with score: 0.005570923649441341
Proxy 12 for educ: V202207y6 with score: 0.005464206151554892
Proxy 13 for educ: V201629e with score: 0.00527757992107882
Proxy 14 for educ: V202155 with score: 0.004105383874831843
Proxy 15 for educ: V201233

KeyError: 'proxy_0.99'

<Figure size 1000x600 with 0 Axes>