# Monte Carlo Testing for Proxy Finder Algorithm


In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import sys
from gain import GAIN
from usage_example import *
import utils
import models
import matplotlib.pyplot as plt

In [None]:
def proxy_finder_validate(item, candidates, df1, df2, predictors, orthogonal_vars):
    # validate proxies and st item
    assert item in df1.columns, f'AssertionError: item {item} not in df1.columns'

    assert predictors, f'AssertionError: missing predictors. If you would prefer to not specify predictors, do not pass in a variable.'

    for c in predictors:
        assert c in df1.columns, f'AssertionError: predictor {c} not in df1.columns'
        assert c in df2.columns, f'AssertionError: predictor {c} not in df2.columns' # we need same variable in second dataset
        assert c in df1.select_dtypes(include=['number']).columns, f'predictor {c} is not a numeric column in df1'
        assert c in df2.select_dtypes(include=['number']).columns, f'predictor {c} is not a numeric column in df2'

    for c in candidates:
        assert c in df2.columns, f'AssertionError: candidate {c} not in df2.columns'

    if (orthogonal_vars != None):
        for c in orthogonal_vars:
            assert c in df2.columns, f'AssertionError: orthogonal variable {c} not in df2.columns'

In [None]:
def get_predictions(df_train, df_test, predictors, target, epochs=250, learning_rate=0.001, l2_lambda=0.001):
  # CODE IMPLEMENTATION ASSISTED BY GENERATIVE AI

  # Set parameters

  DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
  TRAIN_SIZE = 1.0  # Using all of df1 for training


  df1 = df_train.copy()
  df2 = df_test.copy()

  # drop everything but predictors and target from df1
  target_col_df1 = df1[target]
  df1 = df1[predictors]
  df1[target] = target_col_df1

  # drop everything but predictors from df2
  df2 = df2[predictors]
  # add missing target
  df2[target] = np.nan

  combined_df = pd.concat([df1, df2])

  # # Step 3: Normalize the data
  # scaler = MinMaxScaler()
  # combined_data_std = scaler.fit_transform(combined_df)

  # Split back into df1 (training) and df2 (prediction)
  df1_std = combined_df[:len(df1)]
  df2_std = combined_df[len(df1):]

  # Create tensors and masks
  X_train_tensor = torch.tensor(df1_std.values).float()
  M_train_tensor = get_mask(X_train_tensor)  # This creates mask with 0s for observed values, 1s for missing values
  train_dataset = TensorDataset(X_train_tensor, M_train_tensor)
  train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=False)

  X_test_tensor = torch.tensor(df2_std.values).float()
  M_test_tensor = get_mask(X_test_tensor)  # This will mark all values in the target column as missing
  test_dataset = TensorDataset(X_test_tensor, M_test_tensor)
  test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False)

  # Step 4: Initialize and train the GAIN model
  stopper = EarlyStopper(patience=2, min_delta=0.001)
  model = GAIN(train_loader=train_loader)

  optimizer_G = torch.optim.Adam(model.G.parameters())
  optimizer_D = torch.optim.Adam(model.D.parameters())
  model.set_optimizer(optimizer=optimizer_G, generator=True)
  model.set_optimizer(optimizer=optimizer_D, generator=False)

  model.to(DEVICE)
  model.train(n_epoches=epochs, verbose=True, stopper=stopper)

  # Step 5: Use the trained model to predict (impute) target values for df2
  predictions = []

  for x_test_batch, m_batch in test_loader:
      x_batch_imputed = model.imputation(x=x_test_batch, m=m_batch)
      x_batch_imputed = x_batch_imputed.cpu().numpy()
      predictions.append(x_batch_imputed)

  # Combine predictions and inverse transform
  predictions_combined = np.vstack(predictions)
  # predictions_original_scale = scaler.inverse_transform(predictions_combined)

  # Extract the target column predictions
  target_column_index = df1.columns.get_loc(target)
  df2_predictions = predictions_combined[:, target_column_index]

  return df2_predictions

In [None]:
# get predictions from the Torch neural network
def get_predictionsTorch(df_train, df_test, predictors, target, epochs=100, learning_rate=0.001, l2_lambda=0.001):
    import torch
    import torch.nn as nn
    import torch.optim as optim
    import torch.nn.functional as F

    # split data for training and testing.
    training_features, validation_features, training_target, validation_target = train_test_split(df_train[predictors].to_numpy(), df_train[target].to_numpy(), test_size=0.2, random_state=42)

    training_features = torch.FloatTensor(training_features)
    training_target = torch.FloatTensor(training_target)
    validation_features = torch.FloatTensor(validation_features)
    validation_target = torch.FloatTensor(validation_target)

    model = nn.Sequential(
        nn.Linear(len(predictors), 64),
        nn.ReLU(),
        nn.Dropout(p=0.5),
        nn.Linear(64, 32),
        nn.ReLU(),
        nn.Dropout(p=0.5),
        nn.Linear(32, 1)
    )

    # Adam optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=l2_lambda)

    # MSE loss
    loss_func = nn.MSELoss()

    # train the model
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()

        # Forward pass
        prediction = model(training_features)
        loss = loss_func(prediction, training_target.unsqueeze(1))

        # Backward pass
        loss.backward()
        optimizer.step()

    # get predictions
    model.eval()
    test_data = torch.FloatTensor(df_test[predictors].to_numpy())

    with torch.no_grad():
        predictions = model(test_data)
        predictions = predictions.numpy().flatten()

        val_predictions = model(validation_features)
        val_predictions = val_predictions.numpy().flatten()


    # exit if correlation between predictions and item is bad
    mse = mean_squared_error(val_predictions, validation_target)
    print(f"Debug statement: Neural Net test MSE = {mse}") ####DEBUG
    if (mse > 0.03):
        print('Input Error: Predictors cannot predict {target} in df1', file=sys.stderr)
        print('Aborting program')
        sys.exit(-1)

    return predictions

In [None]:
 # orthogonalization method
# all data is preprocessed and df test has been appended target preds
def orthogonalize(candidates, df_test, orthogonal_vars):
        orth_scores = {}
        assert 'proxy_0.95' in df_test.columns, "Column 'B' does not exist in DataFrame"
        for c in candidates:
            candset = df_test[[c, 'predicted_target']].copy().dropna() # assumes candidate has mostly non-NaN entries
            candcol = candset[c]

            X = sm.add_constant(candcol)
            temp_orth_scores = []
            for orth_var in orthogonal_vars:
                orthset = df_test[[orth_var]].copy().dropna()
                common_indices = candset.index.intersection(orthset.index)
                if common_indices.empty:
                    continue
                orth_col = orthset.loc[common_indices, orth_var]
                if np.var(orth_col) == 0:
                    print("ortho:", orth_var, "candidate", c)
                    continue # zero variance leads to divide by zero error
                candcol_common = candset.loc[common_indices, c]

                X_common = sm.add_constant(candcol_common)
                model = sm.OLS(orth_col, X_common).fit()
                temp_orth_scores.append(model.rsquared)

            if temp_orth_scores:
                orth_scores[c] = sum(temp_orth_scores) / len(temp_orth_scores)
            else:
                orth_scores[c] = 0
        return orth_scores

In [None]:
def proxy_finder(df_train, df_test, target, predictors, num_proxies=1, orth_weight=0.65, candidates=None, orthogonal_vars=None, neural_net="original", drop=True):
    if candidates is None:
        candidates = list(df_test.select_dtypes(include='number').columns) #only numerical data (don't encode categories, make user do that)


    proxy_finder_validate(target, candidates, df_train, df_test, predictors, orthogonal_vars)

    #print(f"Predictors: {predictors}") #DEBUGDEBUGDEBUG------------------------------------------------------------
    #print(f"Candidates: {candidates}")


    # drop any rows that are missing data from target
    df_train = df_train.dropna(subset=target)

    if neural_net == "torch":
      predicted_scores = get_predictionsTorch(df_train, df_test, predictors, target)
    else:
      predicted_scores = get_predictions(df_train, df_test, predictors, target)


    df_test['predicted_target'] = predicted_scores
    #print(f"Predicted scores: {predicted_scores[:10]}")  #DEBUG DEBUG------------------------------------------------------------

    results = {}

    for c in candidates:
        candset = df_test[[c, 'predicted_target']].copy().dropna()
        if candset.empty:
            continue

        pred_scores = candset['predicted_target']
        candcol = candset[c]

        X_pred = sm.add_constant(candcol)
        model_pred = sm.OLS(pred_scores, X_pred).fit()
        results[c] = {
            'R_squared': model_pred.rsquared,
            'p_value': model_pred.pvalues.iloc[1],
            'coef': model_pred.params.iloc[1]
        }
        #print(f"candidate {c}: Results: {results}")  # Debug statement------------------------------------------------------------

    best_proxies = []

    if orthogonal_vars:
        orth_scores = orthogonalize(candidates, df_test, orthogonal_vars)
        proxy_scores = {}
        for c in candidates:
            try:
                proxy_scores[c] = (c, (1 - orth_weight) * results[c]['R_squared'] - orth_weight * orth_scores[c])
            except KeyError as e:
                continue

        sorted_results = sorted(proxy_scores.values(), key=lambda x: x[1], reverse=True)

        for i in range(min(num_proxies, len(sorted_results))):
            proxy, score = sorted_results[i]
            best_proxies.append(proxy)
            print(f"Proxy {i+1} for {target}: {proxy} with score: {score}")
    else:
        sorted_results = sorted(results.items(), key=lambda x: (-x[1]['R_squared'], x[1]['p_value']))

        for i in range(min(num_proxies, len(sorted_results))):
            proxy, metrics = sorted_results[i]
            best_proxies.append(proxy)
            print(f"Proxy {i+1} for {target}: {proxy} with R_squared: {metrics['R_squared']} and p_value: {metrics['p_value']}")

    print("PROXY SCORE !!!!!", proxy_scores['proxy_0.95'])

    return best_proxies

In [None]:
# return a new df that is a copy of df, with: rescale all columns to be
#  between 0 and 1, inclusive. Drop any non-numeric columns. Drop any
# rows that are missing at least one predictor.
def data_rescale(df, df2, predictors, target):
    df = df.copy() # preserve immutability
    df2 = df2.copy()

    df = df.dropna(axis=1, how='all')
    df2 = df2.dropna(axis=1, how='all')

    print("SHAPE AFTER DROPPING ALL NA", df.shape)

    # Select only the numeric columns
    numeric_cols = df.select_dtypes(include=['number']).columns
    numeric_cols2 = df2.select_dtypes(include=['number']).columns


    df = df.dropna(subset=[target])
    df2 = df2.dropna(subset=[target])

    print("SHAPE AFTER DROPPING TARGET", df.shape)

    # # drop any rows that are missing at least one predictor
    df = df.dropna(subset=predictors)
    print("SHAPE AFTER DROPPING PREDS", df.shape)


    # print('the dataframe we\'re rescaling is size: ') # debug
    # Initialize the scaler
    scaler = MinMaxScaler()

    # Fit the scaler to the data and transform it
    scaled_values = scaler.fit_transform(df[numeric_cols])
    scaled_values2 = scaler.fit_transform(df2[numeric_cols2])


    # Create a new DataFrame with the scaled values, maintaining the original column names
    scaled_df = pd.DataFrame(scaled_values, columns=numeric_cols, index=df.index)
    scaled_df2 = pd.DataFrame(scaled_values2, columns=numeric_cols2, index=df2.index)


    scaled_df.fillna(scaled_df.mean(), inplace=True)
    scaled_df2.fillna(scaled_df2.mean(), inplace=True)


    return scaled_df, scaled_df2

In [None]:
def generate_synthetic_proxies(target_column, target_correlation, noise_level=0.1):

   # Convert target_column to numpy array and standardize
    print("HOW MANY NAN", target_column.isna().sum())

    target = np.array(target_column)
    length = len(target)

    target = (target - np.mean(target)) / np.std(target)
    print("TARGET LEN", len(target))

    synthetic_proxies = {}

    # Generate independent standard normal variable
    z = np.random.standard_normal(length)

    # Create correlated variable using the correlation formula
    proxy = target_correlation * target + np.sqrt(1 - target_correlation**2) * z

    # Add controlled noise
    proxy = proxy + np.random.normal(0, noise_level, length)

    # Standardize final proxy
    proxy = (proxy - np.mean(proxy)) / np.std(proxy)

    synthetic_proxies[f'proxy_{target_correlation:.2f}'] = proxy

    return synthetic_proxies

In [None]:
def prepare_dataset(df, target, target_correlation):
  df = df.copy()

  # add synthetic proxies to test set
  target_column = df[target]
  synthetic_proxies = generate_synthetic_proxies(target_column, target_correlation)
  for name, proxy in synthetic_proxies.items():
    df[name] = proxy

  # drop target from test set
  df = df.drop(columns=[target])
  assert 'proxy_0.95' in df.columns, "Column 'B' does not exist in DataFrame"


  return df

# Stage 1: Testing Mean Penalty Approach with Several Target Correlations

In [None]:
def run_and_visualize_monte_carlo(df1, df2, weights, num_iterations, target, target_correlations, predictors, neural_net):
    selection_trackers = []
    proxy_names = []


    # Run Monte Carlo for each target correlation
    for target_correlation in target_correlations:

        # rescale data
        df1, df2 = data_rescale(df1, df2, predictors, target)

        df2 = prepare_dataset(df2, target, target_correlation)

        selection_tracker = {orth_weight: {} for orth_weight in weights}
        assert 'proxy_0.95' in df2.columns, "Column 'B' does not exist in DataFrame"

        # Run iterations for each weight
        for orth_weight in weights:
            print(f"Testing with orthogonality weight: {orth_weight}")
            print(f"Testing with target correlation: {target_correlation}")

            for i in range(num_iterations):
                print(f"Running iteration {i+1}/{num_iterations}")
                top_proxies = proxy_finder(df_train=df1,
                                         df_test=df2,
                                         target=target,
                                         predictors=predictors,
                                         num_proxies=50,
                                         orth_weight=orth_weight,
                                         orthogonal_vars=predictors,
                                         neural_net=neural_net
                                         )

                # Update selection tracker for top pick
                for rank, proxy in enumerate(top_proxies, 1):
                    if rank == 1:
                        selection_tracker[orth_weight][proxy] = selection_tracker[orth_weight].get(proxy, 0) + 1

        selection_trackers.append(selection_tracker)
        proxy_names.append(f'proxy_{target_correlation:.2f}')



    # SAVE TO CSV --------------------
    data = []

    for target_correlation, selection_tracker in zip(proxy_names, selection_trackers):
        for orth_weight, proxies in selection_tracker.items():
            for proxy, count in proxies.items():
                data.append({
                    'Target Correlation': target_correlation,
                    'Orthogonality Weight': orth_weight,
                    'Proxy': proxy,
                    'Count': count
                })

    df_selection_tracker = pd.DataFrame(data)
    df_selection_tracker.to_csv('selection_tracker.csv', index=False)
    # SAVE TO CSV --------------------


    # # Visualization
    # plt.figure(figsize=(10, 6))

    # # Plot results for each target correlation
    # for index, tracker in enumerate(selection_trackers):
    #     results = []
    #     for orth_weight, proxies in tracker.items():
    #         for proxy, frequency in proxies.items():
    #             results.append({
    #                 'orth_weight': orth_weight,
    #                 'proxy': proxy,
    #                 'frequency': (frequency / num_iterations) * 100
    #             })

    #     results_df = pd.DataFrame(results)
    #     pivot_data = results_df.pivot(index='orth_weight', columns='proxy', values='frequency')
    #     pivot_data.fillna(0, inplace=True)
    #     print(pivot_data)

    #     # Plot each proxy as a separate line
    #     name = proxy_names[index]
    #     plt.plot(pivot_data.index, pivot_data[name], marker='o', label=name, linewidth=2)

    # # Create the line plot
    # plt.xlabel('Orthogonality Weight')
    # plt.ylabel('Selection Frequency')
    # plt.title('Selection Frequency vs Orthogonality Weight')
    # plt.grid(True, linestyle='--', alpha=0.7)
    # plt.legend()
    # plt.show()

In [None]:
# change parameters as needed
df1 = pd.read_stata("/content/yougov_recode_03052025.dta")
df2 = pd.read_stata("/content/anes_recode_03052025.dta")
#weights = [0, 0.55, 0.56, 0.57, 0.58, 0.59, 0.60, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.70, 0.71, 0.72, 0.73, 0.74, 0.75]
weights = [0.65]
target_correlations = [0.95]
num_iterations = 1
target = 'educ'  # The target variable in the training set
predictors = [ # predictors in both training and test set
                  # 'educ',
                   'ideo7',
                   'auth_grid_1',
                   'auth_grid_3',
                   'auth_grid_2',
                   'faminc_new',
                   'pff_jb',
                   'pff_dt',
                   'presvote20post',
                   'housevote20post',
                   'senvote20post',
                   'abortion',
                   'immigrant_citizenship',
                   'immigrant_deport'
                   ]
run_and_visualize_monte_carlo(df1, df2, weights, num_iterations, target, target_correlations, predictors, neural_net="GAIN")

SHAPE AFTER DROPPING ALL NA (3400, 22)
SHAPE AFTER DROPPING TARGET (2442, 22)
SHAPE AFTER DROPPING PREDS (1312, 22)
HOW MANY NAN 0
TARGET LEN 8149
Testing with orthogonality weight: 0.65
Testing with target correlation: 0.95
Running iteration 1/1


Epoch 0: 100%|██████████| 11/11 [00:00<00:00, 83.65batch/s, mse_test=nan, mse_train=0.193]
Epoch 1: 100%|██████████| 11/11 [00:00<00:00, 97.15batch/s, mse_test=nan, mse_train=0.189]
Epoch 2: 100%|██████████| 11/11 [00:00<00:00, 73.47batch/s, mse_test=nan, mse_train=0.185]
Epoch 3: 100%|██████████| 11/11 [00:00<00:00, 81.83batch/s, mse_test=nan, mse_train=0.179]
Epoch 4: 100%|██████████| 11/11 [00:00<00:00, 100.46batch/s, mse_test=nan, mse_train=0.171]
Epoch 5: 100%|██████████| 11/11 [00:00<00:00, 73.40batch/s, mse_test=nan, mse_train=0.16]
Epoch 6: 100%|██████████| 11/11 [00:00<00:00, 70.18batch/s, mse_test=nan, mse_train=0.147]
Epoch 7: 100%|██████████| 11/11 [00:00<00:00, 53.68batch/s, mse_test=nan, mse_train=0.133]
Epoch 8: 100%|██████████| 11/11 [00:00<00:00, 52.09batch/s, mse_test=nan, mse_train=0.12]
Epoch 9: 100%|██████████| 11/11 [00:00<00:00, 62.16batch/s, mse_test=nan, mse_train=0.11]
Epoch 10: 100%|██████████| 11/11 [00:00<00:00, 56.15batch/s, mse_test=nan, mse_train=0.102]


Proxy 1 for educ: immigrant_citizenship with score: 0.03605068273702876
Proxy 2 for educ: immigrant_deport with score: 0.006120182030859775
Proxy 3 for educ: V201507x with score: 0.00484217718253338
Proxy 4 for educ: V201631s with score: 0.0024622882924979048
Proxy 5 for educ: V201630s with score: 0.002435874109782282
Proxy 6 for educ: V201571 with score: 0.0021872641664657795
Proxy 7 for educ: V201537b with score: 0.002105833771636178
Proxy 8 for educ: V201251 with score: 0.001910780385066018
Proxy 9 for educ: V201631i with score: 0.0018186518970115084
Proxy 10 for educ: V201587 with score: 0.001620512489430625
Proxy 11 for educ: V201631d with score: 0.0009533535724146668
Proxy 12 for educ: V201630n with score: 0.0008477247129152763
Proxy 13 for educ: V201629a with score: 0.0008377868822021628
Proxy 14 for educ: V201146 with score: 0.0008278180898785457
Proxy 15 for educ: V201631j with score: 0.0006985138919454791
Proxy 16 for educ: V201220 with score: 0.0006238468228488058
Proxy 17 f

In [None]:
df1 = pd.read_stata("/content/yougov_recode_03052025.dta")
df2 = pd.read_stata("/content/anes_recode_03052025.dta")
target = 'educ'  # The target variable in the training set
predictors = [ # predictors in both training and test set
                  # 'educ',
                   'ideo7',
                   'auth_grid_1',
                   'auth_grid_3',
                   'auth_grid_2',
                   'faminc_new',
                   'pff_jb',
                   'pff_dt',
                   'presvote20post',
                   'housevote20post',
                   'senvote20post',
                   'abortion',
                   'immigrant_citizenship',
                   'immigrant_deport'
                   ]

In [None]:
df1, df2 = data_rescale(df1, df2, predictors, target)

SHAPE AFTER DROPPING ALL NA (3400, 22)
SHAPE AFTER DROPPING TARGET (2442, 22)
SHAPE AFTER DROPPING PREDS (1312, 22)


In [None]:
target_col = df2['educ']
print(target_col)

0       0.714286
1       0.285714
2       0.142857
3       0.428571
4       1.000000
          ...   
8275    0.428571
8276    0.714286
8277    0.142857
8278    0.571429
8279    0.857143
Name: educ, Length: 8149, dtype: float64


In [None]:
df2 = prepare_dataset(df2, 'educ', 0.95)

HOW MANY NAN 0
TARGET LEN 8149


In [None]:
print(df2['proxy_0.95'])
highly = df2['proxy_0.95']

0       0.765113
1      -0.362602
2      -1.311245
3      -0.429265
4       1.237096
          ...   
8275   -0.075746
8276    0.966438
8277   -1.083665
8278    0.366594
8279    1.649712
Name: proxy_0.95, Length: 8149, dtype: float64


In [None]:
print(target_col.corr(highly))

0.9457903814376333


In [None]:
predictions = get_predictions(df1, df2, predictors, target)

print(predictions)
print(len(predictions))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[0.14606088 0.4272429  0.22965828 ... 0.32547882 0.23860644 0.3007361 ]
8149


In [None]:
print(df1['educ'])

2       0.4
5       1.0
6       1.0
7       0.8
11      0.4
       ... 
3265    0.2
3266    0.8
3268    0.8
3270    0.6
3274    0.2
Name: educ, Length: 1312, dtype: float64


In [None]:
print(df2['proxy_0.95'])

0       0.765113
1      -0.362602
2      -1.311245
3      -0.429265
4       1.237096
          ...   
8275   -0.075746
8276    0.966438
8277   -1.083665
8278    0.366594
8279    1.649712
Name: proxy_0.95, Length: 8149, dtype: float64


In [None]:
predictionsdf = pd.Series(predictions)
print(predictionsdf.corr(highly))

0.008405879763471328


In [None]:
proxy = df2['proxy_0.95']
print(target_col.corr(proxy))

0.9457903814376333


In [None]:
print(proxy.corr(predictionsdf))

0.008405879763471326
