In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import sys
from gain import GAIN
from usage_example import *
import utils
import models

In [27]:
def proxy_finder_validate(item, candidates, df1, df2, predictors, orthogonal_vars):

    # validate proxies and st item
    assert item in df1.columns, f'AssertionError: item {item} not in df1.columns'

    assert predictors, f'AssertionError: missing predictors. If you would prefer to not specify predictors, do not pass in a variable.'

    for c in predictors:
        assert c in df1.columns, f'AssertionError: predictor {c} not in df1.columns'
        assert c in df2.columns, f'AssertionError: predictor {c} not in df2.columns' # we need same variable in second dataset
        assert c in df1.select_dtypes(include=['number']).columns, f'predictor {c} is not a numeric column in df1'
        assert c in df2.select_dtypes(include=['number']).columns, f'predictor {c} is not a numeric column in df2'

    for c in candidates:
        assert c in df2.columns, f'AssertionError: candidate {c} not in df2.columns'

    if (orthogonal_vars != None):
        for c in orthogonal_vars:
            assert c in df2.columns, f'AssertionError: orthogonal variable {c} not in df2.columns'


In [28]:
# return a new df that is a copy of df, with: rescale all columns to be
#  between 0 and 1, inclusive. Drop any non-numeric columns. Drop any
# rows that are missing at least one predictor.
def data_rescale(df, predictors, target, drop=True):
    df = df.copy() # preserve immutability

    # Select only the numeric columns
    numeric_cols = df.select_dtypes(include=['number']).columns

    if drop:
      # drop any rows that are missing at least one predictor
      df = df.dropna(subset=predictors)

    # print('the dataframe we\'re rescaling is size: ') # debug
    # Initialize the scaler
    scaler = MinMaxScaler()

    # Fit the scaler to the data and transform it
    scaled_values = scaler.fit_transform(df[numeric_cols])

    # Create a new DataFrame with the scaled values, maintaining the original column names
    scaled_df = pd.DataFrame(scaled_values, columns=numeric_cols, index=df.index)

    return scaled_df

In [44]:
def get_predictions(df_train, df_test, predictors, target, epochs=50, learning_rate=0.001, l2_lambda=0.001):
  # CODE IMPLEMENTATION ASSISTED BY GENERATIVE AI

  # Set parameters
  SEED = 13
  DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
  TRAIN_SIZE = 1.0  # Using all of df1 for training
  np.random.seed(SEED)

  df1 = df_train.copy()
  df2 = df_test.copy()

  # drop everything but predictors and target from df1
  target_col_df1 = df1[target]
  df1 = df1[predictors]
  df1[target] = target_col_df1

  # drop everything but predictors from df2
  df2 = df2[predictors]
  # add missing target
  df2[target] = np.nan

  combined_df = pd.concat([df1, df2])

  # Step 3: Normalize the data
  scaler = MinMaxScaler()
  combined_data_std = scaler.fit_transform(combined_df)

  # Split back into df1 (training) and df2 (prediction)
  df1_std = combined_data_std[:len(df1)]
  df2_std = combined_data_std[len(df1):]

  # Create tensors and masks
  X_train_tensor = torch.tensor(df1_std).float()
  M_train_tensor = get_mask(X_train_tensor)  # This creates mask with 0s for observed values, 1s for missing values
  train_dataset = TensorDataset(X_train_tensor, M_train_tensor)
  train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=False)

  X_test_tensor = torch.tensor(df2_std).float()
  M_test_tensor = get_mask(X_test_tensor)  # This will mark all values in the target column as missing
  test_dataset = TensorDataset(X_test_tensor, M_test_tensor)
  test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False)

  # Step 4: Initialize and train the GAIN model
  stopper = EarlyStopper(patience=2, min_delta=0.001)
  model = GAIN(train_loader=train_loader, seed=SEED)

  optimizer_G = torch.optim.Adam(model.G.parameters())
  optimizer_D = torch.optim.Adam(model.D.parameters())
  model.set_optimizer(optimizer=optimizer_G, generator=True)
  model.set_optimizer(optimizer=optimizer_D, generator=False)

  model.to(DEVICE)
  model.train(n_epoches=100, verbose=True, stopper=stopper)

  # Step 5: Use the trained model to predict (impute) target values for df2
  predictions = []

  for x_test_batch, m_batch in test_loader:
      x_batch_imputed = model.imputation(x=x_test_batch, m=m_batch)
      x_batch_imputed = x_batch_imputed.cpu().numpy()
      predictions.append(x_batch_imputed)

  # Combine predictions and inverse transform
  predictions_combined = np.vstack(predictions)
  predictions_original_scale = scaler.inverse_transform(predictions_combined)

  # Extract the target column predictions
  target_column_index = df1.columns.get_loc(target)
  df2_predictions = predictions_original_scale[:, target_column_index]

  return df2_predictions

In [30]:
 # orthogonalization method
# all data is preprocessed and df test has been appended target preds
def orthogonalize(candidates, df_test, orthogonal_vars):
        orth_scores = {}
        for c in candidates:
            candset = df_test[[c, 'predicted_target']].copy().dropna() # assumes candidate has mostly non-NaN entries
            candcol = candset[c]

            X = sm.add_constant(candcol)
            temp_orth_scores = []
            for orth_var in orthogonal_vars:
                orthset = df_test[[orth_var]].copy().dropna()
                common_indices = candset.index.intersection(orthset.index)
                if common_indices.empty:
                    continue
                orth_col = orthset.loc[common_indices, orth_var]
                if np.var(orth_col) == 0:
                    print("ortho:", orth_var, "candidate", c)
                    continue # zero variance leads to divide by zero error
                candcol_common = candset.loc[common_indices, c]

                X_common = sm.add_constant(candcol_common)
                model = sm.OLS(orth_col, X_common).fit()
                temp_orth_scores.append(model.rsquared)

            if temp_orth_scores:
                orth_scores[c] = sum(temp_orth_scores) / len(temp_orth_scores)
            else:
                orth_scores[c] = 0
        return orth_scores

In [31]:
def proxy_finder(df_train, df_test, target, predictors, num_proxies=1, orth_weight=0.65, candidates=None, orthogonal_vars=None, neural_net="original", drop=True):
    if candidates is None:
        candidates = list(df_test.select_dtypes(include='number').columns) #only numerical data (don't encode categories, make user do that)


    proxy_finder_validate(target, candidates, df_train, df_test, predictors, orthogonal_vars)

    #print(f"Predictors: {predictors}") #DEBUGDEBUGDEBUG------------------------------------------------------------
    #print(f"Candidates: {candidates}")

    df_train = data_rescale(df_train, predictors, target, drop)
    df_test = data_rescale(df_test, predictors, target, drop)
    # drop any rows that are missing data from target
    df_train = df_train.dropna(subset=target)

    if neural_net == "torch":
      predicted_scores = get_predictionsTorch(df_train, df_test, predictors, target)
    elif neural_net == "tiered":
      predicted_scores = get_predictionsTiered(df_train, df_test, predictors, target)
    else:
      predicted_scores = get_predictions(df_train, df_test, predictors, target)


    df_test['predicted_target'] = predicted_scores
    #print(f"Predicted scores: {predicted_scores[:10]}")  #DEBUG DEBUG------------------------------------------------------------

    results = {}

    for c in candidates:
        candset = df_test[[c, 'predicted_target']].copy().dropna()
        if candset.empty:
            continue

        pred_scores = candset['predicted_target']
        candcol = candset[c]

        X_pred = sm.add_constant(candcol)
        model_pred = sm.OLS(pred_scores, X_pred).fit()
        results[c] = {
            'R_squared': model_pred.rsquared,
            'p_value': model_pred.pvalues[1],
            'coef': model_pred.params[1]
        }
        #print(f"candidate {c}: Results: {results}")  # Debug statement------------------------------------------------------------

    best_proxies = []

    if orthogonal_vars:
        orth_scores = orthogonalize(candidates, df_test, orthogonal_vars)
        proxy_scores = {}
        for c in candidates:
            try:
                proxy_scores[c] = (c, (1 - orth_weight) * results[c]['R_squared'] - orth_weight * orth_scores[c])
            except KeyError as e:
                continue

        sorted_results = sorted(proxy_scores.values(), key=lambda x: x[1], reverse=True)

        for i in range(min(num_proxies, len(sorted_results))):
            proxy, score = sorted_results[i]
            best_proxies.append(proxy)
            print(f"Proxy {i+1} for {target}: {proxy} with score: {score}")
    else:
        sorted_results = sorted(results.items(), key=lambda x: (-x[1]['R_squared'], x[1]['p_value']))

        for i in range(min(num_proxies, len(sorted_results))):
            proxy, metrics = sorted_results[i]
            best_proxies.append(proxy)
            print(f"Proxy {i+1} for {target}: {proxy} with R_squared: {metrics['R_squared']} and p_value: {metrics['p_value']}")

    return best_proxies

In [51]:
import warnings
warnings.filterwarnings("ignore", message="All-NaN slice encountered")
warnings.filterwarnings("ignore", category=FutureWarning, message="Series.__getitem__ treating keys as positions is deprecated")


# Suppress numpy invalid operation warnings
np.seterr(invalid='ignore')

datafile_train =  "/content/yougov_recoded.dta"
datafile_test =  "/content/yougov_recoded.dta"
df_train = pd.read_stata(datafile_train)
df_test = pd.read_stata(datafile_test)

In [52]:

target = 'christian_nationalism'  # The target variable in the training set
predictors = [ # predictors in both training and test set
                   'presvote20post',
                   'housevote20post',
                   'senvote20post',
                   'pff_jb',
                   'pff_dt',
                   'pid7',
                   'election_fairnness',
                   'educ',
                   'hispanic',
                   'partisan_violence',
                   'immigrant_citizenship',
                   'immigrant_deport',
                   'auth_grid_1',
                   'auth_grid_3',
                   'auth_grid_2',
                   'faminc_new'
                   ]

orthogonal_vars = [
                  'presvote20post',
                   'housevote20post',
                   'senvote20post',
                   'pff_jb',
                   'pff_dt',
                   'pid7',
                   'election_fairnness',
                   'educ',
                   'hispanic',
                   'partisan_violence',
                   'immigrant_citizenship',
                   'immigrant_deport',
                   'auth_grid_1',
                   'auth_grid_3',
                   'auth_grid_2',
                   'faminc_new']




best_proxies = proxy_finder(df_train, df_test, target, predictors, orth_weight=0.60, orthogonal_vars=orthogonal_vars, num_proxies=8, neural_net="original", drop=False)
#print(best_proxies)
### orth weight 0.9 --> version, how many interviews, etx
### 0.85 same thing
### 0.8
### 0.75 bad
### 0.7 bad

Epoch 0: 100%|██████████| 27/27 [00:00<00:00, 72.83batch/s, mse_test=nan, mse_train=0.12]
Epoch 1: 100%|██████████| 27/27 [00:00<00:00, 75.68batch/s, mse_test=nan, mse_train=0.107]
Epoch 2: 100%|██████████| 27/27 [00:00<00:00, 84.06batch/s, mse_test=nan, mse_train=0.0951]
Epoch 3: 100%|██████████| 27/27 [00:00<00:00, 102.74batch/s, mse_test=nan, mse_train=0.0802]
Epoch 4: 100%|██████████| 27/27 [00:00<00:00, 87.67batch/s, mse_test=nan, mse_train=0.0639]
Epoch 5: 100%|██████████| 27/27 [00:00<00:00, 80.27batch/s, mse_test=nan, mse_train=0.0554]
Epoch 6: 100%|██████████| 27/27 [00:00<00:00, 88.63batch/s, mse_test=nan, mse_train=0.0533]
Epoch 7: 100%|██████████| 27/27 [00:00<00:00, 91.50batch/s, mse_test=nan, mse_train=0.0524]
Epoch 8: 100%|██████████| 27/27 [00:00<00:00, 87.58batch/s, mse_test=nan, mse_train=0.0517]
Epoch 9: 100%|██████████| 27/27 [00:00<00:00, 101.83batch/s, mse_test=nan, mse_train=0.0509]
Epoch 10: 100%|██████████| 27/27 [00:00<00:00, 97.62batch/s, mse_test=nan, mse_tr

Proxy 1 for christian_nationalism: christian_nationalism with score: 0.10095037445472552
Proxy 2 for christian_nationalism: immigrant_deport with score: 0.08747972244070154
Proxy 3 for christian_nationalism: immigrant_citizenship with score: 0.08663656722326765
Proxy 4 for christian_nationalism: auth_grid_3 with score: 0.06254779846685989
Proxy 5 for christian_nationalism: ideo7 with score: 0.05851350719253787
Proxy 6 for christian_nationalism: pff_dt with score: 0.057801202628490894
Proxy 7 for christian_nationalism: auth_grid_1 with score: 0.03181674005661009
Proxy 8 for christian_nationalism: presvote20post with score: 0.025884471213003912
