In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import sys
from gain import GAIN
from usage_example import *
import utils
import models

In [3]:
def proxy_finder_validate(item, candidates, df1, df2, predictors, orthogonal_vars):

    # validate proxies and st item
    assert item in df1.columns, f'AssertionError: item {item} not in df1.columns'

    assert predictors, f'AssertionError: missing predictors. If you would prefer to not specify predictors, do not pass in a variable.'

    for c in predictors:
        assert c in df1.columns, f'AssertionError: predictor {c} not in df1.columns'
        assert c in df2.columns, f'AssertionError: predictor {c} not in df2.columns' # we need same variable in second dataset
        assert c in df1.select_dtypes(include=['number']).columns, f'predictor {c} is not a numeric column in df1'
        assert c in df2.select_dtypes(include=['number']).columns, f'predictor {c} is not a numeric column in df2'

    for c in candidates:
        assert c in df2.columns, f'AssertionError: candidate {c} not in df2.columns'

    if (orthogonal_vars != None):
        for c in orthogonal_vars:
            assert c in df2.columns, f'AssertionError: orthogonal variable {c} not in df2.columns'

In [4]:
def get_predictions(df1, df2, predictors, target, epochs=10, learning_rate=0.001, l2_lambda=0.001):
    # preserve immutability
    df1 = df1.copy()
    target_col = df1[target]

    # keep only predictor columns
    df1_predictors = df1[predictors].copy()
    df1_predictors[target] = target_col

    # drop missing vals from df1
    df1_predictors = df1_predictors.dropna()

    # NOTE ----  adapted from usage_example.py ----- NOTE # https://github.com/evolext/GAIN/blob/main/usage_example.py

    # 80% train/test split
    data = df1_predictors.to_numpy()
    data_missing = add_missings(data, miss_rate=0.15)

    # Initialize the scaler
    scaler = MinMaxScaler()
    data_missing_scaled = scaler.fit_transform(data_missing)

    train_cutoff = int(data_missing_scaled.shape[0] * 0.80)
    X_train, X_test = data_missing_scaled[:train_cutoff], data_missing_scaled[train_cutoff:]
    X_actual = scaler.transform(data[train_cutoff:])

    X_train_tensor = torch.tensor(X_train).float()
    M_train_tensor = (~torch.isnan(X_train_tensor)).float()
    X_train_tensor = torch.nan_to_num(X_train_tensor, nan=0.0)  # Replace NaNs with zeros
    train_dataset = TensorDataset(X_train_tensor, M_train_tensor)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

    X_test_tensor = torch.tensor(X_test).float()
    M_test_tensor = (~torch.isnan(X_test_tensor)).float()
    X_test_tensor = torch.nan_to_num(X_test_tensor, nan=0.0)
    X_actual_tensor = torch.tensor(X_actual).float()
    test_dataset = TensorDataset(X_test_tensor, M_test_tensor, X_actual_tensor)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

    model = GAIN(train_loader=train_loader)
    optimizer_G = torch.optim.Adam(model.G.parameters(), lr=learning_rate, weight_decay=l2_lambda)
    optimizer_D = torch.optim.Adam(model.D.parameters(), lr=learning_rate, weight_decay=l2_lambda)
    model.set_optimizer(optimizer=optimizer_G, generator=True)
    model.set_optimizer(optimizer=optimizer_D, generator=False)

    model.to('cpu')
    model.train(n_epoches=epochs, verbose=True)

    # NOTE ----  adapted from usage_example.py ----- NOTE # https://github.com/evolext/GAIN/blob/main/usage_example.py
    # NOTE --- implementation assisted by LLMs

    # TESTING AND PREDICTION
    df2 = df2.copy()
    df2 = df2[predictors]

    # add target column with NaNs
    df2[target] = np.nan

    # Reorder columns to match training data
    df2 = df2[df1_predictors.columns]

    # Scale using the same scaler as training data
    data = df2.to_numpy()
    data_scaled = scaler.transform(data)

    X = torch.tensor(data_scaled).float()
    mask = (~torch.isnan(X)).float()
    X = torch.nan_to_num(X, nan=0.0)  # Replace NaNs with zeros

    # get imputations
    imputed_data_scaled = model.imputation(X, mask)
    imputed_data = scaler.inverse_transform(imputed_data_scaled.cpu().numpy())
    imputed_df = pd.DataFrame(imputed_data, columns=df2.columns)

    return imputed_df[target]

In [50]:
# orthogonalization method
# all data is preprocessed and df test has been appended target preds
def orthogonalize(candidates, df2, orthogonal_vars):
        from tqdm import tqdm
        orth_scores = {}
        for c in tqdm(candidates):
            candset = df2[[c, 'predicted_target']].copy().dropna() # assumes candidate has mostly non-NaN entries
            candcol = candset[c]

            X = sm.add_constant(candcol)
            temp_orth_scores = []
            for orth_var in orthogonal_vars:
                orthset = df2[[orth_var]].copy().dropna()
                common_indices = candset.index.intersection(orthset.index)
                if common_indices.empty:
                    continue
                orth_col = orthset.loc[common_indices, orth_var]
                if np.var(orth_col) == 0:
                    print("ortho:", orth_var, "candidate", c)
                    continue # zero variance leads to divide by zero error
                candcol_common = candset.loc[common_indices, c]

                X_common = sm.add_constant(candcol_common)
                model = sm.OLS(orth_col, X_common).fit()
                temp_orth_scores.append(model.rsquared)

            if temp_orth_scores:
                orth_scores[c] = sum(temp_orth_scores) / len(temp_orth_scores)
            else:
                orth_scores[c] = 0
        return orth_scores

In [52]:
def proxy_finder(df1, df2, target, predictors, num_proxies=1, orth_weight=0.65, candidates=None, orthogonal_vars=None, neural_net="original", drop=True):
    if candidates is None:
        candidates = list(df2.select_dtypes(include='number').columns) #only numerical data (don't encode categories, make user do that)

    proxy_finder_validate(target, candidates, df1, df2, predictors, orthogonal_vars)

    pred = get_predictions(df1, df2, predictors, target, epochs=10, learning_rate=0.001, l2_lambda=0.001)

    df2['predicted_target'] = pred

    numeric_cols = df2.select_dtypes(include=['number']).columns
    df2=df2[numeric_cols]


    results = {}


    for c in candidates:
        if c == 'predicted_target':
          continue
        candset = df2[[c, 'predicted_target']].copy().dropna()
        if candset.empty:
            continue

        pred_scores = candset['predicted_target']
        candcol = candset[c]

        X_pred = sm.add_constant(candcol)
        model_pred = sm.OLS(pred_scores, X_pred).fit()
        results[c] = {
            'R_squared': model_pred.rsquared,
            'p_value': model_pred.pvalues[1] if len(model_pred.pvalues) > 1 else None,
            'coef': model_pred.params[1] if len(model_pred.params) > 1 else None
        }
        #print(f"candidate {c}: Results: {results}")  # Debug statement------------------------------------------------------------


        #print(f"candidate {c}: Results: {results}")  # Debug statement------------------------------------------------------------

    best_proxies = []

    if orthogonal_vars:
        orth_scores = orthogonalize(candidates, df2, orthogonal_vars)
        proxy_scores = {}
        for c in candidates:
            try:
                proxy_scores[c] = (c, (1 - orth_weight) * results[c]['R_squared'] - orth_weight * orth_scores[c])
            except KeyError as e:
                continue

        sorted_results = sorted(proxy_scores.values(), key=lambda x: x[1], reverse=True)

        for i in range(min(num_proxies, len(sorted_results))):
            proxy, score = sorted_results[i]
            best_proxies.append(proxy)
            print(f"Proxy {i+1} for {target}: {proxy} with score: {score}")
    else:
        sorted_results = sorted(results.items(), key=lambda x: (-x[1]['R_squared'], x[1]['p_value']))

        for i in range(min(num_proxies, len(sorted_results))):
            proxy, metrics = sorted_results[i]
            best_proxies.append(proxy)
            print(f"Proxy {i+1} for {target}: {proxy} with R_squared: {metrics['R_squared']} and p_value: {metrics['p_value']}")

    return best_proxies

In [17]:
import warnings
warnings.filterwarnings("ignore", message="All-NaN slice encountered")
warnings.filterwarnings("ignore", category=FutureWarning, message="Series.__getitem__ treating keys as positions is deprecated")


# Suppress numpy invalid operation warnings
np.seterr(invalid='ignore')

df1 =  "/content/yougov_recoded.dta"
df2 =  "/content/anes_recoded.dta"
df1 = pd.read_stata(df1)
df2 = pd.read_stata(df2)

[0.0833333358168602, 0.0, 0.1666666716337204, 0.4583333432674408, 0.7916666865348816, 0.125, 0.0, 0.0, 0.0, 0.5416666865348816, 0.9166666865348816, 0.8333333134651184, 0.8333333134651184, 0.8333333134651184, 0.7083333134651184, 0.625, 0.5416666865348816, 1.0, 0.8333333134651184, 0.9166666865348816, 0.6666666865348816, 0.6666666865348816, 0.4166666567325592, 0.7916666865348816, 1.0, 0.3333333432674408, 1.0, 0.8333333134651184, 0.5416666865348816, 0.5, 0.3333333432674408, 0.0833333358168602, 0.6666666865348816, 0.7083333134651184, 0.5833333134651184, 0.875, 0.5833333134651184, 0.0833333358168602, 0.0, 0.9166666865348816, 0.5416666865348816, 0.5, 0.625, 0.6666666865348816, 0.2083333283662796, 0.0, 0.4583333432674408, 1.0, 0.5833333134651184, 0.0, 0.7083333134651184, 0.5416666865348816, 0.5, 0.625, 0.0, 0.3333333432674408, 0.7083333134651184, 0.4583333432674408, 0.5833333134651184, 0.2083333283662796, 1.0, 0.8333333134651184, 0.2083333283662796, 0.8333333134651184, 0.9166666865348816, 1.0,

In [51]:


target = 'christian_nationalism'  # The target variable in the training set
predictors = [ # predictors in both training and test set
                   'presvote20post',
                   'housevote20post',
                   'senvote20post',
                   'pff_jb',
                   'pff_dt',
                   'pid7',
                   'election_fairnness',
                   'educ',
                   'white',
                   'hispanic',
                   'partisan_violence',
                   'immigrant_citizenship',
                   'immigrant_deport',
                   'auth_grid_1',
                   'auth_grid_3',
                   'auth_grid_2',
                   'faminc_new'
                   ]

orthogonal_vars = [
                  'presvote20post',
                   'housevote20post',
                   'senvote20post',
                   'pff_jb',
                   'pff_dt',
                   'pid7',
                   'election_fairnness',
                   'educ',
                   'hispanic',
                   'partisan_violence',
                   'immigrant_citizenship',
                   'immigrant_deport',
                   'auth_grid_1',
                   'auth_grid_3',
                   'auth_grid_2',
                   'faminc_new']




best_proxies = proxy_finder(df1, df2, target, predictors, orth_weight=0.65, orthogonal_vars=orthogonal_vars, num_proxies=5, neural_net="torch", drop=False)
#print(best_proxies)
### orth weight 0.9 --> version, how many interviews, etx
### 0.85 same thing
### 0.8
### 0.75 bad
### 0.7 bad

Epoch 0: 100%|██████████| 43/43 [00:00<00:00, 87.11batch/s, mse_test=0.249, mse_train=0.127]
Epoch 1: 100%|██████████| 43/43 [00:00<00:00, 88.49batch/s, mse_test=0.287, mse_train=0.107]
Epoch 2: 100%|██████████| 43/43 [00:00<00:00, 67.65batch/s, mse_test=0.313, mse_train=0.085]
Epoch 3: 100%|██████████| 43/43 [00:00<00:00, 64.50batch/s, mse_test=0.331, mse_train=0.0669]
Epoch 4: 100%|██████████| 43/43 [00:00<00:00, 61.31batch/s, mse_test=0.343, mse_train=0.0606]
Epoch 5: 100%|██████████| 43/43 [00:00<00:00, 88.36batch/s, mse_test=0.345, mse_train=0.0578]
Epoch 6: 100%|██████████| 43/43 [00:00<00:00, 87.23batch/s, mse_test=0.342, mse_train=0.0539]
Epoch 7: 100%|██████████| 43/43 [00:00<00:00, 69.42batch/s, mse_test=0.338, mse_train=0.0487]
Epoch 8: 100%|██████████| 43/43 [00:00<00:00, 68.33batch/s, mse_test=0.335, mse_train=0.0445]
Epoch 9: 100%|██████████| 43/43 [00:00<00:00, 66.30batch/s, mse_test=0.336, mse_train=0.0418]


Here


100%|██████████| 1752/1752 [04:07<00:00,  7.08it/s]

here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here



