# version of proxyFinder algorithm using neural network to make predictions

In [61]:
import numpy as np
import pandas as pd
import sys
import statsmodels.api as sm
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [37]:
import os 
print("PYTHONPATH:", os.environ.get('PYTHONPATH'))
print("PATH:", os.environ.get('PATH'))

PYTHONPATH: None
PATH: c:\Users\kirin\anaconda3\envs\torchenv;C:\Users\kirin\anaconda3\envs\torchenv;C:\Users\kirin\anaconda3\envs\torchenv\Library\mingw-w64\bin;C:\Users\kirin\anaconda3\envs\torchenv\Library\usr\bin;C:\Users\kirin\anaconda3\envs\torchenv\Library\bin;C:\Users\kirin\anaconda3\envs\torchenv\Scripts;C:\Users\kirin\anaconda3\envs\torchenv\bin;C:\Users\kirin\anaconda3\condabin;C:\Program Files\Git\usr\local\bin;C:\Program Files\Git\bin;C:\Program Files\Eclipse Adoptium\jdk-11.0.15.10-hotspot\bin;C:\Program Files (x86)\Common Files\Oracle\Java\javapath;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0;C:\Windows\System32\OpenSSH;C:\Program Files (x86)\NVIDIA Corporation\PhysX\Common;C:\Program Files\NVIDIA Corporation\NVIDIA NvDLISR;C:\WINDOWS\system32;C:\WINDOWS;C:\WINDOWS\System32\Wbem;C:\WINDOWS\System32\WindowsPowerShell\v1.0;C:\WINDOWS\System32\OpenSSH;C:\Program Files\Git\cmd;C:\Users\kirin\AppData\Local\Microsoft\Window

In [38]:
def proxy_finder_validate(item, candidates, df1, df2, predictors, orthogonal_vars):

    # validate proxies and st item
    assert item in df1.columns, f'AssertionError: item {item} not in df1.columns'

    assert predictors, f'AssertionError: missing predictors. If you would prefer to not specify predictors, do not pass in a variable.'
    
    for c in predictors:
        assert c in df1.columns, f'AssertionError: predictor {c} not in df1.columns'
        assert c in df2.columns, f'AssertionError: predictor {c} not in df2.columns' # we need same variable in second dataset  
        assert c in df1.select_dtypes(include=['number']).columns, f'predictor {c} is not a numeric column in df1'   
        assert c in df2.select_dtypes(include=['number']).columns, f'predictor {c} is not a numeric column in df2'    
    
    for c in candidates:
        assert c in df2.columns, f'AssertionError: candidate {c} not in df2.columns'
        
    if (orthogonal_vars != None):
        for c in orthogonal_vars:
            assert c in df2.columns, f'AssertionError: orthogonal variable {c} not in df2.columns'
                

In [39]:
# return a new df that is a copy of df, with: rescale all columns to be
#  between 0 and 1, inclusive. Drop any non-numeric columns. Drop any 
# rows that are missing at least one predictor. 
def data_rescale(df, predictors):
    df = df.copy() # preserve immutability

    # Select only the numeric columns
    numeric_cols = df.select_dtypes(include=['number']).columns

    # drop any rows that are missing at least one predictor
    df = df.dropna(subset=predictors)
    
    # Initialize the scaler
    scaler = MinMaxScaler()

    # Fit the scaler to the data and transform it
    scaled_values = scaler.fit_transform(df[numeric_cols])

    # Create a new DataFrame with the scaled values, maintaining the original column names
    scaled_df = pd.DataFrame(scaled_values, columns=numeric_cols, index=df.index)
    
    return scaled_df

In [65]:
# Neural network definition
def build_nn_model(input_dim, learning_rate=0.001, l2_lambda=0.001):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,), kernel_regularizer=l2(l2_lambda)),
        BatchNormalization(),
        Dropout(0.5),  
        Dense(32, activation='relu', kernel_regularizer=l2(l2_lambda)),
        BatchNormalization(),
        Dropout(0.5),  
        Dense(1, kernel_regularizer=l2(l2_lambda))
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse')
    return model

In [66]:
# return a trained neural network to predict df[item] using df[predictors_df1]
# report error and crash if predictors don't predict item
def train_nn_model(X_train, y_train, input_dim, epochs=100, learning_rate=0.001, l2_lambda=0.001):
    model = build_nn_model(input_dim, learning_rate, l2_lambda)
    model.fit(X_train, y_train, epochs=epochs, validation_split=0.2, verbose=0)
    return model

In [21]:
 #DEBUG#### test nn train and build -- appear to be working
# Function to create synthetic data for testing
def create_synthetic_data(num_samples=1000, num_features=10):
    np.random.seed(42)
    X = np.random.rand(num_samples, num_features)
    y = np.random.rand(num_samples)
    return X, y

# Create synthetic data
X, y = create_synthetic_data()

# Normalize the data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the neural network
input_dim = X_train.shape[1]
model = train_nn_model(X_train, y_train, input_dim, epochs=10)

# Print model summary
print(model.summary())

# Evaluate the model on the test set
loss = model.evaluate(X_test, y_test)
print(f"Test loss: {loss}")

# Predict on the test set
predictions = model.predict(X_test)
print(f"Predictions: {predictions[:10]}")

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 64)                704       
                                                                 
 dropout_10 (Dropout)        (None, 64)                0         
                                                                 
 dense_16 (Dense)            (None, 32)                2080      
                                                                 
 dropout_11 (Dropout)        (None, 32)                0         
                                                                 
 dense_17 (Dense)            (None, 1)                 33        
                                                                 
Total params: 2,817
Trainable params: 2,817
Non-trainable params: 0
_________________________________________________________________
None
Test loss: 0.10631357878446579
Predictions: [[0

In [67]:
# Function to get predictions from the neural network
def get_nn_predictions(df_train, df_test, predictors, target, epochs=100, learning_rate=0.001, l2_lambda=0.001):
    # Remove rows with NaN values in predictor columns already done in rescale
  #  df_train = df_train.dropna(subset=predictors)
   # df_test = df_test.dropna(subset=predictors)
    
    X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(df_train[predictors].to_numpy(), df_train[target].to_numpy(), test_size=0.2, random_state=42)
    X_test = df_test[predictors].to_numpy()


    model = train_nn_model(X_train_train, y_train_train, len(predictors), epochs, learning_rate, l2_lambda)
    predictions = model.predict(X_test)

    # exit if correlation between predictions and item is bad
    mse = mean_squared_error(model.predict(X_train_test), y_train_test)
    print(f"Debug statement: MSE = {mse}") ####DEBUG
    if (mse > 0.03):
        print('Input Error: Predictors cannot predict item in df1', file=sys.stderr)
        print('Aborting program')
        sys.exit(-1)

    print(f"Predictions before flattening: {predictions[:10]}") #DEBUG
    print('predictions after flattening: ', predictions.flatten()[:10])#DEBUG

    return predictions.flatten()

In [27]:
# Create synthetic data#DEBUG
X, y = create_synthetic_data(num_samples=1000, num_features=10)

# Convert synthetic data to DataFrame
columns = [f'feature_{i}' for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=columns)
df['target'] = y

# Normalize the data
df = data_rescale(df)

# Split the data into training and testing sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Define predictors and target
predictors = [f'feature_{i}' for i in range(X.shape[1])]
target = 'target'

# Test get_nn_predictions function
predicted_scores = get_nn_predictions(df_train, df_test, predictors, target, epochs=10)

# Print results
print(f"Predicted scores: {predicted_scores[:10]}")


Predictions before flattening: [[0.3332389 ]
 [0.34358305]
 [0.4043013 ]
 [0.36761525]
 [0.41573027]
 [0.42988393]
 [0.3725268 ]
 [0.42518765]
 [0.33807346]
 [0.33562624]]
predictions after flattening:  [0.3332389  0.34358305 0.4043013  0.36761525 0.41573027 0.42988393
 0.3725268  0.42518765 0.33807346 0.33562624]
Predicted scores: [0.3332389  0.34358305 0.4043013  0.36761525 0.41573027 0.42988393
 0.3725268  0.42518765 0.33807346 0.33562624]


In [78]:
#final 3 parameters for debugging/fine tuning
def proxy_finder(df_train, df_test, target, predictors, num_proxies=1, orth_weight=0.65, candidates=None, orthogonal_vars=None, epochs=100, learning_rate=0.001, l2_lambda=0.001):
    if candidates is None:
        candidates = list(df_test.select_dtypes(include='number').columns)
    
    print(f"Predictors: {predictors}") #DEBUGDEBUGDEBUG------------------------------------------------------------
    print(f"Candidates: {candidates}")

    # Predict status threat scores in df_test
    df_train = data_rescale(df_train, predictors)
    df_test = data_rescale(df_test, predictors)
    print(df_train.head)
    print(df_test.head)
    predicted_scores = get_nn_predictions(df_train, df_test, predictors, target, epochs, learning_rate, l2_lambda)
    
    df_test['predicted_status_threat'] = predicted_scores
    print(f"Predicted scores: {predicted_scores[:10]}")  #DEBUG DEBUG------------------------------------------------------------ 

    results = {}
    
    for c in candidates:
        candset = df_test[[c, 'predicted_status_threat']].copy().dropna()
        if candset.empty:
            continue
        
        pred_scores = candset['predicted_status_threat']
        candcol = candset[c]

        X_pred = sm.add_constant(candcol)
        model_pred = sm.OLS(pred_scores, X_pred).fit()
        results[c] = {
            'R_squared': model_pred.rsquared,
            'p_value': model_pred.pvalues[1],
            'coef': model_pred.params[1]
        }
        print(f"candidate {c}: Results: {results}")  # Debug statement------------------------------------------------------------ 
  
    best_proxies = []

    if orthogonal_vars:
        orth_score = {}
        for c in candidates:
            candset = df_test[[c, 'predicted_status_threat']].copy().dropna()
            pred_scores = candset['predicted_status_threat']
            candcol = candset[c]
        
            X = sm.add_constant(candcol)
            temp_orth_scores = []
            for orth_var in orthogonal_vars:
                orthset = df_test[[orth_var]].copy().dropna()
                common_indices = candset.index.intersection(orthset.index)
                if common_indices.empty:
                    continue
                orth_col = orthset.loc[common_indices, orth_var]
                candcol_common = candset.loc[common_indices, c]

                X_common = sm.add_constant(candcol_common)
                model = sm.OLS(orth_col, X_common).fit()
                temp_orth_scores.append(model.rsquared)
            
            if temp_orth_scores:
                orth_score[c] = sum(temp_orth_scores) / len(temp_orth_scores)
            else:
                orth_score[c] = 0
        
        proxy_scores = {}
        for c in candidates:
            try:
                proxy_scores[c] = (c, (1 - orth_weight) * results[c]['R_squared'] - orth_weight * orth_score[c])
            except KeyError as e:
                continue
        
        sorted_results = sorted(proxy_scores.values(), key=lambda x: x[1], reverse=True)
        
        for i in range(min(num_proxies, len(sorted_results))):
            proxy, score = sorted_results[i]
            best_proxies.append(proxy)
            print(f"Proxy {i+1} for {target}: {proxy} with score: {score}")
    else: 
        sorted_results = sorted(results.items(), key=lambda x: (-x[1]['R_squared'], x[1]['p_value']))
    
        for i in range(min(num_proxies, len(sorted_results))):
            proxy, metrics = sorted_results[i]
            best_proxies.append(proxy)
            print(f"Proxy {i+1} for {target}: {proxy} with R_squared: {metrics['R_squared']} and p_value: {metrics['p_value']}")
    
    return best_proxies

In [77]:
# Example usage
datafile_train =  r'C:\Users\kirin\Downloads\W1_W2_W3_Merged_saved.dta'
datafile_test =  r'C:\Users\kirin\Downloads\W1_W2_W3_Merged_saved.dta'
df_train = pd.read_stata(datafile_train)
df_test = pd.read_stata(datafile_test)

target = 'status_threat'  # The target variable in the training set
predictors = [
                   'psc1_W1_01',
                   'christian_nationalism',
                   'authoritarianism',
                   'social_dom11',
                   'race_resent',
                   'party_ID',
                   'ideology',
                   'age501',
                   'education']  # Predictors in both training and testing sets
orthogonal_vars = ['psc1_W1_01',
                   'christian_nationalism',
                   'authoritarianism',
                   'social_dom11',
                   'race_resent',
                   'party_ID',
                   'ideology',
                   'age501',
                   'education']

best_proxies = proxy_finder(df_train, df_test, target, predictors, orthogonal_vars=orthogonal_vars, num_proxies=20)
print(best_proxies)

Predictors: ['psc1_W1_01', 'christian_nationalism', 'authoritarianism', 'social_dom11', 'race_resent', 'party_ID', 'ideology', 'age501', 'education']
Candidates: ['birthyr_W3', 'respondent_id', 'weight_W3', 'weight2_W3', 'weight3_W3', 'birthyr_W2', 'weight_W2', 'weight2_W2', 'weight3_W2', 'birthyr', 'weight', 'st_W1', 'st_W1_01', 'status_W2', 'status2_W2', 'psc1_W1', 'psc1_W1_01', 'cn1', 'cn2', 'cn3', 'cn4', 'cn5', 'cn6', 'cn_W2', 'cn1_W2', 'rr_W1', 'rr1_W1', 'rrx_W1', 'auth_W1', 'auth1_W1', 'socdom_W1', 'socdom1_W1', 'christn', 'christn1', 'ill_W2', 'illib_W2', 'christian_top', 'age501', 'education', 'party_ID', 'ideology', 'christian_nationalism', 'white_top', 'status_threat', 'socdom2_W1', 'SDO11', 'social_dom11', 'race_resent', 'authoritarianism', 'bidenfav', 'trumpfav', 'mwe_ethnic_01', 'white_infl1', 'black_discrim_1', 'hisp_discrim_1', 'white_discrim_1', 'critrace', 'critrace1', 'cap_riot', 'pros_riot', 'imm_cit', 'imm_dep', 'race_ID1', 'civilw', 'whitecon_a', 'whitecon_b', 'mag

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


Debug statement: MSE = 0.024956846155927023
Predictions before flattening: [[0.32239884]
 [0.32885075]
 [0.7600112 ]
 [0.22497535]
 [0.2820828 ]
 [0.30359486]
 [0.37473756]
 [0.6063272 ]
 [0.6848222 ]
 [0.75853944]]
predictions after flattening:  [0.32239884 0.32885075 0.7600112  0.22497535 0.2820828  0.30359486
 0.37473756 0.6063272  0.6848222  0.75853944]
Predicted scores: [0.32239884 0.32885075 0.7600112  0.22497535 0.2820828  0.30359486
 0.37473756 0.6063272  0.6848222  0.75853944]
candidate birthyr_W3: Results: {'birthyr_W3': {'R_squared': 0.05583108995094077, 'p_value': 1.8473327499254616e-17, 'coef': -0.17309739643931998}}
candidate respondent_id: Results: {'birthyr_W3': {'R_squared': 0.05583108995094077, 'p_value': 1.8473327499254616e-17, 'coef': -0.17309739643931998}, 'respondent_id': {'R_squared': 0.019730026488667507, 'p_value': 5.504378180916157e-07, 'coef': -0.08510303542625154}}
candidate weight_W3: Results: {'birthyr_W3': {'R_squared': 0.05583108995094077, 'p_value': 1.8

  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.param

candidate status_threat: Results: {'birthyr_W3': {'R_squared': 0.05583108995094077, 'p_value': 1.8473327499254616e-17, 'coef': -0.17309739643931998}, 'respondent_id': {'R_squared': 0.019730026488667507, 'p_value': 5.504378180916157e-07, 'coef': -0.08510303542625154}, 'weight_W3': {'R_squared': 0.011187526501931888, 'p_value': 0.00016800416942243122, 'coef': -0.14010481065753838}, 'weight2_W3': {'R_squared': 0.017425900424246188, 'p_value': 2.556156311940964e-06, 'coef': -0.16896505541457396}, 'birthyr_W2': {'R_squared': 0.056894884796267475, 'p_value': 1.1414682961257949e-17, 'coef': -0.17552198542022085}, 'weight_W2': {'R_squared': 1.935869215707875e-06, 'p_value': 0.9607621495701084, 'coef': 0.002345201355602511}, 'weight2_W2': {'R_squared': 0.0015835924667180201, 'p_value': 0.1591954554871975, 'coef': -0.06872745111028747}, 'birthyr': {'R_squared': 0.05699252477372296, 'p_value': 8.427264610295655e-18, 'coef': -0.17353382242341298}, 'weight': {'R_squared': 0.00453153888684632, 'p_va

  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.params[1]
  'p_value': model_pred.pvalues[1],
  'coef': model_pred.param

Proxy 1 for status_threat: rr_W1 with score: 0.1723849149357103
Proxy 2 for status_threat: rrx_W1 with score: 0.1723849149357103
Proxy 3 for status_threat: race_resent with score: 0.1723849149357103
Proxy 4 for status_threat: rr1_W1 with score: 0.17238491476555218
Proxy 5 for status_threat: ideology with score: 0.15635844007818883
Proxy 6 for status_threat: pres2024 with score: 0.15034014536004262
Proxy 7 for status_threat: st_W1_01 with score: 0.14388437533804455
Proxy 8 for status_threat: status_threat with score: 0.14388437533804455
Proxy 9 for status_threat: st_W1 with score: 0.1438843750316923
Proxy 10 for status_threat: maga_view with score: 0.13442676194701197
Proxy 11 for status_threat: status2_W2 with score: 0.13138424464788667
Proxy 12 for status_threat: status_W2 with score: 0.1313842445090026
Proxy 13 for status_threat: imm_dep1 with score: 0.12985649005767974
Proxy 14 for status_threat: pid01_W3 with score: 0.12753265432190755
Proxy 15 for status_threat: pid17_W3 with scor