# version of proxyFinder algorithm using neural network to make predictions

In [4]:
import numpy as np
import pandas as pd
import sys
import statsmodels.api as sm
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [5]:
import os 
print("PYTHONPATH:", os.environ.get('PYTHONPATH'))
print("PATH:", os.environ.get('PATH'))

PYTHONPATH: None
PATH: c:\Users\kirin\anaconda3\envs\torchenv;C:\Users\kirin\anaconda3\envs\torchenv;C:\Users\kirin\anaconda3\envs\torchenv\Library\mingw-w64\bin;C:\Users\kirin\anaconda3\envs\torchenv\Library\usr\bin;C:\Users\kirin\anaconda3\envs\torchenv\Library\bin;C:\Users\kirin\anaconda3\envs\torchenv\Scripts;C:\Users\kirin\anaconda3\envs\torchenv\bin;C:\Users\kirin\anaconda3\condabin;C:\Program Files\Git\usr\local\bin;C:\Program Files\Git\bin;C:\Program Files\Eclipse Adoptium\jdk-11.0.15.10-hotspot\bin;C:\Program Files (x86)\Common Files\Oracle\Java\javapath;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0;C:\Windows\System32\OpenSSH;C:\Program Files (x86)\NVIDIA Corporation\PhysX\Common;C:\Program Files\NVIDIA Corporation\NVIDIA NvDLISR;C:\WINDOWS\system32;C:\WINDOWS;C:\WINDOWS\System32\Wbem;C:\WINDOWS\System32\WindowsPowerShell\v1.0;C:\WINDOWS\System32\OpenSSH;C:\Program Files\Git\cmd;C:\Users\kirin\AppData\Local\Microsoft\Window

In [6]:
def proxy_finder_validate(item, candidates, df1, df2, predictors, orthogonal_vars):

    # validate proxies and st item
    assert item in df1.columns, f'AssertionError: item {item} not in df1.columns'

    assert predictors, f'AssertionError: missing predictors'
    
    for c in predictors:
        assert c in df1.columns, f'AssertionError: predictor {c} not in df1.columns'
        assert c in df2.columns, f'AssertionError: predictor {c} not in df2.columns' # only because we need same variable in second dataset        
    
    for c in candidates:
        assert c in df2.columns, f'AssertionError: candidate {c} not in df2.columns'
        
    if (orthogonal_vars != None):
        for c in orthogonal_vars:
            assert c in df2.columns, f'AssertionError: orthogonal variable {c} not in df2.columns'
                

In [30]:
# return a new df that is a copy of df, with: rescale all columns to be between 0 and 1, inclusive. Drop any non-numeric columns.
def data_rescale(df):
    df = df.copy() # preserve immutability

    # Select only the numeric columns
    numeric_cols = df.select_dtypes(include=['number']).columns
    
    # Initialize the scaler
    scaler = MinMaxScaler()

    # Fit the scaler to the data and transform it
    scaled_values = scaler.fit_transform(df[numeric_cols])

    # Create a new DataFrame with the scaled values, maintaining the original column names
    scaled_df = pd.DataFrame(scaled_values, columns=numeric_cols, index=df.index)
    
    return scaled_df

In [8]:
# Neural network definition
def build_nn_model(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        Dropout(0.5),  # Adding dropout regularization
        Dense(32, activation='relu'),
        Dropout(0.5),  # Adding dropout regularization
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

In [9]:
# return a trained neural network to predict df[item] using df[predictors_df1]
# report error and crash if predictors don't predict item
def train_nn_model(X_train, y_train, input_dim, epochs=100):
    model = build_nn_model(input_dim)
    model.fit(X_train, y_train, epochs=epochs, validation_split=0.2, verbose=0)
    return model

In [21]:
 #DEBUG#### test nn train and build -- appear to be working
# Function to create synthetic data for testing
def create_synthetic_data(num_samples=1000, num_features=10):
    np.random.seed(42)
    X = np.random.rand(num_samples, num_features)
    y = np.random.rand(num_samples)
    return X, y

# Create synthetic data
X, y = create_synthetic_data()

# Normalize the data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the neural network
input_dim = X_train.shape[1]
model = train_nn_model(X_train, y_train, input_dim, epochs=10)

# Print model summary
print(model.summary())

# Evaluate the model on the test set
loss = model.evaluate(X_test, y_test)
print(f"Test loss: {loss}")

# Predict on the test set
predictions = model.predict(X_test)
print(f"Predictions: {predictions[:10]}")

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 64)                704       
                                                                 
 dropout_10 (Dropout)        (None, 64)                0         
                                                                 
 dense_16 (Dense)            (None, 32)                2080      
                                                                 
 dropout_11 (Dropout)        (None, 32)                0         
                                                                 
 dense_17 (Dense)            (None, 1)                 33        
                                                                 
Total params: 2,817
Trainable params: 2,817
Non-trainable params: 0
_________________________________________________________________
None
Test loss: 0.10631357878446579
Predictions: [[0

In [34]:
# Function to get predictions from the neural network
def get_nn_predictions(df_train, df_test, predictors, target, epochs=100):
    # Remove rows with NaN values in predictor columns
    df_train = df_train.dropna(subset=predictors)
    df_test = df_test.dropna(subset=predictors)
    
    X_train = df_train[predictors].to_numpy()
    y_train = df_train[target].to_numpy()
    X_test = df_test[predictors].to_numpy()


    model = train_nn_model(X_train, y_train, len(predictors), epochs)
    predictions = model.predict(X_test)
    print(f"Predictions before flattening: {predictions[:10]}") #DEBUG
    print('predictions after flattening: ', predictions.flatten()[:10])#DEBUG

    return predictions.flatten()

In [27]:
# Create synthetic data#DEBUG
X, y = create_synthetic_data(num_samples=1000, num_features=10)

# Convert synthetic data to DataFrame
columns = [f'feature_{i}' for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=columns)
df['target'] = y

# Normalize the data
df = data_rescale(df)

# Split the data into training and testing sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Define predictors and target
predictors = [f'feature_{i}' for i in range(X.shape[1])]
target = 'target'

# Test get_nn_predictions function
predicted_scores = get_nn_predictions(df_train, df_test, predictors, target, epochs=10)

# Print results
print(f"Predicted scores: {predicted_scores[:10]}")


Predictions before flattening: [[0.3332389 ]
 [0.34358305]
 [0.4043013 ]
 [0.36761525]
 [0.41573027]
 [0.42988393]
 [0.3725268 ]
 [0.42518765]
 [0.33807346]
 [0.33562624]]
predictions after flattening:  [0.3332389  0.34358305 0.4043013  0.36761525 0.41573027 0.42988393
 0.3725268  0.42518765 0.33807346 0.33562624]
Predicted scores: [0.3332389  0.34358305 0.4043013  0.36761525 0.41573027 0.42988393
 0.3725268  0.42518765 0.33807346 0.33562624]


In [32]:
def proxy_finder(df_train, df_test, target, predictors, num_proxies=1, orth_weight=0.5, candidates=None, orthogonal_vars=None):
    if candidates is None:
        candidates = list(df_test.select_dtypes(include='number').columns)
    
    print(f"Predictors: {predictors}") #DEBUGDEBUGDEBUG------------------------------------------------------------
    print(f"Candidates: {candidates}")

    # Predict status threat scores in df_test
    df_train = data_rescale(df_train)
    df_test = data_rescale(df_test)
    print(df_train.head)
    print(df_test.head)
    predicted_scores = get_nn_predictions(df_train, df_test, predictors, target)
    
    df_test['predicted_status_threat'] = predicted_scores
    print(f"Predicted scores: {predicted_scores[:10]}")  #DEBUG DEBUG------------------------------------------------------------ 

    results = {}
    
    for c in candidates:
        candset = df_test[[c, 'predicted_status_threat']].copy().dropna()
        if candset.empty:
            continue
        
        pred_scores = candset['predicted_status_threat']
        candcol = candset[c]

        X_pred = sm.add_constant(candcol)
        model_pred = sm.OLS(pred_scores, X_pred).fit()
        results[c] = {
            'R_squared': model_pred.rsquared,
            'p_value': model_pred.pvalues[1],
            'coef': model_pred.params[1]
        }
        print(f"candidate {c}: Results: {results}")  # Debug statement------------------------------------------------------------ 
  
    best_proxies = []

    if orthogonal_vars:
        orth_score = {}
        for c in candidates:
            candset = df_test[[c, 'predicted_status_threat']].copy().dropna()
            pred_scores = candset['predicted_status_threat']
            candcol = candset[c]
        
            X = sm.add_constant(candcol)
            temp_orth_scores = []
            for orth_var in orthogonal_vars:
                orthset = df_test[[orth_var]].copy().dropna()
                common_indices = candset.index.intersection(orthset.index)
                if common_indices.empty:
                    continue
                orth_col = orthset.loc[common_indices, orth_var]
                candcol_common = candset.loc[common_indices, c]

                X_common = sm.add_constant(candcol_common)
                model = sm.OLS(orth_col, X_common).fit()
                temp_orth_scores.append(model.rsquared)
            
            if temp_orth_scores:
                orth_score[c] = sum(temp_orth_scores) / len(temp_orth_scores)
            else:
                orth_score[c] = 0
        
        proxy_scores = {}
        for c in candidates:
            try:
                proxy_scores[c] = (c, (1 - orth_weight) * results[c]['R_squared'] - orth_weight * orth_score[c])
            except KeyError as e:
                continue
        
        sorted_results = sorted(proxy_scores.values(), key=lambda x: x[1], reverse=True)
        
        for i in range(min(num_proxies, len(sorted_results))):
            proxy, score = sorted_results[i]
            best_proxies.append(proxy)
            print(f"Proxy {i+1} for {target}: {proxy} with score: {score}")
    else: 
        sorted_results = sorted(results.items(), key=lambda x: (-x[1]['R_squared'], x[1]['p_value']))
    
        for i in range(min(num_proxies, len(sorted_results))):
            proxy, metrics = sorted_results[i]
            best_proxies.append(proxy)
            print(f"Proxy {i+1} for {target}: {proxy} with R_squared: {metrics['R_squared']} and p_value: {metrics['p_value']}")
    
    return best_proxies

In [35]:
# Example usage
datafile_train =  r'C:\Users\kirin\Downloads\W1_W2_W3_Merged_saved.dta'
datafile_test =  r'C:\Users\kirin\Downloads\W1_W2_W3_Merged_saved.dta'
df_train = pd.read_stata(datafile_train, convert_categoricals=False)
df_test = pd.read_stata(datafile_test, convert_categoricals=False)

target = 'status_threat'  # The target variable in the training set
predictors = ['psc1_W1_01',
                   'christian_nationalism',
                   'authoritarianism']  # Predictors in both training and testing sets
orthogonal_vars = ['authoritarianism', 'christian_nationalism', 'social_dom11', 'race_resent', 'party_ID', 'ideology']

best_proxies = proxy_finder(df_train, df_test, target, predictors, num_proxies=5, orthogonal_vars=orthogonal_vars)
print(best_proxies)

Predictors: ['psc1_W1_01', 'christian_nationalism', 'authoritarianism']
Candidates: ['vendor_W3', 'immigrant_W3', 'inputstate_W3', 'votereg_W3', 'inputregstate_W3', 'birthyr_W3', 'age5_W3', 'gender_W3', 'transgender_W3', 'educ_W3', 'educ4_W3', 'educ2_W3', 'race_W3', 'race4_W3', 'race3_W3', 'race2_W3', 'hispanic_W3', 'hispanic_origin_1_W3', 'hispanic_origin_2_W3', 'hispanic_origin_3_W3', 'hispanic_origin_4_W3', 'turnout20post_W3', 'presvote20post_W3', 'faminc_new_W3', 'res_region_W3', 'res_division_W3', 'reg_region_W3', 'reg_division_W3', 'partisanship_W3', 'partisanship_reps_W3', 'partisanship_dems_W3', 'partisanship_ind_W3', 'house2022_W3', 'biden_approval_W3', 'trump_approval_W3', 'expandinghouse_split_W3', 'expandinghouse_A_W3', 'expandinghouse_B_W3', 'expandinghouse_C_W3', 'expandinghouse_D_W3', 'electioninteg_split_W3', 'electioninteg_A_W3', 'electioninteg_B_W3', 'electioninteg_C_W3', 'pres2024_W3', 'jan6_exposure_W3', 'jan6_hearings_W3', 'trump_crimes_W3', 'trump_charge_W3', 'tru

ValueError: Length of values (1285) does not match length of index (3400)