# version of proxyFinder algorithm using neural network to make predictions

In [1]:
import numpy as np
import pandas as pd
import sys
import statsmodels.api as sm
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
import os 
print("PYTHONPATH:", os.environ.get('PYTHONPATH'))
print("PATH:", os.environ.get('PATH'))

PYTHONPATH: None
PATH: c:\Users\kirin\anaconda3\envs\torchenv;C:\Users\kirin\anaconda3\envs\torchenv;C:\Users\kirin\anaconda3\envs\torchenv\Library\mingw-w64\bin;C:\Users\kirin\anaconda3\envs\torchenv\Library\usr\bin;C:\Users\kirin\anaconda3\envs\torchenv\Library\bin;C:\Users\kirin\anaconda3\envs\torchenv\Scripts;C:\Users\kirin\anaconda3\envs\torchenv\bin;C:\Users\kirin\anaconda3\condabin;C:\Program Files\Git\usr\local\bin;C:\Program Files\Git\bin;C:\Program Files\Eclipse Adoptium\jdk-11.0.15.10-hotspot\bin;C:\Program Files (x86)\Common Files\Oracle\Java\javapath;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0;C:\Windows\System32\OpenSSH;C:\Program Files (x86)\NVIDIA Corporation\PhysX\Common;C:\Program Files\NVIDIA Corporation\NVIDIA NvDLISR;C:\WINDOWS\system32;C:\WINDOWS;C:\WINDOWS\System32\Wbem;C:\WINDOWS\System32\WindowsPowerShell\v1.0;C:\WINDOWS\System32\OpenSSH;C:\Program Files\Git\cmd;C:\Users\kirin\AppData\Local\Microsoft\Window

In [3]:
def proxy_finder_validate(item, candidates, df1, df2, predictors, orthogonal_vars):

    # validate proxies and st item
    assert item in df1.columns, f'AssertionError: item {item} not in df1.columns'

    assert predictors, f'AssertionError: missing predictors. If you would prefer to not specify predictors, do not pass in a variable.'
    
    for c in predictors:
        assert c in df1.columns, f'AssertionError: predictor {c} not in df1.columns'
        assert c in df2.columns, f'AssertionError: predictor {c} not in df2.columns' # we need same variable in second dataset  
        assert c in df1.select_dtypes(include=['number']).columns, f'predictor {c} is not a numeric column in df1'   
        assert c in df2.select_dtypes(include=['number']).columns, f'predictor {c} is not a numeric column in df2'    
    
    for c in candidates:
        assert c in df2.columns, f'AssertionError: candidate {c} not in df2.columns'
        
    if (orthogonal_vars != None):
        for c in orthogonal_vars:
            assert c in df2.columns, f'AssertionError: orthogonal variable {c} not in df2.columns'
                

In [4]:
# return a new df that is a copy of df, with: rescale all columns to be
#  between 0 and 1, inclusive. Drop any non-numeric columns. Drop any 
# rows that are missing at least one predictor. 
def data_rescale(df, predictors):
    df = df.copy() # preserve immutability

    # Select only the numeric columns
    numeric_cols = df.select_dtypes(include=['number']).columns

    # drop any rows that are missing at least one predictor
    df = df.dropna(subset=predictors)
    
    # Initialize the scaler
    scaler = MinMaxScaler()

    # Fit the scaler to the data and transform it
    scaled_values = scaler.fit_transform(df[numeric_cols])

    # Create a new DataFrame with the scaled values, maintaining the original column names
    scaled_df = pd.DataFrame(scaled_values, columns=numeric_cols, index=df.index)
    
    return scaled_df

In [5]:
# Neural network definition
def build_nn_model(input_dim, learning_rate=0.001, l2_lambda=0.001):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,), kernel_regularizer=l2(l2_lambda)),
        BatchNormalization(),
        Dropout(0.5),  
        Dense(32, activation='relu', kernel_regularizer=l2(l2_lambda)),
        BatchNormalization(),
        Dropout(0.5),  
        Dense(1, kernel_regularizer=l2(l2_lambda))
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse')
    return model

In [6]:
# return a trained neural network to predict df[item] using df[predictors_df1]
# report error and crash if predictors don't predict item
def train_nn_model(X_train, y_train, input_dim, epochs=100, learning_rate=0.001, l2_lambda=0.001):
    model = build_nn_model(input_dim, learning_rate, l2_lambda)
    model.fit(X_train, y_train, epochs=epochs, validation_split=0.2, verbose=0)
    return model

In [19]:
# get predictions from the neural network. Takes in
def get_nn_predictions(df_train, df_test, predictors, target, epochs=100, learning_rate=0.001, l2_lambda=0.001):
    
    # split data for training and testing. 
    X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(df_train[predictors].to_numpy(), df_train[target].to_numpy(), test_size=0.2, random_state=42)
    X_test = df_test[predictors].to_numpy()

    # train network and get predictions
    model = train_nn_model(X_train_train, y_train_train, len(predictors), epochs, learning_rate, l2_lambda)
    predictions = model.predict(X_test)

    # exit if correlation between predictions and item is bad
    mse = mean_squared_error(model.predict(X_train_test), y_train_test)
    #print(f"Debug statement: MSE = {mse}") ####DEBUG
    print(f'Confidence level: {(int)((1 - (mse / 0.036)) * 100)}%')
    if (mse > 0.03):
        print('Input Error: Predictors cannot predict item in df1', file=sys.stderr)
        print('Aborting program')
        sys.exit(-1)

   # print(f"Predictions before flattening: {predictions[:10]}") #DEBUG
   # print('predictions after flattening: ', predictions.flatten()[:10])#DEBUG

    return predictions.flatten()

In [15]:
#final 3 parameters for debugging/fine tuning
def proxy_finder(df_train, df_test, target, predictors, num_proxies=1, orth_weight=0.65, candidates=None, orthogonal_vars=None, epochs=100, learning_rate=0.001, l2_lambda=0.001):
    if candidates is None:
        candidates = list(df_test.select_dtypes(include='number').columns)
    
  #  print(f"Predictors: {predictors}") #DEBUGDEBUGDEBUG------------------------------------------------------------
    #print(f"Candidates: {candidates}")

    # Predict status threat scores in df_test
    df_train = data_rescale(df_train, predictors)
    df_test = data_rescale(df_test, predictors)
   # print(df_train.head) ## debug
  #  print(df_test.head)
    predicted_scores = get_nn_predictions(df_train, df_test, predictors, target, epochs, learning_rate, l2_lambda)
    
    df_test['predicted_status_threat'] = predicted_scores
    #print(f"Predicted scores: {predicted_scores[:10]}")  #DEBUG DEBUG------------------------------------------------------------ 

    results = {}
    
    for c in candidates:
        candset = df_test[[c, 'predicted_status_threat']].copy().dropna()
        if candset.empty:
            continue
        
        pred_scores = candset['predicted_status_threat']
        candcol = candset[c]

        X_pred = sm.add_constant(candcol)
        model_pred = sm.OLS(pred_scores, X_pred).fit()
        results[c] = {
            'R_squared': model_pred.rsquared,
            'p_value': model_pred.pvalues[1],
            'coef': model_pred.params[1]
        }
        #print(f"candidate {c}: Results: {results}")  # Debug statement------------------------------------------------------------ 
  
    best_proxies = []

    if orthogonal_vars:
        orth_score = {}
        for c in candidates:
            candset = df_test[[c, 'predicted_status_threat']].copy().dropna()
            pred_scores = candset['predicted_status_threat']
            candcol = candset[c]
        
            X = sm.add_constant(candcol)
            temp_orth_scores = []
            for orth_var in orthogonal_vars:
                orthset = df_test[[orth_var]].copy().dropna()
                common_indices = candset.index.intersection(orthset.index)
                if common_indices.empty:
                    continue
                orth_col = orthset.loc[common_indices, orth_var]
                candcol_common = candset.loc[common_indices, c]

                X_common = sm.add_constant(candcol_common)
                model = sm.OLS(orth_col, X_common).fit()
                temp_orth_scores.append(model.rsquared)
            
            if temp_orth_scores:
                orth_score[c] = sum(temp_orth_scores) / len(temp_orth_scores)
            else:
                orth_score[c] = 0
        
        proxy_scores = {}
        for c in candidates:
            try:
                proxy_scores[c] = (c, (1 - orth_weight) * results[c]['R_squared'] - orth_weight * orth_score[c])
            except KeyError as e:
                continue
        
        sorted_results = sorted(proxy_scores.values(), key=lambda x: x[1], reverse=True)
        
        for i in range(min(num_proxies, len(sorted_results))):
            proxy, score = sorted_results[i]
            best_proxies.append(proxy)
            print(f"Proxy {i+1} for {target}: {proxy} with score: {score}")
    else: 
        sorted_results = sorted(results.items(), key=lambda x: (-x[1]['R_squared'], x[1]['p_value']))
    
        for i in range(min(num_proxies, len(sorted_results))):
            proxy, metrics = sorted_results[i]
            best_proxies.append(proxy)
            print(f"Proxy {i+1} for {target}: {proxy} with R_squared: {metrics['R_squared']} and p_value: {metrics['p_value']}")
    
    return best_proxies

In [20]:
import warnings
warnings.filterwarnings("ignore", message="All-NaN slice encountered")
warnings.filterwarnings("ignore", category=FutureWarning, message="Series.__getitem__ treating keys as positions is deprecated") # I should probably actually fix this one so it doesn't break with future updates


# Suppress numpy invalid operation warnings
np.seterr(invalid='ignore')

# Example usage: Clearly, the best proxy for status threat should be status threat & related items. 
datafile_train =  r'C:\Users\kirin\Downloads\W1_W2_W3_Merged_saved.dta'
datafile_test =  r'C:\Users\kirin\Downloads\W1_W2_W3_Merged_saved.dta'
df_train = pd.read_stata(datafile_train)
df_test = pd.read_stata(datafile_test)

target = 'status_threat'  # The target variable in the training set
predictors = [
                   'psc1_W1_01',
                   'christian_nationalism',
                   'authoritarianism',
                   'social_dom11',
                   'race_resent',
                   'party_ID',
                   'ideology',
                   'age501',
                   'education']  # Predictors in both training and testing sets
orthogonal_vars = ['psc1_W1_01',
                   'christian_nationalism',
                   'authoritarianism',
                   'social_dom11',
                   'race_resent',
                   'party_ID',
                   'ideology',
                   'age501',
                   'education']

best_proxies = proxy_finder(df_train, df_test, target, predictors, orthogonal_vars=orthogonal_vars, num_proxies=20)
#print(best_proxies)

Confidence level: 27%
Proxy 1 for status_threat: st_W1_01 with score: 0.059827577676469956
Proxy 2 for status_threat: status_threat with score: 0.059827577676469956
Proxy 3 for status_threat: st_W1 with score: 0.0598275775976887
Proxy 4 for status_threat: rr1_W1 with score: 0.054831451729434766
Proxy 5 for status_threat: rr_W1 with score: 0.054831451657055524
Proxy 6 for status_threat: rrx_W1 with score: 0.054831451657055524
Proxy 7 for status_threat: race_resent with score: 0.054831451657055524
Proxy 8 for status_threat: status_W2 with score: 0.05148620176210139
Proxy 9 for status_threat: status2_W2 with score: 0.05148620157651515
Proxy 10 for status_threat: ideology with score: 0.04395950751103625
Proxy 11 for status_threat: imm_dep1 with score: 0.04263953878398996
Proxy 12 for status_threat: maga_view with score: 0.03994999649819911
Proxy 13 for status_threat: sta_W3_01 with score: 0.03429170417259268
Proxy 14 for status_threat: stat_W3 with score: 0.03429170389072758
Proxy 15 for s