# version of proxyFinder algorithm using neural network to make predictions

In [87]:
import numpy as np
import pandas as pd
import sys
import statsmodels.api as sm
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [88]:
%matplotlib inline
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import random
import tqdm
import os
from torchviz import make_dot

In [89]:
# Neural network definition
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.layer1 = nn.Linear(input_dim, 64)
        self.layer2 = nn.Linear(64, 32)
        self.layer3 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = self.layer3(x)
        return x

In [90]:
def proxy_finder_validate(item, candidates, df1, df2, predictors, orthogonal_vars):

    # validate proxies and st item
    assert item in df1.columns, f'AssertionError: item {item} not in df1.columns'

    assert predictors, f'AssertionError: missing predictors'
    
    for c in predictors:
        assert c in df1.columns, f'AssertionError: predictor {c} not in df1.columns'
        assert c in df2.columns, f'AssertionError: predictor {c} not in df2.columns' # only because we need same variable in second dataset        
    
    for c in candidates:
        assert c in df2.columns, f'AssertionError: candidate {c} not in df2.columns'
        
    if (orthogonal_vars != None):
        for c in orthogonal_vars:
            assert c in df2.columns, f'AssertionError: orthogonal variable {c} not in df2.columns'
                

In [91]:
# rescale all columns to be between 0 and 1, inclusive. Drop any non-numeric columns.
def data_rescale(df):
   
    # Select only the numeric columns
    numeric_cols = df.select_dtypes(include=['number']).columns
    
    # Initialize the scaler
    scaler = MinMaxScaler()

    # Fit the scaler to the data and transform it
    scaled_values = scaler.fit_transform(df[numeric_cols])

    # Create a new DataFrame with the scaled values, maintaining the original column names
    scaled_df = pd.DataFrame(scaled_values, columns=numeric_cols, index=df.index)
    
    return scaled_df

In [92]:
# return a trained neural network to predict df[item] using df[predictors_df1]
# report error and crash if predictors don't predict item
def train_nn_model(X_train, y_train, input_dim, epochs=100):
    model = SimpleNN(input_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

    return model

In [93]:
# Function to get predictions from the neural network
def get_nn_predictions(df_train, df_test, predictors, target, epochs=100):
    X_train = df_train[predictors].to_numpy()
    y_train = df_train[target].to_numpy()
    X_test = df_test[predictors].to_numpy()

    model = train_nn_model(X_train, y_train, len(predictors), epochs)

    model.eval()
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    with torch.no_grad():
        predictions = model(X_test_tensor).cpu().numpy()

    return predictions.flatten()

In [94]:
# Function to find the best proxies
def proxy_finder(df_train, df_test, target, predictors, num_proxies=1, orth_weight=0.5, candidates=None, orthogonal_vars=None):
    if candidates is None:
        candidates = list(df_test.select_dtypes(include='number').columns)
    
    # Predict status threat scores in df_test
    df_train = data_rescale(df_train)
    df_test = data_rescale(df_test)
    predicted_scores = get_nn_predictions(df_train, df_test, predictors, target)
    
    df_test['predicted_status_threat'] = predicted_scores

    results = {}
    
    for c in candidates:
        candset = df_test[[c, 'predicted_status_threat']].copy().dropna()
        if candset.empty:
            continue
        
        pred_scores = candset['predicted_status_threat']
        candcol = candset[c]

        X_pred = sm.add_constant(candcol)
        model_pred = sm.OLS(pred_scores, X_pred).fit()
        results[c] = {
            'R_squared': model_pred.rsquared,
            'p_value': model_pred.pvalues[1],
            'coef': model_pred.params[1]
        }
  
    best_proxies = []

    if orthogonal_vars:
        orth_score = {}
        for c in candidates:
            candset = df_test[[c, 'predicted_status_threat']].copy().dropna()
            pred_scores = candset['predicted_status_threat']
            candcol = candset[c]
        
            X = sm.add_constant(candcol)
            temp_orth_scores = []
            for orth_var in orthogonal_vars:
                orthset = df_test[[orth_var]].copy().dropna()
                common_indices = candset.index.intersection(orthset.index)
                if common_indices.empty:
                    continue
                orth_col = orthset.loc[common_indices, orth_var]
                candcol_common = candset.loc[common_indices, c]

                X_common = sm.add_constant(candcol_common)
                model = sm.OLS(orth_col, X_common).fit()
                temp_orth_scores.append(model.rsquared)
            
            if temp_orth_scores:
                orth_score[c] = sum(temp_orth_scores) / len(temp_orth_scores)
            else:
                orth_score[c] = 0
        
        proxy_scores = {}
        for c in candidates:
            try:
                proxy_scores[c] = (c, (1 - orth_weight) * results[c]['R_squared'] - orth_weight * orth_score[c])
            except KeyError as e:
                continue
        
        sorted_results = sorted(proxy_scores.values(), key=lambda x: x[1], reverse=True)
        
        for i in range(min(num_proxies, len(sorted_results))):
            proxy, score = sorted_results[i]
            best_proxies.append(proxy)
            print(f"Proxy {i+1} for {target}: {proxy} with score: {score}")
    else: 
        sorted_results = sorted(results.items(), key=lambda x: (-x[1]['R_squared'], x[1]['p_value']))
    
        for i in range(min(num_proxies, len(sorted_results))):
            proxy, metrics = sorted_results[i]
            best_proxies.append(proxy)
            print(f"Proxy {i+1} for {target}: {proxy} with R_squared: {metrics['R_squared']} and p_value: {metrics['p_value']}")
    
    return best_proxies

In [95]:
# Example usage
datafile_train =  r'C:\Users\kirin\Downloads\W1_W2_W3_Merged_saved.dta'
datafile_test =  r'C:\Users\kirin\Downloads\W1_W2_W3_Merged_saved.dta'
df_train = pd.read_stata(datafile_train, convert_categoricals=False)
df_test = pd.read_stata(datafile_test, convert_categoricals=False)

target = 'status_threat'  # The target variable in the training set
predictors = ['psc1_W1_01',
                   'christian_nationalism',
                   'authoritarianism']  # Predictors in both training and testing sets
orthogonal_vars = ['authoritarianism', 'christian_nationalism', 'social_dom11', 'race_resent', 'party_ID', 'ideology']

best_proxies = proxy_finder(df_train, df_test, target, predictors, num_proxies=5, orthogonal_vars=orthogonal_vars)
print(best_proxies)

RuntimeError: Numpy is not available