In [126]:
import os

# Get the current working directory
current_directory = os.getcwd()

# Print the current working directory
print("Current Working Directory:", current_directory)


Current Working Directory: C:\Users\kirin\Untitled Folder


Idea: Find a proxy for status threat to use in datasets that have no such measure but have common demographic variables such as sdo, christian nationalism, authoritarianism, ....  
Steps:
1. Use regression to predict status threat (perhaps single status threat item) based on other measures such that each measure has a corresponding measure in the new dataset.
2. Regress predicted status threat on 'christian_nationalism',
                   'authoritarianism',
                   'social_dom11',
                   'race_resent',
                   'party_ID',
                   'ideology'
3. use residuals as ST\perp


Then, iterate through each candidate proxy. Select proxy to minimize objective function: MSE?  

MSE = \sum_{i \in (study participants)} (ST\perp _{i} - candidate_{i})^2

 

In [15]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

In [16]:
# Set display options to show only 5 colmns/rows
pd.set_option('display.max_columns', 5)
pd.set_option('display.max_rows', 5)
pd.set_option('display.max_colwidth', 5)
pd.set_option('display.width', 5)

In [17]:
# Set display options to show all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)


In [18]:
def proxy_finder_validate(item, candidates, df1, df2):
   
   # assert (df1 != None)
   # assert (df2 != None)

    # validate proxies and st item
    assert item in df1.columns

    for c in candidates:
        assert c in df2.columns

In [19]:
# rescale all columns to be between 0 and 1, inclusive. Drop any non-numeric columns.
def data_rescale(df):
   
    # Select only the numeric columns
    numeric_cols = df.select_dtypes(include=['number']).columns
    
    # Initialize the scaler
    scaler = MinMaxScaler()

    # Fit the scaler to the data and transform it
    scaled_values = scaler.fit_transform(df[numeric_cols])

    # Create a new DataFrame with the scaled values, maintaining the original column names
    scaled_df = pd.DataFrame(scaled_values, columns=numeric_cols, index=df.index)
    
    return scaled_df

In [20]:
# return a linear regression model to predict df[item] using df[predictors_df1]
# report error and crash if predictors don't predict item
def get_model(predictors_df1, item, test_pct, df):

    # create linear model such that item_predicted = B0 + B1X1 + ... + BnXn
    prepped_data = df[predictors_df1 + [item]].dropna(axis=0)
    X = prepped_data[predictors_df1].to_numpy()

    y = prepped_data[item].to_numpy()


    #split into train and test data

    test_size = (int) (X.shape[0] * test_pct)
    train_size = (int) (X.shape[0] * (1-test_pct))

    x_train = X[:-train_size]
    x_test = X[-test_size:]

    # Split the targets into training/testing sets
    y_train = y[:-train_size]
    y_test = y[-test_size:]


    # run linear regression
    regr = linear_model.LinearRegression()
    regr.fit(x_train, y_train)
    
    if (mean_squared_error(regr.predict(x_test), y_test) > 0.05):
        print('predictors cannot predict item in df1')
        assert(False)
    
    return regr

In [21]:
# returns predicted item in df2
def predict_item(df2, predictors_df2, model):
  
    # item_predicted = B0 + B1X1 + ... + BnXn
    X = df2[predictors_df2] 
    
    return np.dot(X, model.coef_) + model.intercept_

In [22]:
# return the best-fitting proxy out of candidates variables for predicted value of item in df1 using predictors_df1 variables
# to find approximation for item in df2 using predictors_df2 variables. Approximation in df2 is purged of influence of 
# orthogonal_vars, if specified. If candidates not specified, consider all columns with numerical data to be candidates.
def proxy_finder(df1, df2, item, predictors_df1, predictors_df2, num_proxies=1, candidates=None, orthogonal_vars=None):
    #test size for linear regression training
    test_size = 0.2
    
    if (candidates == None):
        candidates = list(df2.select_dtypes(include='number').columns)
    
    # validate parameters and construct df2 prediction for item
    proxy_finder_validate(item, candidates, df1, df2)
    df1 = data_rescale(df1) # ensure each df is scaled between 0,1
    df2 = data_rescale(df2)
    regr = get_model(predictors_df1, item, test_size, df1)
    item_pred = predict_item(df2, predictors_df2, regr)
    
    df2['item_pred'] = item_pred

    # perform regression analysis for each candidate proxy
    results = {}
    
    for c in candidates:
        
        # drop rows from item_pred and df2[c]
        candset = df2[[c, 'item_pred']].copy()
        
        candset = candset.replace([np.inf, -np.inf], np.nan).dropna()
        item_pred_drop = candset['item_pred']
        candcol = candset[c]
        
        X = sm.add_constant(candcol)
                            
        model = sm.OLS(item_pred_drop, X).fit()
        results[c] = {
            'R_squared': model.rsquared,
            'p_value': model.pvalues[1],  
            'coef': model.params[1]
        } 
  
    # Select the proxy with the highest R-squared and significant p-value
    # Sort the results by R_squared (descending) and p_value (ascending)
    sorted_results = sorted(results.items(), key=lambda x: (-x[1]['R_squared'], x[1]['p_value']))
    
    best_proxies = []
    
    # add & print the top number_proxies
    for i in range(min(num_proxies, len(sorted_results))):
        proxy, metrics = sorted_results[i]
        best_proxies.append(proxy)
        print(f"Proxy {i+1} for {item}: {proxy} with R_squared: {metrics['R_squared']} and p_value: {metrics['p_value']}")
    
    return best_proxies

### TOY RUN CASE

In [23]:
# use this case to make sure the algorithm works

# specific item we'd like to make a proxy for
item = 'status_threat' 

# specific variables we use to predict the item in first dataframe
predictors_df1 = [
                   'psc1_W1_01',
                   'party_ID',
                   'age501',
                   'education'] #CN AUTH RR SDO PID IDE + EDU AGE GEND

# specific variables we use to predict the item in second dataframe. 
# These should correspond to the itemsin predictors_df1.
predictors_df2 = [
                   'psc1_W1_01',
                   'party_ID',
                   'age501',
                   'education'
                   ] 

# potential proxies
candidates = ['christian_top',
'age501',
'education',
'ideology',
'christian_nationalism',
'white_top',
'status_threat',
'SDO11',
'social_dom11',
'race_resent',
'authoritarianism',
'trumpfav'
              ] 

# .dta file with item measure
datafile_item = r'C:\Users\kirin\Downloads\W1_W2_W3_Merged_saved.dta'

# .dta file we want to find a proxy in
datafile_proxy = r'C:\Users\kirin\Downloads\W1_W2_W3_Merged_saved.dta'


df1 = pd.read_stata(datafile_item)
df2 = pd.read_stata(datafile_proxy, convert_categoricals=False)

# find and print suggested proxy
best_proxy = proxy_finder(df1, df2, item, predictors_df1, predictors_df2, candidates)

IndexError: index 1 is out of bounds for axis 0 with size 1

This is a nice sanity check. If we look for a proxy for status threat in the original status threat dataset, the best fitting proxy is status threat itself. 

### Status threat, GSS data

In [71]:
# specific item we'd like to make a proxy for
item = 'status_threat' 

# specific variables we use to predict the item in first dataframe
predictors_df1 = [
                   'christian_nationalism',
                   'authoritarianism',
                   'social_dom11',
                   'race_resent',
                   'party_ID',
                   'ideology',
                   'age501',
                   'education'] 

# specific variables we use to predict the item in second dataframe. 
# These should correspond to the itemsin predictors_df1.
predictors_df2 = [
                  
                   ] 

# variables we'd like to remove the influence of on predicted item
orthogonal_vars = ['christian_nationalism', 
                   'authoritarianism', 
                   'social_dom11', 
                   'race_resent', 
                   'party_ID', 
                   'ideology']

# potential proxies
candidates = ['spocc10',
              'sppres10', 
              'sppres80',
              'spind10',
              'prestg10',
              'occ10',
              'wrkstat',
              'divorce',
              'paocc10',
              'papres10',
              'maocc10',
              'mapres10',
              'paind10',
              'maind10',
              ] 

# .dta file with item measure
datafile_item = r'C:\Users\kirin\Downloads\W1_W2_W3_Merged_saved.dta'

# .dta file we want to find a proxy in
datafile_proxy = r'C:\Users\kirin\Downloads\GSS2022.dta'


df1 = pd.read_stata(datafile_st)
df2 = pd.read_stata(datafile_proxy, convert_categoricals=False)

# find and print suggested proxy
proxy_finder(df1, df2, item, predictors_df1, predictors_df2, candidates)

ValueError: shapes (4149,0) and (8,) not aligned: 0 (dim 1) != 8 (dim 0)

In [59]:
# Print all column names to verify
print("Column names in the dataset:")
column_names = df2.columns.tolist()
print("\n".join(column_names))

Column names in the dataset:
year
id
wrkstat
hrs1
hrs2
evwork
wrkslf
occ10
prestg10
indus10
marital
martype
divorce
widowed
spwrksta
sphrs1
sphrs2
spevwork
cowrksta
coevwork
cohrs1
cohrs2
spwrkslf
sppres80
spocc10
sppres10
spind10
coocc10
coind10
pawrkslf
paocc10
papres10
paind10
mawrkslf
maocc10
mapres10
maind10
sibs
childs
age
agekdbrn
educ
paeduc
maeduc
speduc
coeduc
codeg
degree
padeg
madeg
spdeg
major1
major2
dipged
sex
race
res16
reg16
mobile16
family16
famdif16
mawrkgrw
incom16
born
parborn
granborn
hompop
babies
preteen
teens
adults
unrelat
earnrs
income
rincome
income16
rincom16
region
xnorcsiz
srcbelt
size
partyid
vote16
pres16
if16who
polviews
natspac
natenvir
natheal
natcity
natcrime
natdrug
nateduc
natrace
natarms
nataid
natfare
natroad
natsoc
natmass
natpark
natchld
natsci
natenrgy
natspacy
natenviy
nathealy
natcityy
natcrimy
natdrugy
nateducy
natracey
natarmsy
nataidy
natfarey
eqwlth
tax
spkath
colath
libath
spkrac
colrac
librac
spkcom
colcom
libcom
spkmslm
colmslm
libms

### ANES 2020 data

In [None]:
# in this example, we'll have to make approximations for predictors that are not explicitly measured in the second (ANES) 
#dataset. 
# .dta file we want to find a proxy in
filepath_proxy = r'C:\Users\kirin\Downloads\anes2020\anes_timeseries_2020_stata_20220210.dta'
df2 = pd.read_stata(filepath_proxy, convert_categoricals=False)

df2['psc1_W1_01'] = df2[['V202311', 'V202312', 'V202304']].mean(axis=1)

df2['christian_nationalism'] = df2['V202169']

df2['authoritarianism'] = df2[['V202163', 'V202302', 'V202158', 'V202170', 'V202159']].mean(axis=1)

df2['social_dom11'] = df2[['column1', 'column2', 'column3']].mean(axis=1)

df2['race_resent'] = df2[['column1', 'column2', 'column3']].mean(axis=1)

df2['race_resent'] = df2[['column1', 'column2', 'column3']].mean(axis=1)

df2['race_resent'] = df2[['column1', 'column2', 'column3']].mean(axis=1)





In [26]:
# specific item we'd like to make a proxy for
item = 'status_threat' 

# specific variables we use to predict the item in first dataframe
predictors_df1 = [
                   'psc1_W1_01',
                   'christian_nationalism',
                   'authoritarianism',
                   'social_dom11',
                   'race_resent',
                   'party_ID',
                   'ideology',
                   'age501',
                   'education'] 


# specific variables we use to predict the item in second dataframe. 
# These should correspond to the itemsin predictors_df1.
predictors_df2 = [
                  'V202312', #psc item
                  'V202169', # rate christians
                  'V201507x' #age
                   ] 

# variables we'd like to remove the influence of on predicted item
#orthogonal_vars = []

# potential proxies
#candidates = [
 #             ] 

# .dta file with item measure
filepath_item = r'C:\Users\kirin\Downloads\W1_W2_W3_Merged_saved.dta'




df1 = pd.read_stata(filepath_item)


# find and print suggested proxy
proxy_finder(df1, df2, item, predictors_df1, predictors_df2, num_proxies=5)

Proxy 1 for status_threat: V202312 with R_squared: 0.9783593534925631 and p_value: 0.0
Proxy 2 for status_threat: V202311 with R_squared: 0.7836887583576582 and p_value: 0.0
Proxy 3 for status_threat: V202326 with R_squared: 0.7622119691187061 and p_value: 0.0
Proxy 4 for status_threat: V202337 with R_squared: 0.7621547750669253 and p_value: 0.0
Proxy 5 for status_threat: V202339 with R_squared: 0.7593616347111407 and p_value: 0.0


['V202312', 'V202311', 'V202326', 'V202337', 'V202339']

In [None]:
# optional check for predictive power of other measures

check_results = {}

for var in orthogonal_vars:
    model_check = sm.OLS(df2[best_proxy], sm.add_constant(df2[var])).fit()
    check_results[var] = {
        'R_squared': model_check.rsquared,
        'p_value': model_check.pvalues[1]  # p-value for the orthogonal variable
    }

print("Predictive power of other measures on the selected proxy:")
print(pd.DataFrame(check_results).transpose())
