In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns 
sns.set_theme("whitegrid")

### stats analysis imports 
import statsmodels.formula.api as smf

SEED = 125

In [None]:
def print_df_info(df, name):
    print(f"{name}")
    print(f"shape: {df.shape}")
    print(f"columns: {df.columns}")
    print()

# 1. Effect of Persuasion Techniques on Persuasiveness

In [None]:
persuasive_strategies_df = pd.read_csv("llm_responses_labeled.csv", index_col=0)
treatment_df = pd.read_csv("final_data_processed.csv", index_col = 0)

In [None]:
print_df_info(treatment_df, "treatment_df")
print("Number of unique treatment messages: ", treatment_df['treatment_message_id'].nunique())
treatment_df.head()

In [None]:
# for every row in treatment_df, take the entry `treatment_message` and find a match for it in persuasive_strategies_df['response']. If a match is found, copy over the persuasive strategies columns to treatment_df. 
# If no match is found, fill the persuasive strategies columns with NaN. 
# After this operation, treatment_df should have the same number of rows, but with additional columns for persuasive strategies.
merged_df = treatment_df.merge(persuasive_strategies_df, left_on='treatment_message', right_on='response', how='left')

# Print out the `treatment_message` values that did not find a match.
unmatched_subset = merged_df[merged_df['central'].isna()] 
print("Unmatched entries:", unmatched_subset['treatment_message'].unique())

# It appears that all of the unmatched entries are due to missing treatment messages (these are folks who expressed their position without receiving a treatment -- either from human or AI).
# We can fill in the persuasive strategy columns with 0 for these rows, since they received no persuasive strategies.
roles_with_notreatment = merged_df['treatment_message'].isna()
new_cols = ['attractiveness',
       'authority', 'central', 'forewarning', 'gainframe', 'lossframe',
       'partisan', 'partisanship', 'peripheral', 'refutation', 'religion',
       'repetition']
merged_df.loc[roles_with_notreatment, new_cols] = 0
# However, we should make a separate indicator for no treatment too. 
merged_df['no_treatment'] = 0 
merged_df.loc[roles_with_notreatment, 'no_treatment'] = 1

print_df_info(merged_df, "merged_df")
merged_df.head()    

In [None]:
X  = merged_df.drop(columns=['issue', 'issue_full', 'issue_stance_full', 'treatment_message_id', 'treatment_message', 'condition', 'model', 'dv_response_mean', 'response'])
y = merged_df['dv_response_mean']
print("Number of features: ", X.shape[1])
print("Number of samples: ", X.shape[0])
print("Feature columns: ", X.columns)

In [None]:
# what is the distribution of Y?
plt.hist(y)
plt.title("Distribution of response variable")
plt.show()

## 1.1 Naive regression on all covariates

In [None]:
import statsmodels.formula.api as smf

cols = list(merged_df.columns)
start_idx = cols.index("attractiveness")
cat_vars = cols[start_idx:]

formula = "dv_response_mean ~ age + C(education) + C(gender) + C(party_affiliation) + \
           C(ideo_affiliation) + political_knowledge + treatment_message_word_count + \
           " + " + ".join([f"C({c})" for c in cat_vars])

model = smf.ols(
    formula,
    data=merged_df
).fit(cov_type="HC3")   # robust SEs
print(model.summary())


## 1.2 Variance Inflation Factor (VIF) to detect collinear features! 

# 2. Aggregate Analysis