In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import textwrap
import pyreadr # for reading R data files, which some of the data are in.

SEED = 125

In [None]:
def print_df_info(df, name):
    print(f"{name}")
    print(f"shape: {df.shape}")
    print(f"columns: {df.columns}")
    print()

# 1. Load Datasets

In [None]:
ANALYSIS_DIR = 'main_study/code/analysis/'
ANNOTATION_DIR = 'main_study/data/annotation_materials/'
COMPLETIONS_DIR = 'main_study/data/completions/'
PROCESSED_DIR = 'main_study/code/analysis/output/processed_data/'

In [None]:
#### annotations
completed_annotations_final = pd.read_csv(ANNOTATION_DIR + 'completed_annotations_final.csv')
gpt_legibility = pd.read_csv(ANNOTATION_DIR + 'gpt-legibility_on-topic_valence_scores.csv')
sample_for_annotation = pd.read_csv(ANNOTATION_DIR + 'sample_for_annotation.csv')

#### completions 
falcon_40b = pd.read_csv(COMPLETIONS_DIR + 'falcon-40b_responses.csv')
all_responses_combined = pd.read_csv(COMPLETIONS_DIR + 'all_responses_combined.csv')
all_responses = pd.read_csv(COMPLETIONS_DIR + 'all_responses.csv')

In [None]:
## processed data 
def import_rdata(file_path):
    result = pyreadr.read_r(file_path)
    return result[None]  # Extract the DataFrame from the dictionary

prepared_data_df = import_rdata(PROCESSED_DIR + 'prepared_data.rds')
df_estimates_df = import_rdata(PROCESSED_DIR + 'df_estimates.rds')
ate_df = import_rdata(PROCESSED_DIR + 'raw_model_ATEs.rds')

In [None]:
final_data_with_metrics_df = pd.read_csv(ANALYSIS_DIR + 'final_data_with_metrics.csv')
prompts_df = pd.read_csv(ANALYSIS_DIR + 'prompts.csv')
raw_data_final = pd.read_csv(ANALYSIS_DIR + 'raw_data_final.csv', skiprows=[1,2]) # the second and third rows are just extra headers that are not needed.

# 2. Text Completions

### Completed annotations dataset

In [None]:
completed_annotations_final.head()

In [None]:
for res in completed_annotations_final.response[0:5]: 
    print(res)
    print("--------------------------------")

In [None]:
print_df_info(completed_annotations_final, "completed_annotations_final")

### GPT legibility dataset

In [None]:
gpt_legibility.head()

In [None]:
print_df_info(gpt_legibility, "gpt_legibility")

What are the columns `treatement_partisanship` and `issue_stance_valence` referring to? 

### What are the `issues` that are involed here?

In [None]:
print(gpt_legibility['issue_stance_full'].unique(), "\nTotal number of issue areas: ", gpt_legibility['issue_short'].nunique())

In [None]:
print(gpt_legibility['issue_short'].unique())

In [None]:
print(gpt_legibility['issue_area'].unique(), "\nTotal number of issues: ", gpt_legibility['issue_area'].nunique())

### What does subsetting the dataframe to a certain issue topic and partisanship look like?

In [None]:
political_stance = 'conservative-coded'
issue = 'The U.S. should make it a requirement that people work in order to receive Medicaid'
subset = gpt_legibility[(gpt_legibility['issue_stance_full'] == issue) & (gpt_legibility['treatment_partisanship'] == political_stance)]
count = 5
for res in subset['response'][:count]: 
    print(res)
    print("--------------------------------------------------------------------------------")

Well, this first response is a bit non-sensical.. But the rest of the messaging seem interesting.

### Sample_for_annotation dataset

In [None]:
sample_for_annotation.head()

In [None]:
sample_for_annotation.tail()

In [None]:
print_df_info(sample_for_annotation, "sample_for_annotation")

In [None]:
print(sample_for_annotation['Unnamed: 0'].max()) # I wonder if this is related to the 720 responses they mentioned, but the number is slightly off. 
# update: 720 of these are AI-generated. 10 are human. 

### Completions datasetS (yes, there are multiple -- one per model and two aggregate ones)

Why does the `all_responses_combined` dataset have more rows? 

In [None]:
print_df_info(all_responses, "all_responses")
print_df_info(all_responses_combined, "all_responses_combined")
print_df_info(falcon_40b, "falcon_40b")

In [None]:
print(all_responses_combined['model'].unique()) 

In [None]:
human_responses = all_responses_combined[all_responses_combined['model'] == 'human']
human_responses

In [None]:
human_responses[["issue_short", "issue_stance_valence", "treatment_partisanship", "response"]]

In [None]:
def show_responses(df, line_length=200, show_model=False):
    for i, res in enumerate(df['response']):
        print(f"Response {i+1}")
        print(f"Issue: {df['issue_stance_full'].values[i]}")
        print(f"Partisanship: {df['treatment_partisanship'].values[i]}")
        print(f"Issue Stance Valence: {df['issue_stance_valence'].values[i]}")
        if show_model:
            print(f"Model: {df['model'].values[i]}")
        print("-" * max(5, int(line_length / 10)))
        res = textwrap.fill(res, width=line_length)
        print(res)
        print("-" * line_length)

show_responses(human_responses)

In [None]:
np.random.seed(SEED)
samples = 10
ai_samples = all_responses_combined[all_responses_combined['model'] != 'human'].sample(samples)
show_responses(ai_samples, show_model=True)

# 3. Processed Data

In [None]:
print_df_info(prepared_data_df, "prepared_data_df")

In [None]:
print_df_info(ate_df, "ate_df")

In [None]:
ate_df

# 4. Persuasion and Demographic Attributes

In [None]:
print_df_info(final_data_with_metrics_df, "final_data_with_metrics_df")

In [None]:
# for col in final_data_with_metrics_df.columns: 
#     print(col)

In [None]:
print_df_info(raw_data_final, "raw_data_final")

In [None]:
demographic_cols = ["age_1", "education", "gender", "party_affiliation", "ideo_affiliation", "political_knowledge1", "political_knowledge2", "political knowledge3"]
exp_cols = ["condition_assignment", "issue", "condition", "bin_size", "model", "variant"]

temp_df = raw_data_final[demographic_cols + exp_cols]
temp_df.to_csv("raw_data_processed.csv")
temp_df.head()

In [None]:
assignments = ['medicaid-1_1', 'medicaid-2_1', 'medicaid-3_1',
       'medicaid-4_1', 'veterans-1_1', 'veterans-2_1', 'veterans-3_1',
       'veterans-4_1', 'pensions-1_1', 'pensions-2_1', 'pensions-3_1',
       'pensions-4_1', 'foreign_aid-1_1', 'foreign_aid-2_1', 'foreign_aid-3_1',
       'foreign_aid-4_1', 'confinement-1_1', 'confinement-2_1',
       'confinement-3_1', 'confinement-4_1', 'suicide-1_1', 'suicide-2_1',
       'suicide-3_1', 'suicide-4_1', 'border-1_1', 'border-2_1', 'border-3_1',
       'border-4_1', 'felon_voting-1_1', 'felon_voting-2_1',
       'felon_voting-3_1', 'felon_voting-4_1', 'affirmative_action-1_1',
       'affirmative_action-2_1', 'affirmative_action-3_1',
       'affirmative_action-4_1', 'electoral_college-1_1',
       'electoral_college-2_1', 'electoral_college-3_1',
       'electoral_college-4_1']
assignments_df = raw_data_final[assignments]
assignments_df.head()

In [None]:
assignments_df = assignments_df.fillna(0)
assignments_df['sum_effects'] = assignments_df.sum(axis=1)
assignments_df['sum_effects'].head()

Whais is the difference between `party_affiliation` and `ideo_affiliation`? And where is the measure of `persuasiveness`? 

In [None]:
print_df_info(prompts_df, "prompts_df") # 30 is the number of messages generated per model, so presumably each model is prompted 30 times using these prompts

In [None]:
prompts_df['prompt_full_text'].unique()[:5] # its just a repeat of the 3 template messages that they mentioend in "message generation" subsection of paper (pg. 9)

In [None]:
for col in final_data_with_metrics_df.columns: 
    print(col)

In [None]:
demographic_cols = ["age", "education", "gender", "party_affiliation", "ideo_affiliation", "political_knowledge"]
exp_cols = ["issue", "issue_full", "treatment_message_id", "treatment_message", "condition", "model", "dv_response_mean"]
mediator_cols = ["treatment_message_word_count"]

temp_df = final_data_with_metrics_df[demographic_cols + exp_cols + mediator_cols]
temp_df.to_csv("final_data_processed.csv")
temp_df.head()

# 5. Saving Useful Datasets

In [None]:
all_responses

In [None]:
all_responses_combined[['issue_stance_full','response']].to_csv("llm_responses.csv")