In [16]:
import pandas as pd
import pyreadstat
data_name = "ATP_W82"
df_raw, meta = pyreadstat.read_sav(f"{data_name}.sav")

## Map Variable Names

In [17]:
df, meta = pyreadstat.read_sav(f"{data_name}.sav")

# Function to replace values with labels
def replace_with_labels(series, value_labels):
    return series.map(value_labels).fillna(series)

# Create a dictionary to map original column names to labels
column_label_dict = dict(zip(meta.column_names, meta.column_labels))

# Create a reverse mapping from labels to original names
reverse_column_dict = {v: k for k, v in column_label_dict.items()}

# Rename columns using the labels
df.rename(columns=column_label_dict, inplace=True)

# Apply value labels to all columns that have them
for column in df.columns:
    original_column_name = reverse_column_dict.get(column, column)
    if original_column_name in meta.variable_value_labels:
        df[column] = replace_with_labels(df[column], meta.variable_value_labels[original_column_name])

In [18]:
import json
import pandas as pd
import os

# Load the context file
with open(f'/user/al4263/Simulate/Simulations/Pew_Research/{data_name}/context.json', 'r') as f:
    context_data = json.load(f)

# Initialize a dictionary to store non-NaN indices for each question
non_nan_indices = {}

# Process each topic in the context
for topic, topic_data in context_data.items():
    question_id = topic_data['question_id']
    
    # Find the index of the question_id in df_raw
    column_index = df_raw.columns.get_loc(question_id)
    
    # Get the corresponding processed column name from df
    processed_column_name = df.columns[column_index]
    
    # Extract the column from df
    extracted_column = df[processed_column_name]
    
    # Get indices of non-NaN values
    non_nan_mask = ~extracted_column.isna()
    non_nan_indices[question_id] = non_nan_mask[non_nan_mask].index.tolist()

# Save the non-NaN indices to a JSON file
output_dir = f"/user/al4263/Simulate/Simulations/Pew_Research/{data_name}"
output_file = os.path.join(output_dir, "non_nan_indices.json")

with open(output_file, 'w') as f:
    json.dump(non_nan_indices, f, indent=2)

print(f"Non-NaN indices saved to {output_file}")

# Print a sample of the results
for question_id, indices in list(non_nan_indices.items())[:3]:  # Show first 3 items
    print(f"\nQuestion ID: {question_id}")
    print(f"Number of non-NaN responses: {len(indices)}")
    print(f"Sample indices: {indices[:5]}...")  # Show first 5 indices

Non-NaN indices saved to /user/al4263/Simulate/Simulations/Pew_Research/ATP_W82/non_nan_indices.json

Question ID: GAP21Q3_W82
Number of non-NaN responses: 2596
Sample indices: [0, 1, 2, 3, 4]...

Question ID: GAP21Q5_b_W82
Number of non-NaN responses: 2596
Sample indices: [0, 1, 2, 3, 4]...

Question ID: GAP21Q5_a_W82
Number of non-NaN responses: 2596
Sample indices: [0, 1, 2, 3, 4]...


## Save Human Data

In [10]:
import json
import os
from datetime import datetime

# Function to convert non-serializable objects to strings
def json_serial(obj):
    if isinstance(obj, (datetime, pd.Timestamp)):
        return obj.isoformat()
    raise TypeError(f"Type {type(obj)} not serializable")

# Create the directory if it doesn't exist
output_dir = f"/user/al4263/Simulate/Simulations/Pew_Research/{data_name}/human_results"
os.makedirs(output_dir, exist_ok=True)

# Initialize a dictionary to store all results
all_results = {}

# Find the index of GAP21Q1_W82
start_index = df_raw.columns.get_loc("SATIS_W116")

# Iterate through columns in df_raw starting from GAP21Q1_W82
for column_raw in df_raw.columns[start_index:]:
    # Get the corresponding column name in df
    df_column_name = df.columns[df_raw.columns.get_loc(column_raw)]
    
    # Perform the count on df
    counts = df[df_column_name].value_counts().to_dict()
    
    # Convert any non-serializable objects in counts to strings
    counts = {str(k): v for k, v in counts.items()}
    
    # Store the results
    all_results[column_raw] = {
        "id": column_raw,
        "question": df_column_name,
        "counts": counts
    }

# Save the results to a JSON file
output_file = os.path.join(output_dir, "human_results.json")
with open(output_file, 'w') as f:
    json.dump(all_results, f, indent=2, default=json_serial)

print(f"Results saved to {output_file}")

Results saved to /user/al4263/Simulate/Simulations/Pew_Research/ATP_W116/human_results/human_results.json


In [21]:
def json_serial(obj):
    if isinstance(obj, (datetime, pd.Timestamp)):
        return obj.isoformat()
    raise TypeError(f"Type {type(obj)} not serializable")

# Create the directory if it doesn't exist
output_dir = f"/user/al4263/Simulate/Simulations/Pew_Research/{data_name}/human_results"
os.makedirs(output_dir, exist_ok=True)

# Load the context file
with open(f'/user/al4263/Simulate/Simulations/Pew_Research/{data_name}/context.json', 'r') as f:
    context_data = json.load(f)

# Extract question IDs from context
context_question_ids = [topic_data['question_id'] for topic_data in context_data.values()]

# Initialize a dictionary to store selected results
selected_results = {}

# Iterate through columns in df_raw
for column_raw in df_raw.columns:
    if column_raw in context_question_ids:
        # Get the corresponding column name in df
        df_column_name = df.columns[df_raw.columns.get_loc(column_raw)]
        
        # Perform the count on df
        counts = df[df_column_name].value_counts()
        
        # Calculate percentages
        total_responses = counts.sum()
        percentages = (counts / total_responses * 100).round(2)
        
        # Sort counts and percentages in descending order
        counts_sorted = counts.sort_values(ascending=False)
        percentages_sorted = percentages.sort_values(ascending=False)
        
        # Convert to dictionaries
        counts_dict = counts_sorted.to_dict()
        percentages_dict = percentages_sorted.to_dict()
        
        # Store the results
        selected_results[column_raw] = {
            "id": column_raw,
            "question": df_column_name,
            "results": {
                "count": counts_dict,
                "percentage": percentages_dict
            }
        }

# Save the results to a JSON file
output_file = os.path.join(output_dir, "selected_human_results.json")
with open(output_file, 'w') as f:
    json.dump(selected_results, f, indent=2, default=json_serial)

print(f"Selected results saved to {output_file}")
print(f"Number of selected questions: {len(selected_results)}")

Selected results saved to /user/al4263/Simulate/Simulations/Pew_Research/ATP_W82/human_results/selected_human_results.json
Number of selected questions: 21


## Create Persona

In [14]:
import json
import os

# Define the base directory and create the persona_meta folder
base_dir = f"/user/al4263/Simulate/Pew_Research/{data_name}"
persona_dir = os.path.join(base_dir, "persona_meta")
os.makedirs(persona_dir, exist_ok=True)

# Find the start and end indices for the columns we want
start_col = df.columns.get_loc("Metropolitan area indicator")
end_col = df.columns.get_loc("Income tier 3-way")

# Select the columns we want
selected_columns = df.columns[start_col:end_col+1]

# Counter for personas
persona_count = 0

# Iterate through each row in the dataframe
for index, row in df.iterrows():
    persona = {}
    for col in selected_columns:
        # Convert to string to ensure JSON serialization
        persona[col] = str(row[col])
    
    # Create the filename for this persona
    filename = os.path.join(persona_dir, f"persona_{persona_count}.json")
    
    # Save this persona to a JSON file
    with open(filename, 'w') as f:
        json.dump({"persona": persona}, f, indent=2)
    
    persona_count += 1

    # Print progress every 100 personas
    if persona_count % 100 == 0:
        print(f"Created {persona_count} personas...")

print(f"Created a total of {persona_count} personas in {persona_dir}")

Created 100 personas...
Created 200 personas...
Created 300 personas...
Created 400 personas...
Created 500 personas...
Created 600 personas...
Created 700 personas...
Created 800 personas...
Created 900 personas...
Created 1000 personas...
Created 1100 personas...
Created 1200 personas...
Created 1300 personas...
Created 1400 personas...
Created 1500 personas...
Created 1600 personas...
Created 1700 personas...
Created 1800 personas...
Created 1900 personas...
Created 2000 personas...
Created 2100 personas...
Created 2200 personas...
Created 2300 personas...
Created 2400 personas...
Created 2500 personas...
Created 2600 personas...
Created 2700 personas...
Created 2800 personas...
Created 2900 personas...
Created 3000 personas...
Created 3100 personas...
Created 3200 personas...
Created 3300 personas...
Created 3400 personas...
Created 3500 personas...
Created 3600 personas...
Created 3700 personas...
Created 3800 personas...
Created 3900 personas...
Created 4000 personas...
Created 4