In [1]:
import pandas as pd
import numpy as np

In [2]:
# list of countries in the study
countries = [
    "Algeria", "Armenia", "Australia", "Austria", 
    "Belgium", "Brazil", "Bulgaria", 
    "Canada", "Chile", "China", "Czechia", 
    "Denmark", "Ecuador", "Finland", "France",
    "Gambia", "Germany", "Ghana", "Greece",
    "India", "Ireland", "Israel", "Italy",
    "Japan", "Kenya", "Latvia",
    "Mexico", "Morocco", 
    "Netherlands", "New Zealand", "Nigeria",
    "North Macedonia", "Norway",
    "Peru", "Philippines", "Poland", "Portugal",
    "Romania", "Russia",
    "Saudi Arabia", "Serbia", "Singapore",
    "Slovakia", "Slovenia", "Spain",
    "Sri Lanka", "Sudan", "Sweden", "Switzerland",
    "Taiwan", "Tanzania", "Thailand", "Turkey",
    "Uganda", "Ukraine", "United Arab Emirates", 
    "United Kingdom", "United States", "Uruguay",
    "Venezuela", "Vietnam"
]

# 3-letter ISO country codes corresponding to the countries list
country_codes = [
    "DZA", "ARM", "AUS", "AUT",
    "BEL", "BRA", "BGR",
    "CAN", "CHL", "CHN", "CZE",
    "DNK", "ECU", "FIN", "FRA",
    "GMB", "DEU", "GHA", "GRC",
    "IND", "IRL", "ISR", "ITA",
    "JPN", "KEN", "LVA",
    "MEX", "MAR",
    "NLD", "NZL", "NGA",
    "MKD", "NOR",
    "PER", "PHL", "POL", "PRT",
    "ROU", "RUS",
    "SAU", "SRB", "SGP",
    "SVK", "SVN", "ESP",
    "LKA", "SDN", "SWE", "CHE",
    "TWN", "TZA", "THA", "TUR",
    "UGA", "UKR", "ARE",
    "GBR", "USA", "URY",
    "VEN", "VNM"
]

# Create a mapping dictionary
country_to_code = dict(zip(countries, country_codes))

# list of outcome variables in the study
outcomes = ["belief_cc", "policy_support", "share_social_media", "wept"]

# list of intervention conditions in the study
interventions = [
    "control",
    "psychological_distance",
    "letter_future_gen",
    "effective_collective_action",
    "future_self_continuity",
    "system_justification",
    "scientific_consensus",
    "binding_moral_foundations",
    "dynamic_social_norms",
    "pluralistic_ignorance",
    "negative_emotions",
    "working_together_normative_appeal"
]

N_COUNTRIES = len(countries)
N_INTERVENTIONS = len(interventions)

In [3]:
sample_sizes = pd.DataFrame(columns = ["country", "intervention", "sample_size"])
sample_sizes["country"] = np.repeat(countries, N_INTERVENTIONS)
sample_sizes["intervention"] = interventions * N_COUNTRIES
sample_sizes["sample_size"] = np.random.normal(100, 7, size = N_COUNTRIES * N_INTERVENTIONS).astype(int)

TOTAL_SAMPLE_SIZE = sample_sizes["sample_size"].sum()
CONTROL_SAMPLE_SIZE = sample_sizes[sample_sizes["intervention"] == "control"]["sample_size"].sum()

In [4]:
df = pd.DataFrame(
    columns = [
        "age", "gender", "education", "income", "perc_ses", "sp_ideology", "econ_ideology", "country", "country_code", "intervention"
    ] + outcomes
)
df["age"] = np.random.randint(18, 70, size = TOTAL_SAMPLE_SIZE)
df["gender"] = np.random.choice(
    ["male", "female", "nonbinary or other"], 
    size = TOTAL_SAMPLE_SIZE, 
    p = [0.48, 0.48, 0.04]
)
df["education"] = np.random.choice(
    ["0 to 6", "7 to 12", "13 to 16", "17 or more"], 
    size = TOTAL_SAMPLE_SIZE, 
    p = [0.17, 0.46, 0.34, 0.03]
)
df["income"] = np.random.choice(
    ["less than 10K", "10K to 15K", "15K to 25K", "25K to 50K", "50K to 100K", "100K to 150K", "150K to 200K", "more than 200K"], 
    size = TOTAL_SAMPLE_SIZE, 
    p = [0.08, 0.16, 0.22, 0.24, 0.17, 0.08, 0.04, 0.01]   
)
df["perc_ses"] = np.random.choice(
    ["0-10%", "10-20%", "20-30%", "30-40%", "40-50%", "50-60%", "60-70%", "70-80%", "80-90%", "90-100%"], 
    size = TOTAL_SAMPLE_SIZE, 
    p = [0.04, 0.11, 0.13, 0.21, 0.16, 0.12, 0.09, 0.06, 0.05, 0.03]
)
df["sp_ideology"] = np.clip(np.random.normal(50, 15, size = TOTAL_SAMPLE_SIZE).astype(int), 0, 100)
df["econ_ideology"] = np.clip(np.random.normal(50, 15, size = TOTAL_SAMPLE_SIZE).astype(int), 0, 100)
df["country"] = np.repeat(
    countries, 
    sample_sizes.groupby("country")["sample_size"].sum().values
)
df["country_code"] = df["country"].map(country_to_code)

index_counter = 0
for country in countries:
    for intervention in interventions:
        n = sample_sizes[
            (sample_sizes["country"] == country) & 
            (sample_sizes["intervention"] == intervention)
        ]["sample_size"].values[0]
        df.iloc[index_counter : index_counter + n].loc[:, "intervention"] = intervention
        index_counter += n


In [5]:
random_country_effects = {
    country: np.random.normal(0, 0.1, size = len(outcomes))
    for country in countries
}
random_intervention_effects = {
    intervention: np.random.normal(0.1, 0.1, size = len(outcomes))
    for intervention in interventions
}

def generate_outcome(row, outcome_name):
    
    age_norm = (row["age"] - 18) / (70 - 18)
    sp_ideology_norm = row["sp_ideology"] / 100
    econ_ideology_norm = row["econ_ideology"] / 100

    base = 1.0
    base -= 0.1 * age_norm + np.random.normal(0, 0.03)
    base -= 0.1 * sp_ideology_norm + np.random.normal(0, 0.03)
    base -= 0.05 * econ_ideology_norm + np.random.normal(0, 0.015)
    base += random_country_effects[row["country"]][outcomes.index(outcome_name)]
    if row["gender"] == "male":
        base -= 0.07 + np.random.normal(0, 0.03)
    elif row["gender"] == "nonbinary or other":
        base += 0.05 + np.random.normal(0, 0.01)
    if row["education"] == "17 or more":
        base += 0.1 + np.random.normal(0, 0.03)
    elif row["education"] == "13 to 16":
        base += 0.05 + np.random.normal(0, 0.01)
    elif row["education"] == "0 to 6":
        base -= 0.05 + np.random.normal(0, 0.01)
    if row["income"] in ["more than 200K", "15K to 25K"]:
        base -= 0.03 + np.random.normal(0, 0.01)
    elif row["income"] in ["less than 10K", "10K to 15K"]:
        base -= 0.05 + np.random.normal(0, 0.01)
    elif row["income"] in ["50K to 100K", "100K to 150K"]:
        base += 0.05 + np.random.normal(0, 0.01)
    if row["perc_ses"] == "90-100%":
        base -= 0.03 + np.random.normal(0, 0.01)
    elif row["perc_ses"] in ["0-10%", "10-20%"]:
        base -= 0.05 + np.random.normal(0, 0.01)
    elif row["perc_ses"] in ["20-30%", "30-40%"]:
        base -= 0.02 + np.random.normal(0, 0.01)
    elif row["perc_ses"] in ["70-80%", "80-90%"]:
        base += 0.05 + np.random.normal(0, 0.01)
    
    if row["intervention"] == "control":
        return base + np.random.normal(0, 0.1)
    else:
        intervention_effect = random_intervention_effects[row["intervention"]][outcomes.index(outcome_name)]
        return base + intervention_effect + np.random.normal(0, 0.1)

for outcome in outcomes:
    df[outcome] = df.apply(lambda row: generate_outcome(row, outcome), axis = 1)


In [6]:
def minmax_scale(series):
    min_val = series.min()
    max_val = series.max()
    return (series - min_val) / (max_val - min_val)

def binarize(series, threshold = 0.5):
    return (series >= threshold).astype(int)

df["belief_cc"] = (minmax_scale(df["belief_cc"]) * 80 + 10).astype(int)
df["policy_support"] = (minmax_scale(df["policy_support"]) * 80 + 10).astype(int)
df["share_social_media"] = binarize(minmax_scale(df["share_social_media"]))
df["wept"] = (minmax_scale(df["wept"]) * 8).astype(int)

In [7]:
df.to_csv("mock_survey_data.csv", index = False)

In [9]:
df_country = pd.DataFrame(
    columns = [
        "country", "country_code", "risk_factor_1", "risk_factor_2"
    ]
)
df_country["country"] = countries
df_country["country_code"] = df_country["country"].map(country_to_code)
df_country["risk_factor_1"] = np.random.normal(0, 1, size = N_COUNTRIES)
df_country["risk_factor_2"] = np.random.normal(0, 1, size = N_COUNTRIES)
df_country.to_csv("mock_country_data.csv", index = False)