In [1]:
import numpy as np
from scipy import stats

In [7]:
def simulate_power_optimized(n, k, p_null=0.5, p_alt=0.58, alpha=0.05, n_sim=1000):
    """
    Optimized simulation of power for given sample size and number of pairs.
    """
    # Simulate all choices at once: n_sim simulations x n participants x k pairs
    choices = np.random.binomial(1, p_alt, size=(n_sim, n, k))
    
    # Calculate the proportion of Option A choices per participant for each simulation
    prop_A = choices.mean(axis=2)  # Shape: (n_sim, n)
    
    # Calculate the mean proportion for each simulation
    mean_prop_A = prop_A.mean(axis=1)  # Shape: (n_sim,)
    
    # Calculate the standard error for each simulation
    se_prop_A = prop_A.std(axis=1, ddof=1) / np.sqrt(n)
    
    # Calculate t-statistics for all simulations
    t_stats = (mean_prop_A - p_null) / se_prop_A
    
    # Calculate two-tailed p-values
    p_values = 2 * (1 - stats.t.cdf(np.abs(t_stats), df=n-1))
    
    # Calculate power as the proportion of p-values below alpha
    power = np.mean(p_values < alpha)
    return power

In [8]:
simulate_power_optimized(100, 10)

np.float64(0.998)

# Survey design

In [55]:
from pathlib import Path
import random

import duckdb

In [23]:
con = duckdb.connect()

In [25]:
df = con.execute("SELECT * FROM 'data/nyt_archive_all.parquet' ").fetchdf()

In [30]:
top_desks = df.news_desk.value_counts().head(20)

In [31]:
top_desks.sum() / df.shape[0]

np.float64(0.887894403760962)

In [35]:
in_desks = df[df.news_desk.isin(top_desks.index)]

In [44]:
sample = in_desks.groupby('news_desk').apply(lambda x: x.sample(n=3, random_state=42))

  sample = in_desks.groupby('news_desk').apply(lambda x: x.sample(n=3, random_state=42))


In [49]:
headline_sample = sample.headline.tolist()

In [61]:
emails = list(Path("data/emails/").glob("*.csv"))

In [65]:
headings = []

for email in random.sample(emails, 10):
    heading = con.execute(f"SELECT '# ' || newsletter_headline || '\n\n' || '## ' || newsletter_sub_hed AS heading FROM '{email}' ").fetchdf().iloc[0, 0]
    headings.append(heading)

In [68]:
random.shuffle(headline_sample)

In [76]:
headline_screen = """Q1.\n\nThe following is a list of New York Times headlines published between July and August 2024. Please select the headlines that you would be interested in reading.\n\n""" + "\n".join(headline_sample)

In [77]:
comparison_texts = []
for i, heading in enumerate(headings):
    text = f"Q{i+2}.\n\nWhich of these headlines would you be more likely to click on?\n\nOption 1: {heading}\n\nOption 2: [Personalized headline, randomly ordered]"
    comparison_texts.append(text)

comparison_screen = "\n\n".join(comparison_texts)

In [79]:
with open("./data/survey.txt", "w") as f:
    f.write(headline_screen + "\n\n" + comparison_screen)