In [None]:
import os

plot_dir = "outputs/imgs/three_level/"
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir, exist_ok=True)

In [None]:
"""
Create a dataframe showing how many prompts and responses each user had per day, grouping nearby prompts so
that we have the expected (roughly) 12 prompts per day.

"""

import numpy as np
import pandas as pd

from analysis_utils import clean


def collapse_meal_info(meal_df: pd.DataFrame, delta: pd.Timedelta) -> pd.DataFrame:
    """
    Find the meal entries that are close enough to each other to be responses to the same prompt.

    Must have a "responded?" column that is 1 if the prompt was responded to, and 0 otherwise.
    Successive entries that are less than delta apart and have the same "responded?" value are considered to
    be a response to the same prompt.

    :param meal_df: a dataframe with a DateTimeIndex that includes an "reponded?" column
    :param delta: the maximum time difference between two entries for them to be considered the same prompt

    """
    collapsed_meal_info = pd.DataFrame()

    for _, group in meal_df.groupby("p_id"):
        assert group.index.is_monotonic_increasing

        # Mark which ones are near enough each other to be considered the same
        n_entries = len(group)
        keep = np.ones(n_entries, dtype=bool)

        for i in range(1, n_entries):
            if (group.index[i] - group.index[i - 1] < delta) and (
                group["responded?"].iloc[i] == group["responded?"].iloc[i - 1]
            ):
                keep[i] = False

        # Append to the new dataframe
        collapsed_meal_info = pd.concat([collapsed_meal_info, group[keep]])

    return collapsed_meal_info


# Read the smartwatch entries
meal_info = clean.cleaned_smartwatch(keep_catchups=False, keep_day0=False)[
    ["p_id", "delta", "meal_type"]
]

# Turn meal type into a binary variable for whether it was a response or non-response
meal_info["responded?"] = (
    meal_info["meal_type"].isin({"Meal", "Drink", "Snack", "No food/drink"}).astype(int)
)
meal_info.drop(columns=["meal_type"], inplace=True)

# Collapse nearby entries of the same type
# Using a 27 minute window because that gives us roughly the expected number of prompts per day
# Using a larger time window doesn't change anything; using a smaller time window leaves us with some
# days with much more than 12 prompts
meal_info = collapse_meal_info(meal_info, pd.Timedelta(minutes=27))

# Read the survey data
demographic_df = clean.cleaned_survey()[
    [
        "residents_id",
        "respondent_sex",
        "respondent_ethnicity",
        "age_dob",
        "phyactq1",  # In the last 7 days, how many days did you attend school?
        "smart1_7to9",  # Did your child participate in the smartwatch study?
        "smart1_10to17",  # Did you participate in the smartwatch study?
    ]
]
demographic_df.rename(
    columns={
        "residents_id": "p_id",
        "age_dob": "age",
        "phyactq1": "schooldays",
        "respondent_sex": "sex",
        "respondent_ethnicity": "ethnicity",
    },
    inplace=True,
)

# Convert sex into 0 or 1 in
demographic_df["sex"] -= 1

# Merge them into one
joined_df = meal_info.reset_index().merge(
    demographic_df, left_on="p_id", right_on="p_id", how="left"
)

# Only keep participants who wore the smartwatch
keep = (joined_df["smart1_7to9"] == 1) | (joined_df["smart1_10to17"] == 1)
print(f"Keeping {len(joined_df.loc[keep, 'p_id'].unique())} participants")
joined_df = joined_df.loc[keep]
joined_df.drop(columns=["smart1_7to9", "smart1_10to17"], inplace=True)

# Turn the age into age groups
joined_df["age_group"] = (joined_df["age"] > 12).astype(int)

joined_df.head()

In [None]:
""" Find the nearest hour to each entry """
joined_df["old_delta"] = joined_df["delta"]
joined_df["delta"] = joined_df["delta"].dt.round("h")
joined_df.head()

In [None]:
""" See if there are duplicate entries that have been rounded to the same hour """

from IPython.display import display

count = 0
for (p_id, time), group in joined_df.groupby(["p_id", "delta"]):
    if len(group) > 1:
        count +=1
        display(group)
print(count)

In [None]:
"""
Find the ones where there are both a response and a non-response within 5 min of each other which have been assigned to the same hour

"""

FIVE_MIN_SECS = 5 * 60

drop_indices = []

for (p_id, time), group in joined_df.groupby(["p_id", "delta"]):
    if len(group) > 1:
        # We have multiple entries for the same person at the same time
        for idx, row in group.iterrows():
            # Check if this row is a non response
            if row["responded?"] == 0:
                # Check if there is a positive response for this participant within 5 minutes
                response_times = joined_df[
                    (joined_df["p_id"] == p_id) & (joined_df["responded?"] == 1)
                ]["old_delta"]
                diff = abs((response_times - row["old_delta"]).dt.total_seconds())

                # If there is, add the index to the list of indices to drop
                if (diff < FIVE_MIN_SECS).any():
                    drop_indices.append(idx)

# Drop the rows
print(f"Dropping {drop_indices}")
joined_df = joined_df.drop(drop_indices)

In [None]:
count = 0
for (p_id, time), group in joined_df.groupby(["p_id", "delta"]):
    if len(group) > 1:
        display(group)
        count += 1
print(count)

In [None]:
"""
Plot a matrix of participants' prompt times

"""


def entry_matrix(input_data: pd.DataFrame, *, only_responses: bool) -> np.ndarray:
    """
    Get a matrix showing how many responses/prompts each participant had in each hour.

    """
    n_participants = len(input_data["p_id"].unique())
    assert n_participants == 82

    # Get the times that we expect to have prompts - i.e. every hour
    times = pd.timedelta_range(
        start=input_data["delta"].min(), end=input_data["delta"].max(), freq="h"
    )

    # Make a matrix with the right number of columns (7 x 24), and rows (82)
    retval = np.zeros((n_participants, len(times)), dtype=int)

    # Iterate over p_ids, checking if they have a prompt in each hour
    for row_idx, (p_id, group) in enumerate(input_data.groupby("p_id")):
        for _, row in group.iterrows():
            # Find the index of the time in the times array
            idx = np.searchsorted(times, row["delta"])
            if not only_responses or row["responded?"] == 1:
                retval[row_idx][idx] += 1

    return retval


prompt_matrix = entry_matrix(joined_df, only_responses=False)
response_matrix = entry_matrix(joined_df, only_responses=True)

In [None]:
display((dict(zip(*np.unique(prompt_matrix, return_counts=True)))))
display((dict(zip(*np.unique(response_matrix, return_counts=True)))))

In [None]:
"""
Plot them
"""

import matplotlib.pyplot as plt
from matplotlib.colors import BoundaryNorm, ListedColormap

fig, axes = plt.subplots(1, 2, figsize=(12, 6))

boundaries = np.arange(prompt_matrix.max() + 2) - 0.5
colors = plt.cm.tab10.colors[: len(boundaries)]
cmap = ListedColormap(colors)
norm = BoundaryNorm(boundaries, cmap.N, clip=True)
imshow_kw = {
    "aspect": "auto",
    "cmap": cmap,
    "norm": norm,
    "filternorm": False,
}
axes[0].set_title(f"Prompts\n{dict(zip(*np.unique(prompt_matrix, return_counts=True)))}")
axes[0].imshow(prompt_matrix, **imshow_kw)

axes[1].set_title(f"Responses\n{dict(zip(*np.unique(response_matrix, return_counts=True)))}")
axes[1].imshow(response_matrix, **imshow_kw)

axes[0].set_ylabel("Participant")
for axis in axes:
    axis.set_yticks([])

cbar_ax = fig.colorbar(axes[0].imshow(prompt_matrix, **imshow_kw), ax=axes[1])
cbar_ax.set_ticks(boundaries[:-1] + 0.5)

# Set X ticks
times = pd.timedelta_range(
    start=joined_df["delta"].min(), end=joined_df["delta"].max(), freq="h"
)

# Set the x-ticks to the filtered times
for axis in axes:
    (tick_indices,) = np.where(times.components.hours.isin({12}))
    labels = [
        f"Day {time.components.days}\n12:00"
        for time in times[tick_indices]
    ]
    axis.set_xticks(tick_indices, labels=labels)


fig.savefig(f"{plot_dir}/prompt_matrix.png")

In [None]:
"""
Generate a CSV file with the data

"""

joined_df["hour"] = joined_df["delta"].dt.components.hours
joined_df["day"] = joined_df["delta"].dt.components.days

csv_cols = [
    "p_id",
    "day",
    "hour",
    "responded?",
    "sex",
    "ethnicity",
    "schooldays",
    "age_group",
]
summary_df = joined_df[csv_cols]
summary_df.rename(columns={"responded?": "response"}, inplace=True)

summary_df.to_csv("outputs/data/three_level_data.csv", index=False)
summary_df

In [None]:
"""
Find some summary stats for the data

"""
from scipy.special import logit
from sklearn.utils import resample

def logit_mean_response(df, group_col):
    grouped = df.groupby(group_col)['response'].mean()
    return logit(grouped)

# Function to bootstrap logit of mean response rate
def bootstrap_logit_mean_response(df, group_col, n_bootstraps=1000):
    bootstrapped_means = []
    for _ in range(n_bootstraps):
        sample = resample(df)
        bootstrapped_means.append(logit_mean_response(sample, group_col))
    bootstrapped_means = pd.DataFrame(bootstrapped_means)
    return bootstrapped_means.quantile([0.025, 0.975])

logit_day = logit_mean_response(summary_df, 'day')
logit_hour = logit_mean_response(summary_df, 'hour')

# Bootstrap to estimate uncertainty
ci_day = bootstrap_logit_mean_response(summary_df, 'day')
ci_hour = bootstrap_logit_mean_response(summary_df, 'hour')

display(logit_day)
display(logit_hour)

In [None]:
# Plot for day
fig, ax = plt.subplots()
ax.errorbar(
    logit_day.index,
    logit_day,
    yerr=[logit_day - ci_day.loc[0.025], ci_day.loc[0.975] - logit_day],
    fmt="o",
)
ax.set_xlabel("Day")
ax.set_ylabel("Logit of Mean Response Rate")
ax.set_title("Logit of Mean Response Rate by Day")
fig.tight_layout()

fig.savefig(f"{plot_dir}/logit_day.png")

In [None]:
fig, ax = plt.subplots()
ax.errorbar(
    logit_hour.index,
    logit_hour,
    yerr=[logit_hour - ci_hour.loc[0.025], ci_hour.loc[0.975] - logit_hour],
    fmt="o",
)
ax.set_xlabel("Hour")
ax.set_ylabel("Logit of Mean Response Rate")
ax.set_title("Logit of Mean Response Rate by Hour")
fig.tight_layout()

fig.savefig(f"{plot_dir}/logit_hour.png")

In [None]:
import seaborn as sns

response_rate = summary_df.groupby(["day", "hour"])["response"].mean().reset_index()
pivot_table = response_rate.pivot(index="day", columns="hour", values="response")

fig, ax = plt.subplots(figsize=(12, 6))

sns.heatmap(pivot_table, ax=ax, cmap="inferno", cbar_kws={"label": "Response rate"})

fig.tight_layout()

fig.savefig(f"{plot_dir}/heatmap.png")

In [None]:
sex_estimate, sex_err = 0.538021, 0.266238

display(f"Sex odds ratio: {np.exp(sex_estimate)}")
display(
    f"95%CI: [{np.exp(sex_estimate - 1.96 * sex_err)}, {np.exp(sex_estimate + 1.96 * sex_err)}]"
)