In [None]:
"""
Create a dataframe showing how many prompts and responses each user had per day, grouping nearby prompts so
that we have the expected (roughly) 12 prompts per day.

"""

import numpy as np
import pandas as pd

from analysis_utils import clean


def collapse_meal_info(meal_df: pd.DataFrame, delta: pd.Timedelta) -> pd.DataFrame:
    """
    Find the meal entries that are close enough to each other to be responses to the same prompt.

    Must have a "responded?" column that is 1 if the prompt was responded to, and 0 otherwise.
    Successive entries that are less than delta apart and have the same "responded?" value are considered to
    be a response to the same prompt.

    :param meal_df: a dataframe with a DateTimeIndex that includes an "reponded?" column
    :param delta: the maximum time difference between two entries for them to be considered the same prompt

    """
    collapsed_meal_info = pd.DataFrame()

    for _, group in meal_df.groupby("p_id"):
        assert group.index.is_monotonic_increasing

        # Mark which ones are near enough each other to be considered the same
        n_entries = len(group)
        keep = np.ones(n_entries, dtype=bool)

        for i in range(1, n_entries):
            if (group.index[i] - group.index[i - 1] < delta) and (
                group["responded?"].iloc[i] == group["responded?"].iloc[i - 1]
            ):
                keep[i] = False

        # Append to the new dataframe
        collapsed_meal_info = pd.concat([collapsed_meal_info, group[keep]])

    return collapsed_meal_info


# Read the smartwatch entries
meal_info = clean.cleaned_smartwatch(keep_catchups=False, keep_day0=False)[
    ["p_id", "delta", "meal_type"]
]

# Turn meal type into a binary variable for whether it was a response or non-response
meal_info["responded?"] = (
    meal_info["meal_type"].isin({"Meal", "Drink", "Snack", "No food/drink"}).astype(int)
)
meal_info.drop(columns=["meal_type"], inplace=True)

# Collapse nearby entries of the same type
# Using a 27 minute window because that gives us roughly the expected number of prompts per day
# Using a larger time window doesn't change anything; using a smaller time window leaves us with some
# days with much more than 12 prompts
meal_info = collapse_meal_info(meal_info, pd.Timedelta(minutes=27))

# Read the survey data
demographic_df = clean.cleaned_survey()[
    [
        "residents_id",
        "respondent_sex",
        "respondent_ethnicity",
        "age_dob",
        "phyactq1",  # In the last 7 days, how many days did you attend school?
        "smart1_7to9",  # Did your child participate in the smartwatch study?
        "smart1_10to17",  # Did you participate in the smartwatch study?
    ]
]
demographic_df.rename(
    columns={
        "residents_id": "p_id",
        "age_dob": "age",
        "phyactq1": "schooldays",
        "respondent_sex": "sex",
        "respondent_ethnicity": "ethnicity",
    },
    inplace=True,
)

# Convert sex into 0 or 1 in
demographic_df["sex"] -= 1

# Merge them into one
joined_df = meal_info.reset_index().merge(
    demographic_df, left_on="p_id", right_on="p_id", how="left"
)

# Only keep participants who wore the smartwatch
keep = (joined_df["smart1_7to9"] == 1) | (joined_df["smart1_10to17"] == 1)
print(f"Keeping {len(joined_df.loc[keep, 'p_id'].unique())} participants")
joined_df = joined_df.loc[keep]
joined_df.drop(columns=["smart1_7to9", "smart1_10to17"], inplace=True)

# Turn the age into age groups
joined_df["age_group"] = (joined_df["age"] > 12).astype(int)

joined_df.head()

In [None]:
""" Find the nearest hour to each entry """
joined_df["delta"] = joined_df["delta"].dt.round("H")
joined_df.head()

# If there's both a response and a non-response in the same hour, keep the response

In [None]:
""" See if there are duplicates """
from IPython.display import display

for (p_id, time), group in joined_df.groupby(["p_id", "delta"]):
    if len(group) > 1:
        display(group)

In [None]:
"""
Plot a matrix of participants' prompt times

"""
# Get the times that we expect to have prompts - i.e. every hour
# Check they're what I expect - start at day 1 midnight, every hour from then
# Make a matrix with the right number of columns (7 x 24), and rows (82)
# Iterate over p_ids, checking if they have a prompt in each hour
# Check if we're only accepting responses or all prompts
# +1 if there's an entry
# Matshow it on an axis