Meal Stats
====
Statistics on the number of entries of each type each participant made per day

In [None]:
import os

%load_ext autoreload
%autoreload 2

img_dir = "outputs/imgs/meal_stats/"
if not os.path.exists(img_dir):
    os.makedirs(img_dir)

In [None]:
"""
Read and clean the meal entry data

"""

from analysis_utils import clean

meal_info = clean.cleaned_smartwatch(keep_catchups=True, keep_day0=False)
meal_info.head(5)

In [None]:
# Get the average response rate
1 - (meal_info["meal_type"] == "No response").mean()

# Get the response rate for each p_id
mean_response_rates = (meal_info["meal_type"] != "No response").groupby(meal_info["p_id"]).mean()
mean_response_rates.plot(kind="hist", bins=25, title="Response Rate Distribution")
mean_response_rates.describe()

In [None]:
""" Pick out the rows and columns we want to keep """

import pandas as pd

cols = ["meal_type", "p_id", "delta", "catchup_flag", "any_in_ramadan"]
meal_info = meal_info[cols]

meal_info = meal_info[~meal_info["meal_type"].isin({"Catch-up start", "Catch-up end"})]

# Take 1 day off the catchups, since they apply to the day before the entry
meal_info.loc[meal_info["catchup_flag"], "delta"] -= pd.Timedelta(days=1)

# Remove the ones that are now in day 0
meal_info = meal_info[meal_info["delta"].dt.days != 0]

meal_info[meal_info["catchup_flag"]].head()
meal_info["delta"].dt.days.value_counts()

In [None]:
# Check we have the expected values
assert set(meal_info["meal_type"]) == {
    "Meal",
    "Drink",
    "Snack",
    "No food/drink",
    "No response",
    "No catch-up",
}, set(meal_info["meal_type"])

# Add the study day to the dataframe
meal_info["day"] = meal_info["delta"].dt.days

# Drop columns we won't need
meal_info = meal_info.drop(columns=["delta"])
meal_info.head()

In [None]:
"""
Find how many entries there were per day overall

"""


def group(df: pd.DataFrame) -> pd.DataFrame:
    """
    From a dataframe containing the columns "p_id" and "day", return a dataframe
    containing the number of entries per day per participant.

    """
    grouped_df = df.groupby(["p_id", "day"]).size().reset_index(name="count")

    # If there are no entries for a day, add a row with 0 entries
    for p_id in set(grouped_df["p_id"]):
        for day in range(1, 8):
            if not ((grouped_df["p_id"] == p_id) & (grouped_df["day"] == day)).any():
                grouped_df = pd.concat(
                    [
                        grouped_df,
                        pd.DataFrame({"p_id": [p_id], "day": [day], "count": [0]}),
                    ]
                )

    # Error if there are any p_ids which dont have an entry for every day
    for p_id in set(grouped_df["p_id"]):
        assert set(grouped_df[grouped_df["p_id"] == p_id]["day"]) == set(
            range(1, 8)
        ), f"Missing days for participant {p_id}: {set(grouped_df[grouped_df['p_id'] == p_id]['day'])}"

    return grouped_df


group(meal_info).head()

In [None]:
from typing import Union

import matplotlib.pyplot as plt


def boxplot(
    df: pd.DataFrame, axis: plt.Axes = None
) -> Union[None | tuple[plt.Figure, plt.Axes]]:
    """
    Make a boxplot of the "count"

    :param df: dataframe with the columns "p_id", "day" and "count"
    :param axis: optional axis to plot on. If not provided, creates a new figure

    :returns: the figure, if a new figure was created
    :returns: the axis, if a new figure was created

    """
    if axis is None:
        new_fig_created = True
        fig, axis = plt.subplots()
    else:
        new_fig_created = False

    df.boxplot(column="count", by="day", ax=axis)

    axis.set_title("")
    axis.set_xlabel("Day")
    axis.set_ylabel("Number of entries")

    axis.set_ylim(0, axis.get_ylim()[1])

    if new_fig_created:
        return fig, axis


fig = boxplot(group(meal_info))

In [None]:
"""
Plot this as boxplots

"""

fig, axes = plt.subplots(1, 5, figsize=(20, 4), sharey=True)

boxplots_dir = f"{img_dir}/boxplots"
if not os.path.exists(boxplots_dir):
    os.makedirs(boxplots_dir)

for axis, meal_type in zip(axes, meal_info["meal_type"].unique()):
    plot_data = group(meal_info[meal_info["meal_type"] == meal_type])
    boxplot(plot_data, axis=axis)

    axis.set_title(meal_type)

    # Save this axis as a separate file
    fig2, individual_axis = plt.subplots()
    boxplot(plot_data, axis=individual_axis)

    individual_axis.set_ylim(
        individual_axis.get_ylim()[0] - 1, individual_axis.get_ylim()[1] + 1
    )
    fig2.suptitle(meal_type)
    fig2.savefig(f'{boxplots_dir}/{meal_type.replace("/", "_or_")}.png')

for axis in axes[1:]:
    axis.set_ylabel("")

axes[0].set_ylim(axes[0].get_ylim()[0] - 1, axes[0].get_ylim()[1] + 1)
fig.suptitle("")
fig.savefig(f"{boxplots_dir}/meal_entries_per_day.png", bbox_inches="tight")

In [None]:
"""
Print a table of the number of each type of entry per day

"""

from IPython.display import display


def summarise(df: pd.DataFrame, meal_type: str = None) -> pd.DataFrame:
    """
    From a dataframe containing the columns "p_id", "day" and "count", return a dataframe
    containing the number of entries per day per participant.

    :param df: dataframe with the columns "p_id", "day" and "count"
    :param meal_type: the meal type to consider

    """
    label = meal_type if meal_type is not None else "All"
    result = (
        group(df)
        .groupby("day")
        .agg(
            {
                "count": [
                    "median",
                    lambda x: x.quantile(0.75) - x.quantile(0.25),
                    lambda x: x.quantile(0.25),
                    lambda x: x.quantile(0.75),
                ]
            }
        )
    )
    result.columns = result.columns.set_levels([label], level=0)
    result.columns = result.columns.set_levels(["median", "IQR", "25%", "75%"], level=1)

    return result


pd.options.display.max_columns = None
display(summarise(meal_info))

# Display the summary for each meal type
dfs = []
for meal_type in meal_info["meal_type"].unique():
    dfs.append(summarise(meal_info[meal_info["meal_type"] == meal_type], meal_type))
all_summary = pd.concat(dfs, axis=1)
all_summary

In [None]:
""" Do these summaries with only the non-catchup entries"""
non_catchups = meal_info[~meal_info["catchup_flag"]]
display(summarise(non_catchups))

# Display the summary for each meal type
dfs = []
for meal_type in non_catchups["meal_type"].unique():
    dfs.append(summarise(non_catchups[non_catchups["meal_type"] == meal_type], meal_type))
no_catchup_summary = pd.concat(dfs, axis=1)
display(no_catchup_summary)

In [None]:
""" Do the non-ramadan ones (without catchups) """

non_ramadan = non_catchups[~non_catchups["any_in_ramadan"]]

print(
    f"{len(non_ramadan['p_id'].unique())} non-ramadan; {len(non_catchups['p_id'].unique())} total"
)
display(summarise(non_ramadan))

# Display the summary for each meal type
dfs = []
for meal_type in non_ramadan["meal_type"].unique():
    dfs.append(summarise(non_ramadan[non_ramadan["meal_type"] == meal_type], meal_type))
no_ramadan_summary = pd.concat(dfs, axis=1)
no_ramadan_summary

In [None]:
display(all_summary)
display(no_catchup_summary)
display(no_ramadan_summary)

In [None]:
""" Active users per day """

import numpy as np

n_entries = np.zeros((len(meal_info["p_id"].unique()), 7), dtype=int)

for i, (p_id, group) in enumerate(meal_info.groupby("p_id")):
    for j, (day , subgroup) in enumerate(group.groupby("day")):
        n_entries[i, j] = subgroup["meal_type"].isin({"Meal", "Drink", "Snack", "No food/drink"}).sum()
n_entries

# Find how many days each participant was active for
# A participant is considered inactive if they have 0 entries on this and all subsequent days
# e.g. [13, 13, 13, 13, 12, 0, 0] is active for 5 days
def active_days(entries: np.ndarray) -> int:
    return len(entries) - np.argmax(np.cumsum(entries[::-1]) > 0)


n_days_active = np.array([active_days(entries) for entries in n_entries])

bins = np.arange(9) - 0.5
days_active_count = np.histogram(n_days_active, bins=bins)[0]

# Plot without day 0
fig, axis = plt.subplots()
axis.bar(
    bins[2:] - 0.5,
    days_active_count[1:],
    width=0.8,
)

axis.set_xlabel("Last day any food and drink provided with μEMA")
axis.set_ylabel("Number of Participants")
fig.tight_layout()
fig.savefig(f"{img_dir}/n_days_active_hist.png")