Catchup Stats
====
Statistics on the number of catchups of each type each participant made per day

In [1]:
import os

%load_ext autoreload
%autoreload 2

img_dir = "outputs/imgs/catchup_stats/"
if not os.path.exists(img_dir):
    os.makedirs(img_dir)

In [None]:
"""
Read and clean the meal entry data

"""

from analysis_utils import clean

catchup_df = clean.cleaned_smartwatch(keep_catchups=True, keep_day0=False)

# Keep only catchups
catchup_df = catchup_df[catchup_df["catchup_flag"] | (catchup_df["meal_type"] == "No catch-up")]

# Remove the catchup markers
catchup_df = catchup_df[catchup_df["meal_type"] != "Catch-up end"]
print(len(catchup_df))
catchup_df.head(5)

In [None]:
# Check we have the expected values
assert set(catchup_df["meal_type"]) == {"Meal", "Drink", "Snack", "No catch-up"}, set(
    catchup_df["meal_type"]
)

# Add the study day to the dataframe
catchup_df["day"] = catchup_df["delta"].dt.days
catchup_df["time"] = catchup_df["delta"].apply(
    lambda x: f"{x.components.hours:02}:{x.components.minutes:02}:{x.components.seconds:02}"
)
catchup_df.to_csv(f"{img_dir}/catchups.csv", index=False)

# Drop columns we won't need
catchup_df = catchup_df.drop(
    columns=[
        "x_id",
        "portion_size",
        "utensil",
        "location",
        "week_day",
        "ramadanstart",
        "ramadanend",
        "delta",
        "catchup_category",
        "catchup_flag",
        "entry_in_ramadan",
        "early_stop",
        "first_in_ramadan",
        "last_in_ramadan",
        "all_in_ramadan",
        "any_in_ramadan",
    ]
)
catchup_df.head()

In [None]:
"""
Find how many entries there were per day overall

"""

import pandas as pd


def group(df: pd.DataFrame) -> pd.DataFrame:
    """
    From a dataframe containing the columns "p_id" and "day", return a dataframe
    containing the number of entries per day per participant.

    """
    grouped_df = df.groupby(["p_id", "day"]).size().reset_index(name="count")

    # If there are no entries for a day, add a row with 0 entries
    for p_id in set(grouped_df["p_id"]):
        for day in range(1, 8):
            if not ((grouped_df["p_id"] == p_id) & (grouped_df["day"] == day)).any():
                grouped_df = pd.concat(
                    [
                        grouped_df,
                        pd.DataFrame({"p_id": [p_id], "day": [day], "count": [0]}),
                    ]
                )

    # Error if there are any p_ids which dont have an entry for every day
    for p_id in set(grouped_df["p_id"]):
        assert set(grouped_df[grouped_df["p_id"] == p_id]["day"]) == set(
            range(1, 8)
        ), f"Missing days for participant {p_id}: {set(grouped_df[grouped_df['p_id'] == p_id]['day'])}"

    return grouped_df


group(catchup_df).head()

In [None]:
from typing import Union

import matplotlib.pyplot as plt


def boxplot(
    df: pd.DataFrame, axis: plt.Axes = None
) -> Union[None | tuple[plt.Figure, plt.Axes]]:
    """
    Make a boxplot of the "count"

    :param df: dataframe with the columns "p_id", "day" and "count"
    :param axis: optional axis to plot on. If not provided, creates a new figure

    :returns: the figure, if a new figure was created
    :returns: the axis, if a new figure was created

    """
    if axis is None:
        new_fig_created = True
        fig, axis = plt.subplots()
    else:
        new_fig_created = False

    df.boxplot(column="count", by="day", ax=axis)

    axis.set_title("")
    axis.set_xlabel("Day")
    axis.set_ylabel("Number of entries")

    axis.set_ylim(0, axis.get_ylim()[1])

    if new_fig_created:
        return fig, axis


fig = boxplot(group(catchup_df))

In [None]:
"""
Plot this as boxplots

"""

fig, axes = plt.subplots(1, 4, figsize=(16, 4), sharey=True)

for axis, meal_type in zip(axes, catchup_df["meal_type"].unique()):
    boxplot(group(catchup_df[catchup_df["meal_type"] == meal_type]), axis=axis)
    axis.set_title(meal_type)

for axis in axes[1:]:
    axis.set_ylabel("")

fig.suptitle("")
fig.savefig(f"{img_dir}/meal_entries_per_day.png", bbox_inches="tight")

In [None]:
"""
Print a table of the number of each type of entry per day

"""

from IPython.display import display


def summarise(df: pd.DataFrame, meal_type: str = None) -> pd.DataFrame:
    """
    From a dataframe containing the columns "p_id", "day" and "count", return a dataframe
    containing the number of entries per day per participant.

    :param df: dataframe with the columns "p_id", "day" and "count"
    :param meal_type: the meal type to consider

    """
    label = meal_type if meal_type is not None else "All"
    result = (
        group(df)
        .groupby("day")
        .agg(
            {
                "count": [
                    "mean",
                    "std",
                    "median",
                    lambda x: x.quantile(0.75) - x.quantile(0.25),
                ]
            }
        )
    )
    result.columns = result.columns.set_levels([label], level=0)

    return result


display(summarise(catchup_df))

for meal_type in catchup_df["meal_type"].unique():
    print()
    display(summarise(catchup_df[catchup_df["meal_type"] == meal_type], meal_type))

In [None]:
# Do it again without one problematic participant
for meal_type in catchup_df["meal_type"].unique():
    print()
    display(
        summarise(
            catchup_df[
                (catchup_df["meal_type"] == meal_type) & (catchup_df["p_id"] != 20050)
            ],
            meal_type,
        )
    )