# [Optional] Analyzing Streaks and Lapses

This notebook computes streaks and lapses in the users' logging behavior.

It is optional. Run this notebook if lapses.csv and lapse_details.csv are missing.

It may take up to 500 seconds to run.

## 1. Loading data

In [9]:
import pandas as pd

diary_days = pd.read_csv("../data/daily_calories.csv")
diary_days.reset_index(inplace=True)
diary_days["username"] = diary_days["username"].astype(str)
diary_days["date"] = pd.to_datetime(diary_days["date"])
diary_days["calories"] = pd.to_numeric(diary_days["calories"])

## 2. Defining function to find streaks and lapses

In [10]:
import pandas as pd
import numpy as np
import json

def find_lapses(df):
    streaks = []
    lapses = []
    streak = []
    lapse = []

    for i in df.iterrows():
        if not pd.isnull(i[1].calories):
            if len(lapse) > 0:
                lapses.append(lapse)
            streak.append(dt.strftime(i[1].date, "%Y-%m-%d"))
            lapse = []
        else:
            if len(streak) > 0:
                streaks.append(streak)
            lapse.append(dt.strftime(i[1].date, "%Y-%m-%d"))
            streak = []

    streaks.append(streak) # The last streak

    return (streaks, lapses)

## 3. Generating continuous date range

In [11]:
from datetime import datetime as dt, timedelta
import pandas as pd

start = diary_days.date.min()
end = diary_days.date.max()
date_list = [start + timedelta(days=x) for x in range(0, (end-start).days+1)]
all_dates = pd.DataFrame(date_list, columns=["date"])
all_dates["date"] = pd.to_datetime(all_dates["date"])

## 4. Find streaks and lapses for all users

In [12]:
import pandas as pd
import time

uns = diary_days.username.unique()
data = []
data2 = []
t0 = time.time()

for u in uns:
    udates = diary_days[diary_days.username==u]
    df = pd.merge(all_dates[(all_dates.date>=udates.date.min()) & (all_dates.date<=udates.date.max())], udates, how="outer", on="date")
    data.append({"username": u, "num_lapsed_days":len(df[df.username.isnull()])})
    streaks, lapses = find_lapses(df)
    data2.append({"username": u, "streaks": json.dumps(streaks), "lapses": json.dumps(lapses)})
    
lapses = pd.DataFrame.from_dict(data)
lapses.set_index("username", inplace=True)
lapses.to_csv("../data/lapses.csv")
lapse_details = pd.DataFrame.from_dict(data2)
lapse_details.set_index("username", inplace=True)
lapse_details.to_csv("../data/lapse_details.csv")
t1 = time.time()
print "Done in %s" % (t1-t0)

Done in 555.974705935
