In [1]:
from IPython.core.display import HTML
HTML(r"""
<style>
    * {
        font-family: monospace;
        font-size: 12px;
        line-height: normal;
    }
</style>
""")

In [2]:
import pandas as pd

# Problem

Given a dataframe including columns below:
- `activity_date`
- `user_id`
- `device_id`
- `total_activity`

Requirements:

Aggregated user data needs to be produced output below

- `user_id`: Column `user_id` from source data.
- `device_id`: Column `device_id` from source data which has the greatest total activity from the `total_activity` column.
- `longest_active_streak`: The longest period of consecutive days this user has been active. For example, if the user had activity record on 2011-08-10, 2011-08-11, 2011-08-14, 2011-08-15 and 2011-08-16, the value of `longest_active_streak` would be 3.

# Solution

In [35]:
pd.set_option('display.max_colwidth', None)

In [39]:
df = pd.read_csv(
    filepath_or_buffer="./activity_tracking.csv",
    header=0,
    parse_dates=["active_date",]
)
df.head(n=10)

Unnamed: 0,active_date,user_id,device_id,total_activity
0,2023-01-10,6191bd7a6a1c730006499bcf,54f088efc7a9b9461467c8eb,91
1,2023-01-10,6266a9f214e79700091a68fe,54f088efc7a9b9461467c8eb,49
2,2023-01-10,603fc562dc5db80006d82a0e,58a6f999c6a4078d51afe031,111
3,2023-01-10,63b7047df028d80007bff7c2,58a6f999c6a4078d51afe031,280
4,2023-01-10,5b1f8ff4a6ec410844c6a0a8,58a6f999c6a4078d51afe031,30
5,2023-01-10,633ac0b777d9b900078f2412,58a6f999c6a4078d51afe031,37
6,2023-01-10,58aef797b292bc1041514355,58aefa3dd48122292e498cae,92
7,2023-01-10,5d67874dbed6700008dec82b,58c80ec0fe1b03791a893efe,90
8,2023-01-10,60e2b320da2454000607d42c,58c80ec0fe1b03791a893efe,116
9,2023-01-10,58db9ca037616fa3242c7cdc,58db9d0ce8f63a0412f444b8,215


In [40]:
df_max_device_id = df.groupby(by=["user_id", "device_id"], as_index=False)["total_activity"].sum() \
    .sort_values("total_activity", ascending=False) \
        .drop_duplicates(["user_id"])
df_max_device_id.head(n=10)

Unnamed: 0,user_id,device_id,total_activity
1461,5cacf5d3ff232d000844839d,63b723312491360007d2cb34,9387
3886,6108ff70f19734000615f061,63b201f442683c00072e89cc,9123
4552,61da4682f577d40008a4a318,5f9211f45dd05e0006d5f72c,6759
7183,63844e41dd24b00006ba55b7,5fb228a23e8523000697849b,5263
5146,624a55cd01eb0b0008df80a8,5a0c5c1b1bb64621508bd785,3926
4857,62133fa254f7a10008d524c4,63bbce636fcf8e0007337692,3351
4330,619dc427c647060006c82fa5,61aee1abb2967000077684f1,2609
1855,5da3d9316f7624000891d8bb,5da3c9989f14fd0008617398,2599
3014,5fca85856f2c3e0006561aad,5fca85f36f2c3e0006561ac2,2517
7135,637c70db6f6ec300079281f6,626f5a506fc6a10007e2e35b,2384


In [41]:
df_longest_active_streak = df.drop_duplicates(subset=["user_id", "active_date"]) \
    .groupby(by=["user_id"], as_index=False) \
        ["active_date"].apply(func=list)
df_longest_active_streak["active_date"] = df_longest_active_streak["active_date"].apply(sorted, **{"reverse": False})
df_longest_active_streak.head(n=10)

Unnamed: 0,user_id,active_date
0,54f09b89c7a9b9461467c8f6,"[2023-01-01 00:00:00, 2023-01-03 00:00:00, 2023-01-04 00:00:00, 2023-01-05 00:00:00, 2023-01-06 00:00:00, 2023-01-07 00:00:00, 2023-01-08 00:00:00, 2023-01-09 00:00:00, 2023-01-10 00:00:00]"
1,55139002ea40cb737b8c819b,[2023-01-06 00:00:00]
2,55a67aaeb9424db226b8d3c9,"[2023-01-01 00:00:00, 2023-01-03 00:00:00, 2023-01-04 00:00:00, 2023-01-05 00:00:00, 2023-01-06 00:00:00, 2023-01-07 00:00:00, 2023-01-08 00:00:00]"
3,55bf2a636b19c4d20c518416,[2023-01-10 00:00:00]
4,55dd3420da7045ae3263d35a,"[2023-01-03 00:00:00, 2023-01-04 00:00:00, 2023-01-05 00:00:00, 2023-01-06 00:00:00, 2023-01-07 00:00:00, 2023-01-09 00:00:00, 2023-01-10 00:00:00]"
5,55e59ec2b68cdc1411955142,"[2023-01-01 00:00:00, 2023-01-02 00:00:00, 2023-01-03 00:00:00, 2023-01-04 00:00:00, 2023-01-05 00:00:00, 2023-01-06 00:00:00, 2023-01-07 00:00:00, 2023-01-09 00:00:00, 2023-01-10 00:00:00]"
6,55e6eaffa856461404a62dd8,"[2023-01-06 00:00:00, 2023-01-09 00:00:00]"
7,55e6eaffa856461404a62ddf,"[2023-01-03 00:00:00, 2023-01-04 00:00:00, 2023-01-05 00:00:00, 2023-01-06 00:00:00, 2023-01-09 00:00:00, 2023-01-10 00:00:00]"
8,55e6eaffa856461404a62de0,"[2023-01-01 00:00:00, 2023-01-02 00:00:00, 2023-01-03 00:00:00, 2023-01-04 00:00:00, 2023-01-05 00:00:00, 2023-01-06 00:00:00, 2023-01-07 00:00:00, 2023-01-09 00:00:00, 2023-01-10 00:00:00]"
9,55e7c05db7efbbfa7c7f2510,"[2023-01-05 00:00:00, 2023-01-06 00:00:00, 2023-01-09 00:00:00, 2023-01-10 00:00:00]"


In [49]:
def get_longest_active_streak(arr):
    max_count = 0
    temp_count = 0
    for i in range(len(arr) - 1):
        if (arr[i + 1] - arr[i]).days > 1:
            temp_count = 0
            continue
        temp_count+=1
        max_count = max(max_count, temp_count)
    return max_count + 1 if max_count > 0 else 0

In [50]:
df_longest_active_streak["longest_active_streak"] = df_longest_active_streak["active_date"].apply(get_longest_active_streak)
df_longest_active_streak.head()

Unnamed: 0,user_id,active_date,longest_active_streak
0,54f09b89c7a9b9461467c8f6,"[2023-01-01 00:00:00, 2023-01-03 00:00:00, 2023-01-04 00:00:00, 2023-01-05 00:00:00, 2023-01-06 00:00:00, 2023-01-07 00:00:00, 2023-01-08 00:00:00, 2023-01-09 00:00:00, 2023-01-10 00:00:00]",8
1,55139002ea40cb737b8c819b,[2023-01-06 00:00:00],0
2,55a67aaeb9424db226b8d3c9,"[2023-01-01 00:00:00, 2023-01-03 00:00:00, 2023-01-04 00:00:00, 2023-01-05 00:00:00, 2023-01-06 00:00:00, 2023-01-07 00:00:00, 2023-01-08 00:00:00]",6
3,55bf2a636b19c4d20c518416,[2023-01-10 00:00:00],0
4,55dd3420da7045ae3263d35a,"[2023-01-03 00:00:00, 2023-01-04 00:00:00, 2023-01-05 00:00:00, 2023-01-06 00:00:00, 2023-01-07 00:00:00, 2023-01-09 00:00:00, 2023-01-10 00:00:00]",5


In [62]:
df_longest_active_streak.merge(df_max_device_id, on=["user_id"])[["user_id", "device_id", "longest_active_streak"]].head(n=10)

Unnamed: 0,user_id,device_id,longest_active_streak
0,54f09b89c7a9b9461467c8f6,55627c31a23173f936c13322,8
1,55139002ea40cb737b8c819b,55bf22576b19c4d20c518322,0
2,55a67aaeb9424db226b8d3c9,55627c31a23173f936c13322,6
3,55bf2a636b19c4d20c518416,55bf22576b19c4d20c518322,0
4,55dd3420da7045ae3263d35a,5fc9c3d80983dc000610d4e8,5
5,55e59ec2b68cdc1411955142,54f088efc7a9b9461467c8eb,7
6,55e6eaffa856461404a62dd8,56c5539ad755d1fe41605dee,0
7,55e6eaffa856461404a62ddf,56c5539ad755d1fe41605dee,4
8,55e6eaffa856461404a62de0,56c5539ad755d1fe41605dee,7
9,55e7c05db7efbbfa7c7f2510,55627c31a23173f936c13322,2
