In [1]:
import pandas as pd
import numpy as np

In [2]:
#load csv file into dataframe
sf_events = pd.read_csv('sf_events.csv')

#show
sf_events

Unnamed: 0,record_date,account_id,user_id
0,1/1/2021,A1,U1
1,1/1/2021,A1,U2
2,1/6/2021,A1,U3
3,1/2/2021,A1,U1
4,12/24/2020,A1,U2
5,12/8/2020,A1,U1
6,12/9/2020,A1,U1
7,1/10/2021,A2,U4
8,1/11/2021,A2,U4
9,1/12/2021,A2,U4


Find all the users who were active for 3 consecutive days or more.

In [3]:
#check table information
sf_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   record_date  23 non-null     object
 1   account_id   23 non-null     object
 2   user_id      23 non-null     object
dtypes: object(3)
memory usage: 680.0+ bytes


In [4]:
# change record_date datatype
sf_events['record_date'] = pd.to_datetime(sf_events['record_date'])

# check info
sf_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   record_date  23 non-null     datetime64[ns]
 1   account_id   23 non-null     object        
 2   user_id      23 non-null     object        
dtypes: datetime64[ns](1), object(2)
memory usage: 680.0+ bytes


In [5]:
# add a column that changes record_date to ordinal number format
sf_events['date_no']= sf_events['record_date'].apply(lambda x:x.toordinal())

# show
sf_events

Unnamed: 0,record_date,account_id,user_id,date_no
0,2021-01-01,A1,U1,737791
1,2021-01-01,A1,U2,737791
2,2021-01-06,A1,U3,737796
3,2021-01-02,A1,U1,737792
4,2020-12-24,A1,U2,737783
5,2020-12-08,A1,U1,737767
6,2020-12-09,A1,U1,737768
7,2021-01-10,A2,U4,737800
8,2021-01-11,A2,U4,737801
9,2021-01-12,A2,U4,737802


In [6]:
#sort dataframe by user_id then by date_no
sf_events.sort_values(['user_id','date_no'], inplace = True)

sf_events

Unnamed: 0,record_date,account_id,user_id,date_no
5,2020-12-08,A1,U1,737767
6,2020-12-09,A1,U1,737768
0,2021-01-01,A1,U1,737791
3,2021-01-02,A1,U1,737792
18,2021-02-07,A1,U1,737828
4,2020-12-24,A1,U2,737783
1,2021-01-01,A1,U2,737791
19,2021-02-10,A1,U2,737831
2,2021-01-06,A1,U3,737796
11,2020-12-17,A2,U4,737776


In [7]:
# add a column that subtract latest date_no from previous date_no for each user
# This makes aware that user with more than 2 consective 1 provided the initial value before the consevtive value isn't NaN
sf_events['date_diff'] = sf_events.groupby('user_id')['date_no'].diff()

# show
sf_events

Unnamed: 0,record_date,account_id,user_id,date_no,date_diff
5,2020-12-08,A1,U1,737767,
6,2020-12-09,A1,U1,737768,1.0
0,2021-01-01,A1,U1,737791,23.0
3,2021-01-02,A1,U1,737792,1.0
18,2021-02-07,A1,U1,737828,36.0
4,2020-12-24,A1,U2,737783,
1,2021-01-01,A1,U2,737791,8.0
19,2021-02-10,A1,U2,737831,40.0
2,2021-01-06,A1,U3,737796,
11,2020-12-17,A2,U4,737776,


In [8]:
# fill all NaN values in date_diff column with 0 
sf_events['date_diff'] = sf_events['date_diff'].fillna(0)

# show
sf_events

Unnamed: 0,record_date,account_id,user_id,date_no,date_diff
5,2020-12-08,A1,U1,737767,0.0
6,2020-12-09,A1,U1,737768,1.0
0,2021-01-01,A1,U1,737791,23.0
3,2021-01-02,A1,U1,737792,1.0
18,2021-02-07,A1,U1,737828,36.0
4,2020-12-24,A1,U2,737783,0.0
1,2021-01-01,A1,U2,737791,8.0
19,2021-02-10,A1,U2,737831,40.0
2,2021-01-06,A1,U3,737796,0.0
11,2020-12-17,A2,U4,737776,0.0


In [9]:
# change datatype of date_diff column to int
sf_events['date_diff'] = sf_events['date_diff'].astype('int')

# show
sf_events

Unnamed: 0,record_date,account_id,user_id,date_no,date_diff
5,2020-12-08,A1,U1,737767,0
6,2020-12-09,A1,U1,737768,1
0,2021-01-01,A1,U1,737791,23
3,2021-01-02,A1,U1,737792,1
18,2021-02-07,A1,U1,737828,36
4,2020-12-24,A1,U2,737783,0
1,2021-01-01,A1,U2,737791,8
19,2021-02-10,A1,U2,737831,40
2,2021-01-06,A1,U3,737796,0
11,2020-12-17,A2,U4,737776,0


In [10]:
# iterate through the dataframe to return user_id when there is consecutive 1
user = 0
prev_date = 0
consecutive_user = []
for index, row in sf_events.iterrows():
    if row['user_id'] != user:
        user = row['user_id']
    elif row['date_diff'] == 1 and prev_date == 1:
            consecutive_user.append(row['user_id'])
    else:
        prev_date = row['date_diff']

# set list to set to return unique values and then back to list
consecutive_user = list(set(consecutive_user))

In [11]:
# result
consecutive_user

['U4']