In [1]:
import pandas as pd
from datetime import date, timedelta

In [None]:
"""
status:
    1 - active
    2 - cancelled
    3 - deactivated

sub_start
sub_end

period_start
period_end

    base:
        sub_start <= period_start                         -> if status = 1 or 2
        sub_start <= period_start <= sub_end              -> if status = 3
    
    new:
        period_start <= FIRST sub_start <= period_end
    
    churned:
        period_start <= sub_end <= period_end
    
    resurrected:
        if AT LEAST ONE but NOT THE FIRST sub_start is contained in [period_start, period_end]

A person can be present in more than one category

"""

## Function Definitions

In [7]:
def set_period(p):
    
    period_end = date(2022,10,31)
    period_start = period_end - timedelta(days = p-1)
    
    return (period_start, period_end)

In [8]:
def is_base(df, period_start, period_end):
    
    for i in range(len(df)):
        
        if df['status'][i] == 3:
            if df['sub_start'][i].date() <= period_start <= df['sub_end'][i].date():
                return (True, i)
        
        else:
            if df['sub_start'][i].date() <= period_start:
                return (True, i)
        
    return (False, None)

In [9]:
def is_new(df, period_start, period_end):
    
    if period_start <= df['sub_start'].min().date() <= period_end:
        return (True, 0)
    
    return (False, None)

In [10]:
def is_churned(df, period_start, period_end):
    
    for i in range(len(df)):
        
        if pd.isnull(df['sub_end'][i]):
            continue
        
        if period_start <= df['sub_end'][i].date() <= period_end:
            return (True, i)
        
    return (False, None)

In [11]:
def is_resurrected(df, period_start, period_end):
    
    for i in range(1, len(df)):
        
        if period_start <= df['sub_start'][i].date() <= period_end:
            return (True, i)
    
    return (False, None)

In [12]:
def generate_user_tags(df, period = 30, period_start = None, period_end = None):
    
    if period_start is None and period_end is None:
        period_start, period_end = set_period(period)
    
    if period_start is None:
        period_start = period_end - timedelta(days = period - 1)
    
    if period_end is None:
        period_end = period_start + timedelta(days = period - 1)
    
    
    
    r = []
    
    
    users = set(df['user_id'])
    
    for user_id in users:
        
        mask = (df['user_id'] == user_id)
        user = df[mask]
        user.reset_index(drop = True, inplace = True)
        
        
        test, index = is_base(user, period_start, period_end)
        if test:
            r.append([ user['user_id'][index], user['status'].iloc[-1], user['plan'][index], "Base" ])
        
        test, index = is_new(user, period_start, period_end)
        if test:
            r.append([ user['user_id'][index], user['status'].iloc[-1], user['plan'][index], "New" ])
        
        test, index = is_churned(user, period_start, period_end)
        if test:
            r.append([ user['user_id'][index], user['status'].iloc[-1], user['plan'][index], "Churned" ])
        
        test, index = is_resurrected(user, period_start, period_end)
        if test:
            r.append([ user['user_id'][index], user['status'].iloc[-1], user['plan'][index], "Resurrected" ])
    
    
    
    result = pd.DataFrame(data = r, columns = ["user_id", "current status", "plan", "tag"])
    result.sort_values(by = "user_id", inplace = True, ignore_index = True)
    
    
    return result
    

## Data Exploration

In [2]:
data = pd.read_csv("Waterfall_subscriptions.csv")

In [3]:
data.head()

Unnamed: 0,user_id,subscription_id,plan,status,sub_start,sub_end
0,31,8368,Annual,1,2022-03-31,
1,182,8246,Monthly,3,2022-03-26,2022-06-08
2,182,9415,Monthly,3,2022-07-13,2022-08-13
3,221,4629,Monthly,3,2021-06-10,2021-07-10
4,235,6429,Annual,1,2021-11-28,


In [4]:
type(data['sub_start'][0])

str

In [5]:
data["sub_start"] = pd.to_datetime(data["sub_start"], infer_datetime_format = True)
data["sub_end"] = pd.to_datetime(data["sub_end"], infer_datetime_format = True)

In [6]:
data.head()

Unnamed: 0,user_id,subscription_id,plan,status,sub_start,sub_end
0,31,8368,Annual,1,2022-03-31,NaT
1,182,8246,Monthly,3,2022-03-26,2022-06-08
2,182,9415,Monthly,3,2022-07-13,2022-08-13
3,221,4629,Monthly,3,2021-06-10,2021-07-10
4,235,6429,Annual,1,2021-11-28,NaT


## Test

In [13]:
test = generate_user_tags(data)

In [14]:
test.head()

Unnamed: 0,user_id,current status,plan,tag
0,31,1,Annual,Base
1,235,1,Annual,Base
2,591,1,Annual,Base
3,616,1,Annual,Base
4,618,1,Annual,Base


## Data Export

In [15]:
result_last_30 = generate_user_tags(data, period = 30)
result_last_90 = generate_user_tags(data, period = 90)
result_last_365 = generate_user_tags(data, period = 365)

In [16]:
result_last_30.to_csv('Waterfall_subs_last_30.csv', index = False)
result_last_90.to_csv('Waterfall_subs_last_90.csv', index = False)
result_last_365.to_csv('Waterfall_subs_last_365.csv', index = False)