In [51]:
import pandas as pd
import numpy as np
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import random

class ModelSubscription:
    def __init__(self, transaction_df, subscription_fee, conversion_ratio_monthly, days_until_inactive=90, churn_probability=0.1, start_date=None, end_date=None):
        self.transaction_df = transaction_df
        self.subscription_fee = subscription_fee
        self.conversion_ratio_monthly = conversion_ratio_monthly
        self.days_until_inactive = days_until_inactive
        self.churn_probability = churn_probability
        self.start_date = pd.to_datetime(start_date if start_date else transaction_df['Period'].min())
        self.end_date = pd.to_datetime(end_date if end_date else transaction_df['Period'].max())
        self.transformed_df = None
        self.patient_pool = {}

    def initialize_patient_pool(self):
        # Initialize the patient pool with non-converted patients
        self.transaction_df['Period'] = pd.to_datetime(self.transaction_df['Period'])
        for customer_id, group in self.transaction_df.groupby('Customer ID'):
            first_transaction = group['Period'].min()
            self.patient_pool[customer_id] = {
                'status': 'non-converted',
                'last_seen': first_transaction
            }

    def update_patient_pool(self, current_month):
        # Update the patient pool based on inactivity and new patients
        active_customers = self.transaction_df[self.transaction_df['Period'].dt.to_period('M') == current_month]['Customer ID'].unique()
        new_patients = []

        for customer_id in active_customers:
            if customer_id not in self.patient_pool:
                # Treat reappearing or new patients as new
                new_patients.append(customer_id)
                self.patient_pool[customer_id] = {
                    'status': 'non-converted',
                    'last_seen': current_month.start_time
                }
            else:
                # Update last_seen for existing patients
                self.patient_pool[customer_id]['last_seen'] = current_month.start_time

        # Mark patients inactive if they have been unseen for days_until_inactive
        for customer_id, details in self.patient_pool.items():
            if details['status'] == 'non-converted':
                if (current_month.start_time - details['last_seen']).days > self.days_until_inactive:
                    details['status'] = 'inactive'

        return new_patients

    def apply_monthly_conversion(self, current_month, new_patients):
        # Calculate the eligible pool for conversion
        eligible_non_converted = [cid for cid, details in self.patient_pool.items() if details['status'] == 'non-converted']
        eligible_pool = eligible_non_converted + new_patients

        num_to_convert = int(len(eligible_pool) * self.conversion_ratio_monthly)
        converting_patients = random.sample(eligible_pool, min(num_to_convert, len(eligible_pool)))

        for customer_id in converting_patients:
            self.patient_pool[customer_id]['status'] = 'converted'

        return converting_patients

    def apply_churn(self, current_month):
        # Simulate churn for converted patients
        if current_month == self.start_date.to_period('M'):
            return []  # No churn in the first month

        converted_patients = [cid for cid, details in self.patient_pool.items() if details['status'] == 'converted']
        churned_patients = [cid for cid in converted_patients if random.random() < self.churn_probability]

        for customer_id in churned_patients:
            self.patient_pool[customer_id]['status'] = 'non-converted'
            # Revert their transactions from current month onward
            customer_transactions = self.transformed_df[(self.transformed_df['Customer ID'] == customer_id) & 
                                                        (self.transformed_df['Period'] >= current_month.start_time)]
            original_transactions = self.transaction_df[self.transaction_df['Customer ID'] == customer_id]

            for index, transaction in customer_transactions.iterrows():
                period = transaction['Period']
                original_transaction = original_transactions[original_transactions['Period'] == period]
                if not original_transaction.empty:
                    # Transform back to the original transaction
                    self.transformed_df.loc[index, 'Revenue'] = original_transaction['Revenue'].values[0]
                    self.transformed_df.loc[index, 'Expense'] = original_transaction['Expense'].values[0]
                    self.transformed_df.loc[index, 'Treatment'] = original_transaction['Treatment'].values[0]

        return churned_patients

    def transform_to_subscription_scheme(self):
        self.initialize_patient_pool()
        self.transformed_df = self.transaction_df.copy()
        all_periods = pd.period_range(self.start_date, self.end_date, freq='M')

        for current_month in all_periods:
            new_patients = self.update_patient_pool(current_month)
            converting_patients = self.apply_monthly_conversion(current_month, new_patients)
            churned_patients = self.apply_churn(current_month)

            # Transform transactions for converting patients
            for customer_id in converting_patients:
                transactions = self.transformed_df[self.transformed_df['Customer ID'] == customer_id]
                self.transformed_df.loc[transactions.index, 'Revenue'] = 0

                # Add subscription rows
                subscription_rows = []
                subscription_date = current_month.start_time
                while subscription_date <= self.end_date:
                    subscription_rows.append({
                        'Period': subscription_date,
                        'Treatment': None,
                        'Revenue': self.subscription_fee,
                        'Expense': 0,
                        'Customer ID': customer_id
                    })
                    subscription_date += relativedelta(months=1)

                # Validate rows and convert to DataFrame
                subscription_rows_df = pd.DataFrame(subscription_rows)
                subscription_rows_df['Revenue'].fillna(self.subscription_fee, inplace=True)

                # Append subscription rows directly into the DataFrame
                self.transformed_df = pd.concat([
                    self.transformed_df, subscription_rows_df
                ], ignore_index=True)

        # Add a 'Remark' column for validation
        self.transformed_df['Remark'] = np.nan
        self.transformed_df.loc[(self.transformed_df['Treatment'].isna()) & (self.transformed_df['Expense'] == 0), 'Remark'] = 'Subscription Fee Payment'

        # Drop rows where Treatment is None and Revenue is 0 (duplicates)
        self.transformed_df = self.transformed_df[~((self.transformed_df['Treatment'].isna()) & (self.transformed_df['Revenue'] == 0))]

        self.transformed_df.sort_values(['Period', 'Customer ID'], inplace=True)
        return self.transformed_df

    def aggregate_by_period(self):
        if self.transformed_df is None:
            raise ValueError("The DataFrame has not been transformed yet. Please call transform_to_subscription_scheme first.")

        # Aggregate basic Revenue and Expense
        aggregated_df = self.transformed_df.groupby(self.transformed_df['Period'].dt.to_period("M")).agg({
            'Revenue': 'sum',
            'Expense': 'sum'
        }).reset_index()

        # Validate Number of Converted and Non-Converted Patients
        def count_converted(period):
            return len([cid for cid, details in self.patient_pool.items()
                        if details['status'] == 'converted' and details['last_seen'] <= period.start_time])

        def count_non_converted(period):
            return len([cid for cid, details in self.patient_pool.items()
                        if details['status'] in ['non-converted', 'inactive'] and details['last_seen'] <= period.start_time])

        # Add corrected columns
        aggregated_df['Number of Converted Patients'] = aggregated_df['Period'].apply(count_converted)
        aggregated_df['Number of Non-Converted Patients'] = aggregated_df['Period'].apply(count_non_converted)

        # Calculate Total Unique Patients Up to Date as the sum of the above two columns
        aggregated_df['Total Unique Patients Up to Date'] = (
            aggregated_df['Number of Converted Patients'] + aggregated_df['Number of Non-Converted Patients']
        )

        aggregated_df['Number of New Patients'] = aggregated_df['Period'].apply(
            lambda period: len([cid for cid, details in self.patient_pool.items()
                                if details['last_seen'] == period.start_time])
        )

        return aggregated_df


In [52]:
transaction_data = pd.read_csv('forecast_df_treatment.csv')

In [53]:
transaction_data['Period'] = pd.to_datetime(transaction_data['Period'])
transaction_data['Month'] = transaction_data['Period'].dt.month
transaction_data['Year'] = transaction_data['Period'].dt.year

group_transaction = transaction_data.groupby(['Year', 'Month']).agg({
    'Customer ID': 'nunique',
    'Revenue': 'sum',
    'Expense': 'sum'
}).reset_index()

group_transaction


    

Unnamed: 0,Year,Month,Customer ID,Revenue,Expense
0,2025,1,106,62168.0,10532.63
1,2025,2,84,62355.0,9265.883333
2,2025,3,95,65857.0,10333.5
3,2025,4,170,53460.0,12283.613333
4,2025,5,143,61069.0,12371.64
5,2025,6,174,68862.0,14010.036667
6,2025,7,127,66652.0,12478.633333
7,2025,8,85,61552.0,8375.853333
8,2025,9,140,71062.0,13193.283333
9,2025,10,140,69310.0,12494.953333


In [54]:
transaction_data['Customer ID'].nunique()

683

In [55]:
transaction_df = pd.DataFrame(transaction_data)
subscription_fee = 100
conversion_ratio_monthly = 1
days_until_inactive = 90
churn_probability = 0

model = ModelSubscription(transaction_df, subscription_fee, conversion_ratio_monthly, days_until_inactive, churn_probability)
transformed_df = model.transform_to_subscription_scheme()


aggregated_df = model.aggregate_by_period()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  subscription_rows_df['Revenue'].fillna(self.subscription_fee, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  subscription_rows_df['Revenue'].fillna(self.subscription_fee, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work beca

In [56]:
transformed_df['Period'] = pd.to_datetime(transformed_df['Period'])
transformed_df['Month'] = transformed_df['Period'].dt.month
transformed_df['Year'] = transformed_df['Period'].dt.year

group_transaction_transformed = transformed_df.groupby(['Year', 'Month']).agg({
    'Customer ID': 'nunique',
    'Revenue': 'sum',
    'Expense': 'sum'
}).reset_index()

group_transaction_transformed

    

Unnamed: 0,Year,Month,Customer ID,Revenue,Expense
0,2025,1,683,68300.0,10532.63
1,2025,2,683,68300.0,9265.883333
2,2025,3,683,68300.0,10333.5
3,2025,4,683,68300.0,12283.613333
4,2025,5,683,68300.0,12371.64
5,2025,6,683,68300.0,14010.036667
6,2025,7,683,68300.0,12478.633333
7,2025,8,683,68300.0,8375.853333
8,2025,9,683,68300.0,13193.283333
9,2025,10,683,68300.0,12494.953333


In [44]:
aggregated_df

Unnamed: 0,Period,Revenue,Expense
0,2025-01,68300.0,10532.63
1,2025-02,68300.0,9265.883333
2,2025-03,68300.0,10333.5
3,2025-04,68300.0,12283.613333
4,2025-05,68300.0,12371.64
5,2025-06,68300.0,14010.036667
6,2025-07,68300.0,12478.633333
7,2025-08,68300.0,8375.853333
8,2025-09,68300.0,13193.283333
9,2025-10,68300.0,12494.953333


In [18]:
transformed_df.head(30)

Unnamed: 0,Period,Treatment,Revenue,Expense,Customer ID,Remark
2510,2025-01-01,,50.0,0.0,Patient 104,Subscription Fee Payment
1562,2025-01-01,,50.0,0.0,Patient 118,Subscription Fee Payment
3098,2025-01-01,,50.0,0.0,Patient 120,Subscription Fee Payment
1598,2025-01-01,,50.0,0.0,Patient 122,Subscription Fee Payment
1634,2025-01-01,,50.0,0.0,Patient 138,Subscription Fee Payment
3026,2025-01-01,,50.0,0.0,Patient 161,Subscription Fee Payment
2882,2025-01-01,,50.0,0.0,Patient 165,Subscription Fee Payment
3014,2025-01-01,,50.0,0.0,Patient 170,Subscription Fee Payment
3086,2025-01-01,,50.0,0.0,Patient 172,Subscription Fee Payment
3146,2025-01-01,,50.0,0.0,Patient 175,Subscription Fee Payment


In [19]:
transformed_df[transformed_df['Customer ID'] == 'Patient 187']

Unnamed: 0,Period,Treatment,Revenue,Expense,Customer ID,Remark
4061,2025-02-01,,50.0,0.0,Patient 187,Subscription Fee Payment
114,2025-02-02,625.0,0.0,49.73,Patient 187,
4062,2025-03-01,,50.0,0.0,Patient 187,Subscription Fee Payment
4063,2025-04-01,,50.0,0.0,Patient 187,Subscription Fee Payment
4064,2025-05-01,,50.0,0.0,Patient 187,Subscription Fee Payment
4065,2025-06-01,,50.0,0.0,Patient 187,Subscription Fee Payment
4066,2025-07-01,,50.0,0.0,Patient 187,Subscription Fee Payment
4067,2025-08-01,,50.0,0.0,Patient 187,Subscription Fee Payment
4068,2025-09-01,,50.0,0.0,Patient 187,Subscription Fee Payment
4069,2025-10-01,,50.0,0.0,Patient 187,Subscription Fee Payment


In [20]:
aggregated_df

Unnamed: 0,Period,Revenue,Expense,Number of Converted Patients,Number of Non-Converted Patients,Number of New Patients,Total Unique Patients Up to Date
0,2025-01,16636.0,10532.63,4,11,15,15
1,2025-02,25142.0,9265.883333,8,25,18,33
2,2025-03,33820.0,10333.5,14,35,16,49
3,2025-04,31646.0,12296.273333,27,67,45,94
4,2025-05,37456.0,12383.74,46,92,44,138
5,2025-06,59929.0,14097.576667,77,132,71,209
6,2025-07,30118.0,12478.633333,107,173,71,280
7,2025-08,44550.0,8460.863333,128,198,46,326
8,2025-09,36295.0,13082.683333,172,254,100,426
9,2025-10,35991.0,12776.173333,227,313,114,540


In [21]:
transaction_df['Revenue'].sum() - transformed_df['Revenue'].sum()

291800.0

In [10]:
converting_patients['Customer ID'].unique()

array(['Patient 796', 'Patient 655', 'Patient 239', 'Patient 710',
       'Patient 797', 'Patient 212', 'Patient 538', 'Patient 281',
       'Patient 456', 'Patient 283', 'Patient 269', 'Patient 398',
       'Patient 656', 'Patient 575', 'Patient 284', 'Patient 309',
       'Patient 180', 'Patient 402', 'Patient 232', 'Patient 590',
       'Patient 275', 'Patient 202', 'Patient 421', 'Patient 341',
       'Patient 318', 'Patient 695', 'Patient 40', 'Patient 537',
       'Patient 625', 'Patient 462', 'Patient 429', 'Patient 765',
       'Patient 216', 'Patient 731', 'Patient 192', 'Patient 395',
       'Patient 44', 'Patient 184', 'Patient 430', 'Patient 488',
       'Patient 616', 'Patient 240', 'Patient 740', 'Patient 491',
       'Patient 467', 'Patient 279', 'Patient 330', 'Patient 716',
       'Patient 32', 'Patient 356', 'Patient 121', 'Patient 556',
       'Patient 82', 'Patient 58', 'Patient 583', 'Patient 618',
       'Patient 426', 'Patient 123', 'Patient 303', 'Patient 397',


In [14]:
non_converting_patients['Customer ID'].unique()

array(['Patient 259', 'Patient 75', 'Patient 515', 'Patient 293',
       'Patient 215', 'Patient 709', 'Patient 550', 'Patient 686',
       'Patient 18', 'Patient 179', 'Patient 22', 'Patient 183',
       'Patient 678', 'Patient 763', 'Patient 304', 'Patient 773',
       'Patient 624', 'Patient 260', 'Patient 248', 'Patient 688',
       'Patient 241', 'Patient 345', 'Patient 362', 'Patient 391',
       'Patient 47', 'Patient 91', 'Patient 732', 'Patient 735',
       'Patient 55', 'Patient 349', 'Patient 554', 'Patient 766',
       'Patient 764', 'Patient 518', 'Patient 326', 'Patient 290',
       'Patient 42', 'Patient 408', 'Patient 787', 'Patient 460',
       'Patient 681', 'Patient 325', 'Patient 219', 'Patient 5',
       'Patient 770', 'Patient 144', 'Patient 693', 'Patient 189',
       'Patient 410', 'Patient 645', 'Patient 405', 'Patient 495',
       'Patient 228', 'Patient 412', 'Patient 177', 'Patient 699',
       'Patient 380', 'Patient 133', 'Patient 535', 'Patient 173',
    

In [20]:
transformed_df[transformed_df['Customer ID'] == 'Patient 699']

Unnamed: 0,Period,Treatment,Revenue,Expense,Customer ID
299,2025-02-17,615,382.0,97.99,Patient 699
403,2025-03-03,251,112.0,28.1,Patient 699
1281,2025-05-15,90,0.0,62.5,Patient 699
1570,2025-06-03,829,165.0,76.62,Patient 699
2074,2025-07-03,119,197.0,53.48,Patient 699


In [21]:
transaction_df[transaction_df['Customer ID'] == 'Patient 699']

Unnamed: 0,Period,Treatment,Revenue,Expense,Customer ID
162,2025-02-17,615,382.0,97.99,Patient 699
203,2025-03-03,251,112.0,28.1,Patient 699
564,2025-05-15,90,0.0,62.5,Patient 699
672,2025-06-03,829,165.0,76.62,Patient 699
865,2025-07-03,119,197.0,53.48,Patient 699
