In [51]:
import pandas as pd
import numpy as np
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import random

class ModelSubscription:
    def __init__(self, transaction_df, subscription_fee, conversion_ratio_monthly, days_until_inactive=90, churn_probability=0.1, start_date=None, end_date=None):
        self.transaction_df = transaction_df
        self.subscription_fee = subscription_fee
        self.conversion_ratio_monthly = conversion_ratio_monthly
        self.days_until_inactive = days_until_inactive
        self.churn_probability = churn_probability
        self.start_date = pd.to_datetime(start_date if start_date else transaction_df['Period'].min())
        self.end_date = pd.to_datetime(end_date if end_date else transaction_df['Period'].max())
        self.transformed_df = None
        self.patient_pool = {}

    def initialize_patient_pool(self):
        # Initialize the patient pool with non-converted patients
        self.transaction_df['Period'] = pd.to_datetime(self.transaction_df['Period'])
        for customer_id, group in self.transaction_df.groupby('Customer ID'):
            first_transaction = group['Period'].min()
            self.patient_pool[customer_id] = {
                'status': 'non-converted',
                'last_seen': first_transaction
            }

    def update_patient_pool(self, current_month):
        # Update the patient pool based on inactivity and new patients
        active_customers = self.transaction_df[self.transaction_df['Period'].dt.to_period('M') == current_month]['Customer ID'].unique()
        new_patients = []

        for customer_id in active_customers:
            if customer_id not in self.patient_pool:
                # Treat reappearing or new patients as new
                new_patients.append(customer_id)
                self.patient_pool[customer_id] = {
                    'status': 'non-converted',
                    'last_seen': current_month.start_time
                }
            else:
                # Update last_seen for existing patients
                self.patient_pool[customer_id]['last_seen'] = current_month.start_time

        # Mark patients inactive if they have been unseen for days_until_inactive
        for customer_id, details in self.patient_pool.items():
            if details['status'] == 'non-converted':
                if (current_month.start_time - details['last_seen']).days > self.days_until_inactive:
                    details['status'] = 'inactive'

        return new_patients

    def apply_monthly_conversion(self, current_month, new_patients):
        # Calculate the eligible pool for conversion
        eligible_non_converted = [cid for cid, details in self.patient_pool.items() if details['status'] == 'non-converted']
        eligible_pool = eligible_non_converted + new_patients

        num_to_convert = int(len(eligible_pool) * self.conversion_ratio_monthly)
        converting_patients = random.sample(eligible_pool, min(num_to_convert, len(eligible_pool)))

        for customer_id in converting_patients:
            self.patient_pool[customer_id]['status'] = 'converted'

        return converting_patients

    def apply_churn(self, current_month):
        # Simulate churn for converted patients
        if current_month == self.start_date.to_period('M'):
            return []  # No churn in the first month

        converted_patients = [cid for cid, details in self.patient_pool.items() if details['status'] == 'converted']
        churned_patients = [cid for cid in converted_patients if random.random() < self.churn_probability]

        for customer_id in churned_patients:
            self.patient_pool[customer_id]['status'] = 'non-converted'
            # Revert their transactions from current month onward
            customer_transactions = self.transformed_df[(self.transformed_df['Customer ID'] == customer_id) & 
                                                        (self.transformed_df['Period'] >= current_month.start_time)]
            original_transactions = self.transaction_df[self.transaction_df['Customer ID'] == customer_id]

            for index, transaction in customer_transactions.iterrows():
                period = transaction['Period']
                original_transaction = original_transactions[original_transactions['Period'] == period]
                if not original_transaction.empty:
                    # Transform back to the original transaction
                    self.transformed_df.loc[index, 'Revenue'] = original_transaction['Revenue'].values[0]
                    self.transformed_df.loc[index, 'Expense'] = original_transaction['Expense'].values[0]
                    self.transformed_df.loc[index, 'Treatment'] = original_transaction['Treatment'].values[0]

        return churned_patients

    def transform_to_subscription_scheme(self):
        self.initialize_patient_pool()
        self.transformed_df = self.transaction_df.copy()
        all_periods = pd.period_range(self.start_date, self.end_date, freq='M')

        for current_month in all_periods:
            new_patients = self.update_patient_pool(current_month)
            converting_patients = self.apply_monthly_conversion(current_month, new_patients)
            churned_patients = self.apply_churn(current_month)

            # Transform transactions for converting patients
            for customer_id in converting_patients:
                transactions = self.transformed_df[self.transformed_df['Customer ID'] == customer_id]
                self.transformed_df.loc[transactions.index, 'Revenue'] = 0

                # Add subscription rows
                subscription_rows = []
                subscription_date = current_month.start_time
                while subscription_date <= self.end_date:
                    subscription_rows.append({
                        'Period': subscription_date,
                        'Treatment': None,
                        'Revenue': self.subscription_fee,
                        'Expense': 0,
                        'Customer ID': customer_id
                    })
                    subscription_date += relativedelta(months=1)

                # Validate rows and convert to DataFrame
                subscription_rows_df = pd.DataFrame(subscription_rows)
                subscription_rows_df['Revenue'].fillna(self.subscription_fee, inplace=True)

                # Append subscription rows directly into the DataFrame
                self.transformed_df = pd.concat([
                    self.transformed_df, subscription_rows_df
                ], ignore_index=True)

        # Add a 'Remark' column for validation
        self.transformed_df['Remark'] = np.nan
        self.transformed_df.loc[(self.transformed_df['Treatment'].isna()) & (self.transformed_df['Expense'] == 0), 'Remark'] = 'Subscription Fee Payment'

        # Drop rows where Treatment is None and Revenue is 0 (duplicates)
        self.transformed_df = self.transformed_df[~((self.transformed_df['Treatment'].isna()) & (self.transformed_df['Revenue'] == 0))]

        self.transformed_df.sort_values(['Period', 'Customer ID'], inplace=True)
        return self.transformed_df

    def aggregate_by_period(self):
        if self.transformed_df is None:
            raise ValueError("The DataFrame has not been transformed yet. Please call transform_to_subscription_scheme first.")

        # Aggregate basic Revenue and Expense
        aggregated_df = self.transformed_df.groupby(self.transformed_df['Period'].dt.to_period("M")).agg({
            'Revenue': 'sum',
            'Expense': 'sum'
        }).reset_index()

        # Validate Number of Converted and Non-Converted Patients
        def count_converted(period):
            return len([cid for cid, details in self.patient_pool.items()
                        if details['status'] == 'converted' and details['last_seen'] <= period.start_time])

        def count_non_converted(period):
            return len([cid for cid, details in self.patient_pool.items()
                        if details['status'] in ['non-converted', 'inactive'] and details['last_seen'] <= period.start_time])

        # Add corrected columns
        aggregated_df['Number of Converted Patients'] = aggregated_df['Period'].apply(count_converted)
        aggregated_df['Number of Non-Converted Patients'] = aggregated_df['Period'].apply(count_non_converted)

        # Calculate Total Unique Patients Up to Date as the sum of the above two columns
        aggregated_df['Total Unique Patients Up to Date'] = (
            aggregated_df['Number of Converted Patients'] + aggregated_df['Number of Non-Converted Patients']
        )

        aggregated_df['Number of New Patients'] = aggregated_df['Period'].apply(
            lambda period: len([cid for cid, details in self.patient_pool.items()
                                if details['last_seen'] == period.start_time])
        )

        return aggregated_df


In [52]:
transaction_data = pd.read_csv('forecast_df_treatment.csv')

In [53]:
transaction_data['Period'] = pd.to_datetime(transaction_data['Period'])
transaction_data['Month'] = transaction_data['Period'].dt.month
transaction_data['Year'] = transaction_data['Period'].dt.year

group_transaction = transaction_data.groupby(['Year', 'Month']).agg({
    'Customer ID': 'nunique',
    'Revenue': 'sum',
    'Expense': 'sum'
}).reset_index()

group_transaction


    

Unnamed: 0,Year,Month,Customer ID,Revenue,Expense
0,2025,1,106,62168.0,10532.63
1,2025,2,84,62355.0,9265.883333
2,2025,3,95,65857.0,10333.5
3,2025,4,170,53460.0,12283.613333
4,2025,5,143,61069.0,12371.64
5,2025,6,174,68862.0,14010.036667
6,2025,7,127,66652.0,12478.633333
7,2025,8,85,61552.0,8375.853333
8,2025,9,140,71062.0,13193.283333
9,2025,10,140,69310.0,12494.953333


In [54]:
transaction_data['Customer ID'].nunique()

683

In [55]:
transaction_df = pd.DataFrame(transaction_data)
subscription_fee = 100
conversion_ratio_monthly = 1
days_until_inactive = 90
churn_probability = 0

model = ModelSubscription(transaction_df, subscription_fee, conversion_ratio_monthly, days_until_inactive, churn_probability)
transformed_df = model.transform_to_subscription_scheme()


aggregated_df = model.aggregate_by_period()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  subscription_rows_df['Revenue'].fillna(self.subscription_fee, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  subscription_rows_df['Revenue'].fillna(self.subscription_fee, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work beca

In [56]:
transformed_df['Period'] = pd.to_datetime(transformed_df['Period'])
transformed_df['Month'] = transformed_df['Period'].dt.month
transformed_df['Year'] = transformed_df['Period'].dt.year

group_transaction_transformed = transformed_df.groupby(['Year', 'Month']).agg({
    'Customer ID': 'nunique',
    'Revenue': 'sum',
    'Expense': 'sum'
}).reset_index()

group_transaction_transformed

    

Unnamed: 0,Year,Month,Customer ID,Revenue,Expense
0,2025,1,683,68300.0,10532.63
1,2025,2,683,68300.0,9265.883333
2,2025,3,683,68300.0,10333.5
3,2025,4,683,68300.0,12283.613333
4,2025,5,683,68300.0,12371.64
5,2025,6,683,68300.0,14010.036667
6,2025,7,683,68300.0,12478.633333
7,2025,8,683,68300.0,8375.853333
8,2025,9,683,68300.0,13193.283333
9,2025,10,683,68300.0,12494.953333


In [44]:
aggregated_df

Unnamed: 0,Period,Revenue,Expense
0,2025-01,68300.0,10532.63
1,2025-02,68300.0,9265.883333
2,2025-03,68300.0,10333.5
3,2025-04,68300.0,12283.613333
4,2025-05,68300.0,12371.64
5,2025-06,68300.0,14010.036667
6,2025-07,68300.0,12478.633333
7,2025-08,68300.0,8375.853333
8,2025-09,68300.0,13193.283333
9,2025-10,68300.0,12494.953333


In [18]:
transformed_df.head(30)

Unnamed: 0,Period,Treatment,Revenue,Expense,Customer ID,Remark
2510,2025-01-01,,50.0,0.0,Patient 104,Subscription Fee Payment
1562,2025-01-01,,50.0,0.0,Patient 118,Subscription Fee Payment
3098,2025-01-01,,50.0,0.0,Patient 120,Subscription Fee Payment
1598,2025-01-01,,50.0,0.0,Patient 122,Subscription Fee Payment
1634,2025-01-01,,50.0,0.0,Patient 138,Subscription Fee Payment
3026,2025-01-01,,50.0,0.0,Patient 161,Subscription Fee Payment
2882,2025-01-01,,50.0,0.0,Patient 165,Subscription Fee Payment
3014,2025-01-01,,50.0,0.0,Patient 170,Subscription Fee Payment
3086,2025-01-01,,50.0,0.0,Patient 172,Subscription Fee Payment
3146,2025-01-01,,50.0,0.0,Patient 175,Subscription Fee Payment


In [19]:
transformed_df[transformed_df['Customer ID'] == 'Patient 187']

Unnamed: 0,Period,Treatment,Revenue,Expense,Customer ID,Remark
4061,2025-02-01,,50.0,0.0,Patient 187,Subscription Fee Payment
114,2025-02-02,625.0,0.0,49.73,Patient 187,
4062,2025-03-01,,50.0,0.0,Patient 187,Subscription Fee Payment
4063,2025-04-01,,50.0,0.0,Patient 187,Subscription Fee Payment
4064,2025-05-01,,50.0,0.0,Patient 187,Subscription Fee Payment
4065,2025-06-01,,50.0,0.0,Patient 187,Subscription Fee Payment
4066,2025-07-01,,50.0,0.0,Patient 187,Subscription Fee Payment
4067,2025-08-01,,50.0,0.0,Patient 187,Subscription Fee Payment
4068,2025-09-01,,50.0,0.0,Patient 187,Subscription Fee Payment
4069,2025-10-01,,50.0,0.0,Patient 187,Subscription Fee Payment


In [20]:
aggregated_df

Unnamed: 0,Period,Revenue,Expense,Number of Converted Patients,Number of Non-Converted Patients,Number of New Patients,Total Unique Patients Up to Date
0,2025-01,16636.0,10532.63,4,11,15,15
1,2025-02,25142.0,9265.883333,8,25,18,33
2,2025-03,33820.0,10333.5,14,35,16,49
3,2025-04,31646.0,12296.273333,27,67,45,94
4,2025-05,37456.0,12383.74,46,92,44,138
5,2025-06,59929.0,14097.576667,77,132,71,209
6,2025-07,30118.0,12478.633333,107,173,71,280
7,2025-08,44550.0,8460.863333,128,198,46,326
8,2025-09,36295.0,13082.683333,172,254,100,426
9,2025-10,35991.0,12776.173333,227,313,114,540


In [21]:
transaction_df['Revenue'].sum() - transformed_df['Revenue'].sum()

291800.0

In [10]:
converting_patients['Customer ID'].unique()

array(['Patient 796', 'Patient 655', 'Patient 239', 'Patient 710',
       'Patient 797', 'Patient 212', 'Patient 538', 'Patient 281',
       'Patient 456', 'Patient 283', 'Patient 269', 'Patient 398',
       'Patient 656', 'Patient 575', 'Patient 284', 'Patient 309',
       'Patient 180', 'Patient 402', 'Patient 232', 'Patient 590',
       'Patient 275', 'Patient 202', 'Patient 421', 'Patient 341',
       'Patient 318', 'Patient 695', 'Patient 40', 'Patient 537',
       'Patient 625', 'Patient 462', 'Patient 429', 'Patient 765',
       'Patient 216', 'Patient 731', 'Patient 192', 'Patient 395',
       'Patient 44', 'Patient 184', 'Patient 430', 'Patient 488',
       'Patient 616', 'Patient 240', 'Patient 740', 'Patient 491',
       'Patient 467', 'Patient 279', 'Patient 330', 'Patient 716',
       'Patient 32', 'Patient 356', 'Patient 121', 'Patient 556',
       'Patient 82', 'Patient 58', 'Patient 583', 'Patient 618',
       'Patient 426', 'Patient 123', 'Patient 303', 'Patient 397',


In [14]:
non_converting_patients['Customer ID'].unique()

array(['Patient 259', 'Patient 75', 'Patient 515', 'Patient 293',
       'Patient 215', 'Patient 709', 'Patient 550', 'Patient 686',
       'Patient 18', 'Patient 179', 'Patient 22', 'Patient 183',
       'Patient 678', 'Patient 763', 'Patient 304', 'Patient 773',
       'Patient 624', 'Patient 260', 'Patient 248', 'Patient 688',
       'Patient 241', 'Patient 345', 'Patient 362', 'Patient 391',
       'Patient 47', 'Patient 91', 'Patient 732', 'Patient 735',
       'Patient 55', 'Patient 349', 'Patient 554', 'Patient 766',
       'Patient 764', 'Patient 518', 'Patient 326', 'Patient 290',
       'Patient 42', 'Patient 408', 'Patient 787', 'Patient 460',
       'Patient 681', 'Patient 325', 'Patient 219', 'Patient 5',
       'Patient 770', 'Patient 144', 'Patient 693', 'Patient 189',
       'Patient 410', 'Patient 645', 'Patient 405', 'Patient 495',
       'Patient 228', 'Patient 412', 'Patient 177', 'Patient 699',
       'Patient 380', 'Patient 133', 'Patient 535', 'Patient 173',
    

In [20]:
transformed_df[transformed_df['Customer ID'] == 'Patient 699']

Unnamed: 0,Period,Treatment,Revenue,Expense,Customer ID
299,2025-02-17,615,382.0,97.99,Patient 699
403,2025-03-03,251,112.0,28.1,Patient 699
1281,2025-05-15,90,0.0,62.5,Patient 699
1570,2025-06-03,829,165.0,76.62,Patient 699
2074,2025-07-03,119,197.0,53.48,Patient 699


In [21]:
transaction_df[transaction_df['Customer ID'] == 'Patient 699']

Unnamed: 0,Period,Treatment,Revenue,Expense,Customer ID
162,2025-02-17,615,382.0,97.99,Patient 699
203,2025-03-03,251,112.0,28.1,Patient 699
564,2025-05-15,90,0.0,62.5,Patient 699
672,2025-06-03,829,165.0,76.62,Patient 699
865,2025-07-03,119,197.0,53.48,Patient 699


### Another Attempt


In [19]:
import pandas as pd
import numpy as np
from datetime import timedelta

class ModelSubscriptionScheme:
    def __init__(self, conversion_rate_monthly, monthly_subscription_fee, days_until_inactive):
        self.conversion_rate_monthly = conversion_rate_monthly
        self.monthly_subscription_fee = monthly_subscription_fee
        self.days_until_inactive = days_until_inactive

        self.converting_patient = set()
        self.non_converting_patient = set()
        self.new_converting_patient = set()
        self.patient_last_transaction = {}  # Track last transaction date for each patient

    def transform(self, df):
        df["Period"] = pd.to_datetime(df["Period"])
        df["Month-Year"] = df["Period"].dt.to_period("M")
        months = df["Month-Year"].unique()

        transformed_rows = []

        for month in months:
            month_data = df[df["Month-Year"] == month]
            unique_patients = set(month_data["Customer ID"].unique())

            # Update last transaction dates
            for patient_id in unique_patients:
                self.patient_last_transaction[patient_id] = month_data[month_data["Customer ID"] == patient_id]["Period"].max()

            # Identify newly seen patients
            new_patients = unique_patients - self.converting_patient - self.non_converting_patient

            # Add new patients to the non-converting pool
            self.non_converting_patient.update(new_patients)

            # Banish inactive patients
            self._banish_inactive_patients(month_data["Period"].max())

            # Determine new converting patients
            pool_size = len(self.non_converting_patient)
            num_to_convert = int(self.conversion_rate_monthly * pool_size)
            self.new_converting_patient = set(
                np.random.choice(list(self.non_converting_patient), num_to_convert, replace=False)
            )

            # Update pools
            self.non_converting_patient -= self.new_converting_patient
            self.converting_patient.update(self.new_converting_patient)

            # Process converting patients
            self._process_converting_patients(month_data, transformed_rows)

            # Process new converting patients
            self._process_new_converting_patients(month_data, transformed_rows)

        # Create final DataFrame
        transformed_df = pd.DataFrame(transformed_rows, columns=df.columns)
        return pd.concat([df, transformed_df]).sort_values("Period").reset_index(drop=True)

    def _process_converting_patients(self, month_data, transformed_rows):
        for patient_id in self.converting_patient:
            patient_transactions = month_data[month_data["Customer ID"] == patient_id]
            subscription_row = self._generate_subscription_row(
                patient_id, self._get_subscription_date(patient_transactions)
            )
            transformed_rows.append(subscription_row)

            for _, row in patient_transactions.iterrows():
                row["Revenue"] = 0
                transformed_rows.append(row)

    def _process_new_converting_patients(self, month_data, transformed_rows):
        for patient_id in self.new_converting_patient:
            patient_transactions = month_data[month_data["Customer ID"] == patient_id]
            subscription_row = self._generate_subscription_row(
                patient_id, patient_transactions["Period"].min()
            )
            transformed_rows.append(subscription_row)

            for _, row in patient_transactions.iterrows():
                row["Revenue"] = 0
                transformed_rows.append(row)

    def _banish_inactive_patients(self, current_month_end):
        inactive_patients = set()
        for patient_id in self.non_converting_patient:
            # Find the last transaction date for the patient
            last_transaction_date = self.patient_last_transaction.get(
                patient_id, current_month_end - timedelta(days=self.days_until_inactive)
            )
            if (current_month_end - last_transaction_date).days > self.days_until_inactive:
                inactive_patients.add(patient_id)

        self.non_converting_patient -= inactive_patients

    def _generate_subscription_row(self, patient_id, date):
        return {
            "Period": date,
            "Treatment": None,
            "Revenue": self.monthly_subscription_fee,
            "Expense": 0,
            "Customer ID": patient_id,
        }

    def _get_subscription_date(self, patient_transactions):
        if not patient_transactions.empty:
            return patient_transactions[patient_transactions["Treatment"].isna()]["Period"].min()
        return None  # Fallback to first day logic handled elsewhere


In [20]:
# Example Usage
df = pd.read_csv("forecast_df_treatment.csv")
model = ModelSubscriptionScheme(conversion_rate_monthly=0.2, monthly_subscription_fee=100, days_until_inactive=90)
transformed_df = model.transform(df)
transformed_df

Unnamed: 0,Period,Treatment,Revenue,Expense,Customer ID,Month-Year
0,2025-01-01,331,179.0,40.64,Patient 428,2025-01
1,2025-01-01,531,87.0,23.59,Patient 526,2025-01
2,2025-01-01,656,129.0,39.08,Patient 571,2025-01
3,2025-01-01,437,93.0,41.75,Patient 259,2025-01
4,2025-01-01,118,56.0,7.30,Patient 75,2025-01
...,...,...,...,...,...,...
5726,NaT,,100.0,0.00,Patient 148,NaT
5727,NaT,,100.0,0.00,Patient 474,NaT
5728,NaT,,100.0,0.00,Patient 465,NaT
5729,NaT,,100.0,0.00,Patient 619,NaT


In [14]:
transformed_df[transformed_df['Customer ID'] == 'Patient 102']

Unnamed: 0,Period,Treatment,Revenue,Expense,Customer ID,Month-Year
0,2024-12-01 19:43:38.081623,,100.0,0.0,Patient 102,NaT
22,2024-12-01 19:43:38.240077,,100.0,0.0,Patient 102,NaT
132,2024-12-01 19:43:38.466998,,100.0,0.0,Patient 102,NaT
255,2024-12-01 19:43:38.845539,,100.0,0.0,Patient 102,NaT
414,2024-12-01 19:43:39.229777,,100.0,0.0,Patient 102,NaT
639,2024-12-01 19:43:39.647612,,100.0,0.0,Patient 102,NaT
932,2024-12-01 19:43:40.069339,,100.0,0.0,Patient 102,NaT
1382,2024-12-01 19:43:40.582739,,100.0,0.0,Patient 102,NaT
1738,2024-12-01 19:43:41.131160,,100.0,0.0,Patient 102,NaT
2134,2024-12-01 19:43:41.650202,,100.0,0.0,Patient 102,NaT


In [8]:
df

Unnamed: 0,Period,Treatment,Revenue,Expense,Customer ID,Month-Year
0,2025-01-01,331,179.0,40.640000,Patient 428,2025-01
1,2025-01-01,531,87.0,23.590000,Patient 526,2025-01
2,2025-01-01,656,129.0,39.080000,Patient 571,2025-01
3,2025-01-01,437,93.0,41.750000,Patient 259,2025-01
4,2025-01-01,118,56.0,7.300000,Patient 75,2025-01
...,...,...,...,...,...,...
1533,2025-12-30,633,336.0,43.550000,Patient 705,2025-12
1534,2025-12-31,739,150.0,59.416667,Patient 586,2025-12
1535,2025-12-31,365,132.0,144.060000,Patient 653,2025-12
1536,2025-12-31,388,212.0,79.280000,Patient 467,2025-12


### Another Attempts (Claude)

In [149]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np

class ModelSubscriptionScheme:
    def __init__(self, conversion_rate_monthly: float, monthly_subscription_fee: float, days_until_inactive: int):
        """
        Initialize the subscription model transformation class.
        
        Args:
            conversion_rate_monthly (float): Monthly conversion rate (0-1)
            monthly_subscription_fee (float): Subscription fee amount
            days_until_inactive (int): Days until a non-converting customer is considered inactive
        """
        self.conversion_rate_monthly = conversion_rate_monthly
        self.monthly_subscription_fee = monthly_subscription_fee
        self.days_until_inactive = days_until_inactive
        
        # Initialize patient pools
        self.converting_patient = set()
        self.new_converting_patient = set()
        self.non_converting_patient = set()
        
        # Track last transaction dates for non-converting patients
        self.last_transaction_dates = {}
        
        # Track subscription dates for converting patients
        self.subscription_dates = {}

    def _is_end_of_month_date(self, date: pd.Timestamp) -> bool:
        """Check if the date is in the last 4 days of the month."""
        next_month = date + pd.offsets.MonthEnd(0) + pd.Timedelta(days=1)
        days_until_month_end = (next_month - date).days
        return days_until_month_end <= 4

    def _get_next_subscription_date(self, current_date: pd.Timestamp) -> pd.Timestamp:
        """
        Get the next subscription date, handling end-of-month cases.
        For dates in the last 4 days of the month, use the last day of next month.
        Otherwise, use the same day in the next month.
        """
        if self._is_end_of_month_date(current_date):
            # If it's end of month, use last day of next month
            next_month_start = current_date + pd.offsets.MonthBegin(1)
            return next_month_start + pd.offsets.MonthEnd(1)
        else:
            # Otherwise, try to use the same day next month
            year = current_date.year
            month = current_date.month + 1
            day = current_date.day
            
            if month > 12:
                year += 1
                month = 1
                
            return pd.Timestamp(year=year, month=month, day=day)

    def _generate_subscription_rows(self, customer_id: str, start_date: pd.Timestamp, end_date: pd.Timestamp) -> list:
        """
        Generate subscription payment rows for a customer between start_date and end_date.
        """
        subscription_rows = []
        current_date = start_date
        
        while current_date <= end_date:
            subscription_row = pd.Series({
                'Period': current_date,
                'Treatment': 'subscription',
                'Revenue': self.monthly_subscription_fee,
                'Expense': 0,
                'Customer ID': customer_id
            })
            subscription_rows.append(pd.DataFrame([subscription_row]))
            current_date = self._get_next_subscription_date(current_date)
        
        return subscription_rows

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Transform the transaction dataframe to implement subscription model.
        """
        # Convert Period to datetime and sort
        df['Period'] = pd.to_datetime(df['Period'])
        df = df.sort_values('Period')
        
        # Get overall date range
        start_date = df['Period'].min()
        end_date = df['Period'].max()
        
        # Initialize transformed dataframe list
        transformed_dfs = []
        
        # Process first month
        first_month = df['Period'].dt.strftime('%Y-%m').iloc[0]
        first_month_df = df[df['Period'].dt.strftime('%Y-%m') == first_month]
        
        # Get unique patients for first month
        first_month_patients = set(first_month_df['Customer ID'].unique())
        
        # Add all patients to non_converting initially
        self.non_converting_patient.update(first_month_patients)
        
        # Select initial converts
        num_initial_converts = int(len(first_month_patients) * self.conversion_rate_monthly)
        if num_initial_converts > 0:
            initial_converts = set(np.random.choice(
                list(first_month_patients),
                size=num_initial_converts,
                replace=False
            ))
            
            # Move to new_converting_patient pool
            self.new_converting_patient.update(initial_converts)
            self.non_converting_patient.difference_update(initial_converts)
            # print(f"Initial Converts: {len(initial_converts)}")
            # print(f"Non Converting: {len(self.non_converting_patient)}")
        
        # Process all months
        for month_year, month_df in df.groupby(df['Period'].dt.strftime('%Y-%m')):
            current_month_start = pd.to_datetime(month_year + '-01')
            current_month_end = current_month_start + pd.offsets.MonthEnd(1)
            month_transformed = []
            
            # Get unique patients for the month
            month_patients = set(month_df['Customer ID'].unique())
            
            # ZAKI
            self.non_converting_patient.update(month_patients)
            self.non_converting_patient.difference_update(self.converting_patient)
            # ZAKI DONE
            
            # Select new converts from non_converting pool
            potential_converts = list(self.non_converting_patient)
            num_converts = int(len(potential_converts) * self.conversion_rate_monthly)
            
            
            
            if potential_converts and num_converts > 0:
                new_converts = set(np.random.choice(
                    potential_converts,
                    size=min(num_converts, len(potential_converts)),
                    replace=False
                ))
                self.new_converting_patient.update(new_converts)
                self.non_converting_patient.difference_update(new_converts)

            
            # Process each customer's transactions
            for customer_id in month_patients:
                customer_df = month_df[month_df['Customer ID'] == customer_id].copy()
                
                if customer_id in self.converting_patient:
                    # Existing subscriber - keep their treatment transactions with zero revenue
                    customer_df['Revenue'] = 0
                    month_transformed.append(customer_df)
                    
                elif customer_id in self.new_converting_patient:
                    # New subscriber
                    first_transaction_date = customer_df['Period'].min()
                    
                    # Store subscription date
                    self.subscription_dates[customer_id] = first_transaction_date
                    
                    # Set treatment revenues to 0
                    customer_df['Revenue'] = 0
                    month_transformed.append(customer_df)
                    
                    # Generate subscription payments from first transaction to end date
                    subscription_rows = self._generate_subscription_rows(
                        customer_id,
                        first_transaction_date,
                        end_date
                    )
                    month_transformed.extend(subscription_rows)
                else:
                    # Non-converting patient - keep original transactions
                    month_transformed.append(customer_df)
                    print(f"{customer_id} is non-converting")
            
            # print(f"Period {month_year}: Number of Non Converting - {len(self.non_converting_patient)}")
            # print(f"Period {month_year}: New Converts - {len(self.new_converting_patient)}")
            # print(f"Period {month_year}: Converting - {len(self.converting_patient)}")
            
            
            # Move new converts to converting pool for next month
            self.converting_patient.update(self.new_converting_patient)
            self.new_converting_patient.clear()
            

            

            

            
            # Combine all transformations for the month
            if month_transformed:
                month_transformed_df = pd.concat(month_transformed, ignore_index=True)
                transformed_dfs.append(month_transformed_df)
        
        # Combine all months
        result_df = pd.concat(transformed_dfs, ignore_index=True)
        return result_df.sort_values(['Period', 'Customer ID'])

    def get_metrics(self) -> dict:
        """Get current metrics about the transformation."""
        return {
            'total_subscribers': len(self.converting_patient) + len(self.new_converting_patient),
            'active_non_subscribers': len(self.non_converting_patient),
            'new_converts_this_month': len(self.new_converting_patient),
            'established_subscribers': len(self.converting_patient)
        }

##### New Session with Claude

In [110]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np

class ModelSubscriptionScheme:
    def __init__(self, conversion_rate_monthly: float, monthly_subscription_fee: float, 
                 days_until_inactive: int, churn_probability: float):
        """
        Initialize the subscription model transformation class.
        
        Args:
            conversion_rate_monthly (float): Monthly conversion rate (0-1)
            monthly_subscription_fee (float): Subscription fee amount
            days_until_inactive (int): Days until a non-converting customer is considered inactive
            churn_probability (float): Monthly probability of a converting customer churning (0-1)
        """
        self.conversion_rate_monthly = conversion_rate_monthly
        self.monthly_subscription_fee = monthly_subscription_fee
        self.days_until_inactive = days_until_inactive
        self.churn_probability = churn_probability
        
        # Initialize patient pools
        self.converting_patient = set()
        self.new_converting_patient = set()
        self.non_converting_patient = set()
        
        # Track last transaction dates for non-converting patients
        self.last_transaction_dates = {}
        
        # Track subscription dates for converting patients
        self.subscription_dates = {}
        
        # Store original dataframe for revenue restoration
        self.original_df = None

    def _is_end_of_month_date(self, date: pd.Timestamp) -> bool:
        """Check if the date is in the last 4 days of the month."""
        next_month = date + pd.offsets.MonthEnd(0) + pd.Timedelta(days=1)
        days_until_month_end = (next_month - date).days
        return days_until_month_end <= 4

    def _get_next_subscription_date(self, current_date: pd.Timestamp) -> pd.Timestamp:
        """
        Get the next subscription date, handling end-of-month cases.
        For dates in the last 4 days of the month, use the last day of next month.
        Otherwise, use the same day in the next month.
        """
        if self._is_end_of_month_date(current_date):
            # If it's end of month, use last day of next month
            next_month_start = current_date + pd.offsets.MonthBegin(1)
            return next_month_start + pd.offsets.MonthEnd(1)
        else:
            # Otherwise, try to use the same day next month
            year = current_date.year
            month = current_date.month + 1
            day = current_date.day
            
            if month > 12:
                year += 1
                month = 1
                
            return pd.Timestamp(year=year, month=month, day=day)

    def _generate_subscription_rows(self, customer_id: str, current_month_start: pd.Timestamp) -> list:
        """
        Generate subscription payment row for a customer for the current month only.
        
        Args:
            customer_id (str): Customer ID
            current_month_start (pd.Timestamp): Start of the current month
            
        Returns:
            list: List containing single subscription row DataFrame
        """
        subscription_row = pd.Series({
            'Period': current_month_start,
            'Treatment': 'subscription',
            'Revenue': self.monthly_subscription_fee,
            'Expense': 0,
            'Customer ID': customer_id
        })
        return [pd.DataFrame([subscription_row])]

    def _process_churns(self, current_month_start: pd.Timestamp, month_df: pd.DataFrame) -> tuple[set, pd.DataFrame]:
        """
        Process potential churns for the current month.
        
        Returns:
            tuple: (churned_customers, updated_month_df)
        """
        # Determine which converting customers churn this month
        potential_churners = self.converting_patient.copy()
        num_churners = np.random.binomial(n=len(potential_churners), p=self.churn_probability)
        
        churned_customers = set()
        if num_churners > 0 and potential_churners:
            churned_customers = set(np.random.choice(
                list(potential_churners),
                size=num_churners,
                replace=False
            ))
            
            # Remove churned customers from converting pool
            self.converting_patient.difference_update(churned_customers)
            
            # For churned customers with transactions this month:
            # 1. Restore their original revenue
            # 2. Remove their subscription payment
            if not month_df.empty:
                month_transformed = []
                current_month_str = current_month_start.strftime('%Y-%m')
                
                for _, group in month_df.groupby('Customer ID'):
                    
                    customer_id = group['Customer ID'].iloc[0]
                    
                    if customer_id in churned_customers:
                        # Get original revenue for treatment transactions
                        customer_orig = self.original_df[
                            (self.original_df['Customer ID'] == customer_id) & 
                            (pd.to_datetime(self.original_df['Period']).dt.strftime('%Y-%m') == current_month_str)
                        ]
                        
                        if not customer_orig.empty:
                            # Keep only treatment transactions with original revenue
                            treatment_rows = group[group['Treatment'] != 'subscription'].copy()
                            treatment_rows['Revenue'] = customer_orig['Revenue'].values
                            month_transformed.append(treatment_rows)
                            
                            print(f"Restoring Revenue for Customer {customer_id} in {current_month_str}")
                            
                    else:
                        month_transformed.append(group)
                
                if month_transformed:
                    month_df = pd.concat(month_transformed, ignore_index=True)
                else:
                    month_df = pd.DataFrame(columns=month_df.columns)
        
        return churned_customers, month_df

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Transform the transaction dataframe to implement subscription model with churn.
        """
        # Store original dataframe for revenue restoration
        self.original_df = df.copy()
        
        # Convert Period to datetime and sort
        df['Period'] = pd.to_datetime(df['Period'])
        self.original_df['Period'] = pd.to_datetime(self.original_df['Period'])
        df = df.sort_values('Period')
        
        # Get overall date range
        start_date = df['Period'].min()
        end_date = df['Period'].max()
        
        # Initialize transformed dataframe list
        transformed_dfs = []
        
        # Process first month
        first_month = df['Period'].dt.strftime('%Y-%m').iloc[0]
        first_month_df = df[df['Period'].dt.strftime('%Y-%m') == first_month]
        
        # Get unique patients for first month
        first_month_patients = set(first_month_df['Customer ID'].unique())
        
        # Add all patients to non_converting initially
        self.non_converting_patient.update(first_month_patients)
        
        # Select initial converts
        num_initial_converts = int(len(first_month_patients) * self.conversion_rate_monthly)
        if num_initial_converts > 0:
            initial_converts = set(np.random.choice(
                list(first_month_patients),
                size=num_initial_converts,
                replace=False
            ))
            
            # Move to new_converting_patient pool
            self.new_converting_patient.update(initial_converts)
            self.non_converting_patient.difference_update(initial_converts)
        
        # Track active subscribers for each month
        active_subscribers = set()
        
        # Process all months
        for month_year, month_df in df.groupby(df['Period'].dt.strftime('%Y-%m')):
            current_month_start = pd.to_datetime(month_year + '-01')
            current_month_end = current_month_start + pd.offsets.MonthEnd(1)
            
            # Get unique patients with transactions this month
            month_patients = set(month_df['Customer ID'].unique())
            
            # Update active subscribers from previous month
            active_subscribers = (active_subscribers | self.converting_patient | self.new_converting_patient)
            
            # Process churns for this month
            churned_customers, month_df = self._process_churns(current_month_start, month_df)
            
            # Remove churned customers from active subscribers
            active_subscribers = active_subscribers - churned_customers
            
            month_transformed = []
            
            # Reset non_converting pool to only include customers with transactions this month
            # who aren't active subscribers
            self.non_converting_patient = month_patients - active_subscribers
            
            # Add churned customers with transactions back to non_converting pool
            churned_with_transactions = churned_customers.intersection(month_patients)
            if churned_with_transactions:
                self.non_converting_patient.update(churned_with_transactions)
            
            # Select new converts only from customers with transactions this month
            potential_converts = list(self.non_converting_patient)
            num_converts = int(len(potential_converts) * self.conversion_rate_monthly)
            
            if potential_converts and num_converts > 0:
                new_converts = set(np.random.choice(
                    potential_converts,
                    size=min(num_converts, len(potential_converts)),
                    replace=False
                ))
                self.new_converting_patient.update(new_converts)
                self.non_converting_patient.difference_update(new_converts)
            
            # Process each customer's transactions
            for customer_id in month_patients:
                customer_df = month_df[month_df['Customer ID'] == customer_id].copy()
                
                if customer_id in active_subscribers:
                    # Existing subscriber
                    # Set treatment revenues to 0
                    customer_df['Revenue'] = 0
                    month_transformed.append(customer_df)
                    
                    # Generate subscription payment for current month
                    subscription_rows = self._generate_subscription_rows(
                        customer_id,
                        current_month_start
                    )
                    month_transformed.extend(subscription_rows)
                    
                elif customer_id in self.new_converting_patient:
                    # New subscriber
                    first_transaction_date = customer_df[customer_df['Customer ID'] == customer_id]['Period'].min()
                    
                    # Store subscription date
                    self.subscription_dates[customer_id] = first_transaction_date
                    
                    # Set treatment revenues to 0
                    customer_df['Revenue'] = 0
                    month_transformed.append(customer_df)
                    
                    # Generate subscription payment for current month
                    subscription_rows = self._generate_subscription_rows(
                        customer_id,
                        current_month_start
                    )
                    month_transformed.extend(subscription_rows)
                else:
                    # Non-converting patient - keep original transactions
                    month_transformed.append(customer_df)
            
            # Generate subscription rows for active subscribers without transactions this month
            subscribers_without_transactions = active_subscribers - month_patients
            for customer_id in subscribers_without_transactions:
                subscription_rows = self._generate_subscription_rows(
                    customer_id,
                    current_month_start
                )
                month_transformed.extend(subscription_rows)
            
            # Move new converts to converting pool for next month
            self.converting_patient.update(self.new_converting_patient)
            self.new_converting_patient.clear()
            
            # Combine all transformations for the month
            if month_transformed:
                month_transformed_df = pd.concat(month_transformed, ignore_index=True)
                transformed_dfs.append(month_transformed_df)
        
        # Combine all months
        result_df = pd.concat(transformed_dfs, ignore_index=True)
        return result_df.sort_values(['Period', 'Customer ID'])
    
    
    def get_metrics(self) -> dict:
        """Get current metrics about the transformation."""
        return {
            'total_subscribers': len(self.converting_patient) + len(self.new_converting_patient),
            'active_non_subscribers': len(self.non_converting_patient),
            'new_converts_this_month': len(self.new_converting_patient),
            'established_subscribers': len(self.converting_patient)
        }

In [111]:
import pandas as pd
import numpy as np

# Using your sample data
df = pd.read_csv("forecast_df_treatment.csv")

# Initialize model with parameters
model = ModelSubscriptionScheme(
    conversion_rate_monthly=0.3,
    monthly_subscription_fee=100,
    days_until_inactive=90,
    churn_probability=0.05  # 5% monthly churn rate
)

# Transform the data
transformed_df = model.transform(df)

# Check results
# print("\nChecking subscription rows:")
# subscription_rows = transformed_df[transformed_df['Treatment'] == 'subscription']
# print(f"Number of subscription rows: {len(subscription_rows)}")
# if len(subscription_rows) > 0:
#     print("\nSample subscription rows:")
#     print(subscription_rows.to_string())

print("\nModel metrics:")
print(model.get_metrics())

Restoring Revenue for Customer Patient 686 in 2025-04
Restoring Revenue for Customer Patient 98 in 2025-04
Restoring Revenue for Customer Patient 348 in 2025-06
Restoring Revenue for Customer Patient 764 in 2025-06
Restoring Revenue for Customer Patient 138 in 2025-07
Restoring Revenue for Customer Patient 236 in 2025-07
Restoring Revenue for Customer Patient 66 in 2025-10
Restoring Revenue for Customer Patient 460 in 2025-11

Model metrics:
{'total_subscribers': 267, 'active_non_subscribers': 41, 'new_converts_this_month': 0, 'established_subscribers': 267}


In [113]:
transformed_df[transformed_df['Customer ID'] == 'Patient 98']

Unnamed: 0,Period,Treatment,Revenue,Expense,Customer ID
403,2025-03-01,subscription,100.0,0.0,Patient 98
401,2025-03-21,012,0.0,12.45,Patient 98
402,2025-03-25,421,0.0,22.09,Patient 98
682,2025-04-14,664,1042.0,128.38,Patient 98
1982,2025-08-01,subscription,100.0,0.0,Patient 98
1981,2025-08-14,514,0.0,90.02,Patient 98
2544,2025-09-01,subscription,100.0,0.0,Patient 98
2753,2025-10-01,subscription,100.0,0.0,Patient 98
2752,2025-10-24,656,0.0,39.08,Patient 98
3260,2025-11-01,subscription,100.0,0.0,Patient 98


In [11]:
transformed_df[transformed_df['Treatment'] == 'subscription']

Unnamed: 0,Period,Treatment,Revenue,Expense,Customer ID
126,2025-01-01,subscription,100.0,0.0,Patient 428
524,2025-01-01,subscription,100.0,0.0,Patient 526
538,2025-01-01,subscription,100.0,0.0,Patient 571
2,2025-01-02,subscription,100.0,0.0,Patient 204
333,2025-01-02,subscription,100.0,0.0,Patient 38
...,...,...,...,...,...
2744,2025-12-31,subscription,100.0,0.0,Patient 750
659,2025-12-31,subscription,100.0,0.0,Patient 764
1779,2025-12-31,subscription,100.0,0.0,Patient 82
454,2025-12-31,subscription,100.0,0.0,Patient 91


In [23]:
transformed_df[transformed_df['Customer ID'] == 'Patient 76']

Unnamed: 0,Period,Treatment,Revenue,Expense,Customer ID
3155,2025-09-08,375,0.0,172.35,Patient 76
3156,2025-09-08,subscription,100.0,0.0,Patient 76
3157,2025-10-08,subscription,100.0,0.0,Patient 76
3391,2025-10-25,928,0.0,140.63,Patient 76
3158,2025-11-08,subscription,100.0,0.0,Patient 76
3159,2025-12-08,subscription,100.0,0.0,Patient 76


In [153]:
df['Revenue'].sum() - transformed_df['Revenue'].sum()

-80616.0

In [112]:
# Analysis and printing results
print("\n=== Original Data Statistics ===")
print(f"Total Transactions: {len(df)}")
print(f"Unique Patients: {df['Customer ID'].nunique()}")
print(f"Total Revenue: ${df['Revenue'].sum():,.2f}")
print(f"Total Expense: ${df['Expense'].sum():,.2f}")

print("\n=== Transformed Data Statistics ===")
print(f"Total Transactions: {len(transformed_df)}")
print(f"Unique Patients: {transformed_df['Customer ID'].nunique()}")
print(f"Total Revenue: ${transformed_df['Revenue'].sum():,.2f}")
print(f"Total Expense: ${transformed_df['Expense'].sum():,.2f}")

print("\n=== Subscription Details ===")
subscription_rows = transformed_df[transformed_df['Treatment'] == 'subscription']
print(f"Number of Subscription Payments: {len(subscription_rows)}")
print(f"Subscription Revenue: ${subscription_rows['Revenue'].sum():,.2f}")

print("\n=== Patient Pool Metrics ===")
metrics = subscription_model.get_metrics()
print(f"Total Subscribers: {metrics['total_subscribers']}")
print(f"Active Non-Subscribers: {metrics['active_non_subscribers']}")
print(f"New Converts This Month: {metrics['new_converts_this_month']}")
print(f"Established Subscribers: {metrics['established_subscribers']}")


=== Original Data Statistics ===
Total Transactions: 1538
Unique Patients: 683
Total Revenue: $763,434.00
Total Expense: $132,029.02

=== Transformed Data Statistics ===
Total Transactions: 3491
Unique Patients: 683
Total Revenue: $612,884.00
Total Expense: $132,029.02

=== Subscription Details ===
Number of Subscription Payments: 1953
Subscription Revenue: $292,950.00

=== Patient Pool Metrics ===
Total Subscribers: 103
Active Non-Subscribers: 3
New Converts This Month: 0
Established Subscribers: 103


In [154]:
transformed_df[transformed_df['Treatment'] == 'subscription']['Customer ID'].nunique()

683

In [123]:
df['Customer ID'].nunique()

683

In [124]:
transformed_df[transformed_df['Customer ID'] == 'Patient 222']

Unnamed: 0,Period,Treatment,Revenue,Expense,Customer ID
2694,2025-07-08,322,199.0,34.99,Patient 222
3378,2025-11-02,523,0.0,60.7,Patient 222


In [125]:
subscriber = transformed_df[transformed_df['Treatment'] == 'subscription']

aggregated_df_subscriber = subscriber.groupby(subscriber['Period'].dt.to_period("M")).agg({
    'Revenue': 'sum',
    'Expense': 'sum',
    'Customer ID': 'nunique'
}).reset_index()

aggregated_df_subscriber

Unnamed: 0,Period,Revenue,Expense,Customer ID
0,2025-01,15900.0,0.0,106
1,2025-02,15900.0,0.0,106
2,2025-03,16950.0,0.0,113
3,2025-04,20400.0,0.0,136
4,2025-05,23250.0,0.0,155
5,2025-06,25350.0,0.0,169
6,2025-07,27150.0,0.0,181
7,2025-08,27900.0,0.0,186
8,2025-09,28500.0,0.0,190
9,2025-10,30150.0,0.0,201


In [103]:
subscriber['Customer ID'].nunique()

206

In [80]:
aggregated_df_original = df.groupby(df['Period'].dt.to_period("M")).agg({
    'Revenue': 'sum',
    'Expense': 'sum',
    'Customer ID': 'nunique'
}).reset_index()

aggregated_df_original

Unnamed: 0,Period,Revenue,Expense,Customer ID
0,2025-01,62168.0,10532.63,106
1,2025-02,62355.0,9265.883333,84
2,2025-03,65857.0,10333.5,95
3,2025-04,53460.0,12283.613333,170
4,2025-05,61069.0,12371.64,143
5,2025-06,68862.0,14010.036667,174
6,2025-07,66652.0,12478.633333,127
7,2025-08,61552.0,8375.853333,85
8,2025-09,71062.0,13193.283333,140
9,2025-10,69310.0,12494.953333,140
