In [1]:
import pandas as pd
data = pd.read_excel("data.xlsx")

In [2]:
control_df=data[data['Intervention'] == 'USUAL'].copy()

In [3]:
control_df['COHORT #'].value_counts()

COHORT #
3     2431
6     2207
8     1638
9     1015
2      719
10     270
7      229
13     220
4      165
1      163
Name: count, dtype: int64

FV purchase

In [4]:
# Filter for FV_dummy == 1
fv_sales = control_df[control_df['FV_DUMMY1'] == 1].copy()

# Optional: ensure 'days_since_start' and 'week_bin' are already computed
# fv_sales['days_since_start'] = (fv_sales['FULL_DATE'] - fv_sales['Start Date']).dt.days
# fv_sales['week_bin'] = (fv_sales['days_since_start'] // 14) * 2

# Group and sum SALES
fv_purchase = fv_sales.groupby(['PID', 'week_bin', 'phase', 'Intervention', 'CUSTOMER_ID', 
                            'Start Date', 'End Date', 'COHORT #', 'period_start'],dropna=False) \
                  .agg(FV_sales=('SALES', 'sum')) \
                  .reset_index()

# Reorder columns (optional)
fv_purchase = fv_purchase[['PID', 'week_bin', 'phase', 'Intervention', 'CUSTOMER_ID',
                   'COHORT #', 'Start Date', 'End Date', 'period_start', 'FV_sales']]
fv_purchase

Unnamed: 0,PID,week_bin,phase,Intervention,CUSTOMER_ID,COHORT #,Start Date,End Date,period_start,FV_sales
0,303,-26,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-10-25,22.33
1,303,-22,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-11-22,39.06
2,303,-18,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-12-20,32.01
3,303,-12,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2024-01-31,16.87
4,303,-6,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2024-03-13,35.47
...,...,...,...,...,...,...,...,...,...,...
268,340,-10,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-01-29,5.98
269,340,-8,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-02-12,4.33
270,340,-6,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-02-26,1.73
271,340,0,In-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-04-09,0.82


In [5]:
control_df["PID"].nunique()

19

In [6]:
fv_sales["PID"].nunique()

17

In [7]:
fv_purchase["PID"].nunique()

17

non_FV (dietitian approved)

In [8]:
control_df["DIETITIAN_PICK"].value_counts()

DIETITIAN_PICK
N    5106
Y    2004
Name: count, dtype: int64

In [9]:
# Filter for FV_dummy == 1
fv_sales = control_df[(control_df['FV_DUMMY1'] != 1) & (control_df['DIETITIAN_PICK'] == 'Y')].copy()


# Optional: ensure 'days_since_start' and 'week_bin' are already computed
# fv_sales['days_since_start'] = (fv_sales['FULL_DATE'] - fv_sales['Start Date']).dt.days
# fv_sales['week_bin'] = (fv_sales['days_since_start'] // 14) * 2

# Group and sum SALES
nfv_purchase = fv_sales.groupby(['PID', 'week_bin', 'phase', 'Intervention', 'CUSTOMER_ID', 
                            'Start Date', 'End Date', 'COHORT #', 'period_start'],dropna=False) \
                  .agg(NFV_sales=('SALES', 'sum')) \
                  .reset_index()

# Reorder columns (optional)
nfv_purchase = nfv_purchase[['PID', 'week_bin', 'phase', 'Intervention', 'CUSTOMER_ID',
                   'COHORT #', 'Start Date', 'End Date', 'period_start', 'NFV_sales']]
nfv_purchase


Unnamed: 0,PID,week_bin,phase,Intervention,CUSTOMER_ID,COHORT #,Start Date,End Date,period_start,NFV_sales
0,303,-26,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-10-25,15.69
1,303,-22,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-11-22,1.64
2,303,-18,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-12-20,6.78
3,303,-12,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2024-01-31,5.64
4,303,-6,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2024-03-13,7.96
...,...,...,...,...,...,...,...,...,...,...
217,340,-12,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-01-15,9.98
218,340,-10,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-01-29,5.00
219,340,-8,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-02-12,14.50
220,340,-6,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-02-26,10.08


In [10]:
# Sets of PIDs
fv_pids = set(fv_purchase['PID'])
nfv_pids = set(nfv_purchase['PID'])

# Unique to fv_purchase
fv_only_pids = fv_pids - nfv_pids

# Unique to nfv_purchase
nfv_only_pids = nfv_pids - fv_pids

# Show results
print("✅ Unique PIDs in fv_purchase only:", fv_only_pids)
print("✅ Unique PIDs in nfv_purchase only:", nfv_only_pids)


✅ Unique PIDs in fv_purchase only: {309}
✅ Unique PIDs in nfv_purchase only: {324, 335}


In [11]:
merged_purchase = pd.merge(
    fv_purchase,
    nfv_purchase,
    on=['PID', 'week_bin'],
    how='outer',
    suffixes=('_fv', '_nfv')
)

# Fill missing FV/NFV sales with 0
merged_purchase['FV_sales'] = merged_purchase['FV_sales'].fillna(0)
merged_purchase['NFV_sales'] = merged_purchase['NFV_sales'].fillna(0)
merged_purchase

Unnamed: 0,PID,week_bin,phase_fv,Intervention_fv,CUSTOMER_ID_fv,COHORT #_fv,Start Date_fv,End Date_fv,period_start_fv,FV_sales,phase_nfv,Intervention_nfv,CUSTOMER_ID_nfv,COHORT #_nfv,Start Date_nfv,End Date_nfv,period_start_nfv,NFV_sales
0,303,-26,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-10-25,22.33,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-10-25,15.69
1,303,-22,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-11-22,39.06,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-11-22,1.64
2,303,-18,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-12-20,32.01,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-12-20,6.78
3,303,-12,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-01-31,16.87,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-01-31,5.64
4,303,-6,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-03-13,35.47,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-03-13,7.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,340,-8,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-02-12,4.33,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-02-12,14.50
306,340,-6,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-02-26,1.73,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-02-26,10.08
307,340,-4,,,,,NaT,NaT,NaT,0.00,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-03-12,3.91
308,340,0,In-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-04-09,0.82,,,,,NaT,NaT,NaT,0.00


In [12]:
# Identify all matching *_fv and *_nfv column pairs
fv_cols = [col for col in merged_purchase.columns if col.endswith('_fv')]
nfv_cols = [col for col in merged_purchase.columns if col.endswith('_nfv')]

# Combine matching pairs
for fv_col in fv_cols:
    base_col = fv_col[:-3]  # remove '_fv'
    nfv_col = base_col + '_nfv'
    if nfv_col in merged_purchase.columns:
        # Combine values: take fv_col unless it's NaN, then take nfv_col
        merged_purchase[base_col] = merged_purchase[fv_col].combine_first(merged_purchase[nfv_col])
        # Drop the original pair
        merged_purchase.drop(columns=[fv_col, nfv_col], inplace=True)
merged_purchase

Unnamed: 0,PID,week_bin,FV_sales,NFV_sales,phase,Intervention,CUSTOMER_ID,COHORT #,Start Date,End Date,period_start
0,303,-26,22.33,15.69,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-10-25
1,303,-22,39.06,1.64,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-11-22
2,303,-18,32.01,6.78,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-12-20
3,303,-12,16.87,5.64,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-01-31
4,303,-6,35.47,7.96,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-03-13
...,...,...,...,...,...,...,...,...,...,...,...
305,340,-8,4.33,14.50,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-02-12
306,340,-6,1.73,10.08,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-02-26
307,340,-4,0.00,3.91,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-03-12
308,340,0,0.82,0.00,In-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-04-09


In [13]:
merged_purchase['PID'].nunique()

19

non_FV (dietitian not approved)

In [14]:
# Filter for FV_dummy == 1
fv_sales = control_df[(control_df['FV_DUMMY1'] != 1) & (control_df['DIETITIAN_PICK'] != 'Y')].copy()


# Optional: ensure 'days_since_start' and 'week_bin' are already computed
# fv_sales['days_since_start'] = (fv_sales['FULL_DATE'] - fv_sales['Start Date']).dt.days
# fv_sales['week_bin'] = (fv_sales['days_since_start'] // 14) * 2

# Group and sum SALES
other_purchase = fv_sales.groupby(['PID', 'week_bin', 'phase', 'Intervention', 'CUSTOMER_ID', 
                            'Start Date', 'End Date', 'COHORT #', 'period_start'],dropna=False) \
                  .agg(other_sales=('SALES', 'sum')) \
                  .reset_index()

# Reorder columns (optional)
other_purchase = other_purchase[['PID', 'week_bin', 'phase', 'Intervention', 'CUSTOMER_ID',
                   'COHORT #', 'Start Date', 'End Date', 'period_start', 'other_sales']]
other_purchase

Unnamed: 0,PID,week_bin,phase,Intervention,CUSTOMER_ID,COHORT #,Start Date,End Date,period_start,other_sales
0,303,-26,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-10-25,93.95
1,303,-22,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-11-22,66.25
2,303,-18,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-12-20,90.42
3,303,-12,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2024-01-31,37.79
4,303,-6,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2024-03-13,66.93
...,...,...,...,...,...,...,...,...,...,...
371,340,-6,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-02-26,98.51
372,340,-4,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-03-12,47.44
373,340,-2,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-03-26,12.94
374,340,0,In-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-04-09,24.27


In [15]:
# Sets of PIDs
fv_pids = set(merged_purchase['PID'])
other_pids = set(other_purchase['PID'])

# Unique to fv_purchase
fv_only_pids = fv_pids - other_pids

# Unique to nfv_purchase
other_only_pids = other_pids - fv_pids

# Show results
print("✅ Unique PIDs in fv_purchase only:", fv_only_pids)
print("✅ Unique PIDs in other_purchase only:", other_only_pids)


✅ Unique PIDs in fv_purchase only: set()
✅ Unique PIDs in other_purchase only: set()


In [16]:
merged_purchase1 = pd.merge(
    merged_purchase,
    other_purchase,
    on=['PID', 'week_bin'],
    how='outer',
    suffixes=('_fv', '_nfv')
)

# Fill missing FV/NFV sales with 0
merged_purchase1['FV_sales'] = merged_purchase1['FV_sales'].fillna(0)
merged_purchase1['NFV_sales'] = merged_purchase1['NFV_sales'].fillna(0)
merged_purchase1['other_sales'] = merged_purchase1['other_sales'].fillna(0)

merged_purchase1

Unnamed: 0,PID,week_bin,FV_sales,NFV_sales,phase_fv,Intervention_fv,CUSTOMER_ID_fv,COHORT #_fv,Start Date_fv,End Date_fv,period_start_fv,phase_nfv,Intervention_nfv,CUSTOMER_ID_nfv,COHORT #_nfv,Start Date_nfv,End Date_nfv,period_start_nfv,other_sales
0,303,-26,22.33,15.69,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-10-25,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-10-25,93.95
1,303,-22,39.06,1.64,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-11-22,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-11-22,66.25
2,303,-18,32.01,6.78,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-12-20,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-12-20,90.42
3,303,-12,16.87,5.64,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-01-31,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-01-31,37.79
4,303,-6,35.47,7.96,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-03-13,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-03-13,66.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,340,-6,1.73,10.08,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-02-26,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-02-26,98.51
377,340,-4,0.00,3.91,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-03-12,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-03-12,47.44
378,340,-2,0.00,0.00,,,,,NaT,NaT,NaT,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-03-26,12.94
379,340,0,0.82,0.00,In-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-04-09,In-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-04-09,24.27


In [17]:
# Identify all matching *_fv and *_nfv column pairs
fv_cols = [col for col in merged_purchase1.columns if col.endswith('_fv')]
nfv_cols = [col for col in merged_purchase1.columns if col.endswith('_nfv')]

# Combine matching pairs
for fv_col in fv_cols:
    base_col = fv_col[:-3]  # remove '_fv'
    nfv_col = base_col + '_nfv'
    if nfv_col in merged_purchase1.columns:
        # Combine values: take fv_col unless it's NaN, then take nfv_col
        merged_purchase1[base_col] = merged_purchase1[fv_col].combine_first(merged_purchase1[nfv_col])
        # Drop the original pair
        merged_purchase1.drop(columns=[fv_col, nfv_col], inplace=True)
merged_purchase1

Unnamed: 0,PID,week_bin,FV_sales,NFV_sales,other_sales,phase,Intervention,CUSTOMER_ID,COHORT #,Start Date,End Date,period_start
0,303,-26,22.33,15.69,93.95,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-10-25
1,303,-22,39.06,1.64,66.25,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-11-22
2,303,-18,32.01,6.78,90.42,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-12-20
3,303,-12,16.87,5.64,37.79,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-01-31
4,303,-6,35.47,7.96,66.93,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-03-13
...,...,...,...,...,...,...,...,...,...,...,...,...
376,340,-6,1.73,10.08,98.51,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-02-26
377,340,-4,0.00,3.91,47.44,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-03-12
378,340,-2,0.00,0.00,12.94,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-03-26
379,340,0,0.82,0.00,24.27,In-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-04-09


FV quantity

In [18]:
# Filter for FV_dummy == 1
fv_sales = control_df[control_df['FV_DUMMY1'] == 1].copy()


# Optional: ensure 'days_since_start' and 'week_bin' are already computed
# fv_sales['days_since_start'] = (fv_sales['FULL_DATE'] - fv_sales['Start Date']).dt.days
# fv_sales['week_bin'] = (fv_sales['days_since_start'] // 14) * 2

# Group and sum SALES
portion = fv_sales.groupby(['PID', 'week_bin', 'phase', 'Intervention', 'CUSTOMER_ID', 
                            'Start Date', 'End Date', 'COHORT #', 'period_start'],dropna=False) \
                  .agg(portion=('portion', 'sum')) \
                  .reset_index()

# Reorder columns (optional)
portion = portion[['PID', 'week_bin', 'phase', 'Intervention', 'CUSTOMER_ID',
                   'COHORT #', 'Start Date', 'End Date', 'period_start', 'portion']]
portion

Unnamed: 0,PID,week_bin,phase,Intervention,CUSTOMER_ID,COHORT #,Start Date,End Date,period_start,portion
0,303,-26,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-10-25,55.829369
1,303,-22,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-11-22,83.196176
2,303,-18,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-12-20,63.466133
3,303,-12,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2024-01-31,25.020117
4,303,-6,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2024-03-13,47.635708
...,...,...,...,...,...,...,...,...,...,...
268,340,-10,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-01-29,9.556667
269,340,-8,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-02-12,12.133532
270,340,-6,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-02-26,5.428981
271,340,0,In-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-04-09,3.137331


In [19]:
# Sets of PIDs
fv_pids = set(merged_purchase1['PID'])
portion_pids = set(portion['PID'])

# Unique to fv_purchase
fv_only_pids = fv_pids - portion_pids

# Unique to nfv_purchase
portion_only_pids = portion_pids - fv_pids

# Show results
print("✅ Unique PIDs in fv_purchase only:", fv_only_pids)
print("✅ Unique PIDs in portion_purchase only:", portion_only_pids)


✅ Unique PIDs in fv_purchase only: {324, 335}
✅ Unique PIDs in portion_purchase only: set()


In [20]:
merged_purchase2 = pd.merge(
    merged_purchase1,
    portion,
    on=['PID', 'week_bin'],
    how='outer',
    suffixes=('_fv', '_nfv')
)

# Fill missing FV/NFV sales with 0
merged_purchase2['FV_sales'] = merged_purchase2['FV_sales'].fillna(0)
merged_purchase2['NFV_sales'] = merged_purchase2['NFV_sales'].fillna(0)
merged_purchase2['other_sales'] = merged_purchase2['other_sales'].fillna(0)
merged_purchase2['portion'] = merged_purchase2['portion'].fillna(0)
merged_purchase2

Unnamed: 0,PID,week_bin,FV_sales,NFV_sales,other_sales,phase_fv,Intervention_fv,CUSTOMER_ID_fv,COHORT #_fv,Start Date_fv,End Date_fv,period_start_fv,phase_nfv,Intervention_nfv,CUSTOMER_ID_nfv,COHORT #_nfv,Start Date_nfv,End Date_nfv,period_start_nfv,portion
0,303,-26,22.33,15.69,93.95,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-10-25,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-10-25,55.829369
1,303,-22,39.06,1.64,66.25,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-11-22,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-11-22,83.196176
2,303,-18,32.01,6.78,90.42,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-12-20,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-12-20,63.466133
3,303,-12,16.87,5.64,37.79,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-01-31,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-01-31,25.020117
4,303,-6,35.47,7.96,66.93,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-03-13,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-03-13,47.635708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,340,-6,1.73,10.08,98.51,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-02-26,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-02-26,5.428981
377,340,-4,0.00,3.91,47.44,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-03-12,,,,,NaT,NaT,NaT,0.000000
378,340,-2,0.00,0.00,12.94,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-03-26,,,,,NaT,NaT,NaT,0.000000
379,340,0,0.82,0.00,24.27,In-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-04-09,In-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-04-09,3.137331


In [21]:
# Identify all matching *_fv and *_nfv column pairs
fv_cols = [col for col in merged_purchase2.columns if col.endswith('_fv')]
nfv_cols = [col for col in merged_purchase2.columns if col.endswith('_nfv')]

# Combine matching pairs
for fv_col in fv_cols:
    base_col = fv_col[:-3]  # remove '_fv'
    nfv_col = base_col + '_nfv'
    if nfv_col in merged_purchase2.columns:
        # Combine values: take fv_col unless it's NaN, then take nfv_col
        merged_purchase2[base_col] = merged_purchase2[fv_col].combine_first(merged_purchase2[nfv_col])
        # Drop the original pair
        merged_purchase2.drop(columns=[fv_col, nfv_col], inplace=True)
merged_purchase2

Unnamed: 0,PID,week_bin,FV_sales,NFV_sales,other_sales,portion,phase,Intervention,CUSTOMER_ID,COHORT #,Start Date,End Date,period_start
0,303,-26,22.33,15.69,93.95,55.829369,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-10-25
1,303,-22,39.06,1.64,66.25,83.196176,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-11-22
2,303,-18,32.01,6.78,90.42,63.466133,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-12-20
3,303,-12,16.87,5.64,37.79,25.020117,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-01-31
4,303,-6,35.47,7.96,66.93,47.635708,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-03-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,340,-6,1.73,10.08,98.51,5.428981,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-02-26
377,340,-4,0.00,3.91,47.44,0.000000,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-03-12
378,340,-2,0.00,0.00,12.94,0.000000,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-03-26
379,340,0,0.82,0.00,24.27,3.137331,In-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-04-09


transaction

In [22]:
fv_sales = control_df.copy()


# Optional: ensure 'days_since_start' and 'week_bin' are already computed
# fv_sales['days_since_start'] = (fv_sales['FULL_DATE'] - fv_sales['Start Date']).dt.days
# fv_sales['week_bin'] = (fv_sales['days_since_start'] // 14) * 2

# Group and count unique transactions
transaction = fv_sales.groupby(
    ['PID', 'week_bin', 'phase', 'Intervention', 'CUSTOMER_ID',
     'Start Date', 'End Date', 'COHORT #', 'period_start'],
    dropna=False
).agg(transaction=('TRANSACTION_HEADER_KEY', 'nunique')).reset_index()

# Reorder columns (optional)
transaction = transaction[[
    'PID', 'week_bin', 'phase', 'Intervention', 'CUSTOMER_ID',
    'COHORT #', 'Start Date', 'End Date', 'period_start', 'transaction'
]]

transaction


Unnamed: 0,PID,week_bin,phase,Intervention,CUSTOMER_ID,COHORT #,Start Date,End Date,period_start,transaction
0,303,-26,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-10-25,1
1,303,-22,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-11-22,1
2,303,-18,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-12-20,2
3,303,-12,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2024-01-31,1
4,303,-6,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2024-03-13,1
...,...,...,...,...,...,...,...,...,...,...
376,340,-6,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-02-26,14
377,340,-4,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-03-12,2
378,340,-2,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-03-26,1
379,340,0,In-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-04-09,1


In [23]:
# Sets of PIDs
fv_pids = set(merged_purchase2['PID'])
portion_pids = set(transaction['PID'])

# Unique to fv_purchase
fv_only_pids = fv_pids - portion_pids

# Unique to nfv_purchase
portion_only_pids = portion_pids - fv_pids

# Show results
print("✅ Unique PIDs in fv_purchase only:", fv_only_pids)
print("✅ Unique PIDs in portion_purchase only:", portion_only_pids)


✅ Unique PIDs in fv_purchase only: set()
✅ Unique PIDs in portion_purchase only: set()


In [24]:
merged_purchase3 = pd.merge(
    merged_purchase2,
    transaction,
    on=['PID', 'week_bin'],
    how='outer',
    suffixes=('_fv', '_nfv')
)

# Fill missing FV/NFV sales with 0
merged_purchase3['FV_sales'] = merged_purchase3['FV_sales'].fillna(0)
merged_purchase3['NFV_sales'] = merged_purchase3['NFV_sales'].fillna(0)
merged_purchase3['other_sales'] = merged_purchase3['other_sales'].fillna(0)
merged_purchase3['portion'] = merged_purchase3['portion'].fillna(0)
merged_purchase3['transaction'] = merged_purchase3['transaction'].fillna(0)
merged_purchase3

Unnamed: 0,PID,week_bin,FV_sales,NFV_sales,other_sales,portion,phase_fv,Intervention_fv,CUSTOMER_ID_fv,COHORT #_fv,...,End Date_fv,period_start_fv,phase_nfv,Intervention_nfv,CUSTOMER_ID_nfv,COHORT #_nfv,Start Date_nfv,End Date_nfv,period_start_nfv,transaction
0,303,-26,22.33,15.69,93.95,55.829369,Pre-Trial,USUAL,417278971.0,1.0,...,2024-10-09,2023-10-25,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-10-25,1
1,303,-22,39.06,1.64,66.25,83.196176,Pre-Trial,USUAL,417278971.0,1.0,...,2024-10-09,2023-11-22,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-11-22,1
2,303,-18,32.01,6.78,90.42,63.466133,Pre-Trial,USUAL,417278971.0,1.0,...,2024-10-09,2023-12-20,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-12-20,2
3,303,-12,16.87,5.64,37.79,25.020117,Pre-Trial,USUAL,417278971.0,1.0,...,2024-10-09,2024-01-31,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2024-01-31,1
4,303,-6,35.47,7.96,66.93,47.635708,Pre-Trial,USUAL,417278971.0,1.0,...,2024-10-09,2024-03-13,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2024-03-13,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,340,-6,1.73,10.08,98.51,5.428981,Pre-Trial,USUAL,887413321.0,13.0,...,2025-09-24,2025-02-26,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-02-26,14
377,340,-4,0.00,3.91,47.44,0.000000,Pre-Trial,USUAL,887413321.0,13.0,...,2025-09-24,2025-03-12,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-03-12,2
378,340,-2,0.00,0.00,12.94,0.000000,Pre-Trial,USUAL,887413321.0,13.0,...,2025-09-24,2025-03-26,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-03-26,1
379,340,0,0.82,0.00,24.27,3.137331,In-Trial,USUAL,887413321.0,13.0,...,2025-09-24,2025-04-09,In-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-04-09,1


In [25]:
# Identify all matching *_fv and *_nfv column pairs
fv_cols = [col for col in merged_purchase3.columns if col.endswith('_fv')]
nfv_cols = [col for col in merged_purchase3.columns if col.endswith('_nfv')]

# Combine matching pairs
for fv_col in fv_cols:
    base_col = fv_col[:-3]  # remove '_fv'
    nfv_col = base_col + '_nfv'
    if nfv_col in merged_purchase3.columns:
        # Combine values: take fv_col unless it's NaN, then take nfv_col
        merged_purchase3[base_col] = merged_purchase3[fv_col].combine_first(merged_purchase3[nfv_col])
        # Drop the original pair
        merged_purchase3.drop(columns=[fv_col, nfv_col], inplace=True)
merged_purchase3

Unnamed: 0,PID,week_bin,FV_sales,NFV_sales,other_sales,portion,transaction,phase,Intervention,CUSTOMER_ID,COHORT #,Start Date,End Date,period_start
0,303,-26,22.33,15.69,93.95,55.829369,1,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-10-25
1,303,-22,39.06,1.64,66.25,83.196176,1,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-11-22
2,303,-18,32.01,6.78,90.42,63.466133,2,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-12-20
3,303,-12,16.87,5.64,37.79,25.020117,1,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-01-31
4,303,-6,35.47,7.96,66.93,47.635708,1,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-03-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,340,-6,1.73,10.08,98.51,5.428981,14,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-02-26
377,340,-4,0.00,3.91,47.44,0.000000,2,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-03-12
378,340,-2,0.00,0.00,12.94,0.000000,1,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-03-26
379,340,0,0.82,0.00,24.27,3.137331,1,In-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-04-09


total purchase

In [26]:
# Filter for FV_dummy == 1
sales = control_df.copy()

# Optional: ensure 'days_since_start' and 'week_bin' are already computed
# fv_sales['days_since_start'] = (fv_sales['FULL_DATE'] - fv_sales['Start Date']).dt.days
# fv_sales['week_bin'] = (fv_sales['days_since_start'] // 14) * 2

# Group and sum SALES
total_purchase = sales.groupby(['PID', 'week_bin', 'phase', 'Intervention', 'CUSTOMER_ID', 
                            'Start Date', 'End Date', 'COHORT #', 'period_start'],dropna=False) \
                  .agg(total_sales=('SALES', 'sum')) \
                  .reset_index()

# Reorder columns (optional)
total_purchase = total_purchase[['PID', 'week_bin', 'phase', 'Intervention', 'CUSTOMER_ID',
                   'COHORT #', 'Start Date', 'End Date', 'period_start', 'total_sales']]
total_purchase

Unnamed: 0,PID,week_bin,phase,Intervention,CUSTOMER_ID,COHORT #,Start Date,End Date,period_start,total_sales
0,303,-26,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-10-25,131.97
1,303,-22,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-11-22,106.95
2,303,-18,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-12-20,129.21
3,303,-12,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2024-01-31,60.30
4,303,-6,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2024-03-13,110.36
...,...,...,...,...,...,...,...,...,...,...
376,340,-6,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-02-26,110.32
377,340,-4,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-03-12,51.35
378,340,-2,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-03-26,12.94
379,340,0,In-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-04-09,25.09


In [27]:
merged_purchase4 = pd.merge(
    merged_purchase3,
    total_purchase,
    on=['PID', 'week_bin'],
    how='outer',
    suffixes=('_fv', '_nfv')
)

# Fill missing FV/NFV sales with 0
merged_purchase4['FV_sales'] = merged_purchase4['FV_sales'].fillna(0)
merged_purchase4['NFV_sales'] = merged_purchase4['NFV_sales'].fillna(0)
merged_purchase4['other_sales'] = merged_purchase4['other_sales'].fillna(0)
merged_purchase4['portion'] = merged_purchase4['portion'].fillna(0)
merged_purchase4['transaction'] = merged_purchase4['transaction'].fillna(0)
merged_purchase4['total_sales'] = merged_purchase4['total_sales'].fillna(0)
merged_purchase4

Unnamed: 0,PID,week_bin,FV_sales,NFV_sales,other_sales,portion,transaction,phase_fv,Intervention_fv,CUSTOMER_ID_fv,...,End Date_fv,period_start_fv,phase_nfv,Intervention_nfv,CUSTOMER_ID_nfv,COHORT #_nfv,Start Date_nfv,End Date_nfv,period_start_nfv,total_sales
0,303,-26,22.33,15.69,93.95,55.829369,1,Pre-Trial,USUAL,417278971.0,...,2024-10-09,2023-10-25,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-10-25,131.97
1,303,-22,39.06,1.64,66.25,83.196176,1,Pre-Trial,USUAL,417278971.0,...,2024-10-09,2023-11-22,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-11-22,106.95
2,303,-18,32.01,6.78,90.42,63.466133,2,Pre-Trial,USUAL,417278971.0,...,2024-10-09,2023-12-20,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2023-12-20,129.21
3,303,-12,16.87,5.64,37.79,25.020117,1,Pre-Trial,USUAL,417278971.0,...,2024-10-09,2024-01-31,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2024-01-31,60.30
4,303,-6,35.47,7.96,66.93,47.635708,1,Pre-Trial,USUAL,417278971.0,...,2024-10-09,2024-03-13,Pre-Trial,USUAL,417278971,1,2024-04-24,2024-10-09,2024-03-13,110.36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,340,-6,1.73,10.08,98.51,5.428981,14,Pre-Trial,USUAL,887413321.0,...,2025-09-24,2025-02-26,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-02-26,110.32
377,340,-4,0.00,3.91,47.44,0.000000,2,Pre-Trial,USUAL,887413321.0,...,2025-09-24,2025-03-12,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-03-12,51.35
378,340,-2,0.00,0.00,12.94,0.000000,1,Pre-Trial,USUAL,887413321.0,...,2025-09-24,2025-03-26,Pre-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-03-26,12.94
379,340,0,0.82,0.00,24.27,3.137331,1,In-Trial,USUAL,887413321.0,...,2025-09-24,2025-04-09,In-Trial,USUAL,887413321,13,2025-04-09,2025-09-24,2025-04-09,25.09


In [28]:
# Identify all matching *_fv and *_nfv column pairs
fv_cols = [col for col in merged_purchase4.columns if col.endswith('_fv')]
nfv_cols = [col for col in merged_purchase4.columns if col.endswith('_nfv')]

# Combine matching pairs
for fv_col in fv_cols:
    base_col = fv_col[:-3]  # remove '_fv'
    nfv_col = base_col + '_nfv'
    if nfv_col in merged_purchase4.columns:
        # Combine values: take fv_col unless it's NaN, then take nfv_col
        merged_purchase4[base_col] = merged_purchase4[fv_col].combine_first(merged_purchase4[nfv_col])
        # Drop the original pair
        merged_purchase4.drop(columns=[fv_col, nfv_col], inplace=True)
merged_purchase4

Unnamed: 0,PID,week_bin,FV_sales,NFV_sales,other_sales,portion,transaction,total_sales,phase,Intervention,CUSTOMER_ID,COHORT #,Start Date,End Date,period_start
0,303,-26,22.33,15.69,93.95,55.829369,1,131.97,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-10-25
1,303,-22,39.06,1.64,66.25,83.196176,1,106.95,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-11-22
2,303,-18,32.01,6.78,90.42,63.466133,2,129.21,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2023-12-20
3,303,-12,16.87,5.64,37.79,25.020117,1,60.30,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-01-31
4,303,-6,35.47,7.96,66.93,47.635708,1,110.36,Pre-Trial,USUAL,417278971.0,1.0,2024-04-24,2024-10-09,2024-03-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,340,-6,1.73,10.08,98.51,5.428981,14,110.32,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-02-26
377,340,-4,0.00,3.91,47.44,0.000000,2,51.35,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-03-12
378,340,-2,0.00,0.00,12.94,0.000000,1,12.94,Pre-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-03-26
379,340,0,0.82,0.00,24.27,3.137331,1,25.09,In-Trial,USUAL,887413321.0,13.0,2025-04-09,2025-09-24,2025-04-09


In [29]:
merged_purchase4.columns

Index(['PID', 'week_bin', 'FV_sales', 'NFV_sales', 'other_sales', 'portion',
       'transaction', 'total_sales', 'phase', 'Intervention', 'CUSTOMER_ID',
       'COHORT #', 'Start Date', 'End Date', 'period_start'],
      dtype='object')

In [30]:
merged_purchase4['FV_out_of_pocket']=merged_purchase4['FV_sales']

In [31]:
merged_purchase4['DP_out_of_pocket ']=merged_purchase4['FV_sales']+merged_purchase4['NFV_sales']

In [32]:
merged_purchase4.to_excel("control_matrix.xlsx", index=False)

In [33]:
merged_purchase4["PID"].unique()

array([303, 306, 308, 309, 312, 314, 319, 320, 321, 323, 324, 327, 329,
       330, 331, 333, 335, 336, 340])