In [None]:
import pandas as pd

# Load cleaned transactions
df = pd.read_csv('data/clean/cleaned_online_retail.csv')
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Compute revenue and drop any non-positive
df['Revenue'] = df['Quantity'] * df['Price']
df = df[df['Revenue'] > 0]

print("Transactions:", df.shape)

from lifetimes.utils import summary_data_from_transaction_data

clv_summary = summary_data_from_transaction_data(
    transactions=df,
    customer_id_col='Customer ID',
    datetime_col='InvoiceDate',
    monetary_value_col='Revenue',
    observation_period_end=df['InvoiceDate'].max() + pd.Timedelta(days=1),
    freq='W'   # use weeks for stability
)

print(clv_summary.head())
# ensures freq>0 & monetary_value>0 by construction

from lifetimes import BetaGeoFitter

bgf = BetaGeoFitter(penalizer_coef=5)
bgf.fit(
    clv_summary['frequency'],
    clv_summary['recency'],
    clv_summary['T'],
    tol=1e-6,
    maxiter=200
)
print("BG/NBD parameters:", bgf.params_)

# Predict repeat transactions over next 4 weeks
clv_summary['pred_purchases_4w'] = bgf.conditional_expected_number_of_purchases_up_to_time(
    4,
    clv_summary['frequency'],
    clv_summary['recency'],
    clv_summary['T']
)
print(clv_summary[['frequency','recency','T','pred_purchases_4w']].head())

# Remove any customers whose average monetary_value is ≤ 0
clv_summary = clv_summary[clv_summary['monetary_value'] > 0]
print("Customers remaining after filter:", clv_summary.shape[0])


from lifetimes import GammaGammaFitter

ggf = GammaGammaFitter(penalizer_coef=0.01)
ggf.fit(
    clv_summary['frequency'],
    clv_summary['monetary_value']
)
print("Gamma–Gamma parameters:", ggf.params_)

# Predict each customer’s expected average profit
clv_summary['exp_avg_profit'] = ggf.conditional_expected_average_profit(
    frequency=clv_summary['frequency'],
    monetary_value=clv_summary['monetary_value']
)

print(clv_summary[['monetary_value', 'exp_avg_profit']].head())
# Calculate CLV for 4-week horizon
clv_summary['clv_4w'] = ggf.customer_lifetime_value(
    bgf,
    clv_summary['frequency'],
    clv_summary['recency'],
    clv_summary['T'],
    clv_summary['monetary_value'],
    time=4,      # 4-week horizon
    freq='W',    # weekly units
    discount_rate=0.01
)
print(clv_summary[['pred_purchases_4w','exp_avg_profit','clv_4w']].head(15))
# Save the CLV summary to a CSV file
clv_summary.to_csv('data/clean/bg_nbd_clv_4w.csv')
print("Saved BG/NBD + Gamma–Gamma CLV to data/clv/bg_nbd_clv_4w.csv")

import pandas as pd

# Load your CLV table
clv_df = pd.read_csv('data/clean/bg_nbd_clv_4w.csv')

# Compute key percentiles
percentiles = clv_df['clv_4w'].quantile([0.5, 0.75, 0.9]).rename({0.5:'50th', 0.75:'75th', 0.9:'90th'})
print("CLV 4-week percentiles:")
print(percentiles)
# Find the CLV threshold for the top 1%
cutoff_99 = clv_df['clv_4w'].quantile(0.99)

# Filter to those top customers
top_1pct = clv_df[clv_df['clv_4w'] >= cutoff_99].sort_values('clv_4w', ascending=False)

print(f"Top 1% cutoff = {cutoff_99:.2f}, number of top customers = {len(top_1pct)}")
print(top_1pct.head(10))   # show the very top 10

# Load or reference your clustered RFM table
rfm_clustered = pd.read_csv('data/rfm/rfm_clustered.csv', index_col=0, encoding='latin1')

# Merge CLV and cluster label
merged = clv_df.merge(
    rfm_clustered[['Cluster']],
    left_index=True, right_index=True,
    how='inner'
)

# Compute average CLV by cluster
cluster_clv = merged.groupby('Cluster')['clv_4w'].agg(['mean','median','count']).rename(
    columns={'mean':'avg_clv_4w', 'median':'median_clv_4w','count':'num_customers'}
)

print("CLV by cluster:")
print(cluster_clv)