We are trying to execute this in two different approaches : one for statistical and one for machine learning
That could also be split in two : one using lifetimes models and one combined with regression approach

I - The modeling & evaluation process is going to be the following:

 1.Fit and evaluate BG/NBD model for frequency prediction
 2.Fit and evaluate Gamma-Gamma model for monetary value  prediction
 3.Combine 2 models into CLV model and compare to baseline
 4. Refit the model on the entire dataset
 
 
 The statistical and traditional ML Model is based on calibration perdio : features period and holdout period : target period
 Each customer has two functions : possibility of buy at time t , and possibility to churn at time t

In [None]:
!pip install lifetimes
!pip install scikit-learn
!pip install keras


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
#  Dataframes and arrays processing
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime 

# to make the notebook reproductible
np.random.seed(42)
import random
random.seed(42)

import warnings 
warnings.filterwarnings('ignore')
import lifetimes


# Statistical LTV (lifetime value)
import lifetimes
from lifetimes import BetaGeoFitter, GammaGammaFitter
from lifetimes.utils import calibration_and_holdout_data, summary_data_from_transaction_data


# Plotting 
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### 1st approach in this notebook we are using Statistical model with Lifetimes (Python library)
Statistical approach : models BG/NBD and Gamma/Gamma
Calibration and holdout : features and target periods => unsupervised learning

In [None]:
# Make default parameters bigger
plt.rcParams['figure.figsize'] = (7,4.5) 
plt.rcParams["figure.dpi"] = 140 

sns.set(style="ticks")
sns.set_context("poster", font_scale = .5, rc={"grid.linewidth": 5})

In [None]:
#  Read the datasets bits by bits : the transaction logs : daily amount spent by each customer each day
df1 = pd.read_csv('../input/brazilian-ecommerce/olist_orders_dataset.csv')
df1

In [None]:
df2 = pd.read_csv('../input/brazilian-ecommerce/olist_customers_dataset.csv')
df2


In [None]:
df3 = pd.read_csv('../input/brazilian-ecommerce/olist_order_payments_dataset.csv')
df3

In [None]:
cols = ['customer_id','order_id','order_purchase_timestamp']
orders = df1[cols]
orders = orders.set_index('customer_id')
orders.drop_duplicates (inplace =True)

cols =['customer_id','customer_unique_id']
customers = df2 [cols]
customers = customers.set_index('customer_id')

cols = ['order_id','payment_value']
payment = df3[cols]
payment = payment.set_index('order_id')
payment.drop_duplicates(inplace = True)


# elog is for scraping
elog = pd.concat([orders,customers], axis=1, join='inner')
elog.reset_index(inplace=True)
                 
                 
cols = [ 'customer_unique_id','order_purchase_timestamp']
elog = elog[cols]


# Datetime transformation                 
elog['order_purchase_timestamp'] = pd.to_datetime(elog['order_purchase_timestamp'])
elog['order_date'] = elog.order_purchase_timestamp.dt.date
elog['order_date'] = pd.to_datetime(elog['order_date'])

                 
cols = ['customer_unique_id', 'order_date']
elog = elog [cols]
                 
elog.columns = ['CUSTOMER_ID','ORDER_DATE']

elog.info()
display(elog.sample(5))

In [None]:
# Date range of orders
elog.ORDER_DATE.describe()

#  Creating RFM (Recency, Frequency, Monetary value) based on transaction log
# Splitting calibration and holdout period

In [None]:

%%time
calibration_period_ends = '2018-06-30'

from lifetimes.utils import calibration_and_holdout_data

summary_cal_holdout = calibration_and_holdout_data(elog, 
                                                   customer_id_col = 'CUSTOMER_ID', 
                                                   datetime_col = 'ORDER_DATE', 
                                                   freq = 'D', #days
                                        calibration_period_end = '2017-08-27',
                                        observation_period_end ='2018-09-28' )

In [None]:
# Feature set 
summary_cal_holdout.head()

### Training the Model MBG (Modifiedbetageofitter)
The Number of  transactions follow the Poisson process with transaction rate lambda

In [None]:

%%time

from lifetimes import ModifiedBetaGeoFitter

mbgnbd = ModifiedBetaGeoFitter(penalizer_coef=0.01)
mbgnbd.fit(summary_cal_holdout['frequency_cal'], 
        summary_cal_holdout['recency_cal'], 
        summary_cal_holdout['T_cal'],
       verbose=True)

In [None]:
print(mbgnbd)

In [None]:
#  Predicting for each customer
t = 90 # days to predict in the future 
summary_cal_holdout['predicted_purchases'] = mbgnbd.conditional_expected_number_of_purchases_up_to_time(t, 
                                                                                      summary_cal_holdout['frequency_cal'], 
                                                                                      summary_cal_holdout['recency_cal'], 
                                                                                      summary_cal_holdout['T_cal'])

summary_cal_holdout['p_alive'] = mbgnbd.conditional_probability_alive(summary_cal_holdout['frequency_cal'], 
                                                                         summary_cal_holdout['recency_cal'], 
                                                                         summary_cal_holdout['T_cal'])
summary_cal_holdout['p_alive'] = np.round(summary_cal_holdout['p_alive'] / summary_cal_holdout['p_alive'].max(), 2)

In [None]:


display(summary_cal_holdout.sample(2).T)



## Best to Worst customers 

In [None]:
# Ranking customers from Best to Worst (the more likely to purchase to more likely to churn)
t = 1
summary_cal_holdout ['predicted_purchases'] = mbgnbd.conditional_expected_number_of_purchases_up_to_time(t,summary_cal_holdout['frequency_cal'],summary_cal_holdout['recency_cal'], summary_cal_holdout ['T_cal'])
summary_cal_holdout.sort_values (by= 'predicted_purchases'). tail(5)


In [None]:
#  Matrix recency and frequency

from lifetimes.plotting import plot_frequency_recency_matrix
plot_frequency_recency_matrix(mbgnbd)

In [None]:
# Probalility of still being alive
from lifetimes.plotting import plot_probability_alive_matrix
plot_probability_alive_matrix(mbgnbd)


## Model evaluation and assessment 

In [None]:
%%time
from lifetimes.plotting import plot_period_transactions
ax = plot_period_transactions(mbgnbd, max_frequency=7)
ax.set_yscale('log')
sns.despine();

In [None]:
%%time 

from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases

plot_calibration_purchases_vs_holdout_purchases(mbgnbd, summary_cal_holdout)
sns.despine();

# Predicting an individual future purchase -- here (historical purchases )

In [None]:
# Customer Probability History

from lifetimes.plotting import plot_history_alive
from datetime import date
from pylab import figure, text, scatter, show

individual = summary_cal_holdout.iloc[4942]

id = individual.name
t = 365*50

today = date.today()
two_year_ago = today.replace(year=today.year - 2)
one_year_from_now = today.replace(year=today.year + 1)

sp_trans = elog.loc[elog['CUSTOMER_ID'] == id]

from lifetimes.utils import calculate_alive_path

t = (today - sp_trans.ORDER_DATE.min().date()).days
p_alive_today = pd.DataFrame(calculate_alive_path(mbgnbd, sp_trans, 'ORDER_DATE', t, freq='D'))[0].tail(1).values
p_alive_today = np.round(p_alive_today[0], 2)
print('Probability that customer is alive today is', p_alive_today)

t = (one_year_from_now - sp_trans.ORDER_DATE.min().date()).days
ax = plot_history_alive(mbgnbd, t, sp_trans, 'ORDER_DATE', start_date=two_year_ago) #, start_date='2016-01-01'
ax.vlines(x=today, ymin=0, ymax=1.05, colors='#4C4C4C')
ax.hlines(y=0.8, xmin=two_year_ago, xmax=one_year_from_now, colors='#4C4C4C')

ax.set_xlim(two_year_ago, one_year_from_now) # sp_trans.ORDER_DATE.min()
ax.set_ylim(0, 1.05)

plt.xticks(rotation=-90)
text(0.75, 0.1, p_alive_today, ha='center', va='center', transform=ax.transAxes)

sns.despine()


In [None]:
# Predicted Transactions with Time
elog.columns = ['CUSTOMER_ID','date']


In [None]:
%%time
# Get expected and actual repeated cumulative transactions.

from lifetimes.utils import expected_cumulative_transactions

t = (elog.date.max() - elog.date.min()).days
df = expected_cumulative_transactions(mbgnbd, elog, 'date', 'CUSTOMER_ID', t)

In [None]:


df.tail()



In [None]:
%%time
# Calibration period = 2016-09-04 to 2017-09-30
from datetime import datetime

cal = datetime.strptime('2018-06-30', '%Y-%m-%d')

from lifetimes.plotting import plot_cumulative_transactions
t = (elog.date.max() - elog.date.min()).days
t_cal = (cal - elog.date.min()).days
plot_cumulative_transactions(mbgnbd, elog, 'date', 'CUSTOMER_ID', t, t_cal, freq='D')
sns.despine()

In [None]:


%%time 

from lifetimes.plotting import plot_incremental_transactions
plot_incremental_transactions(mbgnbd, elog, 'date', 'CUSTOMER_ID', t, t_cal, freq='D')
sns.despine()



In [None]:
summary_cal_holdout.head()

In [None]:
# #  Reading the datasets 
# df2 = pd.read_csv('../input/brazilian-ecommerce/olist_customers_dataset.csv')
# df6 = pd.read_csv('../input/brazilian-ecommerce/olist_geolocation_dataset.csv')
# df4 = pd.read_csv('../input/brazilian-ecommerce/olist_order_items_dataset.csv')
# df3 = pd.read_csv('../input/brazilian-ecommerce/olist_order_payments_dataset.csv')
# df5 = pd.read_csv('../input/brazilian-ecommerce/olist_order_reviews_dataset.csv')
# df1 = pd.read_csv('../input/brazilian-ecommerce/olist_orders_dataset.csv')
# df7 = pd.read_csv('../input/brazilian-ecommerce/olist_products_dataset.csv')
# df8 = pd.read_csv('../input/brazilian-ecommerce/olist_sellers_dataset.csv')
# df9 = pd.read_csv('../input/brazilian-ecommerce/product_category_name_translation.csv')


## To read the data from  transactions , 
### -- BTW cal is is for fitting the dataset and test is done on holdout (feature and target )

In [None]:
#  Lifetimes function to read the transactions 
from lifetimes.datasets import load_transaction_data
from lifetimes .utils import summary_data_from_transaction_data

transaction_data = load_transaction_data()
print(transaction_data)

summary = summary_data_from_transaction_data(transaction_data,'id','date',observation_period_end ='2018-09-28')
print(summary.head())
mbgnbd.fit(summary_cal_holdout['frequency_cal'],summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal'])
print(mbgnbd)



## BASED ON HISTORY ==== INDIVIDUAL 

In [None]:
#  Customer predictions of future purchases
t = 30 
individual  = summary_cal_holdout.iloc[20]
#  expected purchases for each individual 
mbgnbd.predict(t,individual['frequency_cal'], individual['recency_cal'], individual['T_cal'])




In [None]:
# transaction data features 
# frame = {recency , frequency , monetary 'CUSTOMER_ID', 'order_date', 'cumulative_transactions'}
# recency = (features_data.groupby('CUSTOMER_ID')['date'])





### GAMMA GAMMA : Estimating customer lifetime value using the GAMMA-GAMMA model 

In [None]:
from lifetimes.datasets import load_cdnow_summary_data_with_monetary_value
summary_with_money_value = load_cdnow_summary_data_with_monetary_value()
summary_with_money_value.head()
returning_customers_summary = summary_with_money_value[summary_with_money_value['frequency']>0]

print(returning_customers_summary.head())


In [None]:
#  this Gamma submodel  is based on the assumption that  a customer's monetary value is not based on the purchase frequency

returning_customers_summary[['monetary_value','frequency','recency','T']].corr()

 ## Gamma-Gamma submodel and predict the conditional, expected average lifetime value of our customers.

In [None]:
# Submodel GammaGamma based on the assumption frequency and monetary value they are independent
from lifetimes import GammaGammaFitter
ggf = GammaGammaFitter(penalizer_coef = 0)
ggf.fit(returning_customers_summary['frequency'], 
       returning_customers_summary['monetary_value'] )
print(ggf)
