In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import datetime as dt

# 1. Load Data

In [2]:
df = pd.read_csv('Online Retail.csv')

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [4]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate']).dt.date
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01,3.39,17850.0,United Kingdom


In [None]:
df = df[pd.notnull(df['CustomerID'])]
df = df[(df['Quantity']>0)]
df['Sales'] = df['Quantity'] * df['UnitPrice']
cols_of_interest = ['CustomerID', 'InvoiceDate', 'Sales']
df = df[cols_of_interest]

In [None]:
df.tail()

In [None]:
df['CustomerID'].nunique()

In [None]:
df.shape

Frequency:  the number of repeat purchases the customer has made<br>
<br>T: the duration between a customer’s first purchase and the end of the period under study<br>
<br>Recency: the duration between a customer’s first purchase and their latest purchase. (Thus if they have made only 1 purchase, the recency is 0.)

# Data Explore

In [None]:
from lifetimes.plotting import *
from lifetimes.utils import *

In [None]:
data = summary_data_from_transaction_data(df, 'CustomerID', 'InvoiceDate', monetary_value_col='Sales', observation_period_end='2011-12-9')
data.head()

In [None]:
data['frequency'].plot(kind='hist', bins=50)

In [None]:
data['frequency'].describe()

In [None]:
sum(data['frequency'] == 0)/float(len(data))

Among all customers in our data, more than 35% of them only made purchase once (no repeat).

# Frequency/Recency Analysis Using the BG/NBD Model

In [None]:
from lifetimes import BetaGeoFitter

In [None]:
from lifetimes import BetaGeoFitter
bgf = BetaGeoFitter(penalizer_coef=0.033)
bgf.fit(data['frequency'], data['recency'], data['T'])
print(bgf)

In [None]:
from lifetimes.plotting import plot_frequency_recency_matrix
fig = plt.figure(figsize=(12,8))
plot_frequency_recency_matrix(bgf) #Visualizing our frequency/recency matrix

If a customer has made 120 purchases, and his latest purchase was when he was approximately 350 days old (i.e. Recency: the duration between her first transaction and her latest transaction is 350 days), then he is our best customer (bottom-right).<br>

<br>Customers who have purchased a lot and purchased recently will likely be the best customers in the future. We will never have enough of them.<br>
<br>Customers who have purchased a lot but not recently (top-right corner), have probably gone.<br>

<br>There are also another type of customers that around (40, 300) that represents the customer who buys infrequently, and we have not seen him recently, so he might buy again

In [None]:
from lifetimes.plotting import plot_probability_alive_matrix
fig = plt.figure(figsize=(12,8))
plot_probability_alive_matrix(bgf)

Customers who have purchased recently are almost surely “alive”.<br>
<br>Customers who have purchased a lot but not recently, are likely to have dropped out. And the more they bought in the past, the more likely they have dropped out. They are represented in the upper-right.<br>

In [None]:
t = 1
data['predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, data['frequency'], data['recency'], data['T'])
data.sort_values(by='predicted_purchases').tail(5)
#We are ranking customers from “highest expected purchases in the next period” to lowest. 

Listed above are our top 5 customers that the model expects them to make purchases in the next day. The predicted_purchases column represents their expected number of purchases

# Assessing model fit

In [None]:
from lifetimes.plotting import plot_period_transactions
plot_period_transactions(bgf)

the small difference between Actual and Model, thus our model is good

In [None]:
'''
We now partition the dataset into a calibration period dataset and a holdout dataset. 
This is important as we want to test how our model performs on data not yet seen 
(just like cross-validation in machine learning practice).
'''
from lifetimes.utils import calibration_and_holdout_data
summary_cal_holdout = calibration_and_holdout_data(df, 'CustomerID', 'InvoiceDate',
                                        calibration_period_end='2011-06-08',
                                        observation_period_end='2011-12-9' )   
print(summary_cal_holdout.head())

In [None]:
from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases
bgf.fit(summary_cal_holdout['frequency_cal'], summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal'])
plot_calibration_purchases_vs_holdout_purchases(bgf, summary_cal_holdout)

In this plot, we separate the data into both a in-sample (calibration) and validation (holdout) period. The sample <br>period consists the beginning to 2011–06–08; the validation period spans from 2011–06–09 to 2011–12–09.<br> 
<br>The plot groups all customers in the calibration period by their number of repeat purchases (x-axis) and then <br>averages over their repeat purchases in the holdout period (y-axis).<br>
<br>The green and blue line presents the model prediction and actual result of the y-axis respectively.<br>
<br>As we can see, our model is able to very accurately predict the customer base’s behaviour out of the sample, the model under-estimates at 4 purchases and after 5 purchases.

# Customer transactions predictions

In [None]:
# Based on customer history, we can now predict what an individual’s future purchases might look like
t = 10
individual = data.loc[12347]
bgf.predict(t, individual['frequency'], individual['recency'], individual['T'])

Our model predicts that customer 12347’s future transaction is 0.161 in 10 days.

# Estimating customer lifetime value using gamma-gamma model of monetary value

In [None]:
'''
We are now taking into account the economic value of each transaction
to predict likely spend per transaction in the future at the customer level
'''

returning_customers_summary = data[data['frequency']>0]
print(returning_customers_summary.head())
print(len(returning_customers_summary))

We are only estimating the customers who had at least one repeat purchase with us. Therefore, we are estimating for 2,790 customers.

In [None]:
from lifetimes import GammaGammaFitter
ggf = GammaGammaFitter(penalizer_coef = 0)
ggf.fit(returning_customers_summary['frequency'],
        returning_customers_summary['monetary_value'])
print(ggf)

In [None]:
#After applying Gamma-Gamma model, now we can estimate average transaction value for each customer.
print(ggf.conditional_expected_average_profit(
        data['frequency'],
        data['monetary_value']
    ).head(10))