# Recency, Frequency, Monetary Transaction Segmentation

In [1]:
# https://www.putler.com/rfm-analysis/#visuals
# github: https://www.bing.com/videos/search?q=youtube+github+in+15+minutes&view=detail&mid=72C8A6D7A4E82717835072C8A6D7A4E827178350&FORM=VIRE
# Best are High spending, Recent, Frequent - reward their loyalty 
# Bought recently, and more than once - Membership, loyalty program, sell other products (upsell add-ons, cross-sell other services)
# Bought recently but not often - help with onboarding
# About to sleep Not recent - reconnect with them, discount, personalised, new products 
# Lapsed customers - personalised comms
# Not customers - brand campaign
# https://www.kaggle.com/regivm/rfm-analysis-tutorial

In [2]:
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
from pycaret.classification import *

In [3]:
df = pd.read_csv('D:\\marke\\data\\retail\\Retail_Data_Transactions.csv', parse_dates=['trans_date'])

In [4]:
df.head(3)

Unnamed: 0,customer_id,trans_date,tran_amount
0,CS5295,2013-02-11,35
1,CS4768,2015-03-15,39
2,CS2122,2013-02-26,52


In [5]:
len(df)

125000

In [7]:
import sys
sys.getsizeof(df) # in bytes

9875144

In [8]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125000 entries, 0 to 124999
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   customer_id  125000 non-null  object        
 1   trans_date   125000 non-null  datetime64[ns]
 2   tran_amount  125000 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 9.4 MB


In [6]:
print(df['trans_date'].min(), df['trans_date'].max())

2011-05-16 00:00:00 2015-03-16 00:00:00


In [7]:
sd = dt.datetime(2015,3,16)
df['hist']=sd - df['trans_date']
df['hist'].astype('timedelta64[D]')
df['hist']=df['hist'] / np.timedelta64(1, 'D')
df.head()

Unnamed: 0,customer_id,trans_date,tran_amount,hist
0,CS5295,2013-02-11,35,763.0
1,CS4768,2015-03-15,39,1.0
2,CS2122,2013-02-26,52,748.0
3,CS1217,2011-11-16,99,1216.0
4,CS1850,2013-11-20,78,481.0


In [8]:
df=df[df['hist'] < 730]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65023 entries, 1 to 124999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   customer_id  65023 non-null  object        
 1   trans_date   65023 non-null  datetime64[ns]
 2   tran_amount  65023 non-null  int64         
 3   hist         65023 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 2.5+ MB


In [9]:
rfmTable = df.groupby('customer_id').agg({'hist': lambda x:x.min(), # Recency
                                        'customer_id': lambda x: len(x),               # Frequency
                                        'tran_amount': lambda x: x.sum()})          # Monetary Value

rfmTable.rename(columns={'hist': 'recency', 
                         'customer_id': 'frequency', 
                         'tran_amount': 'monetary_value'}, inplace=True)

In [10]:
df[df['customer_id']=='CS1112']

Unnamed: 0,customer_id,trans_date,tran_amount,hist
5650,CS1112,2015-01-14,39,61.0
17810,CS1112,2014-07-16,90,243.0
26626,CS1112,2014-04-29,63,321.0
31500,CS1112,2014-12-04,59,102.0
42084,CS1112,2013-07-01,36,623.0
74842,CS1112,2013-11-13,71,488.0


In [11]:
quartiles = rfmTable.quantile(q=[0.25,0.50,0.75])
print(quartiles, type(quartiles))

      recency  frequency  monetary_value
0.25     22.0        7.0           390.0
0.50     53.0        9.0           607.0
0.75    111.0       12.0           816.0 <class 'pandas.core.frame.DataFrame'>


In [12]:
quartiles=quartiles.to_dict()
quartiles

{'recency': {0.25: 22.0, 0.5: 53.0, 0.75: 111.0},
 'frequency': {0.25: 7.0, 0.5: 9.0, 0.75: 12.0},
 'monetary_value': {0.25: 390.0, 0.5: 607.0, 0.75: 816.0}}

In [13]:
## for Recency 

def RClass(x,p,d):
    if x <= d[p][0.25]:
        return 1
    elif x <= d[p][0.50]:
        return 2
    elif x <= d[p][0.75]: 
        return 3
    else:
        return 4
    
## for Frequency and Monetary value 

def FMClass(x,p,d):
    if x <= d[p][0.25]:
        return 4
    elif x <= d[p][0.50]:
        return 3
    elif x <= d[p][0.75]: 
        return 2
    else:
        return 1    
    

In [14]:
rfmSeg = rfmTable
rfmSeg['R_Quartile'] = rfmSeg['recency'].apply(RClass, args=('recency',quartiles,))
rfmSeg['F_Quartile'] = rfmSeg['frequency'].apply(FMClass, args=('frequency',quartiles,))
rfmSeg['M_Quartile'] = rfmSeg['monetary_value'].apply(FMClass, args=('monetary_value',quartiles,))

In [15]:
rfmSeg['RFMClass'] = rfmSeg.R_Quartile.map(str) \
                            + rfmSeg.F_Quartile.map(str) \
                            + rfmSeg.M_Quartile.map(str)

In [16]:
rfmSeg.head()

Unnamed: 0_level_0,recency,frequency,monetary_value,R_Quartile,F_Quartile,M_Quartile,RFMClass
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CS1112,61.0,6,358,3,4,4,344
CS1113,35.0,11,775,2,2,2,222
CS1114,32.0,11,804,2,2,2,222
CS1115,11.0,11,831,1,2,1,121
CS1116,203.0,5,333,4,4,4,444


In [17]:
rfmSeg.sort_values(by=['RFMClass', 'monetary_value'], ascending=[True, False])

Unnamed: 0_level_0,recency,frequency,monetary_value,R_Quartile,F_Quartile,M_Quartile,RFMClass
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CS4640,18.0,21,1610,1,1,1,111
CS5343,13.0,20,1585,1,1,1,111
CS3622,13.0,20,1549,1,1,1,111
CS3688,15.0,19,1459,1,1,1,111
CS2316,3.0,21,1446,1,1,1,111
...,...,...,...,...,...,...,...
CS7972,335.0,1,26,4,4,4,444
CS8033,483.0,1,24,4,4,4,444
CS8400,446.0,1,19,4,4,4,444
CS8760,447.0,1,19,4,4,4,444


In [18]:
rfmSeg.groupby('RFMClass').agg('monetary_value').mean()

RFMClass
111    1040.886214
112     750.666667
113     526.400000
121     876.818792
122     728.847561
123     506.773333
124     368.222222
132     665.492188
133     525.223404
134     328.179104
143     470.666667
144     270.844444
211    1040.580000
212     747.000000
213     525.833333
221     877.388489
222     722.018293
223     493.111111
224     347.285714
232     669.608333
233     520.100000
234     335.380000
242     617.750000
243     470.890625
244     261.939394
311    1038.503571
312     712.538462
313     563.857143
321     872.103448
322     731.386885
323     503.692308
324     356.666667
332     664.292308
333     526.143519
334     336.459016
342     610.500000
343     472.711538
344     249.600601
411    1008.403509
412     772.400000
413     542.000000
421     867.848101
422     726.048780
423     494.964286
424     355.142857
431     823.000000
432     662.414634
433     523.882653
434     330.442623
442     649.000000
443     468.022523
444     235.129134
Nam

In [19]:
rfmSeg['Total Score'] = rfmSeg['R_Quartile'] + rfmSeg['F_Quartile'] +rfmSeg['M_Quartile']
print(rfmSeg.head(), rfmSeg.info())

<class 'pandas.core.frame.DataFrame'>
Index: 6884 entries, CS1112 to CS9000
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   recency         6884 non-null   float64
 1   frequency       6884 non-null   int64  
 2   monetary_value  6884 non-null   int64  
 3   R_Quartile      6884 non-null   int64  
 4   F_Quartile      6884 non-null   int64  
 5   M_Quartile      6884 non-null   int64  
 6   RFMClass        6884 non-null   object 
 7   Total Score     6884 non-null   int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 484.0+ KB
             recency  frequency  monetary_value  R_Quartile  F_Quartile  \
customer_id                                                               
CS1112          61.0          6             358           3           4   
CS1113          35.0         11             775           2           2   
CS1114          32.0         11             804           2           2   
CS1

In [20]:
res = pd.read_csv('D:\\marke\\data\\retail\\Retail_Data_Response.csv')
res.sort_values('customer_id', inplace=True)

print(res.head(), res.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6884 entries, 0 to 6883
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  6884 non-null   object
 1   response     6884 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 161.3+ KB
  customer_id  response
0      CS1112         0
1      CS1113         0
2      CS1114         1
3      CS1115         1
4      CS1116         1 None


In [21]:
rfmSeg.reset_index(inplace=True)
rfmSeg.head()

Unnamed: 0,customer_id,recency,frequency,monetary_value,R_Quartile,F_Quartile,M_Quartile,RFMClass,Total Score
0,CS1112,61.0,6,358,3,4,4,344,11
1,CS1113,35.0,11,775,2,2,2,222,6
2,CS1114,32.0,11,804,2,2,2,222,6
3,CS1115,11.0,11,831,1,2,1,121,4
4,CS1116,203.0,5,333,4,4,4,444,12


In [22]:
rfmSeg.sort_values('customer_id', inplace=True)
rfm2=pd.merge(rfmSeg, res, on='customer_id')

In [23]:
rfm2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6884 entries, 0 to 6883
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   customer_id     6884 non-null   object 
 1   recency         6884 non-null   float64
 2   frequency       6884 non-null   int64  
 3   monetary_value  6884 non-null   int64  
 4   R_Quartile      6884 non-null   int64  
 5   F_Quartile      6884 non-null   int64  
 6   M_Quartile      6884 non-null   int64  
 7   RFMClass        6884 non-null   object 
 8   Total Score     6884 non-null   int64  
 9   response        6884 non-null   int64  
dtypes: float64(1), int64(7), object(2)
memory usage: 591.6+ KB


In [24]:
# Build a Customer Propensity model to predict response from
label = rfm2.groupby('response').agg({'customer_id': lambda x: len(x)})
label.head()


Unnamed: 0_level_0,customer_id
response,Unnamed: 1_level_1
0,6237
1,647


In [25]:
training_df = rfm2[['response','recency','frequency','monetary_value']]

In [26]:
training_df.head()

Unnamed: 0,response,recency,frequency,monetary_value
0,0,61.0,6,358
1,0,35.0,11,775
2,1,32.0,11,804
3,1,11.0,11,831
4,1,203.0,5,333


In [27]:
experiment_allModels = setup(training_df, target='response', silent = True, use_gpu = True, train_size=0.8,fix_imbalance=True)

Unnamed: 0,Description,Value
0,session_id,8088
1,Target,response
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(6884, 4)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [28]:
best_model = compare_models(round=2)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.85,0.76,0.23,0.22,0.22,0.14,0.14,0.86
lightgbm,Light Gradient Boosting Machine,0.85,0.76,0.25,0.23,0.24,0.16,0.16,0.09
catboost,CatBoost Classifier,0.85,0.76,0.26,0.24,0.25,0.17,0.17,4.79
rf,Random Forest Classifier,0.84,0.73,0.23,0.2,0.21,0.13,0.13,0.71
et,Extra Trees Classifier,0.84,0.73,0.23,0.2,0.21,0.13,0.13,1.06
dt,Decision Tree Classifier,0.83,0.57,0.25,0.19,0.22,0.13,0.13,0.03
gbc,Gradient Boosting Classifier,0.8,0.78,0.46,0.23,0.3,0.2,0.22,0.75
knn,K Neighbors Classifier,0.72,0.67,0.5,0.17,0.25,0.13,0.16,0.23
ada,Ada Boost Classifier,0.72,0.8,0.72,0.21,0.33,0.21,0.28,0.28
lr,Logistic Regression,0.71,0.79,0.74,0.21,0.32,0.21,0.27,0.05


In [29]:
dt = create_model('dt', fold=10)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8403,0.5775,0.2549,0.2063,0.2281,0.1401,0.1411
1,0.8276,0.5768,0.2692,0.1972,0.2276,0.1332,0.1353
2,0.8149,0.5791,0.2885,0.1875,0.2273,0.1275,0.1313
3,0.8185,0.5553,0.2308,0.1667,0.1935,0.0943,0.0959
4,0.8185,0.5545,0.2308,0.1667,0.1935,0.0943,0.0959
5,0.8421,0.5683,0.2308,0.2034,0.2162,0.1288,0.1291
6,0.8421,0.5941,0.2885,0.2308,0.2564,0.1693,0.1706
7,0.8582,0.5778,0.2353,0.2353,0.2353,0.1571,0.1571
8,0.8382,0.5763,0.2549,0.2031,0.2261,0.137,0.1381
9,0.8218,0.5409,0.1961,0.1493,0.1695,0.0717,0.0726


In [30]:
dt_model_tuned = tune_model(dt) 

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9074,0.5,0.0,0.0,0.0,0.0,0.0
1,0.9056,0.5,0.0,0.0,0.0,0.0,0.0
2,0.9056,0.5,0.0,0.0,0.0,0.0,0.0
3,0.9056,0.5,0.0,0.0,0.0,0.0,0.0
4,0.9056,0.5,0.0,0.0,0.0,0.0,0.0
5,0.9056,0.5,0.0,0.0,0.0,0.0,0.0
6,0.9056,0.5,0.0,0.0,0.0,0.0,0.0
7,0.9073,0.5,0.0,0.0,0.0,0.0,0.0
8,0.9073,0.5,0.0,0.0,0.0,0.0,0.0
9,0.9073,0.5,0.0,0.0,0.0,0.0,0.0


In [31]:
#tuned model parameters
print(dt_model_tuned)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=8, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.4, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=9,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=8088, splitter='best')


In [32]:
evaluate_model(lr_model_tuned)

NameError: name 'lr_model_tuned' is not defined

In [None]:
# Sort a dataframe

df.sort

In [None]:
# pivot table

In [None]:
# group and aggregate, count

In [None]:
# subset