In [2]:
#  Cohort Analysis (Retention over User & Product Lifttime) which we especially use more in E-commerce website, and Application Mobile

# Cohort Analysis (Retention over User & Product Lifetime)

# A cohort is a group of subjects who share a defining characteristic. We can observe how a cohort behaves across time and compare it to other cohorts. Cohorts are used in medicine, psychology, econometrics, ecology and many other areas to perform a cross-section (compare difference across subjects) at intervals through time.

# import library
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as dt

#For Data  Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#For Machine Learning Algorithm
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory


# lets load the data

retail =  pd.read_csv(r"C:\Users\jki\Downloads\online_retail_II.csv" )
retail.head(5)

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [3]:
# check for missing values
missing_values=  retail.isna().sum()
print(missing_values)

Invoice             0
StockCode           0
Description      4382
Quantity            0
InvoiceDate         0
Price               0
Customer ID    243007
Country             0
dtype: int64


In [4]:
# remove missing values
retail = retail.dropna(subset=['Customer ID'])

In [5]:
# check for missing values
missing_values=  retail.isna().sum()
print(missing_values)

Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64


In [6]:
# Lets check for negative vlues
retail.describe()

Unnamed: 0,Quantity,Price,Customer ID
count,824364.0,824364.0,824364.0
mean,12.414574,3.6768,15324.638504
std,188.976099,70.241388,1697.46445
min,-80995.0,0.0,12346.0
25%,2.0,1.25,13975.0
50%,5.0,1.95,15255.0
75%,12.0,3.75,16797.0
max,80995.0,38970.0,18287.0


In [7]:
# lest remove the negative values

retail=retail[(retail['Quantity']>0) & (retail['Price']>0)] 
retail.describe()

Unnamed: 0,Quantity,Price,Customer ID
count,805549.0,805549.0,805549.0
mean,13.290522,3.206561,15331.95497
std,143.634088,29.199173,1696.737039
min,1.0,0.001,12346.0
25%,2.0,1.25,13982.0
50%,5.0,1.95,15271.0
75%,12.0,3.75,16805.0
max,80995.0,10953.5,18287.0


In [9]:
# let check the data types
retail.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 805549 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      805549 non-null  object 
 1   StockCode    805549 non-null  object 
 2   Description  805549 non-null  object 
 3   Quantity     805549 non-null  int64  
 4   InvoiceDate  805549 non-null  object 
 5   Price        805549 non-null  float64
 6   Customer ID  805549 non-null  float64
 7   Country      805549 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 55.3+ MB


In [12]:
# let convert the data types
retail['Customer ID'] = retail['Customer ID'].astype(int)

retail['InvoiceDate'] = pd.to_datetime(retail['InvoiceDate'])
retail.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 805549 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      805549 non-null  object        
 1   StockCode    805549 non-null  object        
 2   Description  805549 non-null  object        
 3   Quantity     805549 non-null  int64         
 4   InvoiceDate  805549 non-null  datetime64[ns]
 5   Price        805549 non-null  float64       
 6   Customer ID  805549 non-null  int32         
 7   Country      805549 non-null  object        
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1), object(4)
memory usage: 52.2+ MB


In [14]:
# Let's Make Cohort Analysis
# For cohort analysis, there are a few labels that we have to create
# Invoice period: A string representation of the year and month of a single transaction/invoice
# Cohort group: A string representation of the the year and month of a customer’s first purchase. This label is common across all invoices for a particular customer.
# Cohort period / Cohort Index: A integer representation a customer’s stage in its “lifetime”. The number represents the number of months passed since the first purchase.

def get_month(x) : return dt.datetime(x.year,x.month,1)
retail['InvoiceMonth'] = retail['InvoiceDate'].apply(get_month)
grouping = retail.groupby('Customer ID')['InvoiceMonth']
retail['CohortMonth'] = grouping.transform('min')
retail.tail()



Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,InvoiceMonth,CohortMonth
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.1,12680,France,2011-12-01,2011-08-01
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680,France,2011-12-01,2011-08-01
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680,France,2011-12-01,2011-08-01
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680,France,2011-12-01,2011-08-01
1067370,581587,POST,POSTAGE,1,2011-12-09 12:50:00,18.0,12680,France,2011-12-01,2011-08-01


In [15]:
def get_month_int (dframe,column):
    year = dframe[column].dt.year
    month = dframe[column].dt.month
    day = dframe[column].dt.day
    return year, month , day 

invoice_year,invoice_month,_ = get_month_int(retail,'InvoiceMonth')
cohort_year,cohort_month,_ = get_month_int(retail,'CohortMonth')

year_diff = invoice_year - cohort_year 
month_diff = invoice_month - cohort_month 

retail['CohortIndex'] = year_diff * 12 + month_diff + 1

In [None]:
# Count monthly active customers from each cohort

