In [8]:
# import libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as dt

# data visualization libraries
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns

from operator import attrgetter

# machine learning libraries
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


# load data
retail = pd.read_csv(r"C:\Users\jki\Downloads\online_retail_II.csv")
retail

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom
...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France


In [9]:
retail.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [10]:
# lets drop missing values 
retail.dropna(inplace=True)

In [12]:
# lest remove dublicates
 retail.drop_duplicates(inplace=True)

IndentationError: unexpected indent (1042033969.py, line 2)

In [13]:
# have a look for negative values
retail.describe()

Unnamed: 0,Quantity,Price,Customer ID
count,824364.0,824364.0,824364.0
mean,12.414574,3.6768,15324.638504
std,188.976099,70.241388,1697.46445
min,-80995.0,0.0,12346.0
25%,2.0,1.25,13975.0
50%,5.0,1.95,15255.0
75%,12.0,3.75,16797.0
max,80995.0,38970.0,18287.0


In [17]:
# lets remove the negative values
retail = retail[(retail['Quantity'] > 0) & (retail['Price'] > 0)]

In [18]:
# have a look for negative values
retail.describe()

Unnamed: 0,Quantity,Price,Customer ID
count,805549.0,805549.0,805549.0
mean,13.290522,3.206561,15331.95497
std,143.634088,29.199173,1696.737039
min,1.0,0.001,12346.0
25%,2.0,1.25,13982.0
50%,5.0,1.95,15271.0
75%,12.0,3.75,16805.0
max,80995.0,10953.5,18287.0


In [26]:
# Data Preparation

# For cohort analysis, we need three labels. These are payment period, cohort group and cohort period/index.

# To work with the time series, we need to convert the type of related feature. The format shuld be as in the dataset.

retail['InvoiceDate'] = pd.to_datetime(retail['InvoiceDate'])

# Now, we need to create the cohort and order_month variables. The first one indicates the monthly cohort based on the first purchase date and the second one is the truncated month of the purchase date.

retail['order_month'] = retail['InvoiceDate'].dt.to_period('M')

retail['cohort'] = retail.groupby('Customer ID')['InvoiceDate'].transform('min').dt.to_period('M')


# Then, we aggregate the data per cohort and order_month and count the number of unique customers in each group.

retail_cohort = retail.groupby(['cohort', 'order_month']).agg(n_customers=('Customer ID', 'nunique')).reset_index(drop=False)

retail_cohort['period_number'] = (retail_cohort.order_month - retail_cohort.cohort).apply(attrgetter('n'))

retail_cohort.head()



Unnamed: 0,cohort,order_month,n_customers,period_number
0,2009-12,2009-12,955,0
1,2009-12,2010-01,337,1
2,2009-12,2010-02,319,2
3,2009-12,2010-03,406,3
4,2009-12,2010-04,363,4


In [22]:
retail.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 805549 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      805549 non-null  object 
 1   StockCode    805549 non-null  object 
 2   Description  805549 non-null  object 
 3   Quantity     805549 non-null  int64  
 4   InvoiceDate  805549 non-null  object 
 5   Price        805549 non-null  float64
 6   Customer ID  805549 non-null  float64
 7   Country      805549 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 55.3+ MB
