# 0.0 Imports

In [124]:
import pandas  as pd
import numpy   as np
import seaborn as sns


import umap.umap_ as umap

from matplotlib import pyplot as plt
from IPython.display import      HTML

from sklearn import preprocessing as pp

from sklearn import cluster   as c
from sklearn import metrics   as m

from plotly import express as px

from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer


## 0.1. Helper Functions

In [53]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25,12]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container {width:100% !important;}</style>'))
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()
    
jupyter_settings()    

Populating the interactive namespace from numpy and matplotlib


## 0.2. Load Dataset

In [54]:
# load data
df_raw = pd.read_csv('../data/raw/Ecommerce.csv')

df_raw.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,29-Nov-16,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,29-Nov-16,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,29-Nov-16,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,29-Nov-16,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,29-Nov-16,3.39,17850.0,United Kingdom


# 1.0. Descrição dos dados

## 1.1. Rename Columns

In [55]:
df1 = df_raw.copy()

In [56]:
df1.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [57]:
# Rename Columns
cols_new = ['invoice_no','stock_code','description','quantity','invoice_date','unit_price','customer_id','country']

df1.columns = cols_new

df1.sample()

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
523932,580524,84968A,SET OF 16 VINTAGE ROSE CUTLERY,1,2-Dec-17,12.75,13704.0,United Kingdom


In [58]:
df_raw.sample()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
113834,545998,84380,SET OF 3 BUTTERFLY COOKIE CUTTERS,12,6-Mar-17,1.25,13373.0,United Kingdom


## 1.2. Data Dimensions

In [59]:
print( 'Number of rows: {}'.format ( df1.shape[0] ) )
print( 'Number of cols: {}'.format ( df1.shape[1] ) )


Number of rows: 541909
Number of cols: 8


## 1.3. Data Types

In [60]:
df1.dtypes

invoice_no       object
stock_code       object
description      object
quantity          int64
invoice_date     object
unit_price      float64
customer_id     float64
country          object
dtype: object

## 1.4. Check NA

In [61]:
df1.isna().sum()

invoice_no           0
stock_code           0
description       1454
quantity             0
invoice_date         0
unit_price           0
customer_id     135080
country              0
dtype: int64

## 1.5. Replace NA

In [62]:
# Remove NA
df1 = df1.dropna( subset = ['description','customer_id'])
print( 'Remove data: {:.2f}'.format( 1 - (df1.shape[0]/ df_raw.shape[0]) ))


Remove data: 0.25


In [63]:
df1.shape

(406829, 8)

In [64]:
df1.isna().sum()

invoice_no      0
stock_code      0
description     0
quantity        0
invoice_date    0
unit_price      0
customer_id     0
country         0
dtype: int64

## 1.6. Change Dtypes

In [65]:
df1.dtypes

invoice_no       object
stock_code       object
description      object
quantity          int64
invoice_date     object
unit_price      float64
customer_id     float64
country          object
dtype: object

In [66]:
# Invoice Date

df1['invoice_date'] = pd.to_datetime( df1['invoice_date'], format = '%d-%b-%y')

# Customer Id

df1['customer_id'] = df1['customer_id'].astype(int)

df1.head()

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2016-11-29,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2016-11-29,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2016-11-29,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2016-11-29,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2016-11-29,3.39,17850,United Kingdom


In [67]:
df1['invoice_date'] = pd.to_datetime( df1['invoice_date'], format = '%d-%b-%y')

# Customer Id

df1.head()

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2016-11-29,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2016-11-29,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2016-11-29,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2016-11-29,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2016-11-29,3.39,17850,United Kingdom


In [68]:
df1.dtypes

invoice_no              object
stock_code              object
description             object
quantity                 int64
invoice_date    datetime64[ns]
unit_price             float64
customer_id              int64
country                 object
dtype: object

## 1.7. Descriptive Statistics

In [69]:
num_attributes = df1.select_dtypes( include = [ 'int64', 'float64'] )
cat_attributes = df1.select_dtypes( exclude = [ 'int64', 'float64','datetime64[ns]'])

### 1.7.1 Numerical Attributes

In [70]:
# Central tendency - mean, median
ct1 = pd.DataFrame(num_attributes.apply( np.mean )).T
ct2 = pd.DataFrame(num_attributes.apply( np.median )).T

# Dispersion - desvio padrão, mínimo, máximo, range, skew, kurtosis
d1 = pd.DataFrame( num_attributes.apply( np.std ) ).T
d2 = pd.DataFrame( num_attributes.apply( np.min ) ).T
d3 = pd.DataFrame( num_attributes.apply( np.max ) ).T
d4 = pd.DataFrame( num_attributes.apply( lambda x: x.max( ) - x.min() ) ).T
d5 = pd.DataFrame( num_attributes.apply( lambda x: x.skew( ) ) ).T
d6 = pd.DataFrame( num_attributes.apply( lambda x: x.kurtosis() ) ).T


# Concatenate

m = pd.concat( [d2, d3, d4, ct1, ct2, d1, d5, d6] ).T.reset_index()
m.columns = ['attributes', 'min', 'max', 'range', 'mean', 'mediana', 'std', 'skew', 'kurtosis']
m

Unnamed: 0,attributes,min,max,range,mean,mediana,std,skew,kurtosis
0,quantity,-80995.0,80995.0,161990.0,12.061303,5.0,248.693064,0.182663,94317.563673
1,unit_price,0.0,38970.0,38970.0,3.460481,1.95,69.315076,452.219026,246924.548709
2,customer_id,12346.0,18287.0,5941.0,15287.69057,15152.0,1713.598197,0.029835,-1.179982


### <font color= 'red'> 1.7.1.1 Numerical Attributs - Investigating </f>

- Quantity negativa ( pode ser devolução )
- Preço unitário igual a zero ( pode ser promoção )

### 1.7.2 Categorical Attributes

In [71]:
cat_attributes.head()

Unnamed: 0,invoice_no,stock_code,description,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,United Kingdom
1,536365,71053,WHITE METAL LANTERN,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,United Kingdom


###  Invoice_No

In [72]:
# Problema: Temos invoice com letras e números

# Identificação >

df_letter_invoices = df1.loc[df1['invoice_no'].apply( lambda x : bool( re.search( '[^0-9]+', x ) ) ), :]

print('Total number of invoices: {}'.format( len( df_letter_invoices )))  
print('Total number of negative quantity: {}'.format( len(df_letter_invoices[ df_letter_invoices['quantity'] < 0])))

Total number of invoices: 8905
Total number of negative quantity: 8905


### Stock Code

In [73]:
# Check stock codes only characters
df1.loc[df1['stock_code'].apply( lambda x : bool( re.search( '^[a-zA-Z]+$', x ) ) ) ,'stock_code'].unique()

# Ação:
## 1. Remove stock_code in ['POST', 'D', 'M', 'PADS', 'DOT', 'CRUK']


array(['POST', 'D', 'M', 'PADS', 'DOT', 'CRUK'], dtype=object)

### Description

In [74]:
df1.head()

#  Ação: Delete Description

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2016-11-29,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2016-11-29,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2016-11-29,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2016-11-29,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2016-11-29,3.39,17850,United Kingdom


### Country

In [75]:
df1['country'].unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Greece', 'Singapore', 'Lebanon',
       'United Arab Emirates', 'Saudi Arabia', 'Czech Republic', 'Canada',
       'Unspecified', 'Brazil', 'USA', 'European Community', 'Bahrain',
       'Malta', 'RSA'], dtype=object)

In [76]:
df1['country'].value_counts( normalize = True)

United Kingdom          0.889509
Germany                 0.023339
France                  0.020871
EIRE                    0.018398
Spain                   0.006226
Netherlands             0.005828
Belgium                 0.005086
Switzerland             0.004614
Portugal                0.003638
Australia               0.003095
Norway                  0.002669
Italy                   0.001974
Channel Islands         0.001863
Finland                 0.001708
Cyprus                  0.001529
Sweden                  0.001136
Austria                 0.000986
Denmark                 0.000956
Japan                   0.000880
Poland                  0.000838
USA                     0.000715
Israel                  0.000615
Unspecified             0.000600
Singapore               0.000563
Iceland                 0.000447
Canada                  0.000371
Greece                  0.000359
Malta                   0.000312
United Arab Emirates    0.000167
European Community      0.000150
RSA       

In [77]:
#df1[['custumer_id','country']].drop_duplicates().groupby( 'country')

# 2.0. Filtragem de Variáveis

In [97]:
df2 = df1.copy()

In [98]:
df2.dtypes

invoice_no              object
stock_code              object
description             object
quantity                 int64
invoice_date    datetime64[ns]
unit_price             float64
customer_id              int64
country                 object
dtype: object

In [99]:
 # === Numerical attributes ====
df2 = df2.loc[df2['unit_price'] >= 0.04, :]

# === Categorical attributes ====
df2 = df2[~df2['stock_code'].isin( ['POST', 'D', 'M', 'PADS', 'DOT', 'CRUK'] )]

# description
df2 = df2.drop( columns='description', axis=1 )

# map 
df2 = df2[~df2['country'].isin( ['European Community', 'Unspecified' ] ) ]

# quantity
df2_returns = df2.loc[df1['quantity'] < 0, :]
df2_purchases = df2.loc[df1['quantity'] >= 0, :]





# 3.0. Feature Engineering

In [117]:
df3 = df2.copy()

## 3.1. Feature Creation

In [118]:
# Data Reference

df_ref = df3.drop( ['invoice_no', 'stock_code', 'quantity', 'invoice_date', 'unit_price', 'country'], axis=1 ).drop_duplicates( ignore_index=True )

In [119]:
 # Gross Revenue ( Faturamento ) quantity * price
df2_purchases.loc[:, 'gross_revenue'] = df2_purchases.loc[:, 'quantity'] * df2_purchases.loc[:, 'unit_price']

# Monetary
df_monetary = df2_purchases.loc[:, ['customer_id', 'gross_revenue']].groupby( 'customer_id' ).sum().reset_index()
df_ref = pd.merge( df_ref, df_monetary, on='customer_id', how='left' )


 # Recency - Last day purchase
df_recency = df2_purchases.loc[:, ['customer_id', 'invoice_date']].groupby( 'customer_id' ).max().reset_index()
df_recency['recency_days'] = ( df2['invoice_date'].max() - df_recency['invoice_date'] ).dt.days
df_recency = df_recency[['customer_id', 'recency_days']].copy()
df_ref = pd.merge( df_ref, df_recency, on='customer_id', how='left' )


# Frequency
df_frequ = df2_purchases[['customer_id','invoice_no']].drop_duplicates().groupby('customer_id').count().reset_index()
df_ref = pd.merge ( df_ref, df_frequ, on = 'customer_id', how = 'left')

# Average Ticket
df_avg_ticket = df2_purchases[['customer_id', 'gross_revenue']].groupby('customer_id').mean().reset_index().rename ( columns = {'gross_revenue':'avg_ticket'} )
df_ref = pd.merge( df_ref, df_avg_ticket, on='customer_id', how ='left')
df_ref.isna().sum()





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


customer_id       0
gross_revenue    27
recency_days     27
invoice_no       27
avg_ticket       27
dtype: int64

In [120]:
df_ref.head()

Unnamed: 0,customer_id,gross_revenue,recency_days,invoice_no,avg_ticket
0,17850,5391.21,372.0,34.0,18.152222
1,13047,3232.59,56.0,9.0,18.904035
2,12583,6705.38,2.0,15.0,28.9025
3,13748,948.25,95.0,5.0,33.866071
4,15100,876.0,333.0,3.0,292.0


In [121]:
df3 = df_ref.copy()

# 4.0. Exploratory Data Analysis

In [122]:
df4 = df3.dropna()
df4.isna().sum()

customer_id      0
gross_revenue    0
recency_days     0
invoice_no       0
avg_ticket       0
dtype: int64

In [128]:
df4.head()

Unnamed: 0,customer_id,gross_revenue,recency_days,invoice_no,avg_ticket
0,17850,5391.21,372.0,34.0,18.152222
1,13047,3232.59,56.0,9.0,18.904035
2,12583,6705.38,2.0,15.0,28.9025
3,13748,948.25,95.0,5.0,33.866071
4,15100,876.0,333.0,3.0,292.0


# 5.0. Data Preparation

In [129]:
df5 = df4.copy()

In [130]:
df5.head()

Unnamed: 0,customer_id,gross_revenue,recency_days,invoice_no,avg_ticket
0,17850,5391.21,372.0,34.0,18.152222
1,13047,3232.59,56.0,9.0,18.904035
2,12583,6705.38,2.0,15.0,28.9025
3,13748,948.25,95.0,5.0,33.866071
4,15100,876.0,333.0,3.0,292.0


In [131]:
## Standar Scakaer

ss = pp.StandardScaler()

df5['gross_revenue'] = ss.fit_transform ( df5[['gross_revenue']])
df5['recency_days'] = ss.fit_transform ( df5[['recency_days']])
df5['invoice_no'] = ss.fit_transform ( df5[['invoice_no']])
df5['avg_ticket'] = ss.fit_transform ( df5[['avg_ticket']])


# 6.0. Feature Selection

In [40]:
df6 = df5.copy()

# 7.0. Hyperparameter Fine-tuning

In [41]:
df7 = df6.copy()

In [42]:
X = df6.drop( columns=['customer_id'] )

In [43]:
# How cluster we needs
clusters = [ 2 , 3 , 4 , 5 , 6, 7 ]

In [44]:

kmeans = KElbowVisualizer( c.KMeans(), k = clusters, timings = False)
kmeans.fit( X )
kmeans.show();

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## 7.2. Silhouette Score

In [None]:
kmeans = KElbowVisualizer( c.KMeans(), k = clusters,metric = 'silhouette' , timings = False)
kmeans.fit( X )
kmeans.show();

## 7.3 Silhouette  Analysis

In [None]:
fig, ax = plt.subplots( 3, 2, figsize = (25,18))

for k in clusters:
    km = c.KMeans( n_clusters = k,init= 'random', n_init = 10, max_iter = 100, random_state = 42)
    q, mod = divmod( k , 2)
    visualizer = SilhouetteVisualizer( km, colors = 'yellowbrick', ax = ax[q-1][mod])
    visualizer.fit( X );
    visualizer.finalize()

# 8.0. Model Training

In [None]:
df8 = df7.copy()

## 8.1. K-Means

In [None]:
# Model Definition
k = 4
kmeans = c.KMeans( init = 'random', n_clusters = k, n_init = 10, max_iter = 300, random_state = 42  )

# Model Training
kmeans.fit(X)

# Clustering
labels  = kmeans.labels_


## 8.2. Cluster Validation

In [None]:
# WSS ( Within-Cluster Sum of Square)
print('WSS value : {}'.format(kmeans.inertia_))

## SS ( Silhouette Score)
print('SS value : {}'.format(m.silhouette_score ( X, labels, metric = 'euclidean')))

# 9.0. Cluster Analysis

In [None]:
df9 = df8.copy()
df9['cluster'] = labels
df9.head()

## 9.1. Visualization Inspection

In [None]:
visualizer = SilhouetteVisualizer(kmeans, colors ='yellowbrick')
visualizer.fit( X )
visualizer.finalize()

In [None]:
#fig = px.scatter_3d( df9, x = 'recency_days', y = 'invoice_no', z = 'gross_revenue', color = 'cluster')
#fig.show()

## 9.2. 2D Plot

In [None]:
df_viz = df9.drop( columns = 'customer_id', axis = 1)
sns.pairplot(df_viz, hue = 'cluster')

## 9.3 UMAP

In [None]:
X.head()

In [None]:
reducer = umap.UMAP( n_neighbors = 20, random_state= 42)
embedding = reducer.fit_transform( X )

# embedding

df_viz['embedding_x'] = embedding[:, 0]
df_viz['embedding_y'] = embedding[:, 1]

# plot UMAP
sns.scatterplot ( x = 'embedding_x', y = 'embedding_y',
                hue= 'cluster', palette = sns.color_palette( 'hls', n_colors= len(df_viz['cluster'].unique())), data= df_viz)

## 9.3. Cluster Profile

In [None]:
# Number of customer
df_cluster = df9[['customer_id','cluster']].groupby( 'cluster' ).count().reset_index()
df_cluster['pec_customer'] = 100*(df_cluster['customer_id']/df_cluster['customer_id'].sum())

# Average gross revenue
df_avg_gross_revenue = df9[['gross_revenue', 'cluster']].groupby('cluster').mean().reset_index()
df_cluster = pd.merge( df_cluster, df_avg_gross_revenue, how = 'inner', on = 'cluster')

# Average recency days
df_avg_recency_days = df9[['recency_days', 'cluster']].groupby('cluster').mean().reset_index()
df_cluster = pd.merge( df_cluster, df_avg_recency_days, how = 'inner', on = 'cluster')

# Average invoice_no
df_avg_invoice_no = df9[['invoice_no', 'cluster']].groupby('cluster').mean().reset_index()
df_cluster = pd.merge( df_cluster, df_avg_invoice_no, how = 'inner', on = 'cluster')

# Average Ticket 
df_ticket = df9[['avg_ticket','cluster']].groupby( 'cluster' ).mean().reset_index()
df_cluster = pd.merge( df_cluster, df_ticket, how = 'inner', on = 'cluster')


df_cluster.head()

### Cluster 01:  ( Candidato a Insider )
 
 - Número de customer: 6 (0.13% dos customers)
 - Recência em média: 7 dias
 - Compras em média: 89 compras
 - Receita em média: $ 182.181,00.
 
### Cluster 02:  
 
 - Número de customer: 31 (0.7% dos customers)
 - Recência em média: 14 dias
 - Compras em média: 53 compras
 - Receita em média: $ 40.543,00.
 
 ### Cluster 03: 
 
 - Número de customer: 4.335 (99% dos customers)
 - Recência em média: 92 dias
 - Compras em média: 05 compras
 - Receita em média: $ 1.372,57.

# 10.0. Deploy to Production

In [None]:
df10 = df9.copy()