# 1. Imports

In [1]:
import re
import sqlite3
import s3fs
import os
import pickle

import pandas       as pd
import numpy        as np
import seaborn      as sns
import umap.umap_   as umap

from matplotlib              import pyplot        as plt
from plotly                  import express       as px
from sklearn                 import preprocessing as pp
from matplotlib              import cm
from sqlalchemy              import create_engine
from dotenv                  import load_dotenv, find_dotenv

from sklearn.metrics         import silhouette_score, silhouette_samples
from sklearn.cluster         import KMeans, DBSCAN
from sklearn.preprocessing   import MinMaxScaler
from sklearn.decomposition   import PCA
from sklearn.manifold        import TSNE
from sklearn.ensemble        import RandomForestRegressor
from sklearn.mixture         import GaussianMixture
from sklearn.neighbors       import NearestNeighbors

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

  from .autonotebook import tqdm as notebook_tqdm


## 1.2. Load Dataset

In [5]:
# load env variables
load_dotenv('/home/ubuntu/Insiders-Clustering/src/.env', override=True)

# get credentials
aws_access_key_id = os.environ.get( 'aws_access_key_id' )
aws_secret_access_key = os.environ.get( 'aws_secret_access_key' )
region = os.environ.get( 'region' )

# connect to S3
path_s3 = 's3://insiders-dataset-gcs/'

fs = s3fs.S3FileSystem( anon=False, key=aws_access_key_id, secret=aws_secret_access_key );

In [3]:
# load data
df_raw = pd.read_csv( fs.open( path_s3 + 'Ecommerce.csv', 'rb' ),  encoding= 'unicode_escape' )

# drop extra column
df_raw.drop( columns='Unnamed: 8', axis=1, inplace=True)

In [4]:
df_raw.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,29-Nov-16,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,29-Nov-16,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,29-Nov-16,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,29-Nov-16,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,29-Nov-16,3.39,17850.0,United Kingdom


# 2. Data Description

In [5]:
df2 = df_raw

## 2.1. Rename Columns

In [6]:
cols_new = ['invoice_no', 'stock_code', 'description', 'quantity', 'invoice_date', 'unit_price', 'customer_id', 'country']

# rename columns
df2.columns = cols_new
df2.head()

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,29-Nov-16,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,29-Nov-16,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,29-Nov-16,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,29-Nov-16,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,29-Nov-16,3.39,17850.0,United Kingdom


## 2.2. Data Dimensions

In [7]:
print( 'Number of rows: {}'.format( df2.shape[0] ) )
print( 'Number of columns: {}'.format( df2.shape[1] ) )

Number of rows: 541909
Number of columns: 8


## 2.3. Data Types

In [8]:
df2.dtypes

invoice_no       object
stock_code       object
description      object
quantity          int64
invoice_date     object
unit_price      float64
customer_id     float64
country          object
dtype: object

## 2.4. Check NA

In [9]:
df2.isna().sum()

invoice_no           0
stock_code           0
description       1454
quantity             0
invoice_date         0
unit_price           0
customer_id     135080
country              0
dtype: int64

## 2.5. Replace NA

In [10]:
df_missing = df2.loc[df2['customer_id'].isna(), :]
df_not_missing = df2.loc[~df2['customer_id'].isna(), :]

In [11]:
# max customer_id
df_not_missing['customer_id'].max()

18287.0

In [12]:
# create reference
df_backup = pd.DataFrame( df_missing['invoice_no'].drop_duplicates() )
df_backup['customer_id'] = np.arange( 19000, 19000 + len(df_backup), 1 )
df_backup.head()

Unnamed: 0,invoice_no,customer_id
622,536414,19000
1443,536544,19001
1970,536545,19002
1971,536546,19003
1972,536547,19004


In [13]:
# merge original with reference
df2 = pd.merge( df2, df_backup, on='invoice_no', how='left' )
df2.head()

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,customer_id_x,country,customer_id_y
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,29-Nov-16,2.55,17850.0,United Kingdom,
1,536365,71053,WHITE METAL LANTERN,6,29-Nov-16,3.39,17850.0,United Kingdom,
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,29-Nov-16,2.75,17850.0,United Kingdom,
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,29-Nov-16,3.39,17850.0,United Kingdom,
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,29-Nov-16,3.39,17850.0,United Kingdom,


In [14]:
# coalesce
df2['customer_id'] = df2['customer_id_x'].combine_first( df2['customer_id_y'] )

# drop extra columns
df2.drop( columns=['customer_id_x', 'customer_id_y'], axis=1, inplace=True )
df2.head()

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,country,customer_id
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,29-Nov-16,2.55,United Kingdom,17850.0
1,536365,71053,WHITE METAL LANTERN,6,29-Nov-16,3.39,United Kingdom,17850.0
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,29-Nov-16,2.75,United Kingdom,17850.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,29-Nov-16,3.39,United Kingdom,17850.0
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,29-Nov-16,3.39,United Kingdom,17850.0


In [15]:
df2.isna().sum()

invoice_no         0
stock_code         0
description     1454
quantity           0
invoice_date       0
unit_price         0
country            0
customer_id        0
dtype: int64

In [16]:
df2.dtypes

invoice_no       object
stock_code       object
description      object
quantity          int64
invoice_date     object
unit_price      float64
country          object
customer_id     float64
dtype: object

## 2.6. Change dtypes

In [17]:
# invoice date
df2['invoice_date'] = pd.to_datetime( df2['invoice_date'], format='%d-%b-%y' )

# customer_id
df2['customer_id'] = df2['customer_id'].astype( int )
df2.head()

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,country,customer_id
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2016-11-29,2.55,United Kingdom,17850
1,536365,71053,WHITE METAL LANTERN,6,2016-11-29,3.39,United Kingdom,17850
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2016-11-29,2.75,United Kingdom,17850
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2016-11-29,3.39,United Kingdom,17850
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2016-11-29,3.39,United Kingdom,17850


In [18]:
df2.dtypes

invoice_no              object
stock_code              object
description             object
quantity                 int64
invoice_date    datetime64[ns]
unit_price             float64
country                 object
customer_id              int32
dtype: object

## 2.7. Descriptive Statistics

In [19]:
num_attributes = df2.select_dtypes( include=['int32', 'int64', 'float64'] )
cat_attributes = df2.select_dtypes( exclude=['int32', 'int64', 'float64', 'datetime64[ns]'] )

### 2.7.1. Numerical Attributes

In [20]:
# central tendency
_mean   = pd.DataFrame( num_attributes.apply( np.mean ) ).T
_median = pd.DataFrame( num_attributes.apply( np.median ) ).T

# dispersion
_std   = pd.DataFrame( num_attributes.apply( np.std ) ).T
_min   = pd.DataFrame( num_attributes.apply( np.min ) ).T
_max   = pd.DataFrame( num_attributes.apply( np.max ) ).T
_range = pd.DataFrame( num_attributes.apply( lambda x: x.max() - x.min() ) ).T
_skew  = pd.DataFrame( num_attributes.apply( lambda x: x.skew() ) ).T
_kurtosis  = pd.DataFrame( num_attributes.apply( lambda x: x.kurtosis() ) ).T

# concatenate
summary = pd.concat( [_min, _max, _range, _mean, _median, _std, _skew, _kurtosis] ).T.reset_index()
summary.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']
summary

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,quantity,-80995.0,80995.0,161990.0,9.55225,3.0,218.080957,-0.264076,119769.160031
1,unit_price,-11062.06,38970.0,50032.06,4.611114,2.08,96.759764,186.506972,59005.719097
2,customer_id,12346.0,22709.0,10363.0,16688.840453,16249.0,2911.408666,0.487449,-0.804287


* **Negative quantity** - It could mean a returning.
* **Unit price equal to zero** - Could it mean a promotion?

### 2.7.2. Categorical Attributes

In [21]:
cat_attributes.head()

Unnamed: 0,invoice_no,stock_code,description,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,United Kingdom
1,536365,71053,WHITE METAL LANTERN,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,United Kingdom


#### 2.7.2.1. Invoice_no

In [22]:
# cat_attributes['invoice_no'].astype( int ) # It throws an error
index = df2['invoice_no'].apply( lambda x: bool( re.search( '[^0-9]+', x ) ) )
df2[index].head(10)

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,country,customer_id
141,C536379,D,Discount,-1,2016-11-29,27.5,United Kingdom,14527
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,2016-11-29,4.65,United Kingdom,15311
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,2016-11-29,1.65,United Kingdom,17548
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,2016-11-29,0.29,United Kingdom,17548
237,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,2016-11-29,0.29,United Kingdom,17548
238,C536391,21980,PACK OF 12 RED RETROSPOT TISSUES,-24,2016-11-29,0.29,United Kingdom,17548
239,C536391,21484,CHICK GREY HOT WATER BOTTLE,-12,2016-11-29,3.45,United Kingdom,17548
240,C536391,22557,PLASTERS IN TIN VINTAGE PAISLEY,-12,2016-11-29,1.65,United Kingdom,17548
241,C536391,22553,PLASTERS IN TIN SKULLS,-24,2016-11-29,1.65,United Kingdom,17548
939,C536506,22960,JAM MAKING SET WITH JARS,-6,2016-11-29,4.25,United Kingdom,17897


In [23]:
# unique values
len( df2.loc[index, 'invoice_no'].drop_duplicates()  )

3839

In [24]:
# checking if all invoices with a letter indicates has a negative quantity
df_letter_invoices = df2[index]
print( 'Total number of invoices with a letter: {}'.format( len( df_letter_invoices ) ) )
print( 'Total number of invoices with a letter and negative quantity: {}'.format( len( df_letter_invoices[ df_letter_invoices['quantity'] < 0 ] ) ) )

Total number of invoices with a letter: 9291
Total number of invoices with a letter and negative quantity: 9288


#### 2.7.2.2. Stock_code

In [25]:
index = df2['stock_code'].apply( lambda x: bool( re.search( '^[a-zA-Z]+.*$', x ) ) )
df2[index].head(10)

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,country,customer_id
45,536370,POST,POSTAGE,3,2016-11-29,18.0,France,12583
141,C536379,D,Discount,-1,2016-11-29,27.5,United Kingdom,14527
386,536403,POST,POSTAGE,1,2016-11-29,15.0,Netherlands,12791
1123,536527,POST,POSTAGE,1,2016-11-29,18.0,Germany,12662
1423,536540,C2,CARRIAGE,1,2016-11-29,50.0,EIRE,14911
1814,536544,DOT,DOTCOM POSTAGE,1,2016-11-29,569.77,United Kingdom,19001
2239,536569,M,Manual,1,2016-11-29,1.25,United Kingdom,16274
2250,536569,M,Manual,1,2016-11-29,18.95,United Kingdom,16274
3041,536592,DOT,DOTCOM POSTAGE,1,2016-11-29,607.49,United Kingdom,19014
4406,536779,BANK CHARGES,Bank Charges,1,2016-11-30,15.0,United Kingdom,15823


In [26]:
# unique values
print( len( df2.loc[index, 'stock_code'].drop_duplicates() ) )
print( df2.loc[index, 'stock_code'].unique())

33
['POST' 'D' 'C2' 'DOT' 'M' 'BANK CHARGES' 'S' 'AMAZONFEE' 'DCGS0076'
 'DCGS0003' 'gift_0001_40' 'DCGS0070' 'm' 'gift_0001_50' 'gift_0001_30'
 'gift_0001_20' 'DCGS0055' 'DCGS0072' 'DCGS0074' 'DCGS0069' 'DCGS0057'
 'DCGSSBOY' 'DCGSSGIRL' 'gift_0001_10' 'PADS' 'DCGS0004' 'DCGS0073'
 'DCGS0071' 'DCGS0068' 'DCGS0067' 'DCGS0066P' 'B' 'CRUK']


#### 2.7.2.3. Country

In [27]:
df2['country'].value_counts( normalize=True )

United Kingdom          0.914320
Germany                 0.017521
France                  0.015790
EIRE                    0.015124
Spain                   0.004674
Netherlands             0.004375
Belgium                 0.003818
Switzerland             0.003694
Portugal                0.002803
Australia               0.002323
Norway                  0.002004
Italy                   0.001482
Channel Islands         0.001399
Finland                 0.001283
Cyprus                  0.001148
Sweden                  0.000853
Unspecified             0.000823
Austria                 0.000740
Denmark                 0.000718
Japan                   0.000661
Poland                  0.000629
Israel                  0.000548
USA                     0.000537
Hong Kong               0.000531
Singapore               0.000423
Iceland                 0.000336
Canada                  0.000279
Greece                  0.000269
Malta                   0.000234
United Arab Emirates    0.000125
European C

# 3. Data Filtering

In [28]:
df3 = df2

## 3.1. Numerical Attributes

In [29]:
# unit price > 0
df3 = df3.loc[ df3['unit_price'] > 0.010, : ]

# stock code == number
df3 = df3[~df3['stock_code'].isin( ['POST', 'D', 'C2', 'DOT', 'M', 'BANK, CHARGES', 'S', 'AMAZONFEE', 'DCGS0076'
, 'DCGS0003', 'gift_0001_40', 'DCGS0070', 'm', 'gift_0001_50', 'gift_0001_30'
, 'gift_0001_20', 'DCGS0055', 'DCGS0072', 'DCGS0074', 'DCGS0069', 'DCGS0057'
, 'DCGSSBOY', 'DCGSSGIRL', 'gift_0001_10', 'PADS', 'DCGS0004', 'DCGS0073'
, 'DCGS0071', 'DCGS0068', 'DCGS0067', 'DCGS0066P', 'B', 'CRUK'] )]

# quantity - negative numbears means product returns
df3_returns = df3.loc[df3['quantity'] < 0, :]
df3_purchases = df3.loc[df3['quantity'] > 0, :]

## 3.2. Categorical Attributes

In [30]:
# drop description
df3 = df3.drop( columns='description', axis=1 )

In [31]:
# map
df3 = df3[~df3['country'].isin( ['European Community', 'Unspecified'] )]

In [32]:
# bad users (based on the Univariate Analysis)
df3 = df3[~df3['customer_id'].isin( [16446] )]

# 4. Feature Engineering

In [33]:
df4 = df3.copy()
df_purchases = df3_purchases
df_returns = df3_returns

## 4.1. Feature Creation

The dataset granularity should be customers, because the clusters will be for customers, but it is a unique key composed of product (stock_code) and date (invoice_date). So it is necessary to change the granularity.

In [34]:
# data reference
df_ref = df4[['customer_id']].drop_duplicates( ignore_index=True )
df_ref.head()

Unnamed: 0,customer_id
0,17850
1,13047
2,12583
3,13748
4,15100


### 4.1.1. Gross Revenue

In [35]:
# Gross Revenue (quantity * price)
df_purchases['gross_revenue'] = df_purchases['quantity'] * df_purchases['unit_price']

# Monetary
df_monetary = df_purchases[['customer_id', 'gross_revenue']].groupby( 'customer_id' ).sum().reset_index()
df_ref = pd.merge( df_ref, df_monetary, on='customer_id',  how='left')
df_ref.head()

Unnamed: 0,customer_id,gross_revenue
0,17850,5391.21
1,13047,3232.59
2,12583,6705.38
3,13748,948.25
4,15100,876.0


### 4.1.2. Recency - Day from last purchase

In [36]:
df_recency = df_purchases[['customer_id', 'invoice_date']].groupby( 'customer_id' ).max().reset_index()
df_recency['recency_days'] = ( df_purchases['invoice_date'].max() - df_recency['invoice_date'] ).dt.days
df_recency = df_recency[['customer_id', 'recency_days']].copy()
df_ref = pd.merge( df_ref, df_recency, on='customer_id', how='left' )
df_ref.head()

Unnamed: 0,customer_id,gross_revenue,recency_days
0,17850,5391.21,372.0
1,13047,3232.59,56.0
2,12583,6705.38,2.0
3,13748,948.25,95.0
4,15100,876.0,333.0


### 4.1.5. Quantity of Products Purchased

In [37]:
df_freq = df_purchases[['customer_id', 'quantity']].groupby( 'customer_id' ).sum().rename( columns={"quantity":"qtd_products"}).reset_index()
df_ref = pd.merge( df_ref, df_freq, on='customer_id', how='left' )
df_ref.head()

Unnamed: 0,customer_id,gross_revenue,recency_days,qtd_products
0,17850,5391.21,372.0,1733.0
1,13047,3232.59,56.0,1390.0
2,12583,6705.38,2.0,5028.0
3,13748,948.25,95.0,439.0
4,15100,876.0,333.0,80.0


### 4.1.8. Returns

In [38]:
df_ret = df_returns[['customer_id', 'quantity']].groupby( 'customer_id' ).sum().reset_index().rename( columns={"quantity":"qtd_returns"} )
df_ret['qtd_returns'] = df_ret['qtd_returns'] * -1
df_ref = pd.merge( df_ref, df_ret, how='left', on='customer_id' )
df_ref.loc[df_ref['qtd_returns'].isna(), 'qtd_returns'] = 0
df_ref.head()

Unnamed: 0,customer_id,gross_revenue,recency_days,qtd_products,qtd_returns
0,17850,5391.21,372.0,1733.0,40.0
1,13047,3232.59,56.0,1390.0,35.0
2,12583,6705.38,2.0,5028.0,50.0
3,13748,948.25,95.0,439.0,0.0
4,15100,876.0,333.0,80.0,22.0


### 4.1.9. Frequency Purchase

In [39]:
df_aux = ( df_purchases[['customer_id', 'invoice_no', 'invoice_date']].drop_duplicates()
                                                                      .groupby( 'customer_id' )
                                                                      .agg( max_date = ('invoice_date', 'max'),
                                                                            min_date = ('invoice_date', 'min'),
                                                                            days     = ('invoice_date', lambda x: (( x.max() - x.min() ).days) + 1 ),
                                                                            buy      = ('invoice_no', 'count') ) ).reset_index()

# frequency 
df_aux['frequency'] = df_aux[['buy', 'days']].apply( lambda x: x['buy'] / x['days'] if x['days'] != 0 else 0, axis=1 )

# merge
df_ref = pd.merge( df_ref, df_aux[['customer_id', 'frequency']], on='customer_id', how='left' )
df_ref.head()

Unnamed: 0,customer_id,gross_revenue,recency_days,qtd_products,qtd_returns,frequency
0,17850,5391.21,372.0,1733.0,40.0,17.0
1,13047,3232.59,56.0,1390.0,35.0,0.028302
2,12583,6705.38,2.0,5028.0,50.0,0.040323
3,13748,948.25,95.0,439.0,0.0,0.017921
4,15100,876.0,333.0,80.0,22.0,0.073171


## 4.2. Checking NAs

In [40]:
# checking possible NAs created by the merge
df_ref.isna().sum()

customer_id       0
gross_revenue    91
recency_days     91
qtd_products     91
qtd_returns       0
frequency        91
dtype: int64

# 5. EDA

In [50]:
df5 = df_ref.dropna()

# 6. Data Preparation

In [51]:
cols_select = ['customer_id', 'gross_revenue', 'recency_days', 'qtd_products', 'frequency', 'qtd_returns']
df6 = df5[ cols_select ].drop( columns=['customer_id'] ).copy()

gross_revenue_scaler = pickle.load( fs.open( path_s3 + 'gross_revenue_scaler.pkl', 'rb' ) )
recency_days_scaler  = pickle.load( fs.open( path_s3 + 'recency_days_scaler.pkl', 'rb' ) )
qtd_productse_scaler = pickle.load( fs.open( path_s3 + 'qtd_products_scaler.pkl', 'rb' ) )
qtd_returns_scaler   = pickle.load( fs.open( path_s3 + 'qtd_returns_scaler.pkl', 'rb' ) )
frequency_scaler     = pickle.load( fs.open( path_s3 + 'frequency_scaler.pkl', 'rb' ) )

df6['gross_revenue'] = gross_revenue_scaler.fit_transform( df6[['gross_revenue']] )
df6['recency_days']  = recency_days_scaler.fit_transform( df6[['recency_days']] )
df6['qtd_products']  = qtd_productse_scaler.fit_transform( df6[['qtd_products']] )
df6['qtd_returns']   = qtd_returns_scaler.fit_transform( df6[['qtd_returns']] )
df6['frequency']     = frequency_scaler.fit_transform( df6[['frequency']] )

# 7. Feature Selection

In [52]:
df7 = df6

# 8. Hyperparameter Fine-Tunning

In [53]:
X = df7
X.head()

Unnamed: 0,gross_revenue,recency_days,qtd_products,frequency,qtd_returns
0,0.019312,0.997319,0.008799,1.0,0.000539
1,0.011579,0.150134,0.007056,0.001345,0.000472
2,0.02402,0.005362,0.025538,0.002052,0.000674
3,0.003396,0.254692,0.002225,0.000734,0.0
4,0.003137,0.892761,0.000401,0.003985,0.000296


# 9. Model Training

## 9.1. K-Means

In [54]:
# model definition
k = 7

kmeans = KMeans( init='random', n_clusters=k, max_iter=300, random_state=42 )

# model training
kmeans.fit( X )

# clustering
labels = kmeans.labels_

## 9.2. Cluster Validation

In [55]:
# WSS 
print( 'WSS value: {}'.format( kmeans.inertia_ ) )

# SS
print( 'SS value: {}'.format( silhouette_score( X, labels, metric='euclidean' ) ) )

WSS value: 19.792970466910933
SS value: 0.5064375809181464


# 10. Cluster Analysis

In [56]:
df10 = X
df10['cluster'] = labels
df10.head()

Unnamed: 0,gross_revenue,recency_days,qtd_products,frequency,qtd_returns,cluster
0,0.019312,0.997319,0.008799,1.0,0.000539,0
1,0.011579,0.150134,0.007056,0.001345,0.000472,3
2,0.02402,0.005362,0.025538,0.002052,0.000674,2
3,0.003396,0.254692,0.002225,0.000734,0.0,6
4,0.003137,0.892761,0.000401,0.003985,0.000296,0


## 10.2. Cluster Profile

In [57]:
df10_aux = df5[cols_select]
df10_aux['cluster'] = labels

# change dtypes
df10_aux['recency_days'] = df10_aux['recency_days'].astype( int )
df10_aux['qtd_products'] = df10_aux['qtd_products'].astype( int )
df10_aux['qtd_returns'] = df10_aux['qtd_returns'].astype( int )

# show
df10_aux.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df10_aux['cluster'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df10_aux['recency_days'] = df10_aux['recency_days'].astype( int )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df10_aux['qtd_products'] = df10_aux['qtd_products'].astype( int )
A value is trying to be set on a copy of a s

Unnamed: 0,customer_id,gross_revenue,recency_days,qtd_products,frequency,qtd_returns,cluster
0,17850,5391.21,372,1733,17.0,40,0
1,13047,3232.59,56,1390,0.028302,35,3
2,12583,6705.38,2,5028,0.040323,50,2
3,13748,948.25,95,439,0.017921,0,6
4,15100,876.0,333,80,0.073171,22,0


In [58]:
# number of customers
df_cluster = df10_aux[['customer_id', 'cluster']].groupby( 'cluster' ).count().reset_index()
df_cluster['perc_customer'] = 100 * df_cluster['customer_id'] / df_cluster['customer_id'].sum()

# Avg Gross revenue
df_avg_gross_rev = df10_aux[['gross_revenue', 'cluster']].groupby( 'cluster' ).mean().reset_index()
df_cluster = pd.merge( df_cluster, df_avg_gross_rev, how='inner', on='cluster' )

# Avg Recency days
df_avg_rec_days = df10_aux[['recency_days', 'cluster']].groupby( 'cluster' ).mean().reset_index()
df_cluster = pd.merge( df_cluster, df_avg_rec_days, how='inner', on='cluster' )

# Avg frequency
df_avg_invoice_no = df10_aux[['frequency', 'cluster']].groupby( 'cluster' ).mean().reset_index()
df_cluster = pd.merge( df_cluster, df_avg_invoice_no, how='inner', on='cluster' )

# Avg quantity of products
df_avg_qtd_products = df10_aux[['qtd_products', 'cluster']].groupby( 'cluster' ).mean().reset_index()
df_cluster = pd.merge( df_cluster, df_avg_qtd_products, how='inner', on='cluster' )

# Avg returns
df_returns = df10_aux[['qtd_returns', 'cluster']].groupby( 'cluster' ).mean().reset_index()
df_cluster = pd.merge( df_cluster, df_returns, how='inner', on='cluster' )

# rename columns
df_cluster.rename( columns={ "customer_id":"qtd_customers", "gross_revenue":"avg_gross_revenue", "recency_days":"avg_recency_days", "frequency":"avg_frequency","qtd_products":"avg_qtd_products" ,"qtd_returns":"avg_returns"}, inplace=True )
df_cluster.sort_values( by=['avg_gross_revenue'], ascending=False )

Unnamed: 0,cluster,qtd_customers,perc_customer,avg_gross_revenue,avg_recency_days,avg_frequency,avg_qtd_products,avg_returns
1,1,11,0.193152,132552.812727,8.909091,0.167604,74822.818182,1644.181818
2,2,2036,35.750658,2674.559926,15.952849,0.247945,1510.527014,32.810904
3,3,1137,19.964881,1094.174485,61.361478,0.513579,635.21372,8.23131
0,0,569,9.99122,925.640914,340.086116,0.977187,388.226714,132.462214
6,6,588,10.324846,907.503844,119.605442,0.58945,495.520408,4.013605
5,5,716,12.572432,675.770908,186.074022,0.760309,273.953911,4.416201
4,4,638,11.202809,634.957398,260.95768,0.907916,255.611285,19.26489


* **Insiders:** Cluster 1

# 11. Deploy To Production

In [59]:
df10_aux.dtypes

customer_id        int32
gross_revenue    float64
recency_days       int32
qtd_products       int32
frequency        float64
qtd_returns        int32
cluster            int32
dtype: object

## 11.1. Insert Into SQLite

In [60]:
# database connection
host = 'insidersdb.clicbbkmgcqj.us-east-1.rds.amazonaws.com'
port = '5432'
database = 'postgres'
user =  os.environ.get( 'user_postgreSQL' )
pwd = os.environ.get( 'pwd_postgreSQL' )

endpoint = 'postgresql://{}:{}@{}/postgres'.format( user, pwd, host, port )

In [None]:
# create connection
conn = create_engine( endpoint )

# drop table
query_drop_insiders = """
    DROP TABLE IF EXISTS insiders
"""

conn.execute( query_drop_insiders )

## create table
query_create_table_insiders = """
    CREATE TABLE insiders (
        customer_id     INTEGER,
        gross_revenue   REAL,
        recency_days    INTEGER,
        qtd_products    INTEGER,   
        frequency       REAL,
        qtd_returns     INTEGER,
        cluster         INTEGER
    )

"""

conn.execute( query_create_table_insiders );

In [None]:
# insert table
df10_aux.to_sql( 'insiders', con=conn, if_exists='append', index=False );

In [None]:
# consulting database
query = """ 
    SELECT * FROM INSIDERS 
"""

df = pd.read_sql_query( query, conn )

In [None]:
df.shape