In [1]:
pip install plotly --upgrade

Collecting plotly
  Downloading plotly-5.17.0-py2.py3-none-any.whl (15.6 MB)
                                              0.0/15.6 MB ? eta -:--:--
                                              0.0/15.6 MB 1.3 MB/s eta 0:00:13
                                             0.1/15.6 MB 919.0 kB/s eta 0:00:17
                                              0.1/15.6 MB 1.1 MB/s eta 0:00:15
                                              0.2/15.6 MB 1.1 MB/s eta 0:00:15
                                              0.3/15.6 MB 1.5 MB/s eta 0:00:11
                                              0.4/15.6 MB 1.4 MB/s eta 0:00:11
     -                                        0.5/15.6 MB 1.4 MB/s eta 0:00:11
     -                                        0.5/15.6 MB 1.3 MB/s eta 0:00:12
     -                                        0.6/15.6 MB 1.5 MB/s eta 0:00:11
     -                                        0.7/15.6 MB 1.6 MB/s eta 0:00:10
     --                                       0.9/15.6 MB 1.


[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import numpy as np 
import pandas as pd 
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

pio.renderers.default = 'colab'
pio.templates.default = 'plotly'

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('marketing_campaign.csv',
                           delimiter='\t', index_col='ID')

In [3]:
df.head()

Unnamed: 0_level_0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,88,...,7,0,0,0,0,0,0,3,11,1
2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,1,...,5,0,0,0,0,0,0,3,11,0
4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,49,...,4,0,0,0,0,0,0,3,11,0
6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,4,...,6,0,0,0,0,0,0,3,11,0
5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,43,...,5,0,0,0,0,0,0,3,11,0


In [4]:
df = df.dropna()

In [5]:
df['Frequency'] = df['NumWebPurchases']+df['NumCatalogPurchases']+df['NumStorePurchases']

In [6]:
df['Monetary'] = df['MntWines']+df['MntFruits']+df['MntMeatProducts']+df['MntFishProducts']+df['MntSweetProducts']+df['MntGoldProds']

In [7]:
df_rfm = df[['Recency','Frequency','Monetary']]
df_rfm.describe()

Unnamed: 0,Recency,Frequency,Monetary
count,2216.0,2216.0,2216.0
mean,49.012635,12.55731,607.075361
std,28.948352,7.204611,602.900476
min,0.0,0.0,5.0
25%,24.0,6.0,69.0
50%,49.0,12.0,396.5
75%,74.0,18.0,1048.0
max,99.0,32.0,2525.0


### Data Preprocessing

#### Scaling


In [8]:
colors_rfm = ['rgb(183, 9, 76)',  'rgb(92, 77, 125)', 'rgb(0, 145, 173)']

In [9]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

fig = px.box(pd.melt(df_rfm), x='variable', y='value',
             title='<b>RFM Data Distribution Before Scaling</b>',
             color='variable', color_discrete_sequence=colors_rfm,
             boxmode='overlay', points='all')

fig.update_layout(showlegend=False,paper_bgcolor='rgb(229, 236, 246)',
                  title_font_size=22)

fig.show()

In [None]:
std = StandardScaler()
df_rfm_scaled = pd.DataFrame(std.fit_transform(df_rfm), columns=df_rfm.columns)

In [None]:
fig = px.box(pd.melt(df_rfm_scaled), x='variable', y='value',
             title='<b>RFM Data Distribution After Scaling</b>',
             color='variable', color_discrete_sequence=colors_rfm,
             boxmode='overlay', points='all')

fig.update_layout(showlegend=False,paper_bgcolor='rgb(229, 236, 246)',
                  title_font_size=22)

fig.show()

### Clustering by K-Means

In [None]:
fig = px.scatter_3d(df_rfm_scaled, x='Recency', y='Frequency', z='Monetary',
                    title='<b>RFM Mapping</b>',
                    opacity=0.5,color_discrete_sequence=['rgb(5, 60, 94)'])

fig.update_traces(marker_size=5)

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)', title_font_size=22)

fig.show()

### Xác định số cụm


### Elbow Method

In [None]:
inertias = []
K = range(2,10)
for k in K:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(df_rfm_scaled)
    inertias.append(kmeans.inertia_)

In [None]:
fig = px.line(x=K, y=inertias,
              title='<b>Số cụm tối ưu theo phương pháp Elbow</b>',
              color_discrete_sequence=['rgb(5, 60, 94)'])

fig.update_traces(line_width=4)

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)',title_font_size=22,
                  xaxis_title='k', yaxis_title='Inertia')
fig.add_vline(x=4, line_width=3, line_dash='dash', line_color='rgb(183, 9, 76)')

fig.add_annotation(x=4.1, y=1800, text='<i>optimal k=4</i>', font_size=16,
                   showarrow=True, ax=60, ay=-30, arrowhead=2, arrowsize=1,
                   arrowwidth=2)

fig.add_shape(type='circle', xref='x', yref='y',
    x0=3.9, y0=1650, x1=4.1,y1=1914,
    line_color='rgb(183, 9, 76)')

fig.show()

### Silhouette

In [None]:
silhouette = []
K = range(2,10)
for k in K:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit_predict(df_rfm_scaled)
    score = silhouette_score(df_rfm_scaled, kmeans.labels_)
    silhouette.append(score)

In [None]:
fig = px.line(x=K, y=silhouette,
              title='<b>Số cụm tối ưu theo phương pháp Silhouette Score</b>',
              color_discrete_sequence=['rgb(5, 60, 94)'])

fig.update_traces(line_width=4)

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)',title_font_size=22,
                  xaxis_title='k',
                  yaxis_title='Silhouette Score')

fig.add_vline(x=2, line_width=3, line_dash='dash', line_color='rgb(183, 9, 76)')

fig.add_annotation(x=2.1, y=0.44, text='<i>optimal k=2</i>', font_size=16,
                   showarrow=True, ax=60, ay=-30, arrowhead=2, arrowsize=1,
                   arrowwidth=2)

fig.add_shape(type='circle', xref='x', yref='y',
    x0=1.9, y0=0.432, x1=2.1,y1=0.442,
    line_color='rgb(183, 9, 76)')

fig.show()

### K-means

In [None]:
kmeans = KMeans(n_clusters=4, random_state=2022)
kmeans.fit(df_rfm_scaled)

idx = np.argsort(kmeans.cluster_centers_.sum(axis=1))
luv = np.zeros_like(idx)
luv[idx] = np.arange(4)

y_pred = luv[kmeans.labels_]

In [None]:
df_rfm_scaled["Clusters"]= y_pred
df["Clusters"]= y_pred

In [None]:
colors_cluster = ['rgb(183, 9, 76)', 'rgb(137, 43, 100)','rgb(69, 94, 137)',
                  'rgb(0, 145, 173)']

In [None]:
fig = px.scatter_3d(df_rfm_scaled, x='Recency', y='Monetary', z='Frequency',
                    color='Clusters',title='<b>Cluster Results</b>',
                    opacity=0.5,color_continuous_scale=colors_cluster)

fig.update_traces(marker_size=5)

fig.update_layout(showlegend=False,paper_bgcolor='rgb(229, 236, 246)',
                  title_font_size=22)

fig.show()

### Customer Segmentation

In [None]:
df_rfm_long = pd.melt(df_rfm_scaled, id_vars='Clusters')

#plotting rfm distributions
fig = px.box(df_rfm_long, x='Clusters', y='value',
             title='<b>RFM Distribution by Cluster</b>',
             color='variable', color_discrete_sequence=colors_rfm,
             boxmode='group')

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)',
                  title_font_size=22)

fig.show()

In [None]:
fig = px.histogram(df, x='Clusters', color='Clusters',
                   color_discrete_sequence=colors_cluster,
                   category_orders=dict(Clusters=[0,1,2,3]),
                   title='<b>Number of Customers in Each Cluster</b>',
                   text_auto=True)

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)',title_font_size=22,
                  bargap=0.4)

fig.show()

In [None]:
#identifying the centroid from original data
centroid = df.groupby('Clusters')[['Recency','Frequency','Monetary']].agg('mean')
centroid

Unnamed: 0_level_0,Recency,Frequency,Monetary
Clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,23.489396,7.057096,151.367047
1,73.491961,7.017685,146.393891
2,23.0,19.760776,1186.963362
3,73.170213,19.27853,1181.205029


### Marketing Strategy

### Customers Profiling 

#### Income

In [None]:
#plotting income & monetary by clusters
#one customer with income of 666666 is excluded because it's obscuring the pattern
fig = px.scatter(df[df['Income']<500000], x='Monetary', y='Income',
                 color='Clusters',
                 color_continuous_scale=colors_cluster,
                 title='<b>Clusters by Income & Monetary</b>',
                 opacity=0.5)

fig.update_traces(marker_size=10)

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)',title_font_size=22)

fig.show()

#### Deals

In [None]:
deals = df[['NumDealsPurchases','Clusters']]

deals['DealsAccpt'] = np.where(deals.iloc[:,0]>0,1,0)
deals = deals.sort_values('DealsAccpt')

In [None]:
colors_bin = ['rgb(183, 9, 76)','rgb(0, 145, 173)']

In [None]:
fig = px.histogram(deals, x='Clusters', barnorm='percent',
                   color='DealsAccpt',
                   color_discrete_sequence=colors_bin,
                   category_orders=dict(Clusters=[0,1,2,3]),
                   title='<b>Deals Acceptance Rate</b>',
                   text_auto=True)

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)',title_font_size=22,
                  bargap=0.4, yaxis_title='Percent (%)')

fig.show()

In [None]:
fig = px.histogram(df, x='Clusters', y='NumDealsPurchases', color='Clusters',
                   color_discrete_sequence=colors_cluster,
                   category_orders=dict(Clusters=[0,1,2,3]),
                   title='<b>Number of Deals Purchases by Clusters</b>',
                   text_auto=True)

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)',title_font_size=22,
                  bargap=0.4)

fig.show()

#### Campaigns

In [None]:
cmpgn = df[['AcceptedCmp1','AcceptedCmp2','AcceptedCmp3','AcceptedCmp4',
               'AcceptedCmp5','Response','Clusters']]

cmpgn['CmpAccpt'] = np.where((cmpgn.iloc[:,0:5].mean(axis=1))>0,1,0)
cmpgn = cmpgn.sort_values('CmpAccpt')

new_col = dict(AcceptedCmp1='Cmp1',AcceptedCmp2='Cmp2',
               AcceptedCmp3='Cmp3',AcceptedCmp4='Cmp4',
               AcceptedCmp5='Cmp5',Response='CmpLast')

cmpgn.rename(columns=new_col, inplace=True)

In [None]:
fig = px.histogram(cmpgn, x='Clusters', barnorm='percent',
                   color='CmpAccpt',
                   color_discrete_sequence=colors_bin,
                   category_orders=dict(Clusters=[0,1,2,3]),
                   title='<b>Campaigns Acceptance Rate</b>',
                   text_auto=True)

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)',title_font_size=22,
                  bargap=0.4, yaxis_title='Percent (%)')

fig.show()

In [None]:
cmpgn_long_0 = pd.melt(cmpgn[cmpgn['Clusters']==0][['Cmp1','Cmp2',
                                                    'Cmp3','Cmp4',
                                                    'Cmp5','CmpLast']])
cmpgn_long_1 = pd.melt(cmpgn[cmpgn['Clusters']==1][['Cmp1','Cmp2',
                                                    'Cmp3','Cmp4',
                                                    'Cmp5','CmpLast']])
cmpgn_long_2 = pd.melt(cmpgn[cmpgn['Clusters']==2][['Cmp1','Cmp2',
                                                    'Cmp3','Cmp4',
                                                    'Cmp5','CmpLast']])
cmpgn_long_3 = pd.melt(cmpgn[cmpgn['Clusters']==3][['Cmp1','Cmp2',
                                                    'Cmp3','Cmp4',
                                                    'Cmp5','CmpLast']])

In [None]:
fig = make_subplots(rows=2, cols=2,
                    subplot_titles=('Cluster 0', 'Cluster 1', 'Cluster 2', 
                                    'Cluster 3'),
                    shared_yaxes=True)

fig.add_trace(
    go.Histogram(histfunc='sum', x=cmpgn_long_0['variable'],
                 y=cmpgn_long_0['value'], marker_color=colors_cluster[0],
                 texttemplate='%{y}'),
    row=1, col=1)

fig.add_trace(
    go.Histogram(histfunc='sum', x=cmpgn_long_1['variable'],
                 y=cmpgn_long_1['value'], marker_color=colors_cluster[1],
                 texttemplate='%{y}'),
    row=1, col=2)

fig.add_trace(
    go.Histogram(histfunc='sum', x=cmpgn_long_2['variable'],
                 y=cmpgn_long_2['value'], marker_color=colors_cluster[2],
                 texttemplate='%{y}'),
    row=2, col=1)

fig.add_trace(
    go.Histogram(histfunc='sum', x=cmpgn_long_3['variable'],
                 y=cmpgn_long_3['value'], marker_color=colors_cluster[3],
                 texttemplate='%{y}'),
    row=2, col=2)

fig.update_layout(showlegend=False, paper_bgcolor='rgb(229, 236, 246)',
                  title_text='<b>Campaigns Performance on Each Cluster</b>',
                  title_font_size=22, yaxis_range=[0,150], yaxis3_range=[0,150])