In [None]:
pip install dash

In [None]:
pip install flask plotly

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import plotly.express as px 
import plotly.graph_objs as pgo
from sklearn.metrics import silhouette_score,calinski_harabasz_score, davies_bouldin_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
df=pd.read_csv("amount_df.csv")
df.shape

In [None]:
df.head()

In [None]:
# Let the number of clusters be a parameter, so we can get a feel for an appropriate
# value thereof.
X_reduced=df.values

def cluster(n_clusters):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(X_reduced)
    Z = kmeans.predict(X_reduced)
    return kmeans, Z
import numpy as np
max_clusters = 20

inertias = np.zeros(max_clusters)
for i in range(1, max_clusters):
    kmeans, Z = cluster(i)
    inertias[i] = kmeans.inertia_
_x=range(1, max_clusters)
data1 = pgo.Data([
    pgo.Scatter(
            x=list(_x),
            y=inertias[1:]
    )
])
layout1 = pgo.Layout(
    title='OnlineRetailStore dataset - Elbow method',
    xaxis=pgo.XAxis(title='Number of clusters',
                    range=[0, max_clusters]),
    yaxis=pgo.YAxis(title='Inertia')
)
fig1 = pgo.Figure(data=data1, layout=layout1)
fig.write_html("elbow_method_curve.html")
fig1.show()

In [None]:
#models
X=df
#Train clustering model
km1 = KMeans(n_clusters=3)
km1.fit(X)
km1.fit_transform(df)# distance between point & all 3 centroids
df['labels'] = km1.labels_

In [None]:
X=df
silhouette_score = silhouette_score(X,km1.labels_)
print("Silhouette Score: for 3 clusters", silhouette_score)

calinski_harabasz_index = calinski_harabasz_score(X,km1.labels_)
print("Calinski-Harabasz Index: for 3 clusters", calinski_harabasz_index)

# Calculate the Davies-Bouldin Index
davies_bouldin_index = davies_bouldin_score(X,km1.labels_)
print("Davies-Bouldin Index: for 3 clusteres", davies_bouldin_index)

In [None]:
print(df[df['labels']==0]['Amount'].max())
print(df[df['labels']==1]['Amount'].min())
print(df[df['labels']==1]['Amount'].max())
print(df[df['labels']==2]['Amount'].min())
print(df[df['labels']==2]['Amount'].max())

In [None]:
df.replace({'labels':0},'less than 30k',inplace=True)
df.replace({'labels':2},'30k-125k ',inplace=True)
df.replace({'labels':1},'125k-280k',inplace=True)

In [None]:
fig2=px.scatter(df,y="Amount",x="CustomerID",color="labels",title='Customer segmentation')
fig2.write_html("CustomerSegmentatin.html")
fig2.show()

In [None]:
fig3=px.box(df,y='Amount',color='labels')
fig3.show()

In [None]:
#check for another model
df1=pd.read_csv("amount_df.csv")

In [None]:
X=df1
km2 = KMeans(n_clusters=4)
km2.fit(X)
km2.fit_transform(df1)# distance between point & all 4 centroids
df1['labels'] = km2.labels_

In [None]:
X=df1
#silhouette_score = silhouette_score(X,km2.labels_)
print("Silhouette Score: for 4 clusters", silhouette_score)
calinski_harabasz_index = calinski_harabasz_score(X,km2.labels_)
print("Calinski-Harabasz Index: for 4 clusters", calinski_harabasz_index)
# Calculate the Davies-Bouldin Index
davies_bouldin_index = davies_bouldin_score(X, km2.labels_)
print("Davies-Bouldin Index: for 4 clusteres", davies_bouldin_index)

In [None]:
print(df1[df1['labels']==0]['Amount'].max())
print(df1[df1['labels']==2]['Amount'].min())
print(df1[df1['labels']==2]['Amount'].max())
print(df1[df1['labels']==1]['Amount'].min())
print(df1[df1['labels']==1]['Amount'].max())
print(df1[df1['labels']==3]['Amount'].min())
print(df1[df1['labels']==3]['Amount'].max())

In [None]:
df1.replace({'labels':0},'less than 25k',inplace=True)
df1.replace({'labels':3},'25k-91k ',inplace=True)
df1.replace({'labels':2},'91k-200k',inplace=True)
df1.replace({'labels':1},'200k-280k',inplace=True)

In [None]:
px.scatter(df1,y="Amount",x="CustomerID",color="labels")

In [None]:
import joblib
model=km1
# Save the model to a file
joblib.dump(model, 'kmeans_model.joblib')


In [None]:
fig2.show()

In [None]:
#125k to 280k band customers
df[df['labels']== '125k-280k']

In [None]:
#125k to 280k band customers
df[df['labels']== '30k-125k '].count()

### Customer purchase behavior

In [None]:
df=pd.read_csv("amount_df_month.csv")
df=df.drop("Unnamed: 0",axis=1)
df.head()

In [None]:
df['CustomerID']=df['CustomerID'].astype('int').astype('str')

In [None]:
top_cust=['14646','18102','17450','16446','14911']
topCustomers=df[df.CustomerID.isin(top_cust)]

In [None]:
topCustomers[topCustomers['CustomerID']=='14646']

In [None]:
fig4=px.scatter(topCustomers,x="Year_Month",y="Amount",color="CustomerID",title='Top customers')
fig4.write_html("Top Customers.html")
fig4.show()

### Regular customers

In [None]:
#pivot table creation with count of reccurence
table = pd.pivot_table(data=df,index=['CustomerID'],columns=["Year_Month"],values='Amount')
table

In [None]:
#dataframe for its recurrence
x=pd.DataFrame(table.count(axis=1).sort_values(ascending=False),columns=["Count"])
x.Count.value_counts()

In [None]:
# Regular customers index
freq_cust=x[x["Count"]>=10].index
freq_cust #184 regular customers
# freuent customers dataframe
df_freq_cust=df[df.CustomerID.isin(freq_cust)]
df_freq_cust.CustomerID.nunique()

In [None]:
fig5=px.bar(df_freq_cust,x="Year_Month",y="Amount",color="CustomerID",title='Regular purchasing customers')
fig5.write_html("RegularCust.html")
fig5.show()

In [None]:
df_freq_cust_mean_amt=df_freq_cust.groupby("CustomerID").agg({"Amount":"mean"})
threshold=5000
freq_cust_mean_amt_gt_threshold=df_freq_cust_mean_amt[df_freq_cust_mean_amt["Amount"]>threshold].index


In [None]:
freq_cust_mean_amt_gt_threshold.shape

* regular customers who purchase>1000 are 59 
* regular customers who purchase>5000 are  9

In [None]:
# dataframe of regular purchasing greater than 5000
freq_df_threshold=df_freq_cust[df_freq_cust["CustomerID"].isin(freq_cust_mean_amt_gt_threshold)]
freq_df_threshold

In [None]:
fig6=px.bar(freq_df_threshold,x="Year_Month",y="Amount",color="CustomerID",
           title="Regular customers who purchase greater than 5k")
fig6.show()
fig6.write_html("reg_greater_5k.html")