# Customer Segmentation

In this notebook, we will perform customer segmentation

# Load Data and Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier

In [None]:
full_df = pd.read_csv('../dataset/cleaned/combined_cleansed.csv')

In [None]:
full_df.shape

# Segment By Department

In [None]:
dept_segment = pd.crosstab(full_df['user_id'], full_df['department'])

In [None]:
def find_optimum_clusters(df):
    wcss = []
    num_clusters = []
    sc = StandardScaler()
    X_sc = sc.fit_transform(df)
    
    for k in range(2, 11):
        print('Running {} Clusters...'.format(k))
        km = KMeans(n_clusters=k)
        km.fit(df)
        wcss.append(km.inertia_)
        num_clusters.append(k)

    fig, ax = plt.subplots(figsize=(7, 7))
    ax.plot(range(2, 11), wcss, 'bs-', markerfacecolor = 'red', markeredgecolor = 'red')
    ax.set_ylabel('Inertia', fontsize = 20)
    ax.set_xlabel('Number of Clusters', fontsize = 20)
    ax.set_title('Inertia over number of Clusters', fontsize = 20)
    start, end = ax.get_xlim()
    ax.set_xticks(np.arange(start + 0.4, end + 0.4 , 1))

In [None]:
find_optimum_clusters(dept_segment)

3, 4 and 5 seems like the best

In [None]:
def kmeans_fit_and_visualise(df, x, y, num_cluster):
    df = df.copy()
    sc = StandardScaler()
    X_sc = sc.fit_transform(df)
    km = KMeans(n_clusters = num_cluster, tol = 0.0001, max_iter = 1000, n_init = 20)
    km.fit(X_sc)
    km.predict(X_sc)
    
    df = pd.DataFrame(X_sc, columns = df.columns)
    df['cluster'] = km.labels_
    
    plt.figure(figsize=(7,7))
    colors = ["red", "green", "blue", 'orange', 'black', 'yellow', 'brown', 'purple', 'grey', 'coral']
    df['color'] = df['cluster'].map(lambda p: colors[p])
    ax = df.plot(kind ="scatter", x = x, y = y, c = df['color'])

In [None]:
kmeans_fit_and_visualise(dept_segment, 'pantry', 'dairy eggs', 3)

In [None]:
kmeans_fit_and_visualise(dept_segment, 'pantry', 'dairy eggs', 4)

In [None]:
kmeans_fit_and_visualise(dept_segment, 'pantry', 'dairy eggs', 5)

## With PCA

In [None]:
pca = PCA(n_components = 4)
dept_segment_pca = pca.fit_transform(dept_segment)
dept_segment_pca = pd.DataFrame(dept_segment_pca)

In [None]:
var_exp = pca.explained_variance_ratio_
cum_var_exp = np.cumsum(var_exp)
print('Cumulative explained variance: {}'.format(np.round(cum_var_exp,3)))

# 4 PCAs already explained 91% of the variances

In [None]:
find_optimum_clusters(dept_segment_pca)

3, 4, 5 seems the best

In [None]:
kmeans_fit_and_visualise(dept_segment_pca, 0, 3, 3)

In [None]:
kmeans_fit_and_visualise(dept_segment_pca, 0, 3, 4)

In [None]:
kmeans_fit_and_visualise(dept_segment_pca, 0, 3, 5)

kmeans_fit_and_visualise(dept_segment_pca, 0, 5, 7)

# Segment By Aisle

In [None]:
aisle_segment = pd.crosstab(full_df['user_id'], full_df['aisle'])

In [None]:
find_optimum_clusters(aisle_segment)

3, 4, 5 is the best

In [None]:
kmeans_fit_and_visualise(aisle_segment, 'fresh fruits', 'instant foods', 3)

In [None]:
kmeans_fit_and_visualise(aisle_segment, 'fresh fruits', 'instant foods', 4)

In [None]:
kmeans_fit_and_visualise(aisle_segment, 'fresh fruits', 'instant foods', 5)

## With PCA

In [None]:
pca = PCA(n_components = 11)
aisle_segment_pca = pca.fit_transform(aisle_segment)
aisle_segment_pca = pd.DataFrame(aisle_segment_pca)
var_exp = pca.explained_variance_ratio_
cum_var_exp = np.cumsum(var_exp)
print('Cumulative explained variance: {}'.format(np.round(cum_var_exp,3)))

# 11 components can explain 80% of the variance

__previously was using 7__

In [None]:
find_optimum_clusters(aisle_segment_pca)

3, 4, 5 the best

In [None]:
kmeans_fit_and_visualise(aisle_segment_pca, 0, 4, 3)

In [None]:
kmeans_fit_and_visualise(aisle_segment_pca, 0, 4, 4)

In [None]:
kmeans_fit_and_visualise(aisle_segment_pca, 0, 4, 5)

In [None]:
aisle_segment.head()

## DB SCAN

In [None]:
def dbscan_fit_and_visualise(df, x, y, filename, eps):
    df = df.copy()
    sc = StandardScaler()
    X_sc = sc.fit_transform(df)
    dbscan = DBSCAN(eps=eps)
    dbscan.fit(X_sc)
    df['cluster'] = dbscan.labels_
    print(set(dbscan.labels_))
    print(silhouette_score(X_sc, dbscan.labels_))

    plt.figure(figsize=(7,7))
    colors = ["red", "green", "blue", 'orange', 'brown', 'yellow', 'black', 'purple']
    df['color'] = df['cluster'].map(lambda p: colors[p])
    ax = df.plot(    
    kind="scatter", 
    x=x, y=y,
    figsize=(10,8),
    c = df['color'])
    plt.savefig('../plots/{}.png'.format(filename))
    
    return df 

In [None]:
# determine the best epsilon

neigh = NearestNeighbors(n_neighbors=2)
sc = StandardScaler()
ccc = sc.fit_transform(aisle_segment_pca)
nbrs = neigh.fit(ccc)
distances, indices = nbrs.kneighbors(ccc)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)

In [None]:
zzz = aisle_segment_pca[:50000]

In [None]:
db_scan_aisle_pca = dbscan_fit_and_visualise(zzz, 0, 4, 'dbscan_with_pca', 3.5)

In [None]:
neigh = NearestNeighbors(n_neighbors=2)
sc = StandardScaler()
ddd = sc.fit_transform(aisle_segment)
nbrs = neigh.fit(ddd)
distances, indices = nbrs.kneighbors(ddd)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)

In [None]:
db_scan_aisle_no_pca = dbscan_fit_and_visualise(aaa, 'fresh vegetables', 'instant foods', 'dbscan_without_pca', 100)

## Product Segmentation

## Testing

## Test Visualisation

# Segment By Product

# Segment By Recency

# Segment By Frequency

# Segment By Monetary - No Price