<a href="https://colab.research.google.com/github/JoshRogers18/Unsupervised-Bank-Customer-Segmentation/blob/main/Unsupervised_Bank_Segmenting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# You may need to re-upload the data into the files
df = pd.read_csv('marketing_data.csv')
df.head()
df.info()
df.describe()

# This is data from a bank about it's customers over the course of 6 months
# Includes transaction frequency, amount, tenure, etc.
# Goal: Use ML in order to launch a targeted marketing ad campaign tailered to specific segments
# Keywords: Marketing Segmentation
# Want to divide customers into new customers, customers who use credit cards for transactions only, 
# customers who use their cards for loans, and customers who are increasing their credit limit

In [None]:
# CUSTID: Identification of Credit Card holder 
# BALANCE: Balance amount left in customer's account to make purchases
# BALANCE_FREQUENCY: How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated)
# PURCHASES: Amount of purchases made from account
# ONEOFFPURCHASES: Maximum purchase amount done in one-go
# INSTALLMENTS_PURCHASES: Amount of purchase done in installment
# CASH_ADVANCE: Cash in advance given by the user
# PURCHASES_FREQUENCY: How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased)
# ONEOFF_PURCHASES_FREQUENCY: How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased)
# PURCHASES_INSTALLMENTS_FREQUENCY: How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done)
# CASH_ADVANCE_FREQUENCY: How frequently the cash in advance being paid
# CASH_ADVANCE_TRX: Number of Transactions made with "Cash in Advance"
# PURCHASES_TRX: Number of purchase transactions made
# CREDIT_LIMIT: Limit of Credit Card for user
# PAYMENTS: Amount of Payment done by user
# MINIMUM_PAYMENTS: Minimum amount of payments made by user  
# PRC_FULL_PAYMENT: Percent of full payment paid by user
# TENURE: Tenure of credit card service for user

In [None]:
# See how many missing valus there were in the data and impute them with average
sns.heatmap(df.isnull(), yticklabels= False, cbar = False, cmap = 'Reds')
df.isnull().sum()

df.loc[(df.MINIMUM_PAYMENTS.isnull() == True), 'MINIMUM_PAYMENTS'] = df.MINIMUM_PAYMENTS.mean()
df.loc[(df.CREDIT_LIMIT.isnull() == True), 'CREDIT_LIMIT'] = df.MINIMUM_PAYMENTS.mean()

df.duplicated().sum()

In [None]:
df.drop('CUST_ID',axis=1, inplace = True)

# KDE demonstrates the probability density at different values in a continuous variable. 
plt.figure(figsize=(10,50))
for i in range(len(df.columns)):
  plt.subplot(17,1,i+1)
  sns.distplot(df[df.columns[i]], kde_kws={'color':'b', 'lw':3, 'label':'KDE', 'bw':0.1}, hist_kws={'color':'g'})
  plt.title(df.columns[i])

plt.tight_layout()

In [None]:
plt.subplots(figsize = (20,10))
sns.heatmap(df.corr(), annot=True, mask=np.triu(df.corr()))

In [None]:
scaler = StandardScaler()
df1 = scaler.fit_transform(df)

In [None]:
# Looking at the chart below, we can see that the elbow lands at about 4 clusters
distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(df1)
    distortions.append(kmeanModel.inertia_)

plt.figure(figsize=(12,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
# When we try to look again at the data, we'll notice that it is difficult to read when scaled.
kmeans = KMeans(4)
kmeans.fit(df1)
labels = kmeans.labels_

In [None]:

cluster_centers = pd.DataFrame(data = kmeans.cluster_centers_, columns = [df.columns])
cluster_centers

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,1.459578,0.384753,-0.234638,-0.163914,-0.253747,1.688972,-0.504848,-0.212939,-0.450201,1.745948,1.617143,-0.28358,0.839032,0.603821,0.49091,-0.406367,-0.097146
1,-0.265552,-0.368944,-0.34319,-0.2305,-0.387798,-0.182691,-0.797823,-0.389437,-0.714246,-0.1015,-0.164607,-0.474987,-0.334538,-0.26206,-0.119249,-0.258866,-0.052972
2,-0.321688,0.242574,0.109044,0.000926,0.255904,-0.366373,0.983721,0.317153,0.874138,-0.462599,-0.360303,0.296985,-0.077182,-0.138502,-0.091844,0.395041,0.057744
3,0.954485,0.462694,3.125845,2.713251,2.40647,-0.155091,1.136338,1.798653,1.065918,-0.319096,-0.170458,3.003251,1.429914,1.919096,0.477421,0.454703,0.32414


In [None]:
# Here we are inversing/removing the standardization since we found our elbow
cluster_centers = scaler.inverse_transform(cluster_centers)
cluster_centers = pd.DataFrame(data = cluster_centers, columns = [df.columns])
cluster_centers

# First cluster  : (most lucrative) who use credit card as a loan, customers with the highest balance and cash advance, low percentage of full payment (3%), high cash advance frequency (0.5)
# Second cluster : Those are customers who pay low amount of intrerest charges and are careful with their money since lowest purchase frequency and installments purchases and credit limit
# Third cluster  : Cluster with lowest balance and cash advance
# Fourth cluster : highest credit limit and highest percentage of full payment, target for increase credit limit and increase spending habits since frequent purchases are happening in one-go

In [None]:
# concatenate the clusters labels to our original dataframe so that each row has an associated cluster they fit into
df_cluster = pd.concat([df, pd.DataFrame({'cluster':labels})], axis = 1)
df_cluster.head()



In [None]:
# Plot the histogram of various clusters, helps confirm different clusters created
for i in df.columns:
  plt.figure(figsize = (35, 5))
  for j in range(4):
    plt.subplot(1,4,j+1)
    cluster = df_cluster[df_cluster['cluster'] == j]
    cluster[i].hist(bins = 20)
    plt.title('{}    \nCluster {} '.format(i,j))
  
  plt.show()

In [None]:
pca = PCA(n_components=2)
principal_comp = pca.fit_transform(df1)
principal_comp

In [None]:
# Create two PCAs
pca_df = pd.DataFrame(data = principal_comp, columns =['pca1','pca2'])
pca_df.head()

In [None]:
# Concat thes into data frame
pca_df = pd.concat([pca_df,pd.DataFrame({'cluster':labels})], axis = 1)
pca_df.head()

In [None]:
# With this we can view the the clusters all together and see where they differ and take place
plt.figure(figsize=(10,10))
ax = sns.scatterplot(x="pca1", y="pca2", hue = "cluster", data = pca_df, palette =['red','green','blue','purple'])
plt.show()

In [None]:
# Perfromed data viz, fixed missing values
# corrplot
# applied kmeans to better understand customer segmentation
# able to plot histograms distribution of all various clusters
# Used PCA to convert from out original data into a component space and be able to visualized the different clusters in that way