In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Libraries required for project

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

## About Dataset

This case requires to develop a customer segmentation to define marketing strategy. The
sample Dataset summarizes the usage behavior of about 9000 active credit card holders during the last 6 months. The file is at a customer level with 18 behavioral variables.

Following is the Data Dictionary for Credit Card dataset :-

CUSTID : Identification of Credit Card holder (Categorical)


BALANCE : Balance amount left in their account to make purchases (

BALANCEFREQUENCY : How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated)


PURCHASES : Amount of purchases made from account


ONEOFFPURCHASES : Maximum purchase amount done in one-go


INSTALLMENTSPURCHASES : Amount of purchase done in installment

CASHADVANCE : Cash in advance given by the user

PURCHASESFREQUENCY : How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased)

ONEOFFPURCHASESFREQUENCY : How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased)

PURCHASESINSTALLMENTSFREQUENCY : How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done)

CASHADVANCEFREQUENCY : How frequently the cash in advance being paid

CASHADVANCETRX : Number of Transactions made with "Cash in Advanced"

PURCHASESTRX : Numbe of purchase transactions made

CREDITLIMIT : Limit of Credit Card for user

PAYMENTS : Amount of Payment done by user

MINIMUM_PAYMENTS : Minimum amount of payments made by user

PRCFULLPAYMENT : Percent of full payment paid by user

TENURE : Tenure of credit card service for user

In [3]:
data_set = pd.read_csv('/kaggle/input/ccdata/CC GENERAL.csv')

## Understanding Problem Statement/ Requirement


So in this Project we are having a dataset of customer's credit card usage with various attributes. And we are expected to cluster the users into different groups in order to target the customers accordingly

### Observing the dataset

In [4]:
data_set.head(5)

In [5]:
data_set.shape

In [6]:
data_set.CUST_ID.unique

In [7]:
#checking unique values in each columns
print(len(data_set.columns))
for x in data_set.columns:
    print(f'{x} ===== {len(pd.unique(data_set[x]))}')

In [8]:
len(pd.unique(data_set.BALANCE))

In [9]:
#checking the null values in each column in order to remove them accordingly
data_set.isnull().sum()

In [10]:
data_set.info() #checking data type in each column

In [11]:
data_set.describe()  # to show the staticstics of all the columns to plan accordingly

In [12]:
data_set.BALANCE.min(),data_set.BALANCE.max(),data_set.BALANCE.mean() 
#checking min max and mean value in a column balance

In [13]:
df1 = data_set[data_set.isnull().any(axis=1)] ## Dataset creation with Null values in any columns

In [14]:
df1

In [15]:
# finidng the detail of a customer whose OneOff purchase is maximum

data_set[data_set.ONEOFF_PURCHASES==data_set.ONEOFF_PURCHASES.max()]

In [16]:
data_set[data_set.CASH_ADVANCE==data_set.CASH_ADVANCE.max()]

## Visualisation Of dataset

In [17]:
sns.heatmap(data_set.isnull(),yticklabels=False,cbar=False,cmap='Blues') # to visualise the null values

### Preprocessing to replace the null values as it is very important to deal with Null values before feeding to ML algorithm.

1. Droping the rows (Can cause loss in accuracy if rows having null values are considerable)
2. Replacing the particular place with maximum occuring values
3. Replacing the particular place with Mean or Median of that particular Column

In [18]:
# data_set.loc[(data_set['MINIMUM_PAYMENTS'].isnull()==True),'MINIMUM_PAYMENTS']=data_set['MININUM_PAYMENTS'].median()

In [19]:
fig,ax= plt.subplots (figsize=(10, 6))
sns.boxplot(data_set.MINIMUM_PAYMENTS)

**lot of outliers so replacing null values with median**

In [20]:
data_set['MINIMUM_PAYMENTS']=data_set.MINIMUM_PAYMENTS.fillna(data_set.MINIMUM_PAYMENTS.median())

In [22]:
data_set.loc[data_set.CREDIT_LIMIT.isnull()]

In [23]:
data_set['CREDIT_LIMIT']=data_set.CREDIT_LIMIT.fillna(data_set.CREDIT_LIMIT.median())

In [24]:
data_set.info()

In [25]:
data_set.isnull().sum()

#### # checking for duplicate values in dataset

In [26]:
data_set.duplicated().sum()

In [27]:
data_set.drop('CUST_ID',axis=1,inplace=True)

In [28]:
data_set

### Plotting Dist Plot
It is combination of Matplotlib.hist as well KDE plot of seaborn.


KDE = Kernel Density Estimation

KDE is used to visualise the probability density of the continuous variable

In [29]:
plt.figure(figsize=(10,50))
for i in range(len(data_set.columns)):
    plt.subplot(17,1,i+1)
    sns.distplot(data_set[data_set.columns[i]],kde_kws={
        "color":"b","lw":3,"label":"KDE"
    },hist_kws={"color":"g"})
    plt.title(data_set.columns[i])
    
# plt.tight_layout

## COrrelation matrix between features

In [30]:
plt.figure(figsize=(12,10))
sns.heatmap(data_set.corr(),annot=True)

**elbow method**

  before importing data to any ML algorithm it is necessary to normalise the data

In [31]:
scaler = StandardScaler()
dataset_scaled = scaler.fit_transform(data_set)

In [32]:
dataset_scaled.dtype

In [33]:
dataset_scaled  #scaled dataset

In [None]:
scores = []

# plotting foot elbow  method so checking foot clusters upto required Ran ge and
#  selecting best

for i in range(1,20):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(dataset_scaled)
    scores.append(kmeans.inertia_)
    


### Where ever elebow will be forming it will represent optimum number of clusters to make

In [35]:
plt.plot(scores,'bx-')


below is justto check elbow method when there is only first 8 column of the given dataset

In [None]:
score = []

# plotting foot elbow  method so checking foot clusters upto required Ran ge and
#  selecting best
for i in range(1,20):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(dataset_scaled[:,:8])
    score.append(kmeans.inertia_)
    
plt.plot(score,'bx-')

# applying k means algorithm

In [36]:
kmeans = KMeans(n_clusters=7)
kmeans.fit(dataset_scaled)

label = kmeans.labels_

In [38]:
kmeans.cluster_centers_.shape

it means there are 7 cluters meaning 7 senters and each center have 17 columns

making data frame of these clusters

In [47]:
cluster_centers = pd.DataFrame(data=kmeans.cluster_centers_,columns=[data_set.columns])

In [48]:
cluster_centers

But as we normalised data now we are not able to understand it in real world. So in order to make it more understandable we have to convert it into original form before normalisation

### Inverse transforming cluster centers

In [49]:
cluster_centers = scaler.inverse_transform(cluster_centers)

cluster_centers = pd.DataFrame(data=cluster_centers,columns=[data_set.columns])

cluster_centers

here domain knowledge playes the important role for generating labels.

For example :
let say new customers have shorter tenure period so one class will generated from there.

similarlly VIP customers have higher credit limit so one class will be there.

Some class can be of type of customers who have higher one off payment so one class can be from there.

So domain knoledge plays an important role(not necessarily as sometimes little bit of input will also be enough)

In [52]:
label.shape,label.min(),label.max()


Predicing labels for the dataset

In [53]:
y = kmeans.fit_predict(dataset_scaled)
y

Putting labels with dataset that is concatinating labels to the original data Frame

In [54]:
data_set_final_data = pd.concat([data_set,pd.DataFrame({'cluster':label})],axis=1)

data_set_final_data

## Visualisation of clusters formed

we can use histogram or PCA also.

Here first we will be plotting histogram and than we will be plotting using PCA


In [55]:
for i in data_set.columns:
    plt.figure(figsize=(35,5))
    for j in range(7):
        plt.subplot(1,7,j+1)
        cluster = data_set_final_data[data_set_final_data['cluster']==j]
        cluster[i].hist(bins=20)
    plt.show()
    
    
    ## sowing how many points from each column belongs to each class

To understand the Output We will apply PCA.
that is by converting this higher dimnesion data to 2 dimension for easy visualisation

In [57]:
pca = PCA(n_components = 2)
princi_comp = pca.fit_transform(dataset_scaled)
princi_comp

In [58]:
pca_df = pd.DataFrame(data=princi_comp,columns=['Principal_Comp1','Principal_Comp2'])

In [59]:
pca_df

In [60]:
pca_df = pd.concat([pca_df,pd.DataFrame({'cluster':label})],axis=1)
pca_df

In [61]:
plt.figure(figsize=(10,10))
ax = sns.scatterplot(x="Principal_Comp1",y="Principal_Comp2",hue="cluster",data=pca_df,palette=['red','green','blue','yellow','pink','orange','purple'])
plt.show()