In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [32]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer
from matplotlib import colors

In [3]:
df = pd.read_csv("/kaggle/input/customer-personality-analysis/marketing_campaign.csv", sep="\t")

In [4]:
df.head()

In [5]:
print("Shape of the DataFrame is :",df.shape)

In [6]:
df.describe()

# Data Preprocessing

## Handling Missing Values

In [7]:
df.isnull().sum().sort_values(ascending = False)

There are some missing values in Income

In [8]:
import missingno as mn
mn.matrix(df)

In [9]:
#Filling missing values with median.
df['Income']=df['Income'].fillna(df['Income'].median())

In [10]:
df.isna().any()

In [11]:
#checking unique values
df.nunique()

In above cell "Z_CostContact" and "Z_Revenue" have same value in all the rows that's why , they are not going to contribute anything in the model building. So we can drop them.

In [12]:
df=df.drop(columns=["Z_CostContact", "Z_Revenue"],axis=1)
df.head()

In [13]:
# from sklearn.preprocessing import StandardScaler 
# mms = StandardScaler()
# mms.fit(df)
# normalized_data = mms.transform(df)

In [14]:
print("Print a Summary of a Dataframe is :",df.info())

Education, Marital_Status and Dt_Customer are categorical values.

In [15]:

df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"])


In [16]:
#Age of customer today 
df["Age"] = 2021-df["Year_Birth"]

#Total spendings on various items
df["Spent"] = df["MntWines"]+ df["MntFruits"]+ df["MntMeatProducts"]+ df["MntFishProducts"]+ df["MntSweetProducts"]+ df["MntGoldProds"]

#Feature indicating total children living in the household
df["Children"]=df["Kidhome"]+df["Teenhome"]

#Feature pertaining parenthood
df["Is_Parent"] = np.where(df.Children> 0, 1, 0)

#CHANGING CATEGORY INTO "UG" AND "PG" ONLY....
df['Education'] = df['Education'].replace(['PhD','2n Cycle','Graduation', 'Master'],'Post Graduate')  
df['Education'] = df['Education'].replace(['Basic'], 'Under Graduate')

#REPLACING THE CONFLICT VALUES IN Marital_status..
df['Marital_Status'] = df['Marital_Status'].replace(['Married', 'Together'],'Relationship')
df['Marital_Status'] = df['Marital_Status'].replace(['Divorced', 'Widow', 'Alone', 'YOLO', 'Absurd'],'Single')

In [17]:
df['TotalAcceptedCmp'] = df['AcceptedCmp1'] + df['AcceptedCmp2'] + df['AcceptedCmp3'] + df['AcceptedCmp4'] + df['AcceptedCmp5']

df['NumTotalPurchases'] = df['NumWebPurchases'] + df['NumCatalogPurchases'] + df['NumStorePurchases'] + df['NumDealsPurchases']


In [18]:
#correlation matrix
corrmat= df.corr()
plt.figure(figsize=(20,20))  
sns.heatmap(corrmat,annot=True, center=0)

In [19]:
#Get list of categorical variables
s = (df.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables in the dataset:", object_cols)

In [20]:
#Label Encoding the object dtypes.
LE=LabelEncoder()
for i in object_cols:
    df[i]=df[[i]].apply(LE.fit_transform)
    
print("All features are now numerical")

In [21]:
# Deleting some column to reduce dimension and complexity of model

col_del = ["ID","Dt_Customer","AcceptedCmp1" , "AcceptedCmp2", "AcceptedCmp3" , "AcceptedCmp4","AcceptedCmp5","NumWebVisitsMonth", "NumWebPurchases","NumCatalogPurchases","NumStorePurchases","NumDealsPurchases" , "Kidhome", "Teenhome","MntWines", "MntFruits", "MntMeatProducts", "MntFishProducts", "MntSweetProducts", "MntGoldProds"]
df=df.drop(columns=col_del,axis=1)
df.head()

In [22]:
scaled_features = StandardScaler().fit_transform(df.values)
sf_df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)

In [23]:
df.head()

# PCA

In [25]:
#Initiating PCA to reduce dimentions aka features to 3
pca = PCA(n_components=3)
pca.fit(sf_df)
PCA_ds = pd.DataFrame(pca.transform(sf_df), columns=(["col1","col2", "col3"]))
PCA_ds.describe().T

In [26]:
# Quick examination of elbow method to find numbers of clusters to make.
print('Elbow Method to determine the number of clusters to be formed:')
Elbow_M = KElbowVisualizer(KMeans(), k=15)
Elbow_M.fit(PCA_ds)
Elbow_M.show()

# KMeans Clustering

In [27]:
# Fitting K-Means to the dataset
kmeans = KMeans(n_clusters = 4, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(PCA_ds)
PCA_ds["Clusters"] = y_kmeans
sf_df["Clusters"] = y_kmeans
df["Clusters"] = y_kmeans

In [33]:
cmap = colors.ListedColormap(["#682F2F", "#9E726F", "#D6B2B1", "#B9C0C9", "#9F8A78", "#F3AB60"])

In [34]:
#Plotting the clusters

#A 3D Projection Of Data In The Reduced Dimension
x =PCA_ds["col1"]
y =PCA_ds["col2"]
z =PCA_ds["col3"]
fig = plt.figure(figsize=(10,8))
ax = plt.subplot(111, projection='3d', label="bla")
ax.scatter(x, y, z, s=40, c=PCA_ds["Clusters"], marker='o',  cmap = cmap)
ax.set_title("The Plot Of The Clusters")

plt.show()

In [36]:
pal = ["#682F2F","#B9C0C9", "#9F8A78","#F3AB60"]
pl = sns.countplot(x=sf_df["Clusters"],  palette = pal)
pl.set_title("Distribution Of The Clusters")
plt.show()

In [61]:
# Visualising the clusters
pal = ["#682F2F","#B9C0C9", "#9F8A78","#F3AB60"]
df1 = sf_df[sf_df.Clusters == 0]
df2 = sf_df[sf_df.Clusters == 1]
df3 = sf_df[sf_df.Clusters == 2]
df4 = sf_df[sf_df.Clusters == 3]
plt.figure(figsize= (15,8))
plt.scatter(df1['Income'], df1.Spent, s = 25, c = '#682F2F', label = 'Sensible Customers')
plt.scatter(df2['Income'], df2.Spent, s = 25, c = '#B9C0C9', label = 'Economical Customers')
plt.scatter(df3['Income'], df3.Spent, s = 25, c = '#9F8A78', label = 'Good Customers')
plt.scatter(df4['Income'], df4.Spent, s = 25, c = '#F3AB60', label = 'Careless Customers')


plt.title('Clusters of customers',fontsize = 20)
plt.xlabel('Income',fontsize = 15)
plt.ylabel('Expenses',fontsize = 15)
plt.legend(fontsize = 15)
plt.show()

Sensible Customers: Customers who have low income and low expenses.

Economical Customers: Customers who have average income and spend only according to their needs.

Good Customers: Customers who have high income and high expenses.

Careless Customers: Customers who have low income and high expenses.