In [100]:
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [101]:
path = ("Resources/Customers.csv")
main_df = pd.read_csv(path)
main_df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,1,Male,19,15000,39,Healthcare,1,4
1,2,Male,21,35000,81,Engineer,3,3
2,3,Female,20,86000,6,Engineer,1,1
3,4,Female,23,59000,77,Lawyer,0,2
4,5,Female,31,38000,40,Entertainment,2,6


In [102]:
main_df.describe()

Unnamed: 0,CustomerID,Age,Annual Income ($),Spending Score (1-100),Work Experience,Family Size
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1000.5,48.96,110731.8215,50.9625,4.1025,3.7685
std,577.494589,28.429747,45739.536688,27.934661,3.922204,1.970749
min,1.0,0.0,0.0,0.0,0.0,1.0
25%,500.75,25.0,74572.0,28.0,1.0,2.0
50%,1000.5,48.0,110045.0,50.0,3.0,4.0
75%,1500.25,73.0,149092.75,75.0,7.0,5.0
max,2000.0,99.0,189974.0,100.0,17.0,9.0


In [103]:
#dropping the customer id column
main_df1 = main_df.drop(columns='CustomerID')
main_df1


Unnamed: 0,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,Male,19,15000,39,Healthcare,1,4
1,Male,21,35000,81,Engineer,3,3
2,Female,20,86000,6,Engineer,1,1
3,Female,23,59000,77,Lawyer,0,2
4,Female,31,38000,40,Entertainment,2,6
...,...,...,...,...,...,...,...
1995,Female,71,184387,40,Artist,8,7
1996,Female,91,73158,32,Doctor,7,7
1997,Male,87,90961,14,Healthcare,9,2
1998,Male,77,182109,4,Executive,7,2


In [104]:
#making the age range be 25-65
main_df2 = main_df1[(main_df1['Age'] >=25) & (main_df1['Age'] <=65)]
main_df2

Unnamed: 0,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
4,Female,31,38000,40,Entertainment,2,6
6,Female,35,31000,6,Healthcare,1,3
8,Male,64,97000,3,Engineer,0,3
9,Female,30,98000,72,Artist,1,4
11,Female,35,93000,99,Healthcare,4,4
...,...,...,...,...,...,...,...
1987,Male,63,59244,80,Artist,7,1
1988,Female,54,118944,77,Artist,4,4
1989,Female,47,75293,55,Doctor,6,7
1990,Female,30,166983,69,Artist,7,3


In [105]:
#making work experience greater than 0
main_df3 = main_df2[main_df2['Work Experience']>0]
main_df3.head()


Unnamed: 0,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
4,Female,31,38000,40,Entertainment,2,6
6,Female,35,31000,6,Healthcare,1,3
9,Female,30,98000,72,Artist,1,4
11,Female,35,93000,99,Healthcare,4,4
16,Female,35,29000,35,Homemaker,9,5


In [106]:
main_df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 672 entries, 4 to 1993
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Gender                  672 non-null    object
 1   Age                     672 non-null    int64 
 2   Annual Income ($)       672 non-null    int64 
 3   Spending Score (1-100)  672 non-null    int64 
 4   Profession              660 non-null    object
 5   Work Experience         672 non-null    int64 
 6   Family Size             672 non-null    int64 
dtypes: int64(5), object(2)
memory usage: 42.0+ KB


In [107]:
main_df4=main_df3.dropna()
main_df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 660 entries, 4 to 1993
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Gender                  660 non-null    object
 1   Age                     660 non-null    int64 
 2   Annual Income ($)       660 non-null    int64 
 3   Spending Score (1-100)  660 non-null    int64 
 4   Profession              660 non-null    object
 5   Work Experience         660 non-null    int64 
 6   Family Size             660 non-null    int64 
dtypes: int64(5), object(2)
memory usage: 41.2+ KB


In [108]:
cleaned_df=pd.get_dummies(main_df4,dtype=int)

In [109]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 660 entries, 4 to 1993
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Age                       660 non-null    int64
 1   Annual Income ($)         660 non-null    int64
 2   Spending Score (1-100)    660 non-null    int64
 3   Work Experience           660 non-null    int64
 4   Family Size               660 non-null    int64
 5   Gender_Female             660 non-null    int64
 6   Gender_Male               660 non-null    int64
 7   Profession_Artist         660 non-null    int64
 8   Profession_Doctor         660 non-null    int64
 9   Profession_Engineer       660 non-null    int64
 10  Profession_Entertainment  660 non-null    int64
 11  Profession_Executive      660 non-null    int64
 12  Profession_Healthcare     660 non-null    int64
 13  Profession_Homemaker      660 non-null    int64
 14  Profession_Lawyer         660 non-null   

In [110]:
"""scaled_data = StandardScaler().fit_transform(cleaned_df[["Annual Income ($)", "Spending Score (1-100)", "Work Experience", "Family Size"]])
scaled_data"""

'scaled_data = StandardScaler().fit_transform(cleaned_df[["Annual Income ($)", "Spending Score (1-100)", "Work Experience", "Family Size"]])\nscaled_data'

In [111]:
"""df_scaled_data = pd.DataFrame(
    scaled_data,
    columns = ["Annual Income ($)", "Spending Score (1-100)", "Work Experience", "Family Size"]
)
cleaned_df['Annual Income ($)'] = df_scaled_data['Annual Income ($)']
cleaned_df['Spending Score (1-100)'] = df_scaled_data["Spending Score (1-100)"]
cleaned_df['Work Experience'] = df_scaled_data["Work Experience"]
cleaned_df['Family Size'] = df_scaled_data["Family Size"]
cleaned_df.head()"""

'df_scaled_data = pd.DataFrame(\n    scaled_data,\n    columns = ["Annual Income ($)", "Spending Score (1-100)", "Work Experience", "Family Size"]\n)\ncleaned_df[\'Annual Income ($)\'] = df_scaled_data[\'Annual Income ($)\']\ncleaned_df[\'Spending Score (1-100)\'] = df_scaled_data["Spending Score (1-100)"]\ncleaned_df[\'Work Experience\'] = df_scaled_data["Work Experience"]\ncleaned_df[\'Family Size\'] = df_scaled_data["Family Size"]\ncleaned_df.head()'

In [112]:
cleaned_df.head()

Unnamed: 0,Age,Annual Income ($),Spending Score (1-100),Work Experience,Family Size,Gender_Female,Gender_Male,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing
4,31,38000,40,2,6,1,0,0,0,0,1,0,0,0,0,0
6,35,31000,6,1,3,1,0,0,0,0,0,0,1,0,0,0
9,30,98000,72,1,4,1,0,1,0,0,0,0,0,0,0,0
11,35,93000,99,4,4,1,0,0,0,0,0,0,1,0,0,0
16,35,29000,35,9,5,1,0,0,0,0,0,0,0,1,0,0


In [113]:
cleaned_df = cleaned_df.dropna()


In [114]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 660 entries, 4 to 1993
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Age                       660 non-null    int64
 1   Annual Income ($)         660 non-null    int64
 2   Spending Score (1-100)    660 non-null    int64
 3   Work Experience           660 non-null    int64
 4   Family Size               660 non-null    int64
 5   Gender_Female             660 non-null    int64
 6   Gender_Male               660 non-null    int64
 7   Profession_Artist         660 non-null    int64
 8   Profession_Doctor         660 non-null    int64
 9   Profession_Engineer       660 non-null    int64
 10  Profession_Entertainment  660 non-null    int64
 11  Profession_Executive      660 non-null    int64
 12  Profession_Healthcare     660 non-null    int64
 13  Profession_Homemaker      660 non-null    int64
 14  Profession_Lawyer         660 non-null   

In [115]:
# Create a a list to store inertia values
inertia = []

# Create a a list to store the values of k
k = list(range(1, 11))

In [116]:
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(cleaned_df)
    inertia.append(k_model.inertia_)

In [117]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}

# Create the DataFrame from the elbow data
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,1475568000000.0
1,2,396811200000.0
2,3,208095600000.0
3,4,120159900000.0
4,5,73105330000.0


In [118]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [119]:
model = KMeans(n_clusters = 3)

In [120]:
model.fit(cleaned_df)

KMeans(n_clusters=3)

In [121]:
clusters = model.predict(cleaned_df)
print(clusters)

[1 1 2 2 1 1 2 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 2 2 1 1 2 1 1 2 1 1 2 1
 1 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 1 2 1 1 1 1 1 1 2 2 2 1 1 2 1 1 1 1 2 2 1
 1 2 1 2 1 1 1 1 2 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 1 1 1 2 1 2
 1 2 1 2 1 2 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 2 1 1 2 1 1 1 2 1 2 0 0 0
 0 2 2 0 0 1 1 1 0 2 2 1 1 1 0 0 1 0 2 2 0 0 0 1 1 2 0 2 2 0 2 1 2 2 0 2 2
 0 1 0 0 2 0 0 1 1 0 2 0 2 0 0 1 2 0 2 0 0 0 1 2 2 1 0 0 0 2 0 1 2 2 0 2 0
 1 0 2 2 2 1 0 0 2 1 2 1 0 0 2 1 1 2 2 0 0 0 2 1 2 2 0 0 2 0 1 0 0 0 0 0 0
 0 2 0 2 1 1 2 1 0 0 2 0 2 1 2 0 1 0 0 2 0 2 1 0 2 0 0 1 1 0 2 0 2 1 2 1 2
 1 0 0 1 0 2 0 1 0 0 2 2 0 2 2 1 0 2 2 0 1 2 2 0 2 0 1 1 2 0 0 0 0 0 0 0 2
 0 0 0 2 1 0 2 2 2 0 1 0 0 2 2 0 2 1 0 2 1 1 0 0 2 2 2 1 2 0 2 0 0 2 0 0 0
 2 0 0 2 0 0 1 0 0 1 0 0 1 0 1 0 0 2 2 0 0 1 0 0 2 2 1 2 0 2 0 1 1 1 0 2 1
 0 0 1 1 2 2 0 0 0 0 0 1 0 0 0 2 0 0 0 0 0 2 0 0 2 0 0 2 2 0 2 0 0 0 2 0 2
 1 0 2 2 1 2 2 0 2 2 0 2 0 0 2 0 0 2 1 2 2 2 2 0 2 2 2 2 2 1 0 0 1 0 1 0 2
 0 0 2 2 0 2 0 2 2 0 0 2 

In [122]:
df_predictions = cleaned_df.copy()

In [123]:
df_predictions['predicted_clusters']=clusters
df_predictions.head()

Unnamed: 0,Age,Annual Income ($),Spending Score (1-100),Work Experience,Family Size,Gender_Female,Gender_Male,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing,predicted_clusters
4,31,38000,40,2,6,1,0,0,0,0,1,0,0,0,0,0,1
6,35,31000,6,1,3,1,0,0,0,0,0,0,1,0,0,0,1
9,30,98000,72,1,4,1,0,1,0,0,0,0,0,0,0,0,2
11,35,93000,99,4,4,1,0,0,0,0,0,0,1,0,0,0,2
16,35,29000,35,9,5,1,0,0,0,0,0,0,0,1,0,0,1


In [124]:
predictions_plot = df_predictions.hvplot.scatter(
    x="Annual Income ($)",
    y="Spending Score (1-100)",
    by= "predicted_clusters"
)
predictions_plot