In [10]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import silhouette_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [11]:
data = pd.read_csv('Mall_Customers.csv')

df = data.copy()
df.head()

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [12]:
df.shape

(200, 5)

In [13]:
df = df.drop(['CustomerID'], axis=1)

In [14]:
df.isna().sum()

Genre                     0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

In [15]:
df = df.rename(columns={'Genre': 'Gender'})
df.head()

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,Male,19,15,39
1,Male,21,15,81
2,Female,20,16,6
3,Female,23,16,77
4,Female,31,17,40


In [16]:
label = LabelEncoder()
df['Gender'] = label.fit_transform(df['Gender'])

In [17]:
df.head()

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,19,15,39
1,1,21,15,81
2,0,20,16,6
3,0,23,16,77
4,0,31,17,40


In [18]:
df.describe()

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200.0,200.0,200.0
mean,0.44,38.85,60.56,50.2
std,0.497633,13.969007,26.264721,25.823522
min,0.0,18.0,15.0,1.0
25%,0.0,28.75,41.5,34.75
50%,0.0,36.0,61.5,50.0
75%,1.0,49.0,78.0,73.0
max,1.0,70.0,137.0,99.0


In [None]:
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=10)
    kmeans.fit_predict(df)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(8, 6))
plt.plot(range(1,11), wcss, marker='o', markerfacecolor='red')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=100)
    kmeans.fit_predict(df)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(8, 6))
plt.plot(range(1,11), wcss, marker='o', markerfacecolor='red')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=4, random_state=100)
label1 = kmeans.fit_predict(df)
label1
df['Clusters'] = label1

In [138]:
print(f"Silhouette score: {silhouette_score(df, label1)}")

Silhouette score: 0.39257058330217853


In [139]:
df

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100),Clusters
0,1,19,15,39,1
1,1,21,15,81,1
2,0,20,16,6,0
3,0,23,16,77,1
4,0,31,17,40,1
...,...,...,...,...,...
195,0,35,120,79,3
196,0,45,126,28,2
197,1,32,126,74,3
198,1,32,137,18,2


In [140]:
df.describe()

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100),Clusters
count,200.0,200.0,200.0,200.0,200.0
mean,0.44,38.85,60.56,50.2,1.245
std,0.497633,13.969007,26.264721,25.823522,1.131981
min,0.0,18.0,15.0,1.0,0.0
25%,0.0,28.75,41.5,34.75,0.0
50%,0.0,36.0,61.5,50.0,1.0
75%,1.0,49.0,78.0,73.0,2.0
max,1.0,70.0,137.0,99.0,3.0


In [141]:
df.groupby(['Clusters'], as_index=False)[['Age', 'Annual Income (k$)','Spending Score (1-100)']].mean()

Unnamed: 0,Clusters,Age,Annual Income (k$),Spending Score (1-100)
0,0,52.057971,46.42029,39.884058
1,1,25.056604,40.735849,62.622642
2,2,40.394737,87.0,18.631579
3,3,32.875,86.1,81.525


In [142]:
X = df.loc[:, ['Age', 'Annual Income (k$)','Spending Score (1-100)']]
Y = df.iloc[:, -1]

In [143]:
scaler = StandardScaler()
new_df = scaler.fit_transform(X)

new_df

array([[-1.42456879, -1.73899919, -0.43480148],
       [-1.28103541, -1.73899919,  1.19570407],
       [-1.3528021 , -1.70082976, -1.71591298],
       [-1.13750203, -1.70082976,  1.04041783],
       [-0.56336851, -1.66266033, -0.39597992],
       [-1.20926872, -1.66266033,  1.00159627],
       [-0.27630176, -1.62449091, -1.71591298],
       [-1.13750203, -1.62449091,  1.70038436],
       [ 1.80493225, -1.58632148, -1.83237767],
       [-0.6351352 , -1.58632148,  0.84631002],
       [ 2.02023231, -1.58632148, -1.4053405 ],
       [-0.27630176, -1.58632148,  1.89449216],
       [ 1.37433211, -1.54815205, -1.36651894],
       [-1.06573534, -1.54815205,  1.04041783],
       [-0.13276838, -1.54815205, -1.44416206],
       [-1.20926872, -1.54815205,  1.11806095],
       [-0.27630176, -1.50998262, -0.59008772],
       [-1.3528021 , -1.50998262,  0.61338066],
       [ 0.94373197, -1.43364376, -0.82301709],
       [-0.27630176, -1.43364376,  1.8556706 ],
       [-0.27630176, -1.39547433, -0.590

In [150]:
df1 = pd.DataFrame(new_df, columns=X.columns)
df1

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100)
0,-1.424569,-1.738999,-0.434801
1,-1.281035,-1.738999,1.195704
2,-1.352802,-1.700830,-1.715913
3,-1.137502,-1.700830,1.040418
4,-0.563369,-1.662660,-0.395980
...,...,...,...
195,-0.276302,2.268791,1.118061
196,0.441365,2.497807,-0.861839
197,-0.491602,2.497807,0.923953
198,-0.491602,2.917671,-1.250054


In [151]:
X_train, X_test, y_train, y_test = train_test_split(df1, Y, test_size=0.3, random_state=42)
model = XGBClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
print('\n')
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98        20
           1       1.00      0.93      0.97        15
           2       1.00      1.00      1.00        15
           3       1.00      1.00      1.00        10

    accuracy                           0.98        60
   macro avg       0.99      0.98      0.99        60
weighted avg       0.98      0.98      0.98        60



[[20  0  0  0]
 [ 1 14  0  0]
 [ 0  0 15  0]
 [ 0  0  0 10]]


In [152]:
import pickle

file = 'XGBModel.pkl' 
with open(file, 'wb') as f:
    pickle.dump(model, f)