## Clean

In [38]:
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [39]:
shopping_df = pd.read_csv('shopping_data.csv', encoding="ISO-8859-1")
shopping_df.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [40]:
for col in shopping_df.columns:
    print(f'There are {shopping_df[col].isnull().sum()} nulls in {col}')
    
print(f'# of dupes: {shopping_df.duplicated().sum()}')

There are 0 nulls in CustomerID
There are 2 nulls in Card Member
There are 2 nulls in Age
There are 0 nulls in Annual Income
There are 1 nulls in Spending Score (1-100)
# of dupes: 0


In [41]:
shopping_df=shopping_df.drop('CustomerID', axis=1)
shopping_df=shopping_df.dropna()

In [42]:
def to_bin(x):
    if x =="Yes":
        return 1
    else: return 0
shopping_df['Card Member']= shopping_df['Card Member'].apply(to_bin)
shopping_df['Annual Income'] = shopping_df['Annual Income']/1000
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [43]:
shopping_df.to_csv('clean_shopping.csv', index=False)

## Unsupervised ML

In [44]:
shopping_df.hvplot.scatter(x="Annual Income", y="Spending Score (1-100)")

In [45]:
# Function to cluster and plot dataset
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    # Fitting model
    model.fit(df)
    # Add a new class column to df_iris
    df["class"] = model.labels_

In [46]:
test_cluster_amount(shopping_df, 2)
shopping_df.hvplot.scatter(x="Annual Income", y="Spending Score (1-100)", by="class")

In [47]:
fig = px.scatter_3d(
	shopping_df,
x="Annual Income",
	y="Spending Score (1-100)",
	z="Age",
color="class",
	symbol="class",
	width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [48]:
inertia = []
k = list(range(1, 11))
# Calculate the inertia for the range of K values
for i in k:
   km = KMeans(n_clusters=i, random_state=0)
   km.fit(shopping_df)
   inertia.append(km.inertia_)

In [49]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [50]:
def get_clusters(k, data):   
    # Create a copy of the DataFrame   
    data = data.copy()       
    # Initialize the K-Means model   
    model = KMeans(n_clusters=k, random_state=0)   
    # Fit the model   
    model.fit(data)   
    # Predict clusters   
    predictions = model.predict(data)   
    # Create return DataFrame with predicted clusters   
    data["class"] = model.labels_   
    return data

In [51]:
five_clusters = get_clusters(5, shopping_df)
six_clusters = get_clusters(6, shopping_df)

In [52]:
five_clusters.hvplot.scatter(x="Annual Income", y="Spending Score (1-100)", by="class")

In [53]:
six_clusters.hvplot.scatter(x="Annual Income", y="Spending Score (1-100)", by="class")