In [55]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [56]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384


In [57]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [58]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
    width=800,
    height=300,
    rot=45
)

---

### Prepare the Data

In [59]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the data
scaled_data = scaler.fit_transform(df_market_data)

# Create a new DataFrame with the scaled data
df_scaled_data = pd.DataFrame(scaled_data, columns=df_market_data.columns)

# Display the first 10 rows of the scaled DataFrame
df_scaled_data.head(10)


Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
0,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
1,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
2,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
3,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
4,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317
5,0.891871,1.327295,0.800214,-0.057148,0.778653,-0.188232,-0.225533
6,0.011397,2.572251,1.101647,-0.490495,-0.931954,0.387759,-0.018284
7,0.10253,1.508001,0.648885,0.328959,-0.486349,0.06508,-0.155428
8,0.077497,0.334297,0.85852,-0.012646,-0.366477,-0.486266,-0.292351
9,0.448952,-0.190684,-0.248043,0.051634,-0.529666,-0.532961,-0.206029


In [60]:
# Create a DataFrame with the scaled data

df_scaled = pd.DataFrame(scaled_data, columns=df_market_data.columns, index=df_market_data.index)

# Combine the scaled data and the original index (coin names) into a single DataFrame
df_final = pd.concat([df_market_data.index.to_frame(), df_scaled], axis=1)

# Set the coin_id column as the index
df_final.set_index('coin_id', inplace=True)

# Display sample data
df_final.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317


---

### Find the Best Value for k Using the Original Data.

In [61]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 12))

# Display the list
# print(k_values)


In [62]:
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
for k in k_values:
    # 1. Create a KMeans model using the loop counter for the n_clusters
    model = KMeans(n_clusters=k, random_state=0)

    # 2. Fit the model to the data using `df_scaled`
    model.fit(df_final)

    # 3. Append the model.inertia_ to the inertia list
    inertia.append(model.inertia_)


  "KMeans is known to have a memory leak on Windows "


In [63]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k_values, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)
# df_elbow.set_index('k', inplace=True)

# Display sample data
df_elbow.head(11)


Unnamed: 0,k,inertia
0,1,287.0
1,2,195.820218
2,3,123.190482
3,4,79.022435
4,5,65.302379
5,6,52.888518
6,7,43.91469
7,8,37.517032
8,9,32.485241
9,10,28.222899


In [64]:
# Plot a line chart with all the inertia values computed with the different values of k
df_elbow.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve",
    xticks=k_values,
    xlabel="Number of Clusters (k)",
    ylabel="Inertia",
    width=600,
    height=400,
)


#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** The best value for k is 3 

---

### Cluster Cryptocurrencies with K-means Using the Original Data

In [65]:
# Initialize the K-Means model using the best value for k
k_best = 3  # replace with the optimal value of k
kmeans = KMeans(n_clusters=k_best, random_state=0)

In [66]:
# Fit the K-Means model using the scaled data
kmeans = KMeans(n_clusters=3, random_state=0).fit(df_final)

In [67]:
# Predict the clusters to group the cryptocurrencies using the scaled data
crypto_clusters = kmeans.predict(df_final)

# Print the resulting array of cluster values
print(crypto_clusters)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2
 0 0 0 0]


In [68]:
# Create a copy of the DataFrame
df_clustered = df_market_data.copy()

In [69]:
# Add a new column to the DataFrame with the predicted clusters
df_clustered['cluster'] = crypto_clusters

# Display sample data
df_clustered.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y,cluster
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761,0
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023,0
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954,0
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193,0
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384,0


In [70]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.

df_clustered.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    by="cluster",
    hover_cols=["coin_id"],
    width=800,
    height=400,
    legend="top_left"
)


---

### Optimize Clusters with Principal Component Analysis.

In [71]:
# Create a PCA model instance and set `n_components=3`.
pca = PCA(n_components=3)

In [72]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.

# Apply the PCA model to the scaled data
pca_features = pca.fit_transform(df_final)

# Create a DataFrame with the PCA features
df_pca = pd.DataFrame(
    data=pca_features,
    columns=["PCA1", "PCA2", "PCA3"],
    
)


# View the first five rows of the DataFrame. 
df_pca.head()

Unnamed: 0,PCA1,PCA2,PCA3
0,-0.600667,0.84276,0.461595
1,-0.458261,0.458466,0.952877
2,-0.43307,-0.168126,-0.641752
3,-0.471835,-0.22266,-0.479053
4,-1.1578,2.041209,1.859715


In [73]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.

explained_variance = pca.explained_variance_ratio_

for i, ev in enumerate(explained_variance):
    print(f"Explained variance of PCA{i+1}: {ev:.2%}")

Explained variance of PCA1: 37.20%
Explained variance of PCA2: 34.70%
Explained variance of PCA3: 17.60%


#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** The total explained variance of the three principal components is 89.5%. This means that these three principal components account for 89.50% of the total variance in the data.

In this case, a ratio of 3 principal components to 7 features is relatively fair, but the amount of variance explained is high (89.50%). This suggests that the three principal components capture the most important patterns in the data and may be sufficient for many applications. 

In [74]:
# Create a new DataFrame with the PCA data
df_pca = pd.DataFrame(
    data=pca_features,
    columns=["PC1", "PC2", "PC3"],
    index=df_market_data.index
)

# Set the index name to "coin_id"
df_pca.index.name = "coin_id"

# Display the first five rows of the DataFrame
df_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


---

### Find the Best Value for k Using the PCA Data

In [75]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 12))

In [76]:
# Create an empy list to store the inertia values
inertia_pca = []

# Create a for loop to compute the inertia with each possible value of k

for k in k_values:
    
# Inside the loop:
# Create a KMeans model using the loop counter for the n_clusters
  model = KMeans(n_clusters=k, random_state=42)
# Fit the model to the data using `df_market_data_pca`
  model.fit(df_pca)

# Append the model.inertia_ to the inertia list
  inertia_pca.append(model.inertia_)

  "KMeans is known to have a memory leak on Windows "


In [77]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data_pca = {"k": k_values, "inertia": inertia_pca}
# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_data_pca)
df_elbow_pca.head(11)

Unnamed: 0,k,inertia
0,1,256.874086
1,2,168.811896
2,3,93.774626
3,4,49.665497
4,5,37.878747
5,6,27.618972
6,7,21.182776
7,8,17.091637
8,9,13.667065
9,10,10.559358


In [78]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
df_elbow_pca.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve for PCA Data",
    xticks=k_values,
    xlabel="Number of Clusters (k)",
    ylabel="Inertia",
    width=600,
    height=400,
)

#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** The best value where the point of inflection is obvious is 3


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** No

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [79]:
# Initialize the K-Means model using the best value for k

kmeans_pca = KMeans(n_clusters=3, random_state=42)

In [80]:
# Fit the K-Means model using the PCA data
kmeans_pca.fit(pca_features)

KMeans(n_clusters=3, random_state=42)

In [81]:
# Predict the clusters to group the cryptocurrencies using the PCA data
predicted_pca_clusters = kmeans_pca.predict(pca_features)
# Print the resulting array of cluster values.
print(predicted_pca_clusters)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 1
 0 0 0 0]


In [82]:
# Create a copy of the DataFrame with the PCA data
df_pca_clusters = df_pca.copy()

# Add a new column to the DataFrame with the predicted clusters
df_pca_clusters["predicted_cluster"] = predicted_pca_clusters

# Display sample data
df_pca_clusters.head()

Unnamed: 0_level_0,PC1,PC2,PC3,predicted_cluster
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bitcoin,-0.600667,0.84276,0.461595,0
ethereum,-0.458261,0.458466,0.952877,0
tether,-0.43307,-0.168126,-0.641752,0
ripple,-0.471835,-0.22266,-0.479053,0
bitcoin-cash,-1.1578,2.041209,1.859715,0


In [83]:
# Create a scatter plot using hvPlot by setting 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.

df_pca_clusters.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="predicted_cluster",
    hover_cols=["coin_id"]
)

### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [84]:
# Composite plot to contrast the Elbow curves
# Create the first line plot for k=3
elbow_plot = df_elbow.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve",
    xticks=k_values,
    xlabel="Number of Clusters (k)",
    ylabel="Inertia",
    width=450,
    height=300,
)

# Create the second line plot for k=3
elbow_plot_pca = df_elbow_pca.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve for PCA Data",
    xticks=k_values,
    xlabel="Number of Clusters (k)",
    ylabel="Inertia",
    width=450,
    height=300,
)

# Combine the two line plots into a composite plot
elbows_composite_plot = elbow_plot + elbow_plot_pca

# Display the composite plot
elbows_composite_plot

In [85]:
# Create a composite plot to contrast the clusters
plot1 = df_clustered.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    by="cluster",
    hover_cols=["coin_id"],
    title="Price Change Analysis",
    width=450,
    height=300,
    legend="top_left"
)

plot2 = df_pca_clusters.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="predicted_cluster",
    hover_cols=["coin_id"],
    title="PCA Analysis",
    width=450,
    height=300,
    legend="top_right"
)


# Combine the plots into a single figure
clusters_composite_plot = plot1 + plot2

# Show the composite plot
clusters_composite_plot


#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** Using fewer features or using PCA for dimensionality reduction to cluster the data using K-Means can have several impacts:
  
Positive impacts:

--it results in faster and more efficient clustering(better performance)

--reduced the risk of overfitting.

Negative impacts:

--the resulting clusters may not be meaningful or useful as some features are left out and it leads also to difficulty in interpreting them.

--loss of information

In this specific dataset, the impact of using fewer features to cluster the data using K-Means has positive impacts as 89.5% of the data are retained. Using PCA to reduced the features of the data resulted in improved performance in clustering.

# Evaluate the quality of the clustering results 

Using K-means without dimensionality reduction(PCA)

In [86]:
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import silhouette_score


# Create a KMeans model with the optimal number of clusters (3)
kmeans = KMeans(n_clusters=3, random_state=0)   # change the n_clusters value to test

# Fit the model to the PCA data
kmeans.fit(df_final)

# Compute the inertia
inertia = kmeans.inertia_
print(f"Inertia: {inertia:.2f}")

# Compute the Dunn index
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np

def dunn_index(X, labels):
    """
    Calculate the Dunn Index for a clustering solution.
    """
    # Calculate the inter-cluster distance
    centers = np.array([np.mean(X[labels == i], axis=0) for i in range(len(np.unique(labels)))])
    inter_cluster_dist = euclidean_distances(centers)
    np.fill_diagonal(inter_cluster_dist, np.inf)
    min_inter_cluster_dist = np.min(inter_cluster_dist)

    # Calculate the intra-cluster distance
    max_intra_cluster_dist = 0
    for i in range(len(np.unique(labels))):
        intra_cluster_dist = euclidean_distances(X[labels == i])
        max_intra_cluster_dist = max(max_intra_cluster_dist, np.max(intra_cluster_dist))

    return min_inter_cluster_dist / max_intra_cluster_dist

dunn = dunn_index(df_final,  kmeans.labels_)
print("Dunn Index:", dunn)

# Compute the Silhouette score
silhouette = silhouette_score(df_final, kmeans.labels_)
print(f"Silhouette Score: {silhouette:.2f}")

# Compute the Calinski-Harabasz index
calinski = calinski_harabasz_score(df_final, kmeans.labels_)
print(f"Calinski-Harabasz Index: {calinski:.2f}")

Inertia: 123.19
Dunn Index: 1.4498918745786844
Silhouette Score: 0.70
Calinski-Harabasz Index: 25.26


Dimensionality reduction(PCA)

In [87]:

# Create a KMeans model with the optimal number of clusters (3)
kmeans_pca = KMeans(n_clusters=3, random_state=42)   # change the n_clusters value to test

# Fit the model to the PCA data
kmeans_pca.fit(df_pca)

# Compute the inertia
inertia = kmeans_pca.inertia_
print(f"Inertia: {inertia:.2f}")

# Compute the Dunn index
dunn = dunn_index(df_final,  kmeans_pca.labels_)
print("Dunn Index:", dunn)

# Compute the Silhouette score
silhouette = silhouette_score(df_pca, kmeans_pca.labels_)
print(f"Silhouette Score: {silhouette:.2f}")

# Compute the Calinski-Harabasz index
calinski = calinski_harabasz_score(df_pca, kmeans_pca.labels_)
print(f"Calinski-Harabasz Index: {calinski:.2f}")

Inertia: 93.77
Dunn Index: 1.4498918745786844
Silhouette Score: 0.74
Calinski-Harabasz Index: 33.05


Interpretation

Based on the information provided, the same dataset was used for both the K-means algorithm and the PCA and K-means algorithm combination. The K-means algorithm was applied to the dataset to partition it into K clusters based on similarity of the data points. Then, the PCA algorithm was applied to the dataset to reduce the dimensionality of the dataset, and the K-means algorithm was applied again to the reduced dataset to partition it into K clusters.

The Silhouette Score measures the compactness and separation of the clusters, with a score ranging from -1 to 1. A score closer to 1 indicates better clustering performance. In this case, both algorithms achieved a Silhouette Score of 70% and 74%, respectively, which suggests good performance in clustering. However, the PCA and K-means combination solution had a slightly higher score, indicating better separation and compactness of the clusters.

The Inertia measures the within-cluster sum of squared distances from each point to its centroid. A lower value indicates better clustering performance. In this case, the PCA and K-means combination algorithm achieved a lower Inertia value of 93.77, compared to 123.19 the K-means algorithm alone. This suggests that the PCA and K-means combination algorithm was able to create more compact clusters.

The Calinski-Harabasz Index measures the ratio of between-cluster variance to within-cluster variance. A higher value indicates better clustering performance. In this case, the PCA and K-means combination algorithm achieved a higher CH Index score of 33.05, compared to 25.26 to the other one. This suggests that the PCA and K-means combination algorithm was able to create more separated clusters.

Overall, it appears that both algorithms performed well in clustering, but the PCA algorithm had slightly better performance, with better separation and compactness of the clusters. The k value of 3 was used for both algorithms, and it seems to have resulted in well-separated and compact clusters.