In [1]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
import holoviews as hv
hv.extension('bokeh')
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [3]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [4]:
# Plot your data to see what's in your DataFrame
plot = df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

plot

On the visualisation, it is noticeable that one of the cryptocurrencies (Ethlend) experiences significant price changes, especially over long time intervals (60 days and 1 year), reflected as a sharp peak. For other cryptocurrencies, the changes are less pronounced and exhibit a more stable pattern. By using clustering methods, cryptocurrencies can be grouped based on their price changes. This will help identify cryptocurrencies with similar price change patterns. For example, cryptocurrencies that demonstrate stable behavior or similar trends can be grouped together. This can assist in creating investment strategies or identifying interrelated markets. Dimensionality reduction methods, such as PCA (Principal Component Analysis), can be used to simplify the visualization and analysis of the data. This will help reveal the main directions of changes and uncover hidden patterns.

---

### Prepare the Data

In [5]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_market_data)




In [6]:
# Create a DataFrame with the scaled data, because scaled_data this is an array

df_scaled_data = pd.DataFrame(scaled_data, columns=df_market_data.columns, index=df_market_data.index)

# Copy the crypto names from the original data

df_scaled_data['crypto_name'] = df_market_data.index

# Set the coinid column as index

df_scaled_data.set_index('crypto_name', inplace=True)

# Display sample data

df_scaled_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
crypto_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317
binancecoin,0.891871,1.327295,0.800214,-0.057148,0.778653,-0.188232,-0.225533
chainlink,0.011397,2.572251,1.101647,-0.490495,-0.931954,0.387759,-0.018284
cardano,0.10253,1.508001,0.648885,0.328959,-0.486349,0.06508,-0.155428
litecoin,0.077497,0.334297,0.85852,-0.012646,-0.366477,-0.486266,-0.292351
bitcoin-cash-sv,0.448952,-0.190684,-0.248043,0.051634,-0.529666,-0.532961,-0.206029


---

### Find the Best Value for k Using the Original Data.

In [7]:
# Create a list with the number of k-values from 1 to 10

k_values = list(range(1, 11))
k_values


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [8]:
# Create an empty list to store the inertia values

inertia_values = []

# Create a for loop to compute the inertia with each possible value of k

for k in k_values:
    

# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
    kmeans = KMeans(n_clusters=k, random_state=42)
# 2. Fit the model to the data using `df_market_data_scaled`
    kmeans.fit(df_scaled_data)
# 3. Append the model.inertia_ to the inertia list
    inertia_values.append(kmeans.inertia_)
inertia_values    


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


[287.0,
 195.82021818036046,
 123.19048183836959,
 79.02243535120975,
 65.302379141625,
 54.7399219421002,
 43.48160605008147,
 37.517032490103574,
 32.59189097672458,
 28.22289929060893]

In [9]:
# Create a dictionary with the data to plot the Elbow curve

elbow_data = {
    'k': k_values,
    'inertia': inertia_values
}

# Create a DataFrame with the data to plot the Elbow curve

df_elbow_data = pd.DataFrame(elbow_data)

In [10]:

# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.

elbow_plot = df_elbow_data.hvplot(
    x='k',
    y='inertia',
    kind='line',
    title='Elbow Curve for Original Data',
    xlabel='Number of Clusters (k)',
    ylabel='Inertia',
    grid=True,
    width=600,
    height=400
)

elbow_plot

#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** From the presented Elbow curve, we can observe that significant changes in inertia occur up to 
`𝑘 = 4`. Beyond this value, the improvements in inertia become less substantial. We can see by eye from the array data that the inertia value begins to slow down at around 79, which is confirmed by the graph with the elbow method. It shows that up to 4, there was a significant decrease in inertia, and from 4 to 8, we see a window for investigation. 4, 5, 6, and 7 can be selected. Next, we need to link this result with business. Because an additional cluster in some cases can uncover an interesting group of customers for analysis.

---

### Cluster Cryptocurrencies with K-means Using the Original Data

In [11]:
# Initialize the K-Means model using the best value for k

kmeans_model = KMeans(n_clusters=4, random_state=42)

In [12]:
# Fit the K-Means model using the scaled data

kmeans_model.fit(df_scaled_data)

  super()._check_params_vs_input(X, default_n_init=10)


In [13]:
# Predict the clusters to group the cryptocurrencies using the scaled data
predicted_clusters = kmeans_model.predict(df_scaled_data)

# Print the resulting array of cluster values
print(predicted_clusters)


[0 0 2 2 0 0 0 0 0 2 2 2 2 0 2 0 2 2 0 2 2 0 2 2 2 2 2 2 0 2 2 2 3 0 2 2 1
 2 2 2 2]


In [14]:
# Create a copy of the DataFrame

df_market_data_copy = df_market_data.copy()

In [15]:
# Add a new column to the DataFrame with the predicted clusters

df_market_data_copy['Predicted_Cluster'] = predicted_clusters

# Display sample data

df_market_data_copy.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y,Predicted_Cluster
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761,0
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023,0
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954,2
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193,2
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384,0


In [34]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.

scatter_plot = df_market_data_copy.hvplot.scatter(
    x='price_change_percentage_24h',
    y='price_change_percentage_7d',
    by='Predicted_Cluster',
    title='Cryptocurrency Price Changes (24h vs 7d)',
    hover_cols=['coin_id'],
    xlabel='Price Change Percentage (24h)',
    ylabel='Price Change Percentage (7d)',
    width=600,
    height=400,
)
scatter_plot


From the graph, we can see that one of the clusters provides interesting information. Most points are concentrated around the zero values on the X-axis, indicating that there have been no significant price changes in the last 24 hours. However, the points in the blue cluster show significant weekly fluctuations. Additionally, the one-day changes for this cluster are more uniformly distributed.











---

### Optimize Clusters with Principal Component Analysis.

In [17]:
# Create a PCA model instance and set `n_components=3`.

pca_model = PCA(n_components=3)

In [18]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.

pca_transformed_data = pca_model.fit_transform(df_scaled_data)
# Create a DataFrame with the PCA data
df_pca = pd.DataFrame(pca_transformed_data, columns=['PC1', 'PC2', 'PC3'], index=df_scaled_data.index)

# View the first five rows of the DataFrame. 

df_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3
crypto_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


In [19]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.

explained_variance_ratio = pca_model.explained_variance_ratio_

# Display the explained variance ratio
explained_variance_ratio


array([0.3719856 , 0.34700813, 0.17603793])

In [20]:
# Calculate the total explained variance 
total_explained_variance = explained_variance_ratio.sum()

# Display the total explained variance
total_explained_variance


0.8950316570309841

#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** The total explained variance of the three principal components is approximately `89.50%`. This means that these three components together capture about 89.50% of the total variance in the data. Initially, the original data had 7 columns, which were compressed into a three-dimensional model. A loss of 10.5% of the data is acceptable given the reduction in dimensionality by 4 columns, making the training more stable.

In [21]:
# Create a new DataFrame with the PCA data.

df_pca = pd.DataFrame(pca_transformed_data, columns=['PC1', 'PC2', 'PC3'], index=df_scaled_data.index)

# Copy the crypto names from the original data

df_pca['crypto_name'] = df_market_data.copy().index

# Set the coinid column as index

df_pca.set_index('crypto_name', inplace=True)

# Display sample data

df_pca

Unnamed: 0_level_0,PC1,PC2,PC3
crypto_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715
binancecoin,-0.516534,1.388377,0.804071
chainlink,-0.450711,0.517699,2.846143
cardano,-0.3456,0.729439,1.478013
litecoin,-0.649468,0.432165,0.600303
bitcoin-cash-sv,-0.759014,-0.2012,-0.217653


---

### Find the Best Value for k Using the PCA Data

In [22]:
# Create a list with the number of k-values from 1 to 10

k_values = list(range(1, 11))

In [23]:
# Create an empty list to store the inertia values

inertia_values = []

# Create a for loop to compute the inertia with each possible value of k

for k in k_values:

# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
    kmeans = KMeans(n_clusters=k, random_state=42)
# 2. Fit the model to the data using `df_market_data_pca`
    kmeans.fit(df_pca)

# 3. Append the model.inertia_ to the inertia list
    inertia_values.append(kmeans.inertia_)
    
  
# Display the inertia values
print(inertia_values)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


[256.87408556789256, 165.90199402036012, 93.77462568057294, 49.66549665179736, 37.83946598681243, 30.777746141441188, 21.134056037473602, 17.091636643864742, 13.681139692992751, 10.630647573870965]


In [24]:
# Create a dictionary with the data to plot the Elbow curve

elbow_data = {
    "k": k_values,
    "inertia": inertia_values
}

# Create a DataFrame with the data to plot the Elbow curve

df_elbow_data_pca = pd.DataFrame(elbow_data)

In [25]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.

elbow_plot_pca = df_elbow_data_pca.hvplot.line(
    x='k',
    y='inertia',
    xlabel='Number of Clusters (k)',
    ylabel='Inertia',
    title='Elbow Curve for PCA Data',
    grid=True,
    width=600,
    height=400
)

elbow_plot_pca

#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** Based on the Elbow curve the optimal value for `𝑘` appears to be around 4. This is where the inertia begins to decrease at a slower rate, indicating that adding more clusters beyond this point yields diminishing returns.


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** The optimal value of `𝑘`is the same for both the original data and the PCA-transformed data, which is `k=4`. This indicates that the clustering results are consistent regardless of the dimensionality reduction applied by PCA.

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [26]:
# Initialize the K-Means model using the best value for k

kmeans_model = KMeans(n_clusters=4, random_state=42)

In [27]:
# Fit the K-Means model using the PCA data

kmeans_model.fit(df_pca)

  super()._check_params_vs_input(X, default_n_init=10)


In [28]:
# Predict the clusters to group the cryptocurrencies using the PCA data

predicted_clusters = kmeans_model.predict(df_pca)

# Print the resulting array of cluster values
print(predicted_clusters)

[0 0 2 2 0 0 0 0 0 2 2 2 2 0 2 0 2 2 0 2 2 0 2 2 2 2 2 2 0 2 2 2 3 0 2 2 1
 2 2 2 2]


In [29]:
# Create a copy of the DataFrame with the PCA data
df_pca_copy = df_pca.copy()

# Add a new column to the DataFrame with the predicted clusters
df_pca_copy['Cluster'] = predicted_clusters

# Display sample data
print(df_pca_copy.head())


                   PC1       PC2       PC3  Cluster
crypto_name                                        
bitcoin      -0.600667  0.842760  0.461595        0
ethereum     -0.458261  0.458466  0.952877        0
tether       -0.433070 -0.168126 -0.641752        2
ripple       -0.471835 -0.222660 -0.479053        2
bitcoin-cash -1.157800  2.041209  1.859715        0


In [35]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.

scatter_plot_pca = df_pca_copy.hvplot.scatter(
    x='PC1',
    y='PC2',
    by='Cluster',
    title='Cryptocurrency Clusters (PC1 vs PC2)',
    hover_cols=['crypto_name'],
    xlabel='Principal Component 1',
    ylabel='Principal Component 2',
    width=600,
    height=400,
    cmap='viridis'
)

# Display the scatter plot
scatter_plot_pca


After scaling and applying the Principal Component Analysis (PCA), two clusters are still distinguished, and two others, as in the case with the original data, are outliers. The difference is that the two main clusters have shifted from the zero mark towards negative values on the PC1 component. The principal components allowed highlighting the key differences between the cryptocurrencies, making the visual recognition of clusters easier. As a result, unique clusters, such as the red and green points, emerged, demonstrating the unique characteristics of the cryptocurrencies.

### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [31]:
# Composite plot to contrast the Elbow curves

# Plot the Elbow curve for the original data
elbow_plot_combined = elbow_plot + elbow_plot_pca

elbow_plot_combined

Elbow graphs are similar to each other, which aligns well with the fact that PCA retains 90% of the information from our dataset.

In [36]:
# Composite plot to contrast the clusters

scatter_plots_combined = scatter_plot + scatter_plot_pca
scatter_plots_combined


#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** In this case, the PCA method was applied to the original data, and clustering was performed afterward. This allowed us to identify two unique cryptocurrencies that stood out noticeably on the graph to the right. In this case, the graphs are plotted in the coordinates of the original data and the PCA coordinates. The goal is to better visualize the differences when clustering. We are looking at our data from a different perspective.

In [38]:
#Composite plot to contrast the clusters
# Create a new DataFrame to hold original data with clusters predicted from PCA data
df_market_data_with_pca_clusters = df_market_data.copy()
df_market_data_with_pca_clusters['Predicted_Cluster_PCA'] = predicted_clusters

scatter_plot = df_market_data_copy.hvplot.scatter(
    x='price_change_percentage_24h',
    y='price_change_percentage_7d',
    by='Predicted_Cluster',
    title='Cryptocurrency Price Changes (24h vs 7d)',
    hover_cols=['coin_id'],
    xlabel='Price Change Percentage (24h)',
    ylabel='Price Change Percentage (7d)',
    width=600,
    height=400,
)
scatter_plot
# Scatter plot using original data but with PCA clusters
scatter_plot_pca_on_original = df_market_data_with_pca_clusters.hvplot.scatter(
    x='price_change_percentage_24h',
    y='price_change_percentage_7d',
    by='Predicted_Cluster_PCA',
    title='Cryptocurrency Price Changes (24h vs 7d) - PCA Clusters on Original Data',
    hover_cols=['coin_id'],
    xlabel='Price Change Percentage (24h)',
    ylabel='Price Change Percentage (7d)',
    width=600,
    height=400,
    cmap='viridis'
)

# Composite plot to contrast the scatter plots
scatter_composite_corrected = scatter_plot + scatter_plot_pca_on_original
scatter_composite_corrected.cols(2)

In this case, we see that applying the PCA method does not affect the clustering. In this approach, we build clusters in the original coordinates. Since the graphs match, we see that applying PCA does not impact the clustering.

#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** The main conclusion in this task is that applying the PCA method does not affect the clustering using K-Means. However, it improves visualization, making the identified clusters more noticeable.