In [1]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

In [2]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [3]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [4]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

---

### Prepare the Data

In [5]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
# All data in this df is numerical, so I don't need to select for specific (numerical) columns here
scaled_data = StandardScaler().fit_transform(df_market_data)

In [6]:
# Create a DataFrame with the scaled data
# The output of the StandardScaler().fit_transform() is not a dataframe, so we must convert it here
# and define the columns that we want (as those are not included in the list format of the output)
scaled_data_df = pd.DataFrame(
    scaled_data,
    columns = [
        'price_change_percentage_24h',
        'price_change_percentage_7d',
        'price_change_percentage_14d',
        'price_change_percentage_30d',
        'price_change_percentage_60d',
        'price_change_percentage_200d',
        'price_change_percentage_1y'
    ]
)

# Copy the crypto names from the original DataFrame
# The'reset_index()' was added so that we can get the coin_id column (because it is the index of the df)
crypto_names = df_market_data.reset_index()["coin_id"]

# Set the coin_id column as index
scaled_data_df["coin_id"] = crypto_names
scaled_data_df = scaled_data_df.set_index('coin_id')

# Display the scaled DataFrame
scaled_data_df.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317


---

### Find the Best Value for k Using the Original Scaled DataFrame.

In [44]:
# This function was created to follow DRY. Because we generate more than one elbow plot over the course of this 
# challenge, I created this function as a way to reduce the amount of code repetition
# The 'fit_data' parameter should be set to the data that we're training the model on, while the `plot_title` parameter
# defines te title of the graph that is output by this function.
# The function outputs the hvplot requested by the assignment.
def generate_elbow_plot(fit_data, plot_title):
    # Create a list with the number of k-values from 1 to 11
    k = list(range(1, 12))

    # Create an empty list to store the inertia values
    inertia = []
    
    # Create a for loop to compute the inertia with each possible value of k
    # Inside the loop:
    # 1. Create a KMeans model using the loop counter for the n_clusters
    # 2. Fit the model to the data using `fit_data`
    # 3. Append the model.inertia_ to the inertia list
    for index in tqdm(k, desc="Calculating KMeans..."):
        k_model = KMeans(n_clusters=index, random_state=1)
        k_model.fit(fit_data)
        inertia.append(k_model.inertia_)

    # Create a dictionary with the data to plot the Elbow curve
    elbow_data = {"k" : k, "inertia" : inertia}

    # Create a DataFrame with the data to plot the Elbow curve
    elbow_data_df = pd.DataFrame(elbow_data)

    # Plot a line chart with all the inertia values computed with
    # the different values of k to visually identify the optimal value for k.
    output_plot = elbow_data_df.hvplot.line(
        x="k",
        y="inertia",
        title=plot_title,
        xticks=k
    )

    # Outputting the plot requested by the assignment.
    return output_plot

In [None]:
# Generating the elbow plot requested by the assignment for the non-PCA data. The data is 'raw' in the sense that it has 
# not been passed through the PCA algorithm, not in the sense that it has not been scaled.
# We are generating this elbow plot using the function defined above (generate_elbow_plot()).
raw_elbow_plot = generate_elbow_plot(scaled_data_df, "Raw Data Elbow Graph")

# Displaying the plot
raw_elbow_plot

Calculating KMeans...:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating KMeans...: 100%|██████████| 11/11 [00:00<00:00, 63.64it/s]


#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** The best value for k was 4 clusters (as it was the most obvious 'elbow' value).

---

### Cluster Cryptocurrencies with K-means Using the Original Scaled DataFrame

In [47]:
# This function was created to adhere to DRY. Because we generate multiple scatter plots to display our clusters,
# this function was added to reduce the amount of code replication in this assignment.
# The `fit_data` parameter defines the data that we will train the model on, the `k` parameter defines the ideal number
# of clusters that we found using the elbow method, `x/y_label` defines the x and y labels for the output graph, 'hov_cols' 
# defines an attribute that we want to be able to see when hovering over points in the scatter plot, `graph_title` defines the
# title of the output graph, and `output_intermediary` defines whether or not we want to print out the intermediary steps of
# generating the graph
def generate_cluster_graph(fit_data, k, x_label, y_label, hov_cols, graph_title, output_intermediary):
    # Initialize the K-Means model using the best value for k
    k_model = KMeans(n_clusters=k, random_state=1)

    # Fit the K-Means model using the scaled DataFrame
    k_model.fit(fit_data)

    # Predict the clusters to group the cryptocurrencies using the scaled DataFrame
    clusters = k_model.predict(fit_data)

    # Create a copy of the scaled DataFrame
    output_scaled_data_df = fit_data.copy()

    # Add a new column to the copy of the scaled DataFrame with the predicted clusters
    output_scaled_data_df["cluster"] = clusters

    # Create a scatter plot using hvPlot by setting
    # `x="x_label"` and `y="y_label"`.
    # Color the graph points with the labels found using K-Means and
    # add the crypto name in the `hover_cols` parameter to identify
    # the cryptocurrency represented by each data point.
    output_cluster_plot = output_scaled_data_df.hvplot.scatter(
        x=x_label,
        y=y_label,
        by="cluster",
        hover_cols=hov_cols,
        title=graph_title
    )

    # I added this if statement so that we can choose not to print out these steps (as it can appear messy)
    if output_intermediary:
        print("Generating intermediate steps.......")
        print("--------------------------------------------------------------------")
        # Print the resulting array of cluster values.
        print(clusters)

        # Display the copy of the scaled DataFrame
        print(output_scaled_data_df.head())
        print("--------------------------------------------------------------------")
    
    # Outputting the cluster plot
    return output_cluster_plot


In [None]:
# Generating a scatter plot to display the clusters generated by our k_means algorithm (fit
# with our 'raw' data) The data is 'raw' in the sense that it has not been passed through the
# PCA algorithm, not in the sense that it has not been scaled. We are generating this scatter
# plot using the function defined above (generate_cluster_graph()).
# We assigned the graph to a variable so that we can later create a composite graph.
raw_cluster_plot =  generate_cluster_graph(
    fit_data = scaled_data_df,
    k = 4,
    x_label = "price_change_percentage_24h",
    y_label = "price_change_percentage_7d",
    hov_cols = "coin_id",
    graph_title = "Raw Cluster Graph",
    output_intermediary = True
)

# Displaying the plot
raw_cluster_plot

[2 2 0 0 2 2 2 2 2 0 0 0 0 2 0 2 0 0 2 0 0 2 0 0 0 0 0 0 2 0 0 0 3 2 0 0 1
 0 0 0 0]
              price_change_percentage_24h  price_change_percentage_7d  \
coin_id                                                                 
bitcoin                          0.508529                    0.493193   
ethereum                         0.185446                    0.934445   
tether                           0.021774                   -0.706337   
ripple                          -0.040764                   -0.810928   
bitcoin-cash                     1.193036                    2.000959   

              price_change_percentage_14d  price_change_percentage_30d  \
coin_id                                                                  
bitcoin                          0.772200                     0.235460   
ethereum                         0.558692                    -0.054341   
tether                          -0.021680                    -0.061030   
ripple                           

---

### Optimize Clusters with Principal Component Analysis.

In [17]:
# Create a PCA model instance and set `n_components=3`.
pca = PCA(n_components=3)

In [None]:
# Use the PCA model with `fit_transform` to reduce the original scaled DataFrame
# down to three principal components.
pca_data = pca.fit_transform(scaled_data_df)

# Generating a dataframe from our PCA data so that it can be output below (and further
# processed later on)
pca_data_df = pd.DataFrame(
    pca_data,
    columns=["PCA1", "PCA2", "PCA3"]
)

# View the scaled PCA data
pca_data_df.head()

Unnamed: 0,PCA1,PCA2,PCA3
0,-0.600667,0.84276,0.461595
1,-0.458261,0.458466,0.952877
2,-0.43307,-0.168126,-0.641752
3,-0.471835,-0.22266,-0.479053
4,-1.1578,2.041209,1.859715


In [19]:
# Retrieve the explained variance to determine how much information
# can be attributed to each principal component.
pca_explained_variance = pca.explained_variance_ratio_

# Printing out our PCA variance values for easier viewing
print(f"Individual pca variance values: {pca_explained_variance}")
print(f"Total PCA Variance: {sum(pca_explained_variance)}")

Individual pca variance values: [0.3719856  0.34700813 0.17603793]
Total PCA Variance: 0.8950316570309841


#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** The total explained variance of the three principal components was 0.8950316570309841

In [20]:
# Create a new DataFrame with the PCA data.
pca_data_df_two = pd.DataFrame(
    pca_data,
    columns=["PCA1", "PCA2", "PCA3"]
)

# Copy the crypto names from the original scaled DataFrame
# (note that we already found these name above)
pca_data_df_two["coin_id"] = crypto_names

# Set the coin_id column as index
pca_data_df_two = pca_data_df_two.set_index('coin_id')

# Display the scaled PCA DataFrame
pca_data_df_two.head()

Unnamed: 0_level_0,PCA1,PCA2,PCA3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


---

### Find the Best Value for k Using the Scaled PCA DataFrame

In [50]:
# Generating the elbow plot requested by the assignment for the PCA data.
# We are generating this elbow plot using the function defined above (generate_elbow_plot()).
# We assigned the graph to a variable so that we can later create a composite graph.
pca_elbow_plot = generate_elbow_plot(pca_data_df_two, "PCA Data Elbow Graph")

# Displaying the plot
pca_elbow_plot

Calculating KMeans...: 100%|██████████| 11/11 [00:00<00:00, 133.30it/s]


#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** The best value for k when using the PCA data is still 4 clusters.


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** Although the actual inertia value between the original dataset and the PCA dataset does differ, the best k value for both the original dataset and the PCA dataset remains 4 clusters (i.e. it does not differ).

### Cluster Cryptocurrencies with K-means Using the Scaled PCA DataFrame

In [51]:
# Generating a scatter plot to display the clusters generated by our k_means algorithm (fit
# with our 'PCA' data).  We are generating this scatter plot using the function defined
# above (generate_cluster_graph()). We assigned the graph to a variable so that we can later create a composite graph.
pca_cluster_plot =  generate_cluster_graph(
    fit_data = pca_data_df,
    k = 4,
    x_label = "PCA1",
    y_label = "PCA2",
    hov_cols = "coin_id",
    graph_title = "PCA Cluster Graph",
    output_intermediary = True
)

# Displaying the cluster plot
pca_cluster_plot

[2 2 0 0 2 2 2 2 2 0 0 0 0 2 0 2 0 0 2 0 0 2 0 0 0 0 0 0 2 0 0 0 3 2 0 0 1
 0 0 0 0]
       PCA1      PCA2      PCA3  cluster
0 -0.600667  0.842760  0.461595        2
1 -0.458261  0.458466  0.952877        2
2 -0.433070 -0.168126 -0.641752        0
3 -0.471835 -0.222660 -0.479053        0
4 -1.157800  2.041209  1.859715        2


### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [None]:
# Composite plot to contrast the Elbow curves
# Generating a composite plot using the elbow plots generated over the course of this assignment
composite_elbow_plot = raw_elbow_plot + pca_elbow_plot

# Displaying the composite plot
composite_elbow_plot

In [None]:
# Composite plot to contrast the clusters
# Generating a composite plot using the cluster plots generated over the course of this assignment
composite_cluster_plot = raw_cluster_plot + pca_cluster_plot

# Displaying the composite plot
composite_cluster_plot

#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** The main impact of using fewer features to cluster the data using K-Means is that we are able to view more of the data using fewer dimensions than if we were to just look at the raw cluster data. In the above scatter plots of the clusters, we can see how applying PCA more cleanly defined the clusters as compared to our first (raw) cluster plot. This could be because we are viewing more of the data that separates these data points at once when using PCA.