# Clustering Crypto

In [23]:
# Initial imports
import pandas as pd
import hvplot.pandas
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [24]:
# Load the crypto_data.csv dataset.
# YOUR CODE HERE
file_path = "crypto_data.csv"
df_crypto = pd.read_csv(file_path, encoding="ISO-8859-1", header='infer', index_col =0)


df_crypto.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42.0
365,365Coin,X11,True,PoW/PoS,,2300000000.0
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000.0
611,SixEleven,SHA-256,True,PoW,,611000.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159000000.0
2015,2015 coin,X11,True,PoW/PoS,,0.0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000.0
ETH,Ethereum,Ethash,True,PoW,107684200.0,0.0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000.0


In [25]:
# Keep all the cryptocurrencies that are being traded.
# YOUR CODE HERE

# Find null values
for column in df_crypto.columns:
    print(f"Column {column} has {df_crypto[column].isnull().sum()}null values")

# Drop null rows
df_crypto = df_crypto.dropna()
df_crypto.shape

Column CoinName has 0null values
Column Algorithm has 0null values
Column IsTrading has 0null values
Column ProofType has 0null values
Column TotalCoinsMined has 508null values
Column TotalCoinSupply has 0null values


(744, 6)

In [26]:
# Remove the "IsTrading" column. 
# YOUR CODE HERE
df_crypto.drop(columns=["IsTrading"], inplace=True)
df_crypto.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42.0
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0
808,808,SHA-256,PoW/PoS,0.0,0.0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159000000.0
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000.0


In [27]:
# Remove rows that have at least 1 null value.
# YOUR CODE HERE
# Dropping null values
df_crypto = df_crypto.dropna()
df_crypto.head(20)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42.0
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0
808,808,SHA-256,PoW/PoS,0.0,0.0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159000000.0
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000.0
ETH,Ethereum,Ethash,PoW,107684200.0,0.0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000.0
DASH,Dash,X11,PoW/PoS,9031294.0,22000000.0
XMR,Monero,CryptoNight-V7,PoW,17201140.0,0.0
ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000.0


In [28]:
# Keep the rows where coins are mined.
# YOUR CODE HERE

In [29]:
# Create a new DataFrame that holds only the cryptocurrencies names.
# YOUR CODE HERE
# New dataframe to hold cryptocurrencies names
df_coinname = pd.DataFrame(
    data=df_crypto, columns=["CoinName"])
df_coinname.head()


Unnamed: 0,CoinName
42,42 Coin
404,404Coin
808,808
1337,EliteCoin
BTC,Bitcoin


In [30]:
# New dataframe shape
df_coinname.shape

(744, 1)

In [31]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
# YOUR CODE HERE
df_crypto.drop(columns=["CoinName"], inplace=True)
df_crypto.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0
808,SHA-256,PoW/PoS,0.0,0.0
1337,X13,PoW/PoS,29279420000.0,314159000000.0
BTC,SHA-256,PoW,17927180.0,21000000.0


In [32]:
# Use get_dummies() to create variables for text features.
# YOUR CODE HERE
df_newcrypto = pd.get_dummies(df_crypto, columns=["Algorithm"])
df_newcrypto.head()

Unnamed: 0,ProofType,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,...,Algorithm_X11,Algorithm_X11GOST,Algorithm_X13,Algorithm_X14,Algorithm_X15,Algorithm_X16R,Algorithm_X16S,Algorithm_XEVAN,Algorithm_Zhash,Algorithm_vDPOS
42,PoW/PoS,41.99995,42.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404,PoW/PoS,1055185000.0,532000000.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
808,PoW/PoS,0.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1337,PoW/PoS,29279420000.0,314159000000.0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
BTC,PoW,17927180.0,21000000.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# Drop Product Type 
df_newcrypto.drop(columns=["ProofType"], inplace=True)
df_newcrypto.head(20)


Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,...,Algorithm_X11,Algorithm_X11GOST,Algorithm_X13,Algorithm_X14,Algorithm_X15,Algorithm_X16R,Algorithm_X16S,Algorithm_XEVAN,Algorithm_Zhash,Algorithm_vDPOS
42,41.99995,42.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
808,0.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159000000.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
BTC,17927180.0,21000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LTC,63039240.0,84000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DASH,9031294.0,22000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
XMR,17201140.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETC,113359700.0,210000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# Standardize the data with StandardScaler().
# YOUR CODE HERE
crypto_scaled = StandardScaler().fit_transform(df_newcrypto)
print(crypto_scaled[0:5])

[[-0.10047713 -0.03668886 -0.03668644 -0.03668644 -0.03668644 -0.03668644
  -0.05191741 -0.09016696 -0.03668644 -0.05191741 -0.05191741 -0.03668644
  -0.03668644 -0.18257419 -0.05191741 -0.03668644 -0.03668644 -0.08225509
  -0.03668644 -0.0974575  -0.06362848 -0.03668644 -0.03668644 -0.16620562
  -0.03668644 -0.03668644 -0.13848495 -0.03668644 -0.03668644 -0.07352146
  -0.05191741 -0.03668644 -0.03668644 -0.03668644 -0.06362848 -0.03668644
  -0.07352146 -0.09016696 -0.09016696 -0.03668644 -0.03668644 -0.12250233
  -0.12803688 -0.15291752 -0.03668644 -0.08225509 -0.03668644 -0.03668644
  -0.06362848 -0.16188544 -0.03668644 -0.03668644 -0.03668644 -0.07352146
  -0.1786061  -0.32732684 -0.03668644 -0.0974575  -0.08225509 -0.05191741
  -0.03668644  1.42714214 -0.06362848 -0.03668644 -0.03668644 -0.03668644
  -0.08225509 -0.06362848 -0.03668644 -0.03668644 -0.03668644 -0.03668644
  -0.05191741 -0.03668644 -0.03668644 -0.40984739 -0.03668644 -0.18257419
  -0.03668644 -0.10425721 -0.07352146 

### Deliverable 2: Reducing Data Dimensions Using PCA

In [62]:
# Initialize PCA model
pca = PCA(n_components=3)

In [110]:
# Using PCA to reduce dimension to three principal components.
# YOUR CODE HERE
# Get two principal components for the crypto data.
crypto_pca = pca.fit_transform(crypto_scaled)
print(crypto_pca[0:5])

[[-0.07928264  1.55104577 -0.11595692]
 [-0.05999205  1.55100846 -0.11602922]
 [-0.11579867 -0.87861123  1.54761304]
 [ 0.5121656  -0.33930121  0.38265535]
 [-0.11547094 -0.87861186  1.54761181]]


In [114]:
# Create a DataFrame with the three principal components.
# YOUR CODE HERE
df_crypto_pca = pd.DataFrame(data=crypto_pca, columns=["PC 1", "PC 2", "PC 3"])
df_crypto_pca.dropna
df_crypto_pca.head(10)

Unnamed: 0,PC 1,PC 2,PC 3
0,-0.079283,1.551046,-0.115957
1,-0.059992,1.551008,-0.116029
2,-0.115799,-0.878611,1.547613
3,0.512166,-0.339301,0.382655
4,-0.115471,-0.878612,1.547612
5,-0.090197,-0.370468,-0.066334
6,-0.07813,1.551044,-0.115961
7,-0.168694,-1.176737,-1.461854
8,-0.114463,-1.049776,-0.533821
9,-0.090093,-0.370469,-0.066334


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [115]:
# Create an elbow curve to find the best value for K.
# YOUR CODE HERE
# Store Values of K to Plot
inertia = []
k = list(range(1, 11))

In [116]:
# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_crypto_pca)
    inertia.append(km.inertia_)



In [117]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [118]:
def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = data.copy()

    # Initialize the K-Means model
    # YOUR CODE HERE
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    # YOUR CODE HERE
    model.fit(data)

    # Predict clusters
    # YOUR CODE HERE
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["Class"] = model.labels_

    return data


In [159]:

five_clusters =  get_clusters(5, df_crypto_pca)
five_clusters.head()

Unnamed: 0,PC 1,PC 2,PC 3,Class
0,-0.079283,1.551046,-0.115957,2
1,-0.059992,1.551008,-0.116029,2
2,-0.115799,-0.878611,1.547613,3
3,0.512166,-0.339301,0.382655,0
4,-0.115471,-0.878612,1.547612,3


In [16]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
# YOUR CODE HERE

clustered_df = pd.concat([df_crypto, df_crypto_pca], axis=1)

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
# YOUR CODE HERE
clustered_df = pd.concat([df_crypto, df_crypto_pca, df_coinname], axis=1)

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# YOUR CODE HERE

clustered_df = pd.concat([df_crypto, five_clusters, df_coinname], axis=1)
clustered_df.dropna

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(532, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.332855,1.038358,-0.564944,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.316201,1.038515,-0.565371,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159265359,2.30004,1.643532,-0.570651,EliteCoin,0
BTC,SHA-256,PoW,17927180.0,21000000,-0.149023,-1.309646,0.18262,Bitcoin,1
ETH,Ethash,PoW,107684200.0,0,-0.162646,-2.019908,0.380155,Ethereum,1
LTC,Scrypt,PoW,63039240.0,84000000,-0.159391,-1.123165,-0.021041,Litecoin,1
DASH,X11,PoW/PoS,9031294.0,22000000,-0.410793,1.224033,-0.517184,Dash,0
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.148242,-2.196597,0.375973,Monero,1
ETC,Ethash,PoW,113359700.0,210000000,-0.161087,-2.02001,0.380143,Ethereum Classic,1
ZEC,Equihash,PoW,7383056.0,21000000,-0.179011,-2.0247,0.433256,ZCash,1


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [187]:
# Creating a 3D-Scatter with the PCA data and the clusters
# YOUR CODE HERE
six_clusters =  get_clusters(6, df_crypto_pca)
six_clusters.dropna


# Plotting the 2D-Scatter with x="PC 1" and y="PC2"
fig = px.scatter_3d(
    six_clusters,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [185]:
# Create a table with tradable cryptocurrencies.
# YOUR CODE HERE
df_tradable_crypto = pd.DataFrame(data=df_crypto, columns=["TotalCoinSupply","TotalCoinsMined"])
#df_tradable_crypto= pd.concat([df_tradable_crypto, df_coinname], axis=1)



In [186]:
# Print the total number of tradable cryptocurrencies.
# YOUR CODE HERE
df_tradable_crypto.head()


Unnamed: 0,TotalCoinSupply,TotalCoinsMined
42,42.0,41.99995
404,532000000.0,1055185000.0
808,0.0,0.0
1337,314159000000.0,29279420000.0
BTC,21000000.0,17927180.0


In [20]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
# YOUR CODE HERE
# Standardize data with StandardScaler
tradable_scaled = StandardScaler().fit_transform(df_tradable_crypto)
print(tradable_scaled)

array([[4.20000000e-11, 0.00000000e+00],
       [5.32000000e-04, 1.06585544e-03],
       [3.14159265e-01, 2.95755135e-02],
       ...,
       [1.40022261e-03, 9.90135079e-04],
       [2.10000000e-05, 7.37028150e-06],
       [1.00000000e-06, 1.29582282e-07]])

In [199]:
df_scales_cluster = pd.concat([df_tradable_crypto, df_coinname])

In [200]:
df_scales_cluster.head()

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName
42,42.0,41.99995,
404,532000000.0,1055185000.0,
808,0.0,0.0,
1337,314159000000.0,29279420000.0,
BTC,21000000.0,17927180.0,


In [208]:
# Initialize PCA model
pca = PCA(n_components=2)


# Get two principal components for the coin data.
coin_pca = pca.fit_transform(tradable_scaled)


# Transform PCA data to a DataFrame
df_coin_pca = pd.DataFrame(data=coin_pca, columns=["TotalCoinSupply", "TotalCoinsMined"])
df_coin_pca.head(10)

Unnamed: 0,TotalCoinSupply,TotalCoinsMined
0,0.045105,-0.096991
1,0.025799,-0.077684
2,0.045105,-0.096991
3,-0.490547,0.438792
4,0.044777,-0.096663
5,0.043135,-0.095021
6,0.043952,-0.095838
7,0.04494,-0.096826
8,0.04479,-0.096676
9,0.043031,-0.094917


In [21]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
# YOUR CODE HERE
# Initialize PCA model
pca = PCA(n_components=2)
# Get two principal components for the coin data.
coin_pca = pca.fit_transform(tradable_scaled)
# Transform PCA data to a DataFrame
plot_df = pd.DataFrame(data=coin_pca, columns=["TotalCoinSupply", "TotalCoinsMined"])

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
# YOUR CODE HERE
plot_df = pd.concat([plot-df, df_coinname])


# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
# YOUR CODE HERE
def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = data.copy()

    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_

    return data

plot_df.head(10)

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
42,4.2e-11,0.0,42 Coin,0
404,0.000532,0.001066,404Coin,0
1337,0.3141593,0.029576,EliteCoin,0
BTC,2.1e-05,1.8e-05,Bitcoin,1
ETH,0.0,0.000109,Ethereum,1
LTC,8.4e-05,6.4e-05,Litecoin,1
DASH,2.2e-05,9e-06,Dash,0
XMR,0.0,1.7e-05,Monero,1
ETC,0.00021,0.000115,Ethereum Classic,1
ZEC,2.1e-05,7e-06,ZCash,1


In [210]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
# YOUR CODE HERE
coin_clusters =  get_clusters(6, plot_df)
coin_clusters.dropna

# Plotting the 2D-Scatter with x="Annual Income" and y="Spending Score (1-100)"
coin_clusters.hvplot.scatter(x="TotalCoinSupply", y="TotalCoinsMined")