# Clustering Crypto

In [72]:
# Initial imports
import pandas as pd
import numpy as np
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [73]:
# Load the crypto_data.csv dataset.
crypto_df=pd.read_csv('data/crypto_data.csv')
crypto_df.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684200.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [74]:
# Renamed "Unnamed: 0" column and then set it as the index
crypto_df = crypto_df.rename(columns={'Unnamed: 0': ''})
crypto_df = crypto_df.set_index('')
# crypto_df.sample(10)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,,,
42.0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42.0
365.0,365Coin,X11,True,PoW/PoS,,2300000000.0
404.0,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000.0
611.0,SixEleven,SHA-256,True,PoW,,611000.0
808.0,808,SHA-256,True,PoW/PoS,0.0,0.0


In [75]:
# Keep all the cryptocurrencies that are being traded.
crypto_df = crypto_df[crypto_df['IsTrading'] == True]
crypto_df = crypto_df.reset_index(drop=True)
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.0,0
5,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015 coin,X11,True,PoW/PoS,,0
7,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,Ethereum,Ethash,True,PoW,107684200.0,0
9,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [76]:
# Remove the "IsTrading" column. 
crypto_df = crypto_df.drop("IsTrading", axis=1)
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365Coin,X11,PoW/PoS,,2300000000
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,PoW,,611000
4,808,SHA-256,PoW/PoS,0.0,0
5,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
6,2015 coin,X11,PoW/PoS,,0
7,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,Ethereum,Ethash,PoW,107684200.0,0
9,Litecoin,Scrypt,PoW,63039240.0,84000000


In [77]:
# Remove rows that have at least 1 null value.
crypto_df = crypto_df.dropna()
crypto_df = crypto_df.reset_index(drop=True)
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
2,808,SHA-256,PoW/PoS,0.0,0
3,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
4,Bitcoin,SHA-256,PoW,17927180.0,21000000
5,Ethereum,Ethash,PoW,107684200.0,0
6,Litecoin,Scrypt,PoW,63039240.0,84000000
7,Dash,X11,PoW/PoS,9031294.0,22000000
8,Monero,CryptoNight-V7,PoW,17201140.0,0
9,Ethereum Classic,Ethash,PoW,113359700.0,210000000


In [78]:
# Keep the rows where coins are mined.
crypto_df = crypto_df[crypto_df['TotalCoinsMined'] > 0] 
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
1,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
3,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
4,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
5,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
680,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
681,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
682,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
683,Horizen,Equihash,PoW,7.296538e+06,21000000


In [79]:
# Create a new DataFrame that holds only the cryptocurrencies names.
crypto_names_df = pd.DataFrame(crypto_df['CoinName'])
crypto_names_df.set_index(crypto_df.index, inplace=True)
crypto_names_df.head()

Unnamed: 0,CoinName
0,42 Coin
1,404Coin
3,EliteCoin
4,Bitcoin
5,Ethereum


In [80]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
crypto_df = crypto_df.drop("CoinName", axis=1)
crypto_df.head(10)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,41.99995,42
1,Scrypt,PoW/PoS,1055185000.0,532000000
3,X13,PoW/PoS,29279420000.0,314159265359
4,SHA-256,PoW,17927180.0,21000000
5,Ethash,PoW,107684200.0,0
6,Scrypt,PoW,63039240.0,84000000
7,X11,PoW/PoS,9031294.0,22000000
8,CryptoNight-V7,PoW,17201140.0,0
9,Ethash,PoW,113359700.0,210000000
10,Equihash,PoW,7383056.0,21000000


In [81]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(crypto_df[['Algorithm', 'ProofType']])
X.head()

Unnamed: 0,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,Algorithm_Counterparty,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [82]:
# Standardize the data with StandardScaler().
X = StandardScaler().fit_transform(X)
X

array([[-0.0433963, -0.0433963, -0.0433963, ..., -0.0433963, -0.0433963,
        -0.0433963],
       [-0.0433963, -0.0433963, -0.0433963, ..., -0.0433963, -0.0433963,
        -0.0433963],
       [-0.0433963, -0.0433963, -0.0433963, ..., -0.0433963, -0.0433963,
        -0.0433963],
       ...,
       [-0.0433963, -0.0433963, -0.0433963, ..., -0.0433963, -0.0433963,
        -0.0433963],
       [-0.0433963, -0.0433963, -0.0433963, ..., -0.0433963, -0.0433963,
        -0.0433963],
       [-0.0433963, -0.0433963, -0.0433963, ..., -0.0433963, -0.0433963,
        -0.0433963]])

### Deliverable 2: Reducing Data Dimensions Using PCA

In [83]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
principal_components = pca.fit_transform(X)

In [84]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3'], index=crypto_df.index)
pcs_df.head()

Unnamed: 0,PC1,PC2,PC3
0,1.004832,-0.640286,-0.009561
1,1.004832,-0.640286,-0.009561
3,1.81191,-0.665648,-0.055728
4,-1.304202,0.220424,-0.006754
5,-2.027426,0.40532,-0.004945


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [85]:
# Create an elbow curve to find the best value for K.
inertias = []

for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(pcs_df)
    inertias.append(kmeans.inertia_)

# Create an elbow curve using hvplot
elbow_df = pd.DataFrame({'K': range(1, 11), 'Inertia': inertias})
elbow_curve = elbow_df.hvplot.line(x='K', y='Inertia', 
                                    title='Elbow Curve for K-means Clustering', 
                                    xlabel='Number of Clusters (K)', 
                                    ylabel='Inertia', 
                                    xticks=range(1, 11), 
                                    yticks=[int(x) for x in np.linspace(0, max(inertias), num=10)]
                                ).opts(width=600, height=400)
elbow_curve


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



Running K-Means with `k=4`

In [86]:
# Initialize the K-Means model.
k = 4
model = KMeans(n_clusters=k, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
cluster_preds = model.predict(pcs_df)
cluster_preds

array([0, 0, 0, 3, 3, 3, 0, 3, 3, 3, 0, 3, 0, 0, 3, 0, 3, 3, 0, 0, 3, 3,
       3, 3, 3, 0, 3, 3, 3, 0, 3, 0, 3, 3, 0, 0, 3, 3, 3, 3, 3, 3, 0, 0,
       3, 3, 3, 3, 3, 0, 0, 3, 0, 3, 3, 3, 3, 0, 3, 3, 0, 3, 0, 0, 0, 3,
       3, 3, 0, 0, 0, 0, 0, 3, 3, 3, 0, 0, 3, 0, 3, 0, 0, 3, 3, 3, 3, 0,
       0, 3, 0, 3, 3, 0, 0, 3, 0, 0, 3, 3, 0, 0, 3, 0, 0, 3, 0, 3, 0, 3,
       0, 3, 0, 0, 3, 3, 0, 3, 3, 3, 0, 3, 3, 3, 3, 3, 0, 0, 3, 3, 3, 0,
       3, 0, 3, 3, 0, 3, 0, 3, 0, 0, 3, 3, 0, 3, 3, 0, 0, 3, 0, 3, 0, 0,
       0, 3, 3, 3, 3, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0,
       0, 3, 0, 3, 0, 0, 3, 0, 3, 0, 0, 3, 0, 3, 0, 3, 0, 3, 0, 0, 0, 0,
       3, 0, 0, 0, 0, 0, 3, 3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0,
       0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 0, 3, 0, 3, 0,
       0, 3, 0, 3, 3, 0, 3, 3, 0, 3, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0,
       0, 0, 3, 0, 3, 0, 0, 0, 0, 3, 0, 3, 0, 3, 3, 3, 3, 0, 3, 0, 0, 3,
       0, 3, 3, 3, 0, 3, 0, 3, 3, 3, 0, 3, 0, 3, 0,

In [87]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = crypto_df.copy()

clustered_df['PC1'] = pcs_df['PC1']
clustered_df['PC2'] = pcs_df['PC2']
clustered_df['PC3'] = pcs_df['PC3']

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_df['CoinName'] = crypto_names_df['CoinName']


#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df['Class'] = cluster_preds

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(532, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,CoinName,Class
0,Scrypt,PoW/PoS,41.99995,42,1.004832,-0.640286,-0.009561,42 Coin,0
1,Scrypt,PoW/PoS,1055185000.0,532000000,1.004832,-0.640286,-0.009561,404Coin,0
3,X13,PoW/PoS,29279420000.0,314159265359,1.81191,-0.665648,-0.055728,EliteCoin,0
4,SHA-256,PoW,17927180.0,21000000,-1.304202,0.220424,-0.006754,Bitcoin,3
5,Ethash,PoW,107684200.0,0,-2.027426,0.40532,-0.004945,Ethereum,3
6,Scrypt,PoW,63039240.0,84000000,-1.152794,-0.041043,0.012211,Litecoin,3
7,X11,PoW/PoS,9031294.0,22000000,1.243728,-0.54142,-0.007535,Dash,0
8,CryptoNight-V7,PoW,17201140.0,0,-2.216632,0.475658,-0.003589,Monero,3
9,Ethash,PoW,113359700.0,210000000,-2.027426,0.40532,-0.004945,Ethereum Classic,3
10,Equihash,PoW,7383056.0,21000000,-2.072394,0.455438,0.027159,ZCash,3


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [88]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(clustered_df, x='PC1', y='PC2', z='PC3', color='Class', 
                   hover_name='CoinName', hover_data=['Algorithm'])
fig.show()


In [89]:
# Create a table with tradable cryptocurrencies.
tradable_df = clustered_df[clustered_df['TotalCoinSupply'] != 0]
table = tradable_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply', 'Class'],
                                 width=800, height=400)
table


In [90]:
# Print the total number of tradable cryptocurrencies.
total_tradable_cryptocurrencies = tradable_df.shape[0]
print("Total number of tradable cryptocurrencies: ", total_tradable_cryptocurrencies)

Total number of tradable cryptocurrencies:  532


In [91]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
clustered_df[['TotalCoinSupply', 'TotalCoinsMined']] = MinMaxScaler().fit_transform(clustered_df[['TotalCoinSupply', 'TotalCoinsMined']])
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,CoinName,Class
0,Scrypt,PoW/PoS,0.0,4.2e-11,1.004832,-0.640286,-0.009561,42 Coin,0
1,Scrypt,PoW/PoS,0.001066,0.000532,1.004832,-0.640286,-0.009561,404Coin,0
3,X13,PoW/PoS,0.029576,0.3141593,1.81191,-0.665648,-0.055728,EliteCoin,0
4,SHA-256,PoW,1.8e-05,2.1e-05,-1.304202,0.220424,-0.006754,Bitcoin,3
5,Ethash,PoW,0.000109,0.0,-2.027426,0.40532,-0.004945,Ethereum,3


In [92]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
plot_df = clustered_df[['TotalCoinSupply', 'TotalCoinsMined', 'CoinName', 'Class']].copy()

plot_df.head(10)

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
0,4.2e-11,0.0,42 Coin,0
1,0.000532,0.001066,404Coin,0
3,0.3141593,0.029576,EliteCoin,0
4,2.1e-05,1.8e-05,Bitcoin,3
5,0.0,0.000109,Ethereum,3
6,8.4e-05,6.4e-05,Litecoin,3
7,2.2e-05,9e-06,Dash,0
8,0.0,1.7e-05,Monero,3
9,0.00021,0.000115,Ethereum Classic,3
10,2.1e-05,7e-06,ZCash,3


In [93]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
scatter_plot = plot_df.hvplot.scatter(x='TotalCoinsMined', y='TotalCoinSupply', by='Class', hover_cols=['CoinName'])
scatter_plot