# Clustering Crypto

In [460]:
# Initial imports
import os
import pandas as pd
import hvplot.pandas
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [461]:
# Load the crypto_data.csv dataset.
file = Path=('Resources/crypto_data.csv')

crypto_df = pd.read_csv(file, index_col=0)

crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [462]:
# Look at the column data types
crypto_df.dtypes

CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [463]:
# Find null values
for column in crypto_df.columns:
    print(f"Column {column} has {crypto_df[column].isnull().sum()} null values.")

Column CoinName has 0 null values.
Column Algorithm has 0 null values.
Column IsTrading has 0 null values.
Column ProofType has 0 null values.
Column TotalCoinsMined has 508 null values.
Column TotalCoinSupply has 0 null values.


In [464]:
# Keep all the cryptocurrencies that are being traded.
crypto_df = crypto_df[crypto_df["IsTrading"] == True]

crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [465]:
# Keep all the cryptocurrencies that have a working algorithm

# Printed each unique algorithm to get a list of the coin names
print(sorted((crypto_df["Algorithm"].unique())))

# Used pandas .notna to find all cryptocurrencies that have a working algorithm
pd.notna(crypto_df["Algorithm"])

['1GB AES Pattern Search', '536', 'Argon2', 'Argon2d', 'BLAKE256', 'Blake', 'Blake2S', 'Blake2b', 'C11', 'Cloverhash', 'Counterparty', 'CryptoNight', 'CryptoNight Heavy', 'CryptoNight Heavy X', 'CryptoNight-Lite', 'CryptoNight-V7', 'CryptoNight-lite', 'Cryptonight-GPU', 'Curve25519', 'DPoS', 'Dagger', 'Dagger-Hashimoto', 'ECC 256K1', 'Equihash', 'Equihash+Scrypt', 'Equihash1927', 'Ethash', 'Exosis', 'Green Protocol', 'Groestl', 'HMQ1725', 'HybridScryptHash256', 'IMesh', 'Jump Consistent Hash', 'Keccak', 'Leased POS', 'Lyra2RE', 'Lyra2REv2', 'Lyra2Z', 'M7 POW', 'Momentum', 'Multiple', 'NIST5', 'NeoScrypt', 'Ouroboros', 'PHI1612', 'POS 2.0', 'POS 3.0', 'PoS', 'Progressive-n', 'Proof-of-Authority', 'Proof-of-BibleHash', 'QUAIT', 'QuBit', 'Quark', 'QuarkTX', 'Rainforest', 'SHA-256', 'SHA-256 + Hive', 'SHA-256D', 'SHA-512', 'SHA3', 'SHA3-256', 'Scrypt', 'Scrypt-n', 'Semux BFT consensus', 'Shabal256', 'Skein', 'SkunkHash', 'SkunkHash v2 Raptor', 'Slatechain', 'Stanford Folding', 'T-Inside', 

42      True
365     True
404     True
611     True
808     True
        ... 
SERO    True
UOS     True
BDX     True
ZEN     True
XBC     True
Name: Algorithm, Length: 1144, dtype: bool

In [466]:
# Remove the "IsTrading" column. 
crypto_df.drop(columns=["IsTrading"], inplace=True)

crypto_df



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
SERO,Super Zero,Ethash,PoW,,1000000000
UOS,UOS,SHA-256,DPoI,,1000000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [467]:
# Find null values

for column in crypto_df.columns:
    print(f"Column {column} has {crypto_df[column].isnull().sum()} null values.")

Column CoinName has 0 null values.
Column Algorithm has 0 null values.
Column ProofType has 0 null values.
Column TotalCoinsMined has 459 null values.
Column TotalCoinSupply has 0 null values.


In [468]:
# Remove rows that have at least 1 null value.
crypto_drop_df = crypto_df.dropna()

# Check again for null values

for column in crypto_drop_df.columns:
    print(f"Column {column} has {crypto_drop_df[column].isnull().sum()} null values.")

crypto_drop_df

Column CoinName has 0 null values.
Column Algorithm has 0 null values.
Column ProofType has 0 null values.
Column TotalCoinsMined has 0 null values.
Column TotalCoinSupply has 0 null values.


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [469]:
# Keep the rows where coins are mined.
crypto_df = crypto_drop_df[crypto_drop_df["TotalCoinsMined"] > 0]

crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [470]:
# Create a new DataFrame that holds only the cryptocurrencies names.
coinName_df = crypto_df[["CoinName"]]

coinName_df

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex
ZEN,Horizen


In [471]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
noCoin_crypto_df = crypto_df.copy()

noCoin_crypto_df.drop(columns=["CoinName"], axis=1, inplace=True)

noCoin_crypto_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,X13,PoW/PoS,2.927942e+10,314159265359
BTC,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Equihash,PoW,7.296538e+06,21000000


In [472]:
# Use get_dummies() to create variables for text features.
X_df = pd.get_dummies(noCoin_crypto_df, columns=["Algorithm", "ProofType"])

X_df.head(10)

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LTC,63039240.0,84000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DASH,9031294.0,22000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
XMR,17201140.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETC,113359700.0,210000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEC,7383056.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [473]:
# Standardize the data with StandardScaler().
crypto_scaled = StandardScaler().fit_transform(X_df)

print(crypto_scaled)

[[-0.11710817 -0.1528703  -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.09396955 -0.145009   -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [ 0.52494561  4.48942416 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 ...
 [-0.09561336 -0.13217937 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11694817 -0.15255998 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11710536 -0.15285552 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]]


### Deliverable 2: Reducing Data Dimensions Using PCA

In [474]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)

pca_crypto = pca.fit_transform(crypto_scaled)

pca_crypto

array([[-0.33520526,  1.01694793, -0.55156874],
       [-0.31855488,  1.01711744, -0.55212017],
       [ 2.29986816,  1.69741162, -0.56589703],
       ...,
       [ 0.32718944, -2.29719141,  0.37741337],
       [-0.15075612, -2.06898758,  0.41785969],
       [-0.29258036,  0.7735847 , -0.39608028]])

In [475]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(
    data = pca_crypto, columns = ["PC 1", "PC 2", "PC 3"],
    index = crypto_df.index
)
pcs_df

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.335205,1.016948,-0.551569
404,-0.318555,1.017117,-0.552120
1337,2.299868,1.697412,-0.565897
BTC,-0.142103,-1.260907,0.198560
ETH,-0.156826,-2.103223,0.312866
...,...,...,...
ZEPH,2.483088,0.859200,-0.152460
GAP,-0.333252,1.016833,-0.551608
BDX,0.327189,-2.297191,0.377413
ZEN,-0.150756,-2.068988,0.417860


In [476]:
# Checking how much information can be attributed to each principal component
# pc1 = .03%, pc2 = .02%, pc3 = .02% for a total of .07% of the information
pca.explained_variance_ratio_

array([0.02793146, 0.02140286, 0.02047911])

### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [477]:
NUM_THREADS = '3'
os.environ['OMP_NUM_THREADS'] = NUM_THREADS

# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))

# Calculating inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state = 42)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Creating the actual elbow curve
elbow_data = {'k': k, 'inertia': inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x = 'k', y = 'inertia', xticks = k, title = "Elbow Curve")


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



Running K-Means with `k=4`

In [478]:
# Initialize the K-Means model.
km_model = KMeans(n_clusters = 4, random_state = 42)

# Fit the model
km_model.fit(pcs_df)

# Predict clusters
predictions = km_model.predict(pcs_df)

predictions

array([1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,

In [479]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([crypto_df, pcs_df], axis = 1)

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_df["CoinName"] = coinName_df

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df["Class"] = km_model.labels_

# Giving the crytocurrencies abbreviations a column name of "Coin Abbr."
clustered_df.index.rename('Coin Abbr.', inplace = True)


# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(532, 9)


Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,Class
Coin Abbr.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,42 Coin,Scrypt,PoW/PoS,41.99995,42,-0.335205,1.016948,-0.551569,1
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000,-0.318555,1.017117,-0.55212,1
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359,2.299868,1.697412,-0.565897,1
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000,-0.142103,-1.260907,0.19856,0
ETH,Ethereum,Ethash,PoW,107684200.0,0,-0.156826,-2.103223,0.312866,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000,-0.165661,-1.158274,-0.079032,0
DASH,Dash,X11,PoW/PoS,9031294.0,22000000,-0.39512,1.297201,-0.323142,1
XMR,Monero,CryptoNight-V7,PoW,17201140.0,0,-0.151655,-2.166194,0.413218,0
ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000,-0.155269,-2.103322,0.312836,0
ZEC,ZCash,Equihash,PoW,7383056.0,21000000,-0.150755,-2.068988,0.41786,0


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [480]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,    
    x = "PC 1",
    y = "PC 2",
    z = "PC 3",
    hover_name = "CoinName",
    hover_data = ["Algorithm"],
    color = "Class",
    symbol = "Class",
    width = 800
)

fig.update_layout(legend = dict(x =0, y = 1))
fig.show()

In [481]:
# Create a table with tradable cryptocurrencies.
clustered_df.hvplot.table(columns=["CoinName", "Algorithm", "ProofType", "TotalCoinSupply", "TotalCoinsMined", "Class"])

In [482]:
# Print the total number of tradable cryptocurrencies.
print(f":. There are {clustered_df.CoinName.size} cryptocurrencies that are tradable .:")

:. There are 532 cryptocurrencies that are tradable .:


In [483]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
crypto_scaler = MinMaxScaler().fit_transform(clustered_df[["TotalCoinSupply", "TotalCoinsMined"]])

print(crypto_scaler)

[[4.20000000e-11 0.00000000e+00]
 [5.32000000e-04 1.06585544e-03]
 [3.14159265e-01 2.95755135e-02]
 ...
 [1.40022261e-03 9.90135079e-04]
 [2.10000000e-05 7.37028150e-06]
 [1.00000000e-06 1.29582282e-07]]


In [484]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
combined_df = pd.DataFrame(
    data = crypto_scaler, columns = ["TotalCoinSupply", "TotalCoinsMined"],
    index = clustered_df.index
)

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
combined_df = pd.concat([combined_df, clustered_df["CoinName"]], axis = 1) 

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# Already done above? Check rubric.

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
plot_df = pd.concat([combined_df, clustered_df["Class"]], axis = 1)


plot_df.head(10)

Unnamed: 0_level_0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
Coin Abbr.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,4.2e-11,0.0,42 Coin,1
404,0.000532,0.001066,404Coin,1
1337,0.3141593,0.029576,EliteCoin,1
BTC,2.1e-05,1.8e-05,Bitcoin,0
ETH,0.0,0.000109,Ethereum,0
LTC,8.4e-05,6.4e-05,Litecoin,0
DASH,2.2e-05,9e-06,Dash,1
XMR,0.0,1.7e-05,Monero,0
ETC,0.00021,0.000115,Ethereum Classic,0
ZEC,2.1e-05,7e-06,ZCash,0


In [485]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
plot_df.hvplot.scatter(
                    x = "TotalCoinsMined", 
                    y = "TotalCoinSupply",
                    by = "Class"         
)