# Clustering Crypto

In [91]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

### Deliverable 1: Preprocessing the Data for PCA

In [92]:
# Load the crypto_data.csv dataset.
df = pd.read_csv("crypto_data.csv", index_col=None)
df.head(100)

Unnamed: 0,Name,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
95,FRK,Franko,Scrypt,True,PoW,1.142732e+06,11235813
96,FRAC,FractalCoin,X11,True,PoW,,1000000
97,FSTC,FastCoin,Scrypt,False,PoW,,165888000
98,FTC,FeatherCoin,NeoScrypt,True,PoW,2.086691e+08,336000000


In [93]:
df= df.set_index('Name')
df

Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [94]:
#keep the is trading true values

df = df[df['IsTrading']]
df

Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [95]:
# Keep all the cryptocurrencies that have a working algorithm.
df.Algorithm.unique()


array(['Scrypt', 'X11', 'SHA-256', 'X13', 'Ethash', 'CryptoNight-V7',
       'Equihash', 'SHA-512', 'Multiple', 'X15', 'NIST5', 'Quark',
       'Groestl', 'PoS', 'NeoScrypt', 'SHA3', 'HybridScryptHash256',
       'Scrypt-n', 'PHI1612', 'Lyra2REv2', 'CryptoNight', 'Shabal256',
       'Counterparty', 'Blake', 'Momentum', 'Stanford Folding', 'QuBit',
       'XG Hash', 'M7 POW', 'Curve25519', 'Lyra2RE', 'QUAIT', 'vDPOS',
       'Blake2b', 'BLAKE256', '1GB AES Pattern Search', 'Dagger',
       'CryptoNight-Lite', 'X11GOST', 'SHA-256D', 'POS 3.0',
       'Progressive-n', 'DPoS', 'Lyra2Z', 'X14', 'Time Travel', 'Argon2',
       'Keccak', 'Blake2S', 'Dagger-Hashimoto', '536', 'Argon2d',
       'Cloverhash', 'Skein', 'SkunkHash v2 Raptor',
       'VeChainThor Authority', 'Ouroboros', 'POS 2.0', 'SkunkHash',
       'C11', 'Proof-of-BibleHash', 'SHA-256 + Hive',
       'Proof-of-Authority', 'XEVAN', 'VBFT', 'YescryptR16', 'IMesh',
       'Green Protocol', 'Semux BFT consensus', 'X16R', 'Tribus',


In [96]:
# Remove the "IsTrading" column. 
df = df.drop(columns = ["IsTrading"])
df

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
SERO,Super Zero,Ethash,PoW,,1000000000
UOS,UOS,SHA-256,DPoI,,1000000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [119]:
# Remove rows that have at least 1 null value.
df = df.dropna()
df

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,X13,PoW/PoS,2.927942e+10,3.14159E+11
BTC,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Equihash,PoW,7.296538e+06,21000000


In [120]:
# Keep the rows where coins are mined.
df = df[df["TotalCoinsMined"] > 0]
df

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,X13,PoW/PoS,2.927942e+10,3.14159E+11
BTC,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Equihash,PoW,7.296538e+06,21000000


In [99]:
df_CoinName =df["CoinName"]
df_CoinName.head()

Name
42        42 Coin
404       404Coin
1337    EliteCoin
BTC       Bitcoin
ETH      Ethereum
Name: CoinName, dtype: object

In [100]:
# Create a new DataFrame that holds only the cryptocurrencies names.
CryptoNames = df.index
CryptoNames

Index(['42', '404', '1337', 'BTC', 'ETH', 'LTC', 'DASH', 'XMR', 'ETC', 'ZEC',
       ...
       'TCH', 'WAVES', 'PART', 'BTT', 'NXT', 'ZEPH', 'GAP', 'BDX', 'ZEN',
       'XBC'],
      dtype='object', name='Name', length=532)

In [122]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
df = df.drop(columns = ["CoinName"])
df.head()


Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,41.99995,42.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0
1337,X13,PoW/PoS,29279420000.0,314159000000.0
BTC,SHA-256,PoW,17927180.0,21000000.0
ETH,Ethash,PoW,107684200.0,0.0


In [102]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(df, columns=["Algorithm", "ProofType"])

X.head()

Unnamed: 0_level_0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42,41.99995,42.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [103]:
# Standardize the data with StandardScaler().
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:2])


[[-0.11710816 -0.15287029 -0.0433963  -0.0433963  -0.0433963  -0.06142951
  -0.07530656 -0.0433963  -0.06142951 -0.06142951 -0.0433963  -0.0433963
  -0.19245009 -0.06142951 -0.09740465 -0.0433963  -0.11547005 -0.07530656
  -0.0433963  -0.0433963  -0.15191091 -0.0433963  -0.13118084 -0.0433963
  -0.0433963  -0.08703883 -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.06142951 -0.0433963  -0.08703883 -0.08703883 -0.08703883 -0.0433963
  -0.13118084 -0.13840913 -0.13840913 -0.0433963  -0.06142951 -0.0433963
  -0.07530656 -0.18168574 -0.0433963  -0.0433963  -0.0433963  -0.07530656
  -0.15826614 -0.31491833 -0.0433963  -0.08703883 -0.07530656 -0.06142951
   1.38675049 -0.0433963  -0.0433963  -0.06142951 -0.0433963  -0.0433963
  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.39879994 -0.0433963  -0.18168574 -0.0433963  -0.08703883 -0.08703883
  -0.10680283 -0.0433963  -0.13118084 -0.0433963  -0.0433963  -0.0433963
  -0.0433963  -0.07530656 -0.43911856 -0.04339

### Deliverable 2: Reducing Data Dimensions Using PCA

In [104]:
# Using PCA to reduce dimension to three principal components.
#initialize a PCA model 
pca = PCA(n_components=3)

#get 3 principal components 
X_pca = pca.fit_transform(X_scaled)

In [105]:
# Create a DataFrame with the three principal components.
#transform PCA data to DF
df_X_pca = pd.DataFrame(data= X_pca, columns = 
                           ["PC1", "PC2", "PC3"], index = df.index)

df_X_pca

Unnamed: 0_level_0,PC1,PC2,PC3
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,-0.340240,0.984728,-0.573656
404,-0.323582,0.984806,-0.574071
1337,2.294462,1.573018,-0.723921
BTC,-0.139392,-1.365181,0.158454
ETH,-0.143906,-2.038238,0.433191
...,...,...,...
ZEPH,2.478768,0.790086,-0.100865
GAP,-0.338286,0.984608,-0.573684
BDX,0.327426,-2.298078,0.407649
ZEN,-0.145149,-2.022417,0.398572


In [106]:
pca.explained_variance_ratio_

array([0.0279313 , 0.02138738, 0.02050814])

### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [107]:
#plot elbow curve, calc inertia values
inertia = []
k = list(range(1, 9))

# Calculate the inertia for the range of K values
for i in k:
   km = KMeans(n_clusters=i, random_state=0)
   km.fit(df_X_pca)
   inertia.append(km.inertia_)

#create plot

elbow_data = {"k": k, "inertia": inertia}

df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y = "inertia", xticks = k, title = "elbow curve")

#plt.plot(elbow_data["k"], elbow_data["inertia"])



Running K-Means with `k=4`

In [108]:
# Initialize the K-Means model.
# Initialize the K-means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(df_X_pca)

# Predict clusters
predictions = model.predict(df_X_pca)

# Add the predicted class columns
df_X_pca["class"] = model.labels_
df_X_pca



Unnamed: 0_level_0,PC1,PC2,PC3,class
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,-0.340240,0.984728,-0.573656,0
404,-0.323582,0.984806,-0.574071,0
1337,2.294462,1.573018,-0.723921,0
BTC,-0.139392,-1.365181,0.158454,1
ETH,-0.143906,-2.038238,0.433191,1
...,...,...,...,...
ZEPH,2.478768,0.790086,-0.100865,0
GAP,-0.338286,0.984608,-0.573684,0
BDX,0.327426,-2.298078,0.407649,1
ZEN,-0.145149,-2.022417,0.398572,1


In [109]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
frames = [df_X_pca, df]

clustered_df = pd.concat(frames, axis=1, join='inner')
clustered_df

frames2=[clustered_df,df_CoinName]
clustered_df2=pd.concat(frames2,axis=1, join='inner')



# Print the shape of the clustered_df
print(clustered_df2.shape)

(532, 9)


In [110]:
clustered_df2

Unnamed: 0_level_0,PC1,PC2,PC3,class,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,CoinName
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,-0.340240,0.984728,-0.573656,0,Scrypt,PoW/PoS,4.199995e+01,42,42 Coin
404,-0.323582,0.984806,-0.574071,0,Scrypt,PoW/PoS,1.055185e+09,532000000,404Coin
1337,2.294462,1.573018,-0.723921,0,X13,PoW/PoS,2.927942e+10,3.14159E+11,EliteCoin
BTC,-0.139392,-1.365181,0.158454,1,SHA-256,PoW,1.792718e+07,21000000,Bitcoin
ETH,-0.143906,-2.038238,0.433191,1,Ethash,PoW,1.076842e+08,0,Ethereum
...,...,...,...,...,...,...,...,...,...
ZEPH,2.478768,0.790086,-0.100865,0,SHA-256,DPoS,2.000000e+09,2000000000,ZEPHYR
GAP,-0.338286,0.984608,-0.573684,0,Scrypt,PoW/PoS,1.493105e+07,250000000,Gapcoin
BDX,0.327426,-2.298078,0.407649,1,CryptoNight,PoW,9.802226e+08,1400222610,Beldex
ZEN,-0.145149,-2.022417,0.398572,1,Equihash,PoW,7.296538e+06,21000000,Horizen


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [111]:
# Creating a 3D-Scatter with the PCA data and the clusters
#
fig = px.scatter_3d(
    clustered_df2, 
     x = "PC1",
     y = "PC2", 
     z = "PC3", 
     color="class", 
     symbol = "class", 
     width=800,
     hover_name="CoinName",
     hover_data = ["Algorithm"])

fig.update_layout(legend=dict(x = 0, y =1))
fig.show()

In [113]:
# Create a table with tradable cryptocurrencies.
clustered_df2.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 
                        'TotalCoinsMined', 'class'], sortable=True, selectable=True)



In [114]:
# Print the total number of tradable cryptocurrencies.
number = len(clustered_df2["CoinName"].unique())
print(f"The number of tradable crypto currencies is {number}")

The number of tradable crypto currencies is 531


In [115]:
scaler1 = MinMaxScaler(feature_range=(-1,1))
print(scaler.fit(clustered_df2[["TotalCoinsMined", "TotalCoinSupply"]]))
MinMaxScaler()
scaled_data = scaler.transform(data)
scaled_data
scaled1_df = pd.DataFrame({'Column1': scaled_data[:, 0], 
                           'Column2': scaled_data[:, 1]}, 
                            index = clustered_df2.index )

scaled1_df.columns = ["TotalCoinsMined", "TotalCoinSupply"]
scaled1_df.head()


MinMaxScaler(feature_range=(-1, 1))


Unnamed: 0_level_0,TotalCoinsMined,TotalCoinSupply
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
42,-1.0,-1.0
404,-0.997868,-0.998936
1337,-0.940849,-0.371682
BTC,-0.999964,-0.999958
ETH,-0.999782,-1.0


In [116]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
df_temp = clustered_df2[["CoinName", "class", "Algorithm"]]
frames = [scaled1_df, df_temp]

result2 = pd.concat(frames, axis = 1, join = 'inner')
result2


Unnamed: 0_level_0,TotalCoinsMined,TotalCoinSupply,CoinName,class,Algorithm
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,-1.000000,-1.000000,42 Coin,0,Scrypt
404,-0.997868,-0.998936,404Coin,0,Scrypt
1337,-0.940849,-0.371682,EliteCoin,0,X13
BTC,-0.999964,-0.999958,Bitcoin,1,SHA-256
ETH,-0.999782,-1.000000,Ethereum,1,Ethash
...,...,...,...,...,...
ZEPH,-0.995960,-0.996000,ZEPHYR,0,SHA-256
GAP,-0.999970,-0.999500,Gapcoin,0,Scrypt
BDX,-0.998020,-0.997200,Beldex,1,CryptoNight
ZEN,-0.999985,-0.999958,Horizen,1,Equihash


In [117]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
result2.hvplot.scatter(x='TotalCoinsMined', y='TotalCoinSupply', by='class', 
                  legend='top', height=400, width=400, hover_cols = ["CoinName", "Algorithm"])
