# Clustering Crypto

In [160]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [161]:
# Load the crypto_data.csv dataset.
file_path = "Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path,index_col=0)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [162]:
# Keep all the cryptocurrencies that are being traded
crypto_df = crypto_df[crypto_df.IsTrading != False]
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [163]:
# Drop the IsTrading column
crypto_df = crypto_df.drop(['IsTrading'], axis=1)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0


In [164]:
# Remove rows that have at least one null value
crypto_df = crypto_df.dropna()
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000
DASH,Dash,X11,PoW/PoS,9031294.0,22000000
XMR,Monero,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000


In [165]:
# Filter the crypto_df DataFrame so it only has rows where coins have been mined.
crypto_df = crypto_df[crypto_df.TotalCoinsMined != 0]
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000
DASH,Dash,X11,PoW/PoS,9031294.0,22000000
XMR,Monero,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000
ZEC,ZCash,Equihash,PoW,7383056.0,21000000


In [166]:
# Create a new DataFrame that holds only the cryptocurrency names, and use the crypto_df DataFrame index
# as the index for this new DataFrame
CoinName_df = pd.DataFrame(crypto_df[['CoinName']])
CoinName_df.head(10)

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
LTC,Litecoin
DASH,Dash
XMR,Monero
ETC,Ethereum Classic
ZEC,ZCash


In [167]:
# Remove the CoinName column from the crypto_df DataFrame
# since it's not going to be used on the clustering algorithm.
crypto_df = crypto_df.drop(['CoinName'], axis=1)
crypto_df.head(10)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0
LTC,Scrypt,PoW,63039240.0,84000000
DASH,X11,PoW/PoS,9031294.0,22000000
XMR,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethash,PoW,113359700.0,210000000
ZEC,Equihash,PoW,7383056.0,21000000


In [168]:
# Use the get_dummies() method to create variables for the two text features, Algorithm
# and ProofType, and store the resulting data in a new DataFrame named X.
X = pd.get_dummies(crypto_df, columns = ["Algorithm","ProofType"])
X.head(10)

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LTC,63039240.0,84000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DASH,9031294.0,22000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
XMR,17201140.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETC,113359700.0,210000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEC,7383056.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [169]:
# Use the StandardScaler fit_transform() function to standardize the features from the X DataFrame
data_scaler = StandardScaler()
X = data_scaler.fit_transform(X)
X

array([[-0.11674788, -0.15286468, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.09358885, -0.14499604, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [ 0.52587231,  4.4937636 , -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       ...,
       [-0.09523411, -0.13215444, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.11658774, -0.15255408, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.11674507, -0.15284989, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ]])

### Deliverable 2: Reducing Data Dimensions Using PCA

In [170]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
pcs = pca.fit_transform(X)
pcs

array([[-0.33772464,  1.01243127, -0.56021314],
       [-0.32097024,  1.01286006, -0.5603564 ],
       [ 2.32062882,  1.7808187 , -0.61466943],
       ...,
       [ 0.33679561, -2.17548417,  0.44368783],
       [-0.17328706, -1.96000813,  0.4276179 ],
       [-0.28275275,  0.86112147, -0.23539396]])

In [173]:
# Create a new DataFrame named pcs_df that includes the following columns, PC 1, PC 2, and PC 3,
# and uses the index of the crypto_df DataFrame as the index.

pcs_df = pd.DataFrame(
    data=pcs, columns=["P1", "P2", "P3"],
    index=crypto_df.index
)
pcs_df.head(10)

Unnamed: 0,P1,P2,P3
42,-0.337725,1.012431,-0.560213
404,-0.32097,1.01286,-0.560356
1337,2.320629,1.780819,-0.614669
BTC,-0.145377,-1.426914,0.129918
ETH,-0.150493,-2.05166,0.359429
LTC,-0.162367,-1.144863,-0.006955
DASH,-0.402005,1.262917,-0.520097
XMR,-0.151874,-2.240551,0.391957
ETC,-0.148925,-2.051718,0.359435
ZEC,-0.173286,-1.960008,0.427618


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [176]:
# Create an elbow curve to find the best value for K.
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


Running K-Means with `k=4`

In [177]:
# Initialize the K-means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Add the predicted class columns
pcs_df["class"] = model.labels_
pcs_df.head()

Unnamed: 0,P1,P2,P3,class
42,-0.337725,1.012431,-0.560213,0
404,-0.32097,1.01286,-0.560356,0
1337,2.320629,1.780819,-0.614669,0
BTC,-0.145377,-1.426914,0.129918,1
ETH,-0.150493,-2.05166,0.359429,1


In [185]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.

clustered_df = pd.concat([crypto_df,pcs_df,CoinName_df],axis=1)
columns_titles = ["Algorithm","ProofType","TotalCoinsMined","TotalCoinSupply","P1","P2","P3","CoinName","class"]
clustered_df=clustered_df.reindex(columns=columns_titles)

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(533, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,P1,P2,P3,CoinName,class
42,Scrypt,PoW/PoS,41.99995,42,-0.337725,1.012431,-0.560213,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.32097,1.01286,-0.560356,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159265359,2.320629,1.780819,-0.614669,EliteCoin,0
BTC,SHA-256,PoW,17927180.0,21000000,-0.145377,-1.426914,0.129918,Bitcoin,1
ETH,Ethash,PoW,107684200.0,0,-0.150493,-2.05166,0.359429,Ethereum,1
LTC,Scrypt,PoW,63039240.0,84000000,-0.162367,-1.144863,-0.006955,Litecoin,1
DASH,X11,PoW/PoS,9031294.0,22000000,-0.402005,1.262917,-0.520097,Dash,0
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.151874,-2.240551,0.391957,Monero,1
ETC,Ethash,PoW,113359700.0,210000000,-0.148925,-2.051718,0.359435,Ethereum Classic,1
ZEC,Equihash,PoW,7383056.0,21000000,-0.173286,-1.960008,0.427618,ZCash,1


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [193]:
# Creating a 3D-Scatter with the PCA data and the clusters

fig = px.scatter_3d(clustered_df,
                   x='P3',
                   y='P2',
                   z='P1',
                    color='class',
                   hover_name='CoinName',
                   hover_data=['Algorithm'],
                   width=800)
fig.show()

In [186]:
# Create a table with tradable cryptocurrencies.
clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType',
                                  'TotalCoinSupply', 'TotalCoinsMined',
                                  'class']
                         )

In [187]:
# Print the total number of tradable cryptocurrencies.
print(f'There are {len(clustered_df)} tradable cryptocurrenices.')

There are 533 tradable cryptocurrenices. 


In [188]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
scaler = MinMaxScaler()

normalize_columns = ['TotalCoinSupply','TotalCoinsMined']
X = clustered_df[normalize_columns].values
X_scaled = scaler.fit_transform(X)
X_scaled


array([[4.20000000e-11, 5.94230127e-03],
       [5.32000000e-04, 7.00182308e-03],
       [3.14159265e-01, 3.53420682e-02],
       ...,
       [1.40022261e-03, 6.92655266e-03],
       [2.10000000e-05, 5.94962775e-03],
       [1.00000000e-06, 5.94243008e-03]])

In [190]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
plot_df = pd.DataFrame(X_scaled, columns=normalize_columns, index=crypto_df.index)

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
plot_df["CoinName"] = clustered_df["CoinName"]

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
plot_df["class"] = clustered_df["class"]

plot_df.head(10)

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,class
42,4.2e-11,0.005942,42 Coin,0
404,0.000532,0.007002,404Coin,0
1337,0.3141593,0.035342,EliteCoin,0
BTC,2.1e-05,0.00596,Bitcoin,1
ETH,0.0,0.00605,Ethereum,1
LTC,8.4e-05,0.006006,Litecoin,1
DASH,2.2e-05,0.005951,Dash,0
XMR,0.0,0.00596,Monero,1
ETC,0.00021,0.006056,Ethereum Classic,1
ZEC,2.1e-05,0.00595,ZCash,1


In [191]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".

df_tradable.hvplot.scatter(
    x='TotalCoinsMined',
    y='TotalCoinSupply',
    hover_cols=['CoinName'],
    by='class'
)
