# Clustering Crypto

In [22]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
import holoviews as hv
from holoviews import dim, opts
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
from pathlib import Path

### Fetching Cryptocurrency Data

In [56]:
# Data Path
file_path = Path("crypto_data.csv")

# Create a DataFrame
crypto = pd.read_csv("Resources/crypto_data.csv")

In [57]:
# Display Data Frame 
crypto.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684200.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


### Data Preprocessing

In [58]:
# Check / Preview DataFrame Columns
crypto.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684200.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [59]:
# Keep only cryptocurrencies that are trading
crypto = crypto[crypto['IsTrading']==True].copy()

In [60]:
# Check Data Frame
crypto.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [61]:
# Keep only cryptocurrencies with a working algorithm
crypto = crypto[crypto.Algorithm != 'N/A']


In [62]:
# Display
crypto.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [63]:
# Remove the 'IsTrading' column
crypto.drop(['IsTrading'], axis=1, inplace=True)


In [64]:
# Check Data Frame
crypto.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


In [65]:
# Remove rows with at least 1 null value
crypto.dropna()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
4,808,808,SHA-256,PoW/PoS,0.000000e+00,0
5,1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
7,BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...,...
1238,ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [66]:
# Remove rows with cryptocurrencies having no coins mined
crypto = crypto[crypto['TotalCoinsMined']!=0].copy()
crypto = crypto[crypto.TotalCoinsMined.notna() ]


In [67]:
# Check Data Frame
crypto.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,PoW,107684200.0,0


In [68]:
# Drop rows where there are 'N/A' text values

# 'Total Coins Mined'
crypto = crypto[crypto.TotalCoinsMined != "N/A"]

# 'Total Coin Supply'
crypto = crypto[crypto.TotalCoinSupply != "N/A"]

In [69]:
# Check Data Frame
crypto.isna().sum()

Unnamed: 0         0
CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
TotalCoinSupply    0
dtype: int64

In [70]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df 
coinname = pd.DataFrame(crypto['CoinName'])
coinname.head()


Unnamed: 0,CoinName
0,42 Coin
2,404Coin
5,EliteCoin
7,Bitcoin
8,Ethereum


In [71]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
cluster = crypto.drop(columns=['CoinName'])


In [72]:
# New Data Frame
cluster.tail(10)

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
1226,TCH,SHA-256,PoS,1000000000.0,1000000000
1230,WAVES,Leased POS,LPoS,100000000.0,100000000
1231,PART,PoS,PoS,9283138.0,8634140
1234,BTT,TRC10,DPoS,989988700000.0,990000000000
1237,NXT,PoS,PoS/LPoS,1000000000.0,1000000000
1238,ZEPH,SHA-256,DPoS,2000000000.0,2000000000
1242,GAP,Scrypt,PoW/PoS,14931050.0,250000000
1245,BDX,CryptoNight,PoW,980222600.0,1400222610
1246,ZEN,Equihash,PoW,7296538.0,21000000
1247,XBC,Scrypt,PoS,128327.0,1000000


In [73]:
# Create dummy variables for text features
crypto_dummy = pd.get_dummies(data=cluster, columns=["Algorithm", "ProofType"])


In [74]:
# Standardize data
crypto_scaled = StandardScaler().fit_transform(crypto_dummy)

ValueError: could not convert string to float: 'BTC'

### Reducing Dimensions Using PCA

In [75]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)

crypto_scaled_pca = pca.fit_transform(crypto_scaled)


NameError: name 'crypto_scaled' is not defined

In [67]:
# Create a DataFrame with the principal components data
crypto_pca = pd.DataFrame(data=crypto_scaled_pca, columns=["principal component 1", "principal component 2", "principal component 3"])

NameError: name 'crypto_scaled_pca' is not defined

### Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [68]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range ok k values
for i in k:
    km = KMeans(n_clusters = i, random_state = 0)
    km.fit(crypto_scaled_pca)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k":k, "inertia" : inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x = "k", y="inertia", xticks =k, title= "Elbow Curve")

NameError: name 'crypto_scaled_pca' is not defined

Running K-Means with `k=<your best value for k here>`

In [69]:
# Initialize the K-Means model
model = KMeans(n_clusters=3, random_state=0)
# Fit model
model.fit(crypto_scaled_pca)
# Create a new DataFrame including predicted clusters and cryptocurrencies features
crypto_scaled_pca["class"] = model.labels_

crypto = pd.concat([crypto_pca, crypto_df])

crypto

NameError: name 'crypto_scaled_pca' is not defined

### Visualizing Results

#### 3D-Scatter with Clusters

In [70]:
# Create a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    crypto_scale_pca,
    x="PC3",
    y="PC2",
    z="PC1",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()


NameError: name 'crypto_scale_pca' is not defined

#### Table of Tradable Cryptocurrencies

In [71]:
# Table with tradable cryptos
Name = pd.DataFrame(Name)
crypto_trade = pd.concat([crypto, crypto_scaled_pca, Name], axis = 1, sort = False)
crypto_trade.head()


NameError: name 'Name' is not defined

In [72]:
# Print the total number of tradable cryptocurrencies
len(crypto_trade)

NameError: name 'crypto_trade' is not defined

#### Scatter Plot with Tradable Cryptocurrencies

In [73]:
# Scale data to create the scatter plot


In [74]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"

