# Clustering Crypto

In [203]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from pathlib import Path

### Fetching Cryptocurrency Data

In [204]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [205]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.

In [206]:
# Alternatively, use the provided csv file:
# file_path = Path("Resources/crypto_data.csv")
file_path = Path("Resources/crypto_data.csv")

# Creating a DataFrame
crypto_df = pd.read_csv(file_path)
crypto_df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1247,XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
1248,DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
1249,GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


### Data Preprocessing

In [207]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'

crypto_df = crypto_df.drop(columns=['Unnamed: 0'])
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.0,0


In [208]:
# Keep only cryptocurrencies that are trading

crypto_df = crypto_df[crypto_df["IsTrading"] == True]
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.0,0


In [209]:
# Keep only cryptocurrencies with a working algorithm

crypto_df = crypto_df[crypto_df['Algorithm'].isnull() == False]
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.0,0


In [210]:
# Remove the "IsTrading" column

crypto_df = crypto_df.drop(columns=['IsTrading'])
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365Coin,X11,PoW/PoS,,2300000000
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,PoW,,611000
4,808,SHA-256,PoW/PoS,0.0,0


In [211]:
# Count null values in dataframe
crypto_df.isnull().sum()

CoinName             0
Algorithm            0
ProofType            0
TotalCoinsMined    459
TotalCoinSupply      0
dtype: int64

In [212]:
# Remove rows with at least 1 null value
crypto_df = crypto_df.dropna()
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,SHA-256,PoW/PoS,0.0,0
5,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [213]:
# Check null values have been dropped
crypto_df.isnull().sum()

CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
TotalCoinSupply    0
dtype: int64

In [214]:
# Remove rows with cryptocurrencies having no coins mined

crypto_df = crypto_df.loc[crypto_df['TotalCoinsMined'] != 0]
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,Ethereum,Ethash,PoW,107684200.0,0


In [215]:
# Drop rows where there are 'N/A' text values
for column in crypto_df:
    crypto_df = crypto_df.loc[crypto_df[column] != 'N/A']

In [216]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coin_name_df = pd.DataFrame(crypto_df['CoinName'])
coin_name_df.head()

Unnamed: 0,CoinName
0,42 Coin
2,404Coin
5,EliteCoin
7,Bitcoin
8,Ethereum


In [217]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df.drop(columns=['CoinName'])
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,Ethereum,Ethash,PoW,107684200.0,0


In [218]:
# Check data types to see which one need to be converted to numbers
crypto_df.select_dtypes(include='object').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533 entries, 0 to 1247
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   CoinName         533 non-null    object
 1   Algorithm        533 non-null    object
 2   ProofType        533 non-null    object
 3   TotalCoinSupply  533 non-null    object
dtypes: object(4)
memory usage: 20.8+ KB


In [219]:
# Convert TotalCoinSupply into float
crypto_df = crypto_df.astype({"TotalCoinsMined": float})

In [222]:
# Create dummy variables for text features

text_feature_variable = ''

# Import Label Encoder from SKLearn and encode categorical data:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

le.fit(crypto_df['Algorithm'])
crypto_df['Algorithm'] = le.transform(crypto_df['Algorithm'])

le.fit(crypto_df['ProofType'])
crypto_df['ProofType'] = le.transform(crypto_df['ProofType'])

le.fit(crypto_df['CoinName'])
crypto_df['CoinName'] = le.transform(crypto_df['CoinName'])

crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,4,53,16,41.99995,42
2,3,53,16,1055185000.0,532000000
5,159,67,16,29279420000.0,314159265359
7,55,47,13,17927180.0,21000000
8,171,20,13,107684200.0,0


In [223]:
# Standardize data

scaler = StandardScaler()

crypto_scaled = scaler.fit_transform(crypto_df)
crypto_scaled

array([[-1.70192413,  0.39335561,  0.8871567 , -0.11674788, -0.15286468],
       [-1.70843802,  0.39335561,  0.8871567 , -0.09358885, -0.14499604],
       [-0.6922698 ,  1.21439604,  0.8871567 ,  0.52587231,  4.4937636 ],
       ...,
       [-1.48696546, -2.12841143,  0.00878917, -0.09523411, -0.13215444],
       [-0.22978298, -1.65924547,  0.00878917, -0.11658774, -0.15255408],
       [-1.29806239,  0.39335561, -1.45515672, -0.11674507, -0.15284989]])

### Reducing Dimensions Using PCA

In [224]:
# Use PCA to reduce dimensions to 3 principal components

pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(crypto_scaled)

In [225]:
# Create a DataFrame with the principal components data

pcs_df = pd.DataFrame(data=crypto_pca, 
                      columns=["PC_1", "PC_2", "PC_3"], 
                      index=crypto_df.index)

pcs_df.head()

Unnamed: 0,PC_1,PC_2,PC_3
0,-0.36964,0.849222,-1.71469
2,-0.348115,0.85404,-1.721216
5,3.157987,2.193787,-0.450694
7,-0.155874,0.017015,-1.369521
8,-0.035364,-1.151418,-0.687431


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [226]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values

for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(crypto_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

Running K-Means with `k=5`

In [227]:
# Initialize the K-Means model
model = KMeans(n_clusters=5, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Create a new DataFrame including predicted clusters and cryptocurrencies features

clustered_df = crypto_df.join(pcs_df)
clustered_df['CoinName'] = coin_name_df['CoinName']
clustered_df['Class'] = predictions
clustered_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC_1,PC_2,PC_3,Class
0,42 Coin,53,16,41.99995,42,-0.36964,0.849222,-1.71469,0
2,404Coin,53,16,1055185000.0,532000000,-0.348115,0.85404,-1.721216,0
5,EliteCoin,67,16,29279420000.0,314159265359,3.157987,2.193787,-0.450694,4
7,Bitcoin,47,13,17927180.0,21000000,-0.155874,0.017015,-1.369521,0
8,Ethereum,20,13,107684200.0,0,-0.035364,-1.151418,-0.687431,3


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [228]:
# Scale data to create the scatter plot
clustered_scaled_df = pd.DataFrame(StandardScaler().fit_transform(clustered_df[["TotalCoinsMined", "TotalCoinSupply"]]), index=clustered_df.index)
clustered_scaled_df.head()

Unnamed: 0,0,1
0,-0.116748,-0.152865
2,-0.093589,-0.144996
5,0.525872,4.493764
7,-0.116354,-0.152554
8,-0.114384,-0.152865


In [229]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
clustered_df.hvplot.scatter(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    hover_cols=["CoinName"],
    by="Class",
    width=800,
    height=200
)

#### Table of Tradable Cryptocurrencies

In [230]:
# Table with tradable cryptos
columns = ['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'Class']

In [231]:
# Print the total number of tradable cryptocurrencies
clustered_df.hvplot.table(columns)