# Clustering Crypto

In [95]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [96]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [97]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
resp = requests.get(url).json()
data = resp['Data']

In [98]:
df = pd.DataFrame(data)
df = df.T

In [99]:
df.head()

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,MaxSupply,MktCapPenalty,IsUsedInDefi,IsUsedInNft,PlatformType,AlgorithmType,Difficulty,BuiltOn,SmartContractAddress,DecimalPoints
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,42.0,0.0,0.0,0.0,blockchain,scrypt,1.020476,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,300.0,0.0,0.0,0.0,token,,,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,18.0
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),365Coin is a Proof of Work and Proof of Stake ...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),404 is a PoW/PoS hybrid cryptocurrency that al...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),433 Token is a decentralised soccer platform t...,Finished,...,,,,,,,,,,


In [100]:
# Alternatively, use the provided csv file:
file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame
data_df = pd.read_csv(file_path)

### Data Preprocessing

In [101]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
data_df.drop('Unnamed: 0', axis=1, inplace=True)
data_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.0,0


In [102]:
# Keep only cryptocurrencies that are trading
data_df = data_df.loc[data_df.IsTrading == True]
data_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.0,0


In [103]:
# Keep only cryptocurrencies with a working algorithm
data_df = data_df.loc[data_df.Algorithm != 'N/A']
data_df.shape

(1144, 6)

In [104]:
# Remove the "IsTrading" column
data_df.drop('IsTrading', axis=1, inplace=True)

In [105]:
# Remove rows with at least 1 null value
data_df.dropna(inplace=True)
data_df.shape

(685, 5)

In [106]:
# Remove rows with cryptocurrencies having no coins mined
data_df = data_df.loc[data_df.TotalCoinsMined > 0]
data_df.shape

(532, 5)

In [107]:
# Drop rows where there are 'N/A' text values
data_df=data_df[~data_df.isin(['N/A'])]
data_df.shape

(532, 5)

In [108]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
cn_df = data_df[['CoinName']]
cn_df

Unnamed: 0,CoinName
0,42 Coin
2,404Coin
5,EliteCoin
7,Bitcoin
8,Ethereum
...,...
1238,ZEPHYR
1242,Gapcoin
1245,Beldex
1246,Horizen


In [109]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
data_df.drop('CoinName', axis=1, inplace=True)

In [110]:
data_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,41.99995,42
2,Scrypt,PoW/PoS,1055185000.0,532000000
5,X13,PoW/PoS,29279420000.0,314159265359
7,SHA-256,PoW,17927180.0,21000000
8,Ethash,PoW,107684200.0,0


In [111]:
def label_encoding(data, feature):
    mapping_dict = {}
    for feat in feature:
        for i, feature_value in enumerate(list(data[feat].sort_values(ascending=False).unique())):
            mapping_dict[feature_value] = i+1
        data[feat] = data[feat].map(lambda x: mapping_dict[x])
    return data

In [112]:
# Create dummy variables for text features
data_df_enc = data_df.copy()
data_df_enc = label_encoding(data_df_enc, ['Algorithm', 'ProofType'])
data_df_enc.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,19,10,41.99995,42
2,19,10,1055185000.0,532000000
5,5,10,29279420000.0,314159265359
7,24,13,17927180.0,21000000
8,51,13,107684200.0,0


In [113]:
# Standardize data
std=StandardScaler().fit_transform(data_df_enc.values)
data_df_std=pd.DataFrame(std,columns=data_df_enc.columns)
data_df_std.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,-0.376459,-0.891357,-0.117108,-0.15287
1,-0.376459,-0.891357,-0.09397,-0.145009
2,-1.215438,-0.891357,0.524946,4.489424
3,-0.076824,-0.001672,-0.116715,-0.15256
4,1.541207,-0.001672,-0.114747,-0.15287


### Reducing Dimensions Using PCA

In [114]:
# Use PCA to reduce dimensions to 3 principal components
pca=PCA(n_components = 3)
pca.fit(data_df_std)
data_pca=pca.transform(data_df_std)

In [115]:
# Create a DataFrame with the principal components data
df_data_pca = pd.DataFrame(data_pca, columns=['PCA1', 'PCA2', 'PCA3'])
df_data_pca

Unnamed: 0,PCA1,PCA2,PCA3
0,-0.417875,0.810296,0.372138
1,-0.396564,0.815135,0.373256
2,3.124076,2.209780,0.504335
3,-0.192083,0.016266,-0.072910
4,-0.044116,-1.167492,1.012525
...,...,...,...
527,0.584290,-2.080384,-2.437705
528,-0.415122,0.810892,0.372691
529,0.037594,-1.599930,1.417612
530,-0.034543,-1.255503,1.092984


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [116]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km=KMeans(n_clusters=i, random_state=0)
    km.fit(df_data_pca)
    inertia.append(km.inertia_)
# Create the Elbow Curve using hvPlot
elbow={'k':k, 'inertia':inertia}
df_elbow=pd.DataFrame(elbow)
df_elbow.hvplot.line(x='k',y='inertia',xticks=k,title="Elbow Curve")


Running K-Means with `k=<your best value for k here>`

In [117]:
# Initialize the K-Means model
km4=KMeans(n_clusters=4,random_state=1)
# Fit the model
km4.fit(df_data_pca)
# Predict clusters
pred_y=km4.predict(df_data_pca)
# Create a new DataFrame including predicted clusters and cryptocurrencies features
pred_df=data_df.copy()
pred_df['class']=pred_y

In [120]:
pred_df['CoinName'] = cn_df
pred_df=pred_df[pred_df.columns.tolist()[-1:] + pred_df.columns.tolist()[:-1]]

### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [121]:
# Scale data to create the scatter plot
scaler=MinMaxScaler() 
pred_df[['TotalCoinsMined','TotalCoinSupply']]=scaler.fit_transform(pred_df[['TotalCoinsMined','TotalCoinSupply']])

In [122]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
pred_df.hvplot.scatter(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    hover_cols=["class"],
    by="class",
)

#### Table of Tradable Cryptocurrencies

In [123]:
# Table with tradable cryptos
pred_df.hvplot.table()

In [124]:
# Print the total number of tradable cryptocurrencies
print(len(pred_df))

532
