# Clustering Crypto

In [28]:
# Initial imports
import requests
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [29]:
!pip install -U altair
import altair as alt





### Fetching Cryptocurrency Data

In [30]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"
response = requests.get(url).json()

In [31]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
crypto_df = pd.DataFrame(response['Data']).T
crypto_df.columns

Index(['Id', 'Url', 'ImageUrl', 'ContentCreatedOn', 'Name', 'Symbol',
       'CoinName', 'FullName', 'Description', 'AssetTokenStatus', 'Algorithm',
       'ProofType', 'SortOrder', 'Sponsored', 'Taxonomy', 'Rating',
       'IsTrading', 'TotalCoinsMined', 'CirculatingSupply', 'BlockNumber',
       'NetHashesPerSecond', 'BlockReward', 'BlockTime', 'AssetLaunchDate',
       'AssetWhitepaperUrl', 'AssetWebsiteUrl', 'MaxSupply', 'MktCapPenalty',
       'IsUsedInDefi', 'IsUsedInNft', 'PlatformType', 'BuiltOn',
       'SmartContractAddress', 'DecimalPoints', 'Difficulty', 'AlgorithmType'],
      dtype='object')

In [32]:
# Alternatively, use the provided csv file:
from pathlib import Path
file_path = Path("../Data/crypto_data.csv")

# Create a DataFrame
crypto_df_CSV = pd.read_csv(file_path)
crypto_df_CSV.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


### Data Preprocessing

In [33]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
stripped_crypto_df = crypto_df[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','MaxSupply']]
stripped_crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,0.0,0.0
300,300 token,,True,,300.0,300.0
365,365Coin,X11,True,PoW/PoS,0.0,0.0
404,404Coin,Scrypt,True,PoW/PoS,0.0,0.0
433,433 Token,,False,,,


In [34]:
# Keep only cryptocurrencies that are trading
stripped_crypto_df = stripped_crypto_df.loc[stripped_crypto_df['IsTrading'] == True]
stripped_crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,0,0
300,300 token,,True,,300,300
365,365Coin,X11,True,PoW/PoS,0,0
404,404Coin,Scrypt,True,PoW/PoS,0,0
611,SixEleven,SHA-256,True,PoW,0,0


In [35]:
# Keep only cryptocurrencies with a working algorithm
stripped_crypto_df = stripped_crypto_df[stripped_crypto_df['Algorithm'] != 'N/A']
stripped_crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,0,0
365,365Coin,X11,True,PoW/PoS,0,0
404,404Coin,Scrypt,True,PoW/PoS,0,0
611,SixEleven,SHA-256,True,PoW,0,0
808,808,SHA-256,True,PoW/PoS,0,0
...,...,...,...,...,...,...
YAYCOIN,YAYcoin,Scrypt,True,PoW/PoS,,
ACTIN,Actinium,Lyra2Z,True,PoW,,
RDD,Reddcoin,Scrypt,True,PoW/PoS,3.04122e+10,-1
GRIN,Grin,C31,True,PoW,7.73372e+07,-1


In [36]:
# Remove the "IsTrading" column
stripped_crypto_df = stripped_crypto_df.drop('IsTrading', axis=1)
stripped_crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,0,0
365,365Coin,X11,PoW/PoS,0,0
404,404Coin,Scrypt,PoW/PoS,0,0
611,SixEleven,SHA-256,PoW,0,0
808,808,SHA-256,PoW/PoS,0,0


In [37]:
# Remove rows with at least 1 null value
stripped_crypto_df = stripped_crypto_df.dropna()
stripped_crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,0,0
365,365Coin,X11,PoW/PoS,0,0
404,404Coin,Scrypt,PoW/PoS,0,0
611,SixEleven,SHA-256,PoW,0,0
808,808,SHA-256,PoW/PoS,0,0


In [38]:
# Remove rows with cryptocurrencies having no coins mined
stripped_crypto_df = stripped_crypto_df.loc[stripped_crypto_df['TotalCoinsMined'] > 0]
stripped_crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
NVC,NovaCoin,Scrypt,PoW/PoS,3379860.0,-1
NSR,NuShares,PoS,PoS,6163400000.0,0
MONA,MonaCoin,Scrypt,PoW,82912700.0,-1
TRI,Triangles Coin,X13,PoW/PoS,185362.0,0
SAFEX,SafeExchangeCoin,Scrypt,PoC,2147483647.0,-1


In [39]:
# Drop rows where there are 'N/A' text values
stripped_crypto_df = stripped_crypto_df.drop(stripped_crypto_df[stripped_crypto_df.values == 'N/A'].index)
stripped_crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
NVC,NovaCoin,Scrypt,PoW/PoS,3.37986e+06,-1
NSR,NuShares,PoS,PoS,6.1634e+09,0
MONA,MonaCoin,Scrypt,PoW,8.29127e+07,-1
TRI,Triangles Coin,X13,PoW/PoS,185362,0
SAFEX,SafeExchangeCoin,Scrypt,PoC,2147483647,-1
...,...,...,...,...,...
AION,Aion,"Equihash210,9",PoW/PoS,495239558,-1
ACT,Achain,DPoS,DPoS,1000000000,0
ETC,Ethereum Classic,EtcHash,PoW,1.28113e+08,210700000
RDD,Reddcoin,Scrypt,PoW/PoS,3.04122e+10,-1


In [40]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coin_name_df = stripped_crypto_df['CoinName'].to_frame()
coin_name_df

Unnamed: 0,CoinName
NVC,NovaCoin
NSR,NuShares
MONA,MonaCoin
TRI,Triangles Coin
SAFEX,SafeExchangeCoin
...,...
AION,Aion
ACT,Achain
ETC,Ethereum Classic
RDD,Reddcoin


In [41]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
stripped_crypto_df = stripped_crypto_df.drop('CoinName', axis=1)
stripped_crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply
NVC,Scrypt,PoW/PoS,3379860.0,-1
NSR,PoS,PoS,6163400000.0,0
MONA,Scrypt,PoW,82912700.0,-1
TRI,X13,PoW/PoS,185362.0,0
SAFEX,Scrypt,PoC,2147483647.0,-1


In [42]:
# Create dummy variables for text features
dummy_crypto_df = pd.get_dummies(stripped_crypto_df)

In [43]:
# Standardize data
scaled_crypto_df = StandardScaler().fit_transform(dummy_crypto_df)

### Reducing Dimensions Using PCA

In [44]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)

In [45]:
# Create a DataFrame with the principal components data
pca_crypto_df = pd.DataFrame(pca.fit_transform(scaled_crypto_df), columns=['PC1', 'PC2', 'PC3'])

### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [46]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(pca_crypto_df)
    inertia.append(kmeans.inertia_)

# Create the Elbow Curve using hvPlot
data = {'k': k, 'inertia': inertia}
elbow_df = pd.DataFrame(data)
alt.Chart(elbow_df).mark_line().encode(
    x='k',
    y='inertia'
)

  f"KMeans is known to have a memory leak on Windows "


Running K-Means with `k=<your best value for k here>`

In [47]:
# Initialize the K-Means model
k_model = KMeans(n_clusters=4, random_state=42)
# Fit the model
k_model.fit(pca_crypto_df)
# Predict clusters
predictions = k_model.predict(pca_crypto_df)
# Create a new DataFrame including predicted clusters and cryptocurrencies features
pca_crypto_df['target'] = k_model.labels_
pca_crypto_df['predictions'] = predictions

### Visualizing Results

#### Altair Scatter Plot

In [48]:
# New DataFrame for scatter plots
new_crypto_df = pd.DataFrame(stripped_crypto_df)
new_crypto_df['CoinName'] = coin_name_df['CoinName']
new_crypto_df['target'] = pca_crypto_df['target'].values
new_crypto_df['PC1'] = pca_crypto_df['PC1'].values
new_crypto_df['PC2'] = pca_crypto_df['PC2'].values
new_crypto_df['PC3'] = pca_crypto_df['PC3'].values

In [53]:
new_crypto_df.display()

AttributeError: 'DataFrame' object has no attribute 'display'

In [54]:
# Create a Scatter Plot using altair
alt.Chart(new_crypto_df).mark_circle().encode(
    x='PC1',
    y='PC2',
    tooltip=['CoinName', 'Algorithm', 'TotalCoinsMined', 'MaxSupply']
)


#### Scatter Plot with Tradable Cryptocurrencies

In [50]:
# Scale data to create the scatter plot
mm_scaler = MinMaxScaler()
scatter_crypto = mm_scaler.fit_transform(new_crypto_df[['TotalCoinsMined', 'MaxSupply']].copy())

In [51]:
scatter_crypto_df = pd.DataFrame(scatter_crypto, index=new_crypto_df['CoinName'], columns=['TotalCoinsMined', 'MaxSupply'])
scatter_crypto_df

Unnamed: 0_level_0,TotalCoinsMined,MaxSupply
CoinName,Unnamed: 1_level_1,Unnamed: 2_level_1
NovaCoin,3.377797e-06,0.000000e+00
NuShares,6.163293e-03,9.999833e-13
MonaCoin,8.290928e-05,0.000000e+00
Triangles Coin,1.833495e-07,9.999833e-13
SafeExchangeCoin,2.147446e-03,0.000000e+00
...,...,...
Aion,4.952293e-04,0.000000e+00
Achain,9.999813e-04,9.999833e-13
Ethereum Classic,1.281092e-04,2.106965e-04
Reddcoin,3.041165e-02,0.000000e+00


In [52]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
alt.Chart(scatter_crypto_df).mark_circle().encode(
    x='TotalCoinsMined',
    y='MaxSupply'
)