# Clustering Crypto

In [42]:
# Initial imports
import pandas as pd
import hvplot.pandas
from pathlib import Path
import plotly.express as px

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [2]:
# Load the crypto_data.csv dataset.
file_path = "./Data/crypto_data.csv"
crypto_df= pd.read_csv(file_path)
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [3]:
crypto_df.shape

(1252, 7)

In [4]:
crypto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1252 entries, 0 to 1251
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1252 non-null   object 
 1   CoinName         1252 non-null   object 
 2   Algorithm        1252 non-null   object 
 3   IsTrading        1252 non-null   bool   
 4   ProofType        1252 non-null   object 
 5   TotalCoinsMined  744 non-null    float64
 6   TotalCoinSupply  1252 non-null   object 
dtypes: bool(1), float64(1), object(5)
memory usage: 60.0+ KB


In [5]:
#set_index to unnamed:0 columns
#crypto_df.set_index("Unnamed:0", inplace= True)
crypto_df = crypto_df.set_index("Unnamed: 0")
crypto_df

Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [6]:
# Keep all the cryptocurrencies that are being traded.c
crypto_df["IsTrading"].value_counts()

True     1144
False     108
Name: IsTrading, dtype: int64

In [7]:
# get conditional statement
(crypto_df["IsTrading"] == True)

Unnamed: 0
42       True
365      True
404      True
611      True
808      True
        ...  
XBC      True
DVTC    False
GIOT    False
OPSC    False
PUNK    False
Name: IsTrading, Length: 1252, dtype: bool

In [8]:
# use.loc to get all IsTrading == True
crypto_trading_df = crypto_df.loc[(crypto_df["IsTrading"] == True)]
crypto_trading_df

Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [9]:
crypto_trading_df["IsTrading"].value_counts()

True    1144
Name: IsTrading, dtype: int64

In [10]:
# Remove the "IsTrading" column. 
#use drop method 
crypto_trading_df = crypto_trading_df.drop(columns=['IsTrading'])

In [11]:
crypto_trading_df

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
SERO,Super Zero,Ethash,PoW,,1000000000
UOS,UOS,SHA-256,DPoI,,1000000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [12]:
# Remove rows that have at least 1 null value.
crypto_trading_df["TotalCoinsMined"].isnull().value_counts()

False    685
True     459
Name: TotalCoinsMined, dtype: int64

In [13]:
#drop coints where totalcoinsmined isnull
clean_crypto_trading_df = crypto_trading_df.dropna(how='any', axis='rows')
clean_crypto_trading_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 685 entries, 42 to XBC
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         685 non-null    object 
 1   Algorithm        685 non-null    object 
 2   ProofType        685 non-null    object 
 3   TotalCoinsMined  685 non-null    float64
 4   TotalCoinSupply  685 non-null    object 
dtypes: float64(1), object(4)
memory usage: 32.1+ KB


In [14]:
# Keep the rows where coins are mined.
(clean_crypto_trading_df["TotalCoinsMined"] > 0)

Unnamed: 0
42       True
404      True
808     False
1337     True
BTC      True
        ...  
ZEPH     True
GAP      True
BDX      True
ZEN      True
XBC      True
Name: TotalCoinsMined, Length: 685, dtype: bool

In [15]:
#remove all rows that do not have coint being mined
clean_crypto_df = clean_crypto_trading_df[clean_crypto_trading_df["TotalCoinsMined"] > 0]
clean_crypto_df

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [16]:
# Create a new DataFrame that holds only the cryptocurrencies names.
crypto_names_df = clean_crypto_df.copy()
crypto_names_df = pd.DataFrame(crypto_names_df["CoinName"],index=crypto_names_df.index)
crypto_names_df

Unnamed: 0_level_0,CoinName
Unnamed: 0,Unnamed: 1_level_1
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex
ZEN,Horizen


In [17]:
#save crypto_names_df as csv file for other
output_file_path ="./Data/crypto_names.csv"
crypto_names_df.to_csv(output_file_path, index=True)

In [18]:

# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
# YOUR CODE HERE
clean_crypto_df = clean_crypto_df.drop(columns=['CoinName'])
clean_crypto_df.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [19]:
#save our clean_crpyto as clean_crypto.csv
output_file_path = "./Data/clean_crypto.csv"
clean_crypto_df.to_csv(output_file_path, index=False)

In [20]:
#check data types
clean_crypto_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Algorithm        532 non-null    object 
 1   ProofType        532 non-null    object 
 2   TotalCoinsMined  532 non-null    float64
 3   TotalCoinSupply  532 non-null    object 
dtypes: float64(1), object(3)
memory usage: 20.8+ KB


In [21]:
# Use get_dummies() to create variables for text features.
#encoded_crypto_df = pd.read_csv("./Data/clean_crypto.csv")
#encoded_crypto_df
X_encoded = pd.get_dummies(clean_crypto_df, columns=["Algorithm", "ProofType"])
X_encoded.shape

(532, 98)

In [22]:
# Standardize the data with StandardScaler().
scaler = StandardScaler()

#train/fit our data to the model
X_scaled = scaler.fit_transform(X_encoded)
X_scaled[:5]
X_encoded

Unnamed: 0_level_0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEPH,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GAP,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BDX,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEN,7.296538e+06,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Deliverable 2: Reducing Data Dimensions Using PCA

In [23]:
# Using PCA to reduce dimension to three principal components.
# YOUR CODE HERE

#inititialize PCA for 3 prinicipal components 
pca = PCA(n_components=3)

# get 3 principles for X_scaled
X_pca = pca.fit_transform(X_scaled)

X_pca

array([[-0.34256277,  1.09864548, -0.52182475],
       [-0.32591251,  1.09891765, -0.52231475],
       [ 2.313018  ,  1.70785984, -0.62526076],
       ...,
       [ 0.33002986, -2.31705223,  0.38399928],
       [-0.15374413, -2.07192105,  0.35074322],
       [-0.30339709,  0.79223861, -0.25343573]])

In [24]:
# Create a DataFrame with the three principal components.
X_pca_df = pd.DataFrame(
            data=X_pca,
            columns = ["PC 1","PC 2", "PC 3"],
            index=X_encoded.index)

In [25]:
X_pca_df

Unnamed: 0_level_0,PC 1,PC 2,PC 3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,-0.342563,1.098645,-0.521825
404,-0.325913,1.098918,-0.522315
1337,2.313018,1.707860,-0.625261
BTC,-0.141309,-1.331977,0.160574
ETH,-0.151213,-2.031623,0.380555
...,...,...,...
ZEPH,2.484179,0.714303,-0.249705
GAP,-0.340609,1.098545,-0.521853
BDX,0.330030,-2.317052,0.383999
ZEN,-0.153744,-2.071921,0.350743


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [26]:
# Create an elbow curve to find the best value for K.
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(X_pca_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")



  "KMeans is known to have a memory leak on Windows "


Running K-Means with `k=4`

In [27]:
# Initialize the K-Means model.

model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(X_pca_df)

# Predict clusters
predictions = model.predict(X_pca_df)

# Add the predicted class columns
X_pca_df["class"] = model.labels_
X_pca_df.head()

Unnamed: 0_level_0,PC 1,PC 2,PC 3,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,-0.342563,1.098645,-0.521825,0
404,-0.325913,1.098918,-0.522315,0
1337,2.313018,1.70786,-0.625261,0
BTC,-0.141309,-1.331977,0.160574,3
ETH,-0.151213,-2.031623,0.380555,3


In [28]:
X_pca_df["class"].value_counts()

0    283
3    243
1      5
2      1
Name: class, dtype: int64

### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [29]:
# Create a table with tradable cryptocurrencies.
crypto_df = pd.read_csv("./Data/clean_crypto.csv")
crypto_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,4.200000e+01
1,Scrypt,PoW/PoS,1.055185e+09,5.320000e+08
2,X13,PoW/PoS,2.927942e+10,3.141593e+11
3,SHA-256,PoW,1.792718e+07,2.100000e+07
4,Ethash,PoW,1.076842e+08,0.000000e+00
...,...,...,...,...
527,SHA-256,DPoS,2.000000e+09,2.000000e+09
528,Scrypt,PoW/PoS,1.493105e+07,2.500000e+08
529,CryptoNight,PoW,9.802226e+08,1.400223e+09
530,Equihash,PoW,7.296538e+06,2.100000e+07


In [30]:
#we will concatonante this with our X_pca_df
clustered_df = pd.concat([crypto_df, X_pca_df],axis=1)
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,class
0,Scrypt,PoW/PoS,41.99995,42.0,,,,
1,Scrypt,PoW/PoS,1055185000.0,532000000.0,,,,
2,X13,PoW/PoS,29279420000.0,314159300000.0,,,,
3,SHA-256,PoW,17927180.0,21000000.0,,,,
4,Ethash,PoW,107684200.0,0.0,,,,


In [31]:
#read in the crypto_names.csv file
crypto_names = pd.read_csv("./Data/crypto_names.csv")
crypto_names.head()

Unnamed: 0.1,Unnamed: 0,CoinName
0,42,42 Coin
1,404,404Coin
2,1337,EliteCoin
3,BTC,Bitcoin
4,ETH,Ethereum


In [32]:
crypto_names["CoinName"]

0          42 Coin
1          404Coin
2        EliteCoin
3          Bitcoin
4         Ethereum
          ...     
527         ZEPHYR
528        Gapcoin
529         Beldex
530        Horizen
531    BitcoinPlus
Name: CoinName, Length: 532, dtype: object

In [33]:
#since in the same order
clustered_df["CoinName"] = crypto_names["CoinName"]
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,class,CoinName
0,Scrypt,PoW/PoS,41.99995,42.0,,,,,42 Coin
1,Scrypt,PoW/PoS,1055185000.0,532000000.0,,,,,404Coin
2,X13,PoW/PoS,29279420000.0,314159300000.0,,,,,EliteCoin
3,SHA-256,PoW,17927180.0,21000000.0,,,,,Bitcoin
4,Ethash,PoW,107684200.0,0.0,,,,,Ethereum


In [34]:
# Print the total number of tradable cryptocurrencies.
print(f"Thetotalnumber of tradable cryptocurrencies: {crypto_df.shape[0]}")

Thetotalnumber of tradable cryptocurrencies: 532


In [35]:
# Creating a 3D-Scatter with the PCA data and the clusters
# Plotting the clusters with three features
fig = px.scatter_3d(clustered_df, x="PC 1", y="PC 2", z="PC 3", color="class", symbol="class",width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

KeyError: (nan, '')

In [36]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
# use minmax scaler
mm_scaler = MinMaxScaler()

plot_data = mm_scaler.fit_transform(
    clustered_df[["TotalCoinsMined", "TotalCoinSupply"]]
)

plot_data[:5]

array([[0.00000000e+00, 4.20000000e-11],
       [1.06585544e-03, 5.32000000e-04],
       [2.95755135e-02, 3.14159265e-01],
       [1.81084216e-05, 2.10000000e-05],
       [1.08773140e-04, 0.00000000e+00]])

In [37]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
plot_df = pd.DataFrame(
    plot_data, columns=["TotalCoinsMined", "TotalCoinSupply"], index=clustered_df.index
)

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
plot_df["CoinName"] = clustered_df["CoinName"]

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
plot_df["Class"] = clustered_df["Class"]

plot_df.head(10)

KeyError: 'Class'

In [40]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
plot_df.hvplot.scatter(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    hover_cols=["CoinName"],
    by="Class"
)


DataError: Supplied data does not contain specified dimensions, the following dimensions were not found: ['Class']

PandasInterface expects tabular data, for more information on supported datatypes see http://holoviews.org/user_guide/Tabular_Datasets.html

In [58]:
# save our clusted_df as a csv file and an image
clustered_df.to_csv("./Data/crypto_clustered_.csv")