In [35]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import hvplot.pandas
import plotly.express as px
import os

In [66]:
# read the file
file_path = os.path.join(".", "Resources", "crypto_data.csv")
crypto_df = pd.read_csv(file_path, index_col=0)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [3]:
crypto_df.shape

(1252, 6)

In [4]:
# number of rows before aaren't trading
crypto_df.count()

CoinName           1252
Algorithm          1252
IsTrading          1252
ProofType          1252
TotalCoinsMined     744
TotalCoinSupply    1252
dtype: int64

In [5]:
crypto_df.columns

Index(['CoinName', 'Algorithm', 'IsTrading', 'ProofType', 'TotalCoinsMined',
       'TotalCoinSupply'],
      dtype='object')

In [6]:
# Remove all cryptocurrencies that aren’t trading.
crypto_removed_df = crypto_df[crypto_df.IsTrading != False]

In [7]:
crypto_removed_df.count()

CoinName           1144
Algorithm          1144
IsTrading          1144
ProofType          1144
TotalCoinsMined     685
TotalCoinSupply    1144
dtype: int64

In [8]:
crypto_removed_df.columns

Index(['CoinName', 'Algorithm', 'IsTrading', 'ProofType', 'TotalCoinsMined',
       'TotalCoinSupply'],
      dtype='object')

In [9]:
crypto_removed_df.shape

(1144, 6)

In [10]:
# Remove all cryptocurrencies that don’t have an algorithm defined.
crypto_removed_df = crypto_removed_df.dropna(axis=0, subset=['Algorithm'] )
crypto_removed_df.count()

CoinName           1144
Algorithm          1144
IsTrading          1144
ProofType          1144
TotalCoinsMined     685
TotalCoinSupply    1144
dtype: int64

In [11]:
crypto_removed_df.shape

(1144, 6)

In [12]:
# Remove the IsTrading column
crypto_removed_df = crypto_removed_df.drop(['IsTrading'], axis=1)
crypto_removed_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0


In [13]:
crypto_removed_df.count()

CoinName           1144
Algorithm          1144
ProofType          1144
TotalCoinsMined     685
TotalCoinSupply    1144
dtype: int64

In [14]:
# Remove all cryptocurrencies with at least one null value.
crypto_removed_df = crypto_removed_df.dropna(axis=0, how='any')

In [15]:
crypto_removed_df.count()

CoinName           685
Algorithm          685
ProofType          685
TotalCoinsMined    685
TotalCoinSupply    685
dtype: int64

In [16]:
crypto_removed_df.shape

(685, 5)

In [17]:
crypto_removed_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [18]:
# Remove all cryptocurrencies without coins mined
crypto_removed_df = crypto_removed_df.dropna(axis = 0, subset=['TotalCoinsMined'])
crypto_removed_df = crypto_removed_df[crypto_removed_df.TotalCoinsMined != 0]
crypto_removed_df.count()

CoinName           533
Algorithm          533
ProofType          533
TotalCoinsMined    533
TotalCoinSupply    533
dtype: int64

In [19]:
crypto_removed_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0


In [20]:
crypto_removed_df.shape

(533, 5)

In [21]:
# Store the names of all cryptocurrencies on a DataFramed named coins_name, and use the crypto_df.index as the index for this new DataFrame
coins_name = pd.DataFrame(crypto_removed_df, columns=['CoinName'], index=crypto_removed_df.index)
print(coins_name.shape)
coins_name.head()

(533, 1)


Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [22]:
# Remove the CoinName column
crypto_removed_df = crypto_removed_df.drop(['CoinName'], axis=1)
crypto_removed_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [23]:
crypto_removed_df.shape

(533, 4)

In [24]:
# Create dummies variables for all of the text features, and store the resulting data on a DataFrame named X.
X = pd.get_dummies(data=crypto_removed_df, columns=["Algorithm", "ProofType"])
X.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# crypto_removed_df = StandardScaler().fit_transform(iris_df)
# print(iris_scaled[:10])
type(X)

pandas.core.frame.DataFrame

In [26]:
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[:10])

[[-0.11674788 -0.15286468 -0.0433555  -0.0433555  -0.0433555  -0.06137164
  -0.07523548 -0.0433555  -0.06137164 -0.06137164 -0.0433555  -0.0433555
  -0.19226279 -0.06137164 -0.09731237 -0.0433555  -0.11536024 -0.07523548
  -0.0433555  -0.0433555  -0.15176505 -0.0433555  -0.13105561 -0.0433555
  -0.0433555  -0.08695652 -0.0433555  -0.0433555  -0.0433555  -0.0433555
  -0.06137164 -0.0433555  -0.08695652 -0.08695652 -0.08695652 -0.0433555
  -0.13105561 -0.13827675 -0.13827675 -0.0433555  -0.06137164 -0.0433555
  -0.07523548 -0.1815096  -0.0433555  -0.0433555  -0.0433555  -0.07523548
  -0.15811388 -0.3145935  -0.0433555  -0.08695652 -0.07523548 -0.06137164
  -0.0433555   1.38873015 -0.0433555  -0.0433555  -0.06137164 -0.0433555
  -0.0433555  -0.0433555  -0.0433555  -0.0433555  -0.0433555  -0.0433555
  -0.0433555  -0.39836623 -0.0433555  -0.1815096  -0.0433555  -0.08695652
  -0.08695652 -0.10670145 -0.0433555  -0.0433555  -0.13105561 -0.0433555
  -0.0433555  -0.0433555  -0.0433555  -0.07523

In [27]:
# Initialize PCA model
pca = PCA(n_components=3)

In [28]:
# get principal componants
crypto_pca = pca.fit_transform(X_scaled)

In [29]:
pcs_df = pd.DataFrame(data=crypto_pca, columns=['PC1', 'PC2', 'PC3'], index=crypto_removed_df.index)
pcs_df.head(10)

Unnamed: 0,PC1,PC2,PC3
42,-0.326493,1.04087,-0.520487
404,-0.309823,1.041335,-0.520532
1337,2.295519,1.715463,-0.526829
BTC,-0.140403,-1.31538,0.199101
ETH,-0.153966,-2.055358,0.332408
LTC,-0.15771,-1.155661,-0.049069
DASH,-0.407463,1.298317,-0.43464
XMR,-0.150434,-2.254634,0.335218
ETC,-0.152408,-2.055435,0.332407
ZEC,-0.159453,-1.908588,0.384072


In [30]:
pca.explained_variance_

array([2.74239268, 2.09695225, 2.01178505])

In [31]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

print(inertia)
# # Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

[3644.8011518653716, 2475.775603999379, 1491.4465058150051, 598.2673899249094, 401.00383359267033, 330.3095214860783, 269.7208392022655, 210.5863939861126, 177.3844867494154, 141.5610522533104]


In [32]:
# we will consider k=4 as direction changes more after 4 curve starts to become flat after 4
# Initialize the K-means model
model = KMeans(n_clusters=4, random_state=0)
# Fit the model
model.fit(pcs_df)
# Predict clusters
predictions = model.predict(pcs_df)
# Add the predicted class columns
pcs_df["Class"] = model.labels_
pcs_df.head()

Unnamed: 0,PC1,PC2,PC3,Class
42,-0.326493,1.04087,-0.520487,0
404,-0.309823,1.041335,-0.520532,0
1337,2.295519,1.715463,-0.526829,0
BTC,-0.140403,-1.31538,0.199101,1
ETH,-0.153966,-2.055358,0.332408,1


In [33]:
# Algorithm, ProofType,TotalCoinsMined,TotalCoinSupply, PC 1, PC 2,PC 3, CoinName, and Class.
clustered_df = pd.concat([crypto_removed_df,  pcs_df, coins_name], axis=1)
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,Class,CoinName
42,Scrypt,PoW/PoS,41.99995,42,-0.326493,1.04087,-0.520487,0,42 Coin
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.309823,1.041335,-0.520532,0,404Coin
1337,X13,PoW/PoS,29279420000.0,314159265359,2.295519,1.715463,-0.526829,0,EliteCoin
BTC,SHA-256,PoW,17927180.0,21000000,-0.140403,-1.31538,0.199101,1,Bitcoin
ETH,Ethash,PoW,107684200.0,0,-0.153966,-2.055358,0.332408,1,Ethereum


In [34]:
clustered_df = clustered_df[['Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply', 'PC1', 'PC2', 'PC3', 'CoinName', 'Class']]
clustered_df.head(10)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.326493,1.04087,-0.520487,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.309823,1.041335,-0.520532,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159265359,2.295519,1.715463,-0.526829,EliteCoin,0
BTC,SHA-256,PoW,17927180.0,21000000,-0.140403,-1.31538,0.199101,Bitcoin,1
ETH,Ethash,PoW,107684200.0,0,-0.153966,-2.055358,0.332408,Ethereum,1
LTC,Scrypt,PoW,63039240.0,84000000,-0.15771,-1.155661,-0.049069,Litecoin,1
DASH,X11,PoW/PoS,9031294.0,22000000,-0.407463,1.298317,-0.43464,Dash,0
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.150434,-2.254634,0.335218,Monero,1
ETC,Ethash,PoW,113359700.0,210000000,-0.152408,-2.055435,0.332407,Ethereum Classic,1
ZEC,Equihash,PoW,7383056.0,21000000,-0.159453,-1.908588,0.384072,ZCash,1


In [38]:
# Scatter Plot 1 ✓Create a 3D-Scatter using Plotly Express to plot the clusters using the clustered_df DataFrame.
fig = px.scatter_3d(clustered_df,
                    x = 'PC1',
                    y = 'PC2',
                    z = 'PC3',
                    hover_name="CoinName",
                    hover_data=["Algorithm"],
                    color = 'Class',
                    symbol = 'Class',
                    width=1000
                   )
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [46]:
# create a data table with all the current tradable cryptocurrencies. The table should have the following columns: CoinName, Algorithm, ProofType, TotalCoinSupply, TotalCoinsMined, and Class.
crypto_table = clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'Class'], width=600)
crypto_table

In [51]:
from sklearn.preprocessing import MinMaxScaler

In [54]:
# as datarange is wide need scaling
data = clustered_df[['TotalCoinsMined', 'TotalCoinSupply']]

In [56]:
# create an instace of MinMaxScaler()
scaler = MinMaxScaler()

In [60]:
# perform fit
scaled_data = scaler.fit_transform(data)

numpy.ndarray

In [62]:
# create a dataframe 
data_df = pd.DataFrame(scaled_data, columns=["TotalCoinSupply", "TotalCoinsMined"], index=clustered_df.index)
data_df.head()

Unnamed: 0,TotalCoinSupply,TotalCoinsMined
42,0.005942,4.2e-11
404,0.007002,0.000532
1337,0.035342,0.3141593
BTC,0.00596,2.1e-05
ETH,0.00605,0.0


In [63]:
data_df["CoinName"] = clustered_df["CoinName"]
data_df["Class"] = clustered_df["Class"]
data_df.head()

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
42,0.005942,4.2e-11,42 Coin,0
404,0.007002,0.000532,404Coin,0
1337,0.035342,0.3141593,EliteCoin,0
BTC,0.00596,2.1e-05,Bitcoin,1
ETH,0.00605,0.0,Ethereum,1


In [64]:
# Create a scatter plot to present the clustered data about cryptocurrencies 
# having x="TotalCoinsMined" and y="TotalCoinSupply" to contrast the number of available coins 
# versus the total number of mined coins. Use the hover_cols=["CoinName"] parameter 
# to include the cryptocurrency name on each data point.

data_df.hvplot.scatter(x='TotalCoinsMined', y='TotalCoinSupply', hover_cols=["CoinName"], by='Class')