# Clustering Crypto

In [None]:
# Initial imports
import pandas as pd
import hvplot.pandas
#from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [None]:
# Load the crypto_data.csv dataset.
file_path ="./Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path)
crypto_df.head(10)

In [None]:
# crypto_df.info()

In [None]:
crypto_df.set_index("Unnamed: 0", inplace=True)

In [None]:
# Keep all the cryptocurrencies that are being traded.
# crypto_df["IsTrading"].value_counts()

In [None]:
crypto_trading_df = crypto_df.loc[crypto_df["IsTrading"] == True]
crypto_trading_df 

In [None]:
# Keep all the cryptocurrencies that have a working algorithm.
crypto_df["Algorithm"].value_counts()

In [None]:
# Remove the "IsTrading" column. 
crypto_trading_df = crypto_trading_df.drop(columns=["IsTrading"])
crypto_trading_df 

In [None]:
# Remove rows that have at least 1 null value.
crypto_trading_df["TotalCoinsMined"].isnull().value_counts() 

In [None]:
# Keep the rows where coins are mined.
clean_crypto_trading_df=crypto_trading_df.dropna(how='any',axis='rows')
clean_crypto_trading_df.info()

In [None]:
# Keep the rows where coins are mined.
(clean_crypto_trading_df["TotalCoinsMined"] > 0)

In [None]:
# Keep the rows where coins are mined.
clean_crypto_df = clean_crypto_trading_df[(clean_crypto_trading_df["TotalCoinsMined"] > 0)]
clean_crypto_df

In [None]:
# Create a new DataFrame that holds only the cryptocurrencies names.
crypto_names_df = clean_crypto_df.copy()
crypto_names_df = pd.DataFrame(crypto_names_df["CoinName"], index=crypto_names_df.index)
crypto_names_df

In [None]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
clean_crypto_df=clean_crypto_df.drop(columns=["CoinName"])
clean_crypto_df

In [None]:
clean_crypto_df.info()

In [None]:
#recast / to numeric method
clean_crypto_df["TotalCoinSupply"] = pd.to_numeric(clean_crypto_df["TotalCoinsMined"],errors='coerce')
clean_crypto_df.head()

In [None]:
clean_crypto_df.info()

In [None]:
#Save file
output_file_path ="./Resources/crypto_names.csv"
crypto_names_df.to_csv(output_file_path, index=True)

In [None]:
# Use get_dummies() to create variables for text features.
# encoded_crypto_df ="./Resources/crypto_data.csv"
# encoded_crypto_df = pd.read_csv(file_path)
# encoded_crypto_df

X_encoded = pd.get_dummies(clean_crypto_df,columns=["Algorithm","ProofType"])
X_encoded.shape

In [None]:
# Standardize the data with StandardScaler().
# clean_crypto_df
scaler=StandardScaler()

X_scaled = scaler.fit_transform(X_encoded)
X_scaled[:5]

X_encoded

### Deliverable 2: Reducing Data Dimensions Using PCA

In [None]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)

X_pca = pca.fit_transform(X_scaled)

X_pca

In [None]:
# Create a DataFrame with the three principal components.
X_pca_df = pd.DataFrame(
                data=X_pca,
                columns = ["PC 1","PC 2","PC 3"],
                index=X_encoded.index)
X_pca_df

### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [None]:
# Create an elbow curve to find the best value for K.
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(X_pca_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


Running K-Means with `k=4`

In [None]:
# Initialize the K-Means model.
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(X_pca_df)

# Predict clusters
predictions = model.predict(X_pca_df)

# Add the predicted class columns
X_pca_df["class"] = model.labels_
X_pca_df.head()

In [None]:
# file_path ="./Resources/crypto_data.csv"
crypto_names = pd.read_csv("./Resources/crypto_names.csv")
crypto_names.head()

In [None]:
crypto_names.set_index("Unnamed: 0", inplace=True)
crypto_names

In [None]:
crypto_names["CoinName"]

In [None]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([crypto_df, X_pca_df], axis=1)
clustered_df

# #  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_df["CoinName"] = crypto_names["CoinName"]
clustered_df.head()

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# YOUR CODE HERE

# # Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [None]:
# # Create a table with tradable cryptocurrencies.
# file_path ="./Resources/crypto_data.csv"
# crypto_df = pd.read_csv(file_path)
# crypto_df.head()

In [None]:
# Print the total number of tradable cryptocurrencies.
print(f"The total number of clustered cryptocurrencies: {clustered_df.shape[0]}")

In [None]:
# Plotting the clusters with three features
fig = px.scatter_3d(clustered_df, x="PC 1", y="PC 2", z="PC 3", color="class", symbol="class", width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()