Import the necessary libraries

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.linalg import inv
from IPython.display import display
from scipy.spatial.distance import pdist

Fecth data and filtering data

In [None]:
# Load the CSV file
file_path = 'Crypto.csv'
df = pd.read_csv(file_path)

def convert_to_float(value):
    if isinstance(value, str):
        return float(value.replace(',', ''))
    return float(value)

# Convert the 'marketcap' column to float
df['marketcap'] = df['marketcap'].apply(convert_to_float)

# Set pandas display options to avoid scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Convert all relevant columns to numeric
features = ['volume24hrs', 'marketcap', 'circulatingsupply', 'maxsupply', 'totalsupply', 'price']
for feature in features:
    df[feature] = pd.to_numeric(df[feature], errors='coerce')

# Drop rows with any NaN values
data_selected = df[features].dropna()

Removing outliers

In [None]:
# Function to remove outliers using Z-score
def remove_outliers_zscore(data, threshold=3):
    z_scores = np.abs(zscore(data))
    non_outliers = (z_scores < threshold).all(axis=1)
    filtered_data = data[non_outliers]
    return filtered_data

# Remove outliers
data_selected_no_outliers = remove_outliers_zscore(data_selected)

VIF Calculation for Original Data

In [None]:
# Function to calculate VIF using correlation matrix
def calculate_vif(data):
    vif = pd.DataFrame()
    vif['Feature'] = data.columns
    corr_matrix = np.corrcoef(data, rowvar=False)
    inv_corr_matrix = inv(corr_matrix)
    vif['VIF'] = [inv_corr_matrix[i, i] for i in range(inv_corr_matrix.shape[0])]
    return vif

# Calculate VIF for original data
vif_original = calculate_vif(data_selected)

Data Standarization

In [None]:
# Standardize the data
x = StandardScaler().fit_transform(data_selected_no_outliers)

VIF Calculation for PCA Data

In [None]:
# PCA transformation
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
pca_df = pd.DataFrame(data=principalComponents, columns=['PC1', 'PC2'])

# Correlation heatmap for original data
plt.figure(figsize=(10, 8))
sns.heatmap(data_selected.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix (Original Data)')
plt.show()

# Calculate VIF for PCA data
vif_pca = calculate_vif(pca_df)

# Correlation heatmap for PCA data
plt.figure(figsize=(10, 8))
sns.heatmap(pca_df.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix (PCA Data)')
plt.show()

# Display VIF before and after PCA
print("VIF for Original Data:")
display(vif_original)
print("VIF for PCA Data:")
display(vif_pca)


Biplot Visualization for PCA Data

In [None]:
# Biplot function
def biplot(pca, principalComponents, feature_names):
    fig, ax = plt.subplots(figsize=(10, 7))
    scatter = ax.scatter(principalComponents[:, 0], principalComponents[:, 1])
    for i in range(len(feature_names)):
        ax.arrow(0, 0, pca.components_[0, i] * max(principalComponents[:, 0]), 
                 pca.components_[1, i] * max(principalComponents[:, 1]),
                 head_width=0.05, head_length=0.1, fc='r', ec='r')
        ax.text(pca.components_[0, i] * max(principalComponents[:, 0]), 
                pca.components_[1, i] * max(principalComponents[:, 1]),
                feature_names[i], color='black', ha='center', va='center')
    ax.set_xlabel('PC1')
    ax.set_ylabel('PC2')
    ax.set_title('PCA Biplot')
    plt.show()

# Display Biplot
biplot(pca, principalComponents, features)

K-Means Clustering

In [None]:
# Perform K-means clustering
def find_optimal_k(data):
    inertia = []
    silhouette_scores = []
    K = range(2, 11)
    for k in K:
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(data)
        inertia.append(kmeans.inertia_)
        silhouette_scores.append(silhouette_score(data, kmeans.labels_))
    return K, inertia, silhouette_scores

# Optimal k for PCA data
K, inertia, silhouette_scores = find_optimal_k(principalComponents)
plt.figure(figsize=(10, 5))
plt.plot(K, inertia, marker='o', label='Inertia')
plt.plot(K, silhouette_scores, marker='s', label='Silhouette Score')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Score')
plt.legend()
plt.title('Elbow Method and Silhouette Score for PCA Data')
plt.show()

# Apply K-means with optimal k (e.g., k=4)
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k)
clusters_pca = kmeans.fit_predict(principalComponents)

# Add clusters to DataFrame
pca_df['cluster'] = clusters_pca

K-means Accuracy Testing

In [None]:
# Function to print cluster details
def print_cluster_details(kmeans, data, clusters):
    n_clusters = kmeans.n_clusters
    cluster_centers = kmeans.cluster_centers_
    labels = kmeans.labels_
    cluster_sizes = pd.Series(clusters).value_counts().sort_index()

    print(f"K-means clustering with {n_clusters} clusters:")
    print(f"Cluster sizes: {cluster_sizes.values}")
    print("\nCluster means:")
    cluster_means = pd.DataFrame(cluster_centers, columns=data.columns)
    display(cluster_means)
    
    print("\nClustering vector:")
    for i in range(n_clusters):
        print(f"Cluster {i + 1}:")
        print(np.where(labels == i)[0])

    within_cluster_ss = kmeans.inertia_
    total_ss = np.sum((data - data.mean(axis=0)) ** 2).sum()
    between_ss = total_ss - within_cluster_ss
    print("\nWithin cluster sum of squares by cluster:")
    print(within_cluster_ss)
    print(f"(between_ss / total_ss = {between_ss / total_ss * 100:.1f}%)")

# Print cluster details for with PCA
print("\nK-means clustering with PCA")
print_cluster_details(kmeans, pca_df[['PC1', 'PC2']], clusters_pca)

Testing Data for Best Dendograms using Cophenetic Correlation

In [None]:
# Calculate cophenetic correlation for different linkage methods
linkage_methods = ['single', 'complete', 'average', 'ward']
cophenetic_scores = {}

for method in linkage_methods:
    Z = linkage(x, method=method)
    coph_corr, _ = cophenet(Z, pdist(x))
    cophenetic_scores[method] = coph_corr

# Display cophenetic scores
cophenetic_df = pd.DataFrame(list(cophenetic_scores.items()), columns=['Linkage Method', 'Cophenetic Correlation'])
display(cophenetic_df)

# Select the best method
best_method = max(cophenetic_scores, key=cophenetic_scores.get)
print(f"The best linkage method is: {best_method} with a cophenetic correlation of {cophenetic_scores[best_method]:.4f}")


Dendogram Visualization

In [None]:
# Hierarchical Clustering
linked = linkage(principalComponents, 'average')
plt.figure(figsize=(10, 7))
dendrogram(linked, orientation='top', truncate_mode='lastp', p=optimal_k)
plt.title('Dendrogram for PCA Data')
plt.show()

Scatter Plot for PCA Data

In [None]:
# Visualization of clusters
def scatter_plot(data, x_feature, y_feature, clusters):
    plt.figure(figsize=(10, 7))
    sns.scatterplot(data=data, x=x_feature, y=y_feature, hue=clusters, palette='viridis')
    plt.title(f'Scatter Plot: {x_feature} vs {y_feature}')
    plt.xlabel(x_feature)
    plt.ylabel(y_feature)
    plt.legend(title='Cluster')
    plt.show()

# Since PCA data is already reduced, we will use PC1 and PC2 for visualization
scatter_plot(pca_df, 'PC1', 'PC2', 'cluster')

Clusterization Information

In [None]:
# Display PCA data with clusters
print("PCA Data with Clusters:")
display(pca_df)

# Display original data with clusters
data_selected_no_outliers['cluster'] = clusters_pca
print("Original Data with Clusters:")
display(data_selected_no_outliers)