# Clustering Analysis

## DBSCAN

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, adjusted_rand_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

features = ['Close', 'High', 'Low', 'Open', 'Volume']
X = balanced_df[features]
X_scaled = StandardScaler().fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

dbscan = DBSCAN(eps=0.4, min_samples=5)
clusters = dbscan.fit_predict(X_pca)

balanced_df['DBSCAN_Cluster'] = clusters

plt.figure(figsize=(10, 7))
palette = sns.color_palette('husl', len(set(clusters)))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette=palette, legend='full')
plt.title('DBSCAN Clustering Results (PCA-reduced Data)', fontsize=16)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.grid(True)
plt.show()

n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
n_noise = list(clusters).count(-1)

if n_clusters >= 2:
    print(f'DBSCAN found {n_clusters} clusters.')
    print(f' Number of noise points: {n_noise}')
    mask = clusters != -1
    sil_score = silhouette_score(X_scaled[mask], clusters[mask])
    print(f' Silhouette Score (excluding noise): {sil_score:.4f}')
else:
    print(f'DBSCAN found {n_clusters} cluster(s). Silhouette score not defined.')
    print(f'Number of noise points: {n_noise}')

In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
X_scaled = StandardScaler().fit_transform(X)

k = 5
neigh = NearestNeighbors(n_neighbors=k)
nbrs = neigh.fit(X_scaled)
distances, indices = nbrs.kneighbors(X_scaled)

k_distances = np.sort(distances[:, -1])
plt.figure(figsize=(10, 6))
plt.plot(k_distances)
plt.title(f'K-distance Graph (k={k})')
plt.xlabel('Points sorted by distance')
plt.ylabel(f'{k}th Nearest Neighbor Distance')
plt.grid(True)
plt.show()

In [None]:
cluster_labels = balanced_df[['DBSCAN_Cluster']].copy()
cluster_labels['Target'] = y.values

cluster_analysis = cluster_labels.groupby('DBSCAN_Cluster')['Target'].value_counts(normalize=True).unstack()
print('Class distribution in each cluster:
')
print(cluster_analysis.fillna(0).round(2))

## K-means

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, adjusted_rand_score
import matplotlib.pyplot as plt
import seaborn as sns

features = ['Close', 'High', 'Low', 'Open', 'Volume']
X = balanced_df[features]
X_scaled = StandardScaler().fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

k = 4
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
clusters_kmeans = kmeans.fit_predict(X_scaled)

balanced_df['KMeans_Cluster'] = clusters_kmeans

plt.figure(figsize=(10, 7))
palette = sns.color_palette('husl', k)
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters_kmeans, palette=palette, legend='full')
plt.title('K-Means Clustering Results (PCA-reduced Data)', fontsize=16)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid(True)
plt.legend(title='Cluster')
plt.show()

sil_score = silhouette_score(X_scaled, clusters_kmeans)
ari_score = adjusted_rand_score(balanced_df['Target'], clusters_kmeans)

print(f'Silhouette Score: {sil_score:.4f}')

cluster_target_dist = balanced_df.groupby('KMeans_Cluster')['Target'].value_counts(normalize=True).unstack().fillna(0)
print('
Target Proportions in Each K-Means Cluster:')
print(cluster_target_dist)

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

features = ['Close', 'High', 'Low', 'Open', 'Volume']
X = balanced_df[features]
X_scaled = StandardScaler().fit_transform(X)

inertia_scores = []
silhouette_scores = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X_scaled)
    inertia_scores.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, clusters))

plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.plot(k_range, inertia_scores, marker='o', linestyle='-')
plt.title(' Elbow Method (Inertia)')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(k_range, silhouette_scores, marker='s', color='green')
plt.title(' Silhouette Scores')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.tight_layout()
plt.show()

## Agglomerative

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, adjusted_rand_score
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

features = ['Close', 'High', 'Low', 'Open', 'Volume']
X = balanced_df[features]
y = balanced_df['Target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

linked = linkage(X_scaled, method='ward')

n_clusters = 3
agg = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
clusters = agg.fit_predict(X_scaled)

balanced_df['Hierarchical_Cluster'] = clusters

In [None]:
plt.figure(figsize=(10, 7))
palette = sns.color_palette('Set1', n_clusters)
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette=palette, legend='full')
plt.title('Hierarchical Clustering Results (PCA-reduced Data)', fontsize=16)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.grid(True)
plt.show()

In [None]:
sil_score = silhouette_score(X_scaled, clusters)
ari_score = adjusted_rand_score(y, clusters)

print(f' Silhouette Score: {sil_score:.4f}')

In [None]:
cluster_counts = pd.crosstab(balanced_df['Hierarchical_Cluster'], y, normalize='index')
print('Target Proportions in Each Hierarchical Cluster:')
print(cluster_counts)