In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_blobs, make_moons, make_circles
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")


In [None]:
# Create different datasets for DBSCAN testing
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Blobs dataset
X_blobs, y_blobs = make_blobs(n_samples=300, centers=4, n_features=2, 
                              cluster_std=0.8, random_state=42)
axes[0, 0].scatter(X_blobs[:, 0], X_blobs[:, 1], c=y_blobs, cmap='viridis', alpha=0.7)
axes[0, 0].set_title('Blobs Dataset')
axes[0, 0].set_xlabel('Feature 1')
axes[0, 0].set_ylabel('Feature 2')

# 2. Moons dataset (non-convex clusters)
X_moons, y_moons = make_moons(n_samples=300, noise=0.1, random_state=42)
axes[0, 1].scatter(X_moons[:, 0], X_moons[:, 1], c=y_moons, cmap='viridis', alpha=0.7)
axes[0, 1].set_title('Moons Dataset (Non-convex)')
axes[0, 1].set_xlabel('Feature 1')
axes[0, 1].set_ylabel('Feature 2')

# 3. Circles dataset (nested clusters)
X_circles, y_circles = make_circles(n_samples=300, noise=0.05, factor=0.6, random_state=42)
axes[0, 2].scatter(X_circles[:, 0], X_circles[:, 1], c=y_circles, cmap='viridis', alpha=0.7)
axes[0, 2].set_title('Circles Dataset (Nested)')
axes[0, 2].set_xlabel('Feature 1')
axes[0, 2].set_ylabel('Feature 2')

# 4. Dataset with outliers
np.random.seed(42)
X_normal = np.random.normal(0, 1, (280, 2))
X_outliers = np.random.uniform(-4, 4, (20, 2))
X_with_outliers = np.vstack([X_normal, X_outliers])
outlier_labels = np.array([0]*280 + [1]*20)

axes[1, 0].scatter(X_with_outliers[:, 0], X_with_outliers[:, 1], 
                  c=outlier_labels, cmap='coolwarm', alpha=0.7)
axes[1, 0].set_title('Dataset with Outliers')
axes[1, 0].set_xlabel('Feature 1')
axes[1, 0].set_ylabel('Feature 2')

# 5. Varying density clusters
X_var_density = np.vstack([
    np.random.normal([0, 0], 0.5, (100, 2)),
    np.random.normal([3, 3], 1.5, (100, 2)),
    np.random.normal([-2, 3], 0.3, (50, 2))
])
axes[1, 1].scatter(X_var_density[:, 0], X_var_density[:, 1], alpha=0.7)
axes[1, 1].set_title('Varying Density Clusters')
axes[1, 1].set_xlabel('Feature 1')
axes[1, 1].set_ylabel('Feature 2')

# 6. Noise dataset
X_noise = np.random.uniform(-3, 3, (200, 2))
axes[1, 2].scatter(X_noise[:, 0], X_noise[:, 1], alpha=0.7)
axes[1, 2].set_title('Random Noise')
axes[1, 2].set_xlabel('Feature 1')
axes[1, 2].set_ylabel('Feature 2')

plt.tight_layout()
plt.show()

# Store datasets for later use
datasets = {
    'Blobs': (X_blobs, y_blobs),
    'Moons': (X_moons, y_moons),
    'Circles': (X_circles, y_circles),
    'With Outliers': (X_with_outliers, outlier_labels),
    'Varying Density': (X_var_density, None),
    'Noise': (X_noise, None)
}

print("Created 6 different datasets for DBSCAN testing")


In [None]:
def find_optimal_eps(X, min_samples=5):
    """Find optimal eps parameter using k-distance graph"""
    # Calculate k-distances (distance to k-th nearest neighbor)
    neighbors = NearestNeighbors(n_neighbors=min_samples)
    neighbors_fit = neighbors.fit(X)
    distances, indices = neighbors_fit.kneighbors(X)
    
    # Sort distances and plot
    k_distances = distances[:, min_samples-1]  # k-th nearest neighbor distance
    k_distances = np.sort(k_distances, reverse=True)
    
    return k_distances

# Demonstrate parameter selection for moons dataset
X_demo = X_moons
min_samples_options = [3, 5, 10]

fig, axes = plt.subplots(2, 3, figsize=(18, 12))

for i, min_samples in enumerate(min_samples_options):
    # Plot k-distance graph
    k_distances = find_optimal_eps(X_demo, min_samples)
    axes[0, i].plot(range(len(k_distances)), k_distances, 'b-', linewidth=2)
    axes[0, i].set_title(f'K-Distance Graph (min_samples={min_samples})')
    axes[0, i].set_xlabel('Points sorted by distance')
    axes[0, i].set_ylabel(f'{min_samples}-NN Distance')
    axes[0, i].grid(True, alpha=0.3)
    
    # Find elbow point (simplified approach)
    # Look for the point where the slope changes most dramatically
    diffs = np.diff(k_distances)
    elbow_idx = np.argmax(diffs)
    optimal_eps = k_distances[elbow_idx]
    
    axes[0, i].axhline(y=optimal_eps, color='r', linestyle='--', 
                      label=f'Suggested eps: {optimal_eps:.3f}')
    axes[0, i].legend()
    
    # Apply DBSCAN with suggested parameters
    dbscan = DBSCAN(eps=optimal_eps, min_samples=min_samples)
    cluster_labels = dbscan.fit_predict(X_demo)
    
    # Plot clustering results
    n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    n_noise = list(cluster_labels).count(-1)
    
    scatter = axes[1, i].scatter(X_demo[:, 0], X_demo[:, 1], c=cluster_labels, 
                               cmap='viridis', alpha=0.7)
    axes[1, i].set_title(f'DBSCAN Results\\nClusters: {n_clusters}, Noise: {n_noise}')
    axes[1, i].set_xlabel('Feature 1')
    axes[1, i].set_ylabel('Feature 2')
    
    # Highlight noise points
    noise_mask = cluster_labels == -1
    if np.any(noise_mask):
        axes[1, i].scatter(X_demo[noise_mask, 0], X_demo[noise_mask, 1], 
                         c='red', marker='x', s=50, label='Noise')
        axes[1, i].legend()

plt.tight_layout()
plt.show()

print("Parameter selection guidelines:")
print("- eps: Look for 'elbow' in k-distance graph")
print("- min_samples: Start with 2*dimensions, adjust based on noise tolerance")
print("- Higher min_samples = more conservative clustering (less noise)")
print("- Lower eps = more clusters, higher eps = fewer clusters")
