# DBSCAN: Density-Based Spatial Clustering of Applications with Noise

DBSCAN is an unsupervised machine learning algorithm used for clustering tasks. It is particularly effective for identifying clusters of arbitrary shape and handling noise (outliers).

## Key Concepts
- **Core Points**: Points with at least a minimum number of neighbors (`minPts`) within a specified radius (`eps`).
- **Border Points**: Points within the `eps` radius of a core point but with fewer than `minPts` neighbors.
- **Noise Points**: Points that are neither core points nor border points.

## Parameters
1. **`eps`**: The radius of the neighborhood around a point.
2. **`minPts`**: The minimum number of points required to form a dense region (a cluster).

## Algorithm Steps
1. Start with an unvisited point.
2. If the point is a core point (has at least `minPts` points within `eps`):
   - Create a new cluster.
   - Add all directly connected points (within `eps`) to this cluster.
   - Recursively expand the cluster by visiting neighbors of neighbors.
3. If the point is not a core point and not connected to any cluster, label it as noise.
4. Repeat until all points are visited.

## Advantages
- **No need to specify the number of clusters**: The algorithm determines clusters based on data distribution.
- **Handles arbitrary shapes**: Unlike k-means, DBSCAN identifies clusters of any shape.
- **Noise resilience**: Explicitly labels noise points for outlier detection.

## Limitations
- **Parameter sensitivity**: Proper selection of `eps` and `minPts` is crucial for good results.
- **Scalability**: DBSCAN can be computationally intensive for large datasets.
- **Difficulty with varying densities**: Struggles with clusters that have widely varying densities.

## Applications
- Anomaly detection (e.g., fraud detection, fault detection).
- Spatial data analysis (e.g., geographic clustering).
- Customer segmentation.
- Community detection in social networks.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

def preprocess_data(df):
    """
    Comprehensive data preprocessing function
    
    Parameters:
    -----------
    df : pandas DataFrame
        Raw consumer complaints dataset
    
    Returns:
    --------
    DataFrame: Processed features for clustering
    """
    # Create a copy to avoid modifying the original DataFrame
    data = df.copy()
    
    # Select relevant features for clustering
    categorical_features = [
        'product', 'sub_product', 'issue', 'sub_issue', 
        'state', 'submitted_via', 'company_response_to_consumer',
        'timely_response', 'consumer_disputed?'
    ]
    
    # Encode categorical variables
    le = LabelEncoder()
    for col in categorical_features:
        # Handle missing values first
        data[col] = data[col].fillna('Unknown')
        data[col] = le.fit_transform(data[col].astype(str))
    
    # Select features for clustering
    features_for_clustering = [
        'product', 'sub_product', 'issue', 'sub_issue', 
        'state', 'submitted_via', 'company_response_to_consumer',
        'timely_response', 'consumer_disputed?'
    ]
    
    # Prepare the feature matrix
    X = data[features_for_clustering]
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, X.columns

def perform_dbscan_clustering(X_scaled, eps_range=None, min_samples_range=None):
    """
    Perform DBSCAN clustering with parameter tuning
    
    Parameters:
    -----------
    X_scaled : array-like
        Scaled feature matrix
    eps_range : list, optional
        Range of epsilon values to test
    min_samples_range : list, optional
        Range of min_samples values to test
    
    Returns:
    --------
    tuple: (best DBSCAN model, clustering results)
    """
    # Default parameter ranges if not provided
    if eps_range is None:
        eps_range = np.linspace(0.1, 2, 20)
    if min_samples_range is None:
        min_samples_range = range(2, 10)
    
    # Parameter tuning
    best_silhouette = -1
    best_eps = 0
    best_min_samples = 0
    best_labels = None
    
    results = []
    
    for eps in eps_range:
        for min_samples in min_samples_range:
            # Perform DBSCAN clustering
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            labels = dbscan.fit_predict(X_scaled)
            
            # Calculate metrics
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            n_noise = list(labels).count(-1)
            
            # Calculate silhouette score (excluding noise points)
            if n_clusters > 1:
                try:
                    silhouette_avg = silhouette_score(
                        X_scaled[labels != -1], 
                        labels[labels != -1]
                    )
                except:
                    silhouette_avg = -1
            else:
                silhouette_avg = -1
            
            # Store results
            results.append({
                'eps': eps,
                'min_samples': min_samples,
                'n_clusters': n_clusters,
                'n_noise': n_noise,
                'silhouette_score': silhouette_avg
            })
            
            # Update best parameters
            if silhouette_avg > best_silhouette:
                best_silhouette = silhouette_avg
                best_eps = eps
                best_min_samples = min_samples
                best_labels = labels
    
    # Perform final DBSCAN with best parameters
    best_dbscan = DBSCAN(eps=best_eps, min_samples=best_min_samples)
    best_dbscan.fit(X_scaled)
    
    return best_dbscan, best_labels, results, best_eps, best_min_samples

def visualize_clustering_results(X_scaled, labels, feature_names):
    """
    Visualize DBSCAN clustering results
    
    Parameters:
    -----------
    X_scaled : array-like
        Scaled feature matrix
    labels : array-like
        Cluster labels
    feature_names : array-like
        Names of features used for clustering
    """
    plt.figure(figsize=(20, 15))
    
    # 1. PCA Visualization
    plt.subplot(221)
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    
    # Scatter plot with different colors for each cluster
    unique_labels = set(labels)
    colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
    
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise
            col = [0, 0, 0, 1]
        
        class_member_mask = (labels == k)
        xy = X_pca[class_member_mask]
        plt.scatter(xy[:, 0], xy[:, 1], color=col, 
                    label=f'Cluster {k}' if k != -1 else 'Noise',
                    alpha=0.7)
    
    plt.title('PCA Visualization of Clusters')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.legend()
    
    # 2. Cluster Size Distribution
    plt.subplot(222)
    cluster_sizes = pd.Series(labels).value_counts()
    cluster_sizes.plot(kind='bar')
    plt.title('Cluster Size Distribution')
    plt.xlabel('Cluster Label')
    plt.ylabel('Number of Samples')
    
    # 3. Hyperparameter Tuning Results
    plt.subplot(223)
    tuning_results = pd.DataFrame(results)
    plt.scatter(
        tuning_results['eps'], 
        tuning_results['silhouette_score'], 
        c=tuning_results['n_clusters'], 
        cmap='viridis'
    )
    plt.colorbar(label='Number of Clusters')
    plt.title('Hyperparameter Tuning')
    plt.xlabel('Epsilon')
    plt.ylabel('Silhouette Score')
    
    # 4. Feature Importance in Clustering
    plt.subplot(224)
    # Compute feature importance using variance between clusters
    feature_importance = []
    for i, feature in enumerate(feature_names):
        # Calculate variance of feature across different clusters
        feature_var = [
            X_scaled[labels == cluster, i].var() 
            for cluster in set(labels) if cluster != -1
        ]
        feature_importance.append(np.mean(feature_var))
    
    plt.barh(feature_names, feature_importance)
    plt.title('Feature Importance in Clustering')
    plt.xlabel('Variance Contribution')
    
    plt.tight_layout()
    plt.show()
    
    # Print additional clustering statistics
    print("\nClustering Statistics:")
    print(f"Number of Clusters: {len(set(labels)) - (1 if -1 in labels else 0)}")
    print(f"Number of Noise Points: {list(labels).count(-1)}")
    print(f"Best Epsilon: {best_eps}")
    print(f"Best Minimum Samples: {best_min_samples}")

def main():
    # Load the data
    df = pd.read_csv('C:/Users/NANAYAW/OneDrive/Documents/GitHub/FinalProject/consumer_complaints.csv')
    
    # Preprocess the data
    X_scaled, feature_names = preprocess_data(df)
    
    # Perform DBSCAN clustering
    dbscan, labels, results, best_eps, best_min_samples = perform_dbscan_clustering(X_scaled)
    
    # Visualize clustering results
    visualize_clustering_results(X_scaled, labels, feature_names)

# Run the main function
if __name__ == '__main__':
    main()