# DBSCAN

In [None]:
import numpy as np
import pandas as pd
import time

### Create synthetic dataset

In [None]:
n_rows = 10000
n_cols = 500
n_clusters_data = 10
cluster_std = 1.0
dtype='float32'
from sklearn.datasets import make_blobs
data, _ = make_blobs(
        n_samples=n_rows, n_features=n_cols, centers=n_clusters_data, cluster_std=cluster_std, random_state=0
    )  # make_blobs creates a random dataset of isotropic gaussian blobs.

data = data.astype(dtype)

### Convert dataset to Spark DataFrame

In [None]:
pd_data = pd.DataFrame({"features": list(data)})
df = spark.createDataFrame(pd_data)

## Spark RAPIDS ML DBSCAN (GPU)

In [None]:
from spark_rapids_ml.clustering import DBSCAN
gpu_dbscan = DBSCAN(eps=50.0, min_samples=3).setFeaturesCol("features")

Estimator can be persisted and reloaded.

In [None]:
estimator_path = "/tmp/dbscan-estimator"

In [None]:
gpu_dbscan.write().overwrite().save(estimator_path)
gpu_dbscan_loaded = DBSCAN.load(estimator_path)

### Fit

In [None]:
start_time = time.time()
gpu_model = gpu_dbscan_loaded.fit(df)
print(f"Fit took: {time.time() - start_time} sec")

In [None]:
gpu_dbscan_loaded.getEps()

### Transform

In [None]:
model_path = "/tmp/dbscan-model"

In [None]:
gpu_model.write().overwrite().save(model_path)

In [None]:
gpu_model_loaded = gpu_model.read().load(model_path)

In [None]:
transformed_df = gpu_model_loaded.setPredictionCol("transformed").transform(df)

In [None]:
transformed_df.printSchema()

In [None]:
transformed_df.count()

In [None]:
transformed_df.show(10)

## Compare DBSCAN vs KMeans

### Create Ring Shape Dataset

In [None]:
def generate_random_points_in_ring(center, inner_radius, outer_radius, num_points):
    # Generate random angles
    angles = np.random.uniform(0, 2 * np.pi, num_points)

    # Generate random radii within the ring
    radii = np.sqrt(np.random.uniform(inner_radius**2, outer_radius**2, num_points))

    # Convert polar coordinates to Cartesian coordinates
    x = center[0] + radii * np.cos(angles)
    y = center[1] + radii * np.sin(angles)

    # Create array of points
    points = np.column_stack((x, y))

    return points

data_inner = generate_random_points_in_ring((0,0), 1, 2, 500)
data_outer = generate_random_points_in_ring((0,0), 4, 5, 500)
data = np.concatenate((data_inner, data_outer), axis=0)
np.random.shuffle(data)

pd_data = pd.DataFrame({"features": list(data)})
df = spark.createDataFrame(pd_data)

### Run DBSCAN

In [None]:
dbscan = DBSCAN(eps=1.0, min_samples=5).setFeaturesCol("features")
dbscan_model = dbscan.fit(df)
dbscan_transformed = dbscan_model.transform(df)

dbscan_pd = dbscan_transformed.toPandas()
dbscan_np = dbscan_pd.to_numpy()

### Run KMeans

In [None]:
from spark_rapids_ml.clustering import KMeans
kmeans =  KMeans(k=2).setFeaturesCol("features")
kmeans_model = kmeans.fit(df)
kmeans_transformed = kmeans_model.transform(df)

kmeans_pd = kmeans_transformed.toPandas()
kmeans_np = kmeans_pd.to_numpy()

### Compare Clustering Result

In [None]:
import matplotlib.pyplot as plt

cluster0 = []
cluster1 = []
for p in kmeans_np:
    if (p[1] == 0):
        cluster0.append(p[0])
    else:
        cluster1.append(p[0])

cluster0 = np.array(cluster0)
cluster1 = np.array(cluster1)
        
plt.scatter(cluster0[:, 0], cluster0[:, 1], s=5, label="cluster 0")
plt.scatter(cluster1[:, 0], cluster1[:, 1], s=5, label="cluster 1")
    
plt.xlabel('X')
plt.ylabel('Y')
plt.title('KMeans Clustering Result')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
cluster0 = []
cluster1 = []
for p in dbscan_np:
    if (p[1] == 0):
        cluster0.append(p[0])
    else:
        cluster1.append(p[0])

cluster0 = np.array(cluster0)
cluster1 = np.array(cluster1)
        
plt.scatter(cluster0[:, 0], cluster0[:, 1], s=5, label="cluster 0")
plt.scatter(cluster1[:, 0], cluster1[:, 1], s=5, label="cluster 1")
    
plt.xlabel('X')
plt.ylabel('Y')
plt.title('DBSCAN Clustering Result')
plt.legend()
plt.grid(True)
plt.show()

## Twitter Dataset

### Download Data and Store to Parquet

In [None]:
# Full dataset
# !curl --output twitter.h5.h5 https://b2share.eudat.eu/api/files/189c8eaf-d596-462b-8a07-93b5922c4a9f/twitter.h5.h5

# Partial small dataset
!curl --output twitterSmall.h5.h5 https://b2share.eudat.eu/api/files/189c8eaf-d596-462b-8a07-93b5922c4a9f/twitterSmall.h5.h5

import h5py
import pyarrow
import pyarrow.parquet as pq

with h5py.File('twitterSmall.h5.h5', 'r') as f: 
    data = f["DBSCAN"][:]

df=pd.DataFrame(data, columns=['f1', 'f2'])
arrow_table = pyarrow.Table.from_pandas(df)

# REMEMBER to change the dbfs path to your designated space
#   Or to local like "./twitter.parquet"
dbfs_path = "/dbfs/temp/twitter.parquet"
pq.write_table(arrow_table, dbfs_path)

df = spark.read.parquet(dbfs_path)

### Run DBSCAN over Twitter Dataset

In [None]:
start_time = time.time()

eps = 0.1
gpu_dbscan = DBSCAN(eps=eps, min_samples=40, metric="euclidean")
gpu_dbscan.setFeaturesCols(["f1", "f2"])
gpu_model = gpu_dbscan.fit(df)
gpu_model.setPredictionCol("prediction")
transformed = gpu_model.transform(df)
transformed.show()

end_time = time.time()
elapsed_time = (end_time - start_time)

print("Time", elapsed_time)

dbscan_np = transformed.toPandas().to_numpy()

n_cluster = max(dbscan_np[:,2])
clusters = [[[],[]] for i in range(int(n_cluster) + 1)]

for p in dbscan_np:
    if int(p[2]) == -1:
        continue

    clusters[int(p[2])][0].append(p[0])
    clusters[int(p[2])][1].append(p[1])

clusters = sorted(clusters, key=lambda x: len(x[0]), reverse=True)
print("Number of clusters: ", len(clusters))

for i, c in enumerate(clusters):
    plt.scatter(c[0], c[1], s=0.5, label=f"cluster {i}")
    
plt.xlabel('X')
plt.ylabel('Y')
plt.title(f'Twitter API Geo Clusters with DBSCAN eps={eps}')
plt.show()
# plt.savefig('plot.png', dpi=1200)