### CLUSTERING WITH DBSCAN

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

In [2]:
data = load_iris(as_frame=True)['data']
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [30]:
# model fitting 

model = DBSCAN(eps=1.3, min_samples=11)
model.fit(X=data)
preds = model.labels_
preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [4]:
silhouette_score(X=data, labels=preds)

0.501833562046432

In [24]:
mininmum_samples = list(range(3, 12))
epsilons = [x/10 for x in range(2, 14)]
score_dict = {}

for epsilon in epsilons:
    for sample in mininmum_samples:
        try:
            print('*' * 20)
            model = DBSCAN(eps=epsilon, min_samples=sample)
            model.fit(X=data)
            score = silhouette_score(X=data, labels=model.labels_)
            print(f"epsilon: {epsilon}, min_sample: {sample}.........score: {score}")
        except ValueError:
            score = 0
        score_dict.update({score:[epsilon,sample]})


********************
epsilon: 0.2, min_sample: 3.........score: -0.34352957424599934
********************
epsilon: 0.2, min_sample: 4.........score: -0.3298054383391397
********************
epsilon: 0.2, min_sample: 5.........score: 0.1825642832387009
********************
epsilon: 0.2, min_sample: 6.........score: 0.1584293948698758
********************
epsilon: 0.2, min_sample: 7.........score: 0.12266342750596144
********************
epsilon: 0.2, min_sample: 8.........score: 0.1085733936304099
********************
********************
********************
********************
epsilon: 0.3, min_sample: 3.........score: 0.03169672127881879
********************
epsilon: 0.3, min_sample: 4.........score: -0.046469123715262434
********************
epsilon: 0.3, min_sample: 5.........score: -0.0520642964109566
********************
epsilon: 0.3, min_sample: 6.........score: 0.040616684775775444
********************
epsilon: 0.3, min_sample: 7.........score: 0.008293473378023214
***********

In [29]:
score_dict[max(list(score_dict.keys()))]

[1.3, 11]

### EXCERCISE

Run a dbscan segmentation on tips dataset

In [31]:
import plotly.express as px

data = px.data.tips()
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
