In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.metrics import silhouette_score
from sklearn.model_selection import ParameterGrid  # used for hyperParam tuning of unsurpervised model
from sklearn.cluster import KMeans, DBSCAN, MeanShift

In [3]:
drivers_df = pd.read_csv("./datasets/driver_details.csv")
drivers_df.head()

Unnamed: 0,Driver_ID,Distance_Feature,Speeding_Feature
0,3423311935,71.24,28
1,3423313212,52.53,25
2,3423313724,64.54,27
3,3423311373,55.69,22
4,3423310999,54.58,25


In [4]:
# best practice to check for nana values
drivers_df[drivers_df.isnull().any(axis = 1)]

Unnamed: 0,Driver_ID,Distance_Feature,Speeding_Feature


In [5]:
drivers_df.describe()

Unnamed: 0,Driver_ID,Distance_Feature,Speeding_Feature
count,4000.0,4000.0,4000.0
mean,3423312000.0,76.041523,10.721
std,1154.845,53.469563,13.708543
min,3423310000.0,15.52,0.0
25%,3423311000.0,45.2475,4.0
50%,3423312000.0,53.33,6.0
75%,3423313000.0,65.6325,9.0
max,3423314000.0,244.79,100.0


In [6]:
drivers_features = drivers_df.drop("Driver_ID", axis = 1)

In [8]:
# tuning
parameters = {"n_clusters": [2, 3, 4, 5, 10, 20]}
parameter_grid = ParameterGrid(parameters)

# checks values in this grid obj
list(parameter_grid)

[{'n_clusters': 2},
 {'n_clusters': 3},
 {'n_clusters': 4},
 {'n_clusters': 5},
 {'n_clusters': 10},
 {'n_clusters': 20}]

In [9]:
best_score = -1
model = KMeans()

In [11]:
# iter through paramgrid and train with hyperParams specified
for g in parameter_grid:
    model.set_params(**g)  # sets model param to hyperparam in focus
    model.fit(drivers_features)
    
    ss = metrics.silhouette_score(drivers_features, model.labels_)
    print("Parameter: ", g, "Score: ", ss)
    if ss > best_score:
        best_score = ss
        best_grid = g

Parameter:  {'n_clusters': 2} Score:  0.8490223286225532
Parameter:  {'n_clusters': 3} Score:  0.8231396834167266
Parameter:  {'n_clusters': 4} Score:  0.5911323766293183
Parameter:  {'n_clusters': 5} Score:  0.5131666382497846
Parameter:  {'n_clusters': 10} Score:  0.43562439209615356
Parameter:  {'n_clusters': 20} Score:  0.3646885426381393


In [15]:
best_grid

{'n_clusters': 2}

**For DBSCAN algo**

In [20]:
parameters = {
    "eps": [0.9, 1.0, 5.0, 10.0, 12.0, 14.0, 20.0],
    "min_samples": [5, 7, 10, 12]
}

parameter_grid = ParameterGrid(parameters)
list(parameter_grid)

[{'eps': 0.9, 'min_samples': 5},
 {'eps': 0.9, 'min_samples': 7},
 {'eps': 0.9, 'min_samples': 10},
 {'eps': 0.9, 'min_samples': 12},
 {'eps': 1.0, 'min_samples': 5},
 {'eps': 1.0, 'min_samples': 7},
 {'eps': 1.0, 'min_samples': 10},
 {'eps': 1.0, 'min_samples': 12},
 {'eps': 5.0, 'min_samples': 5},
 {'eps': 5.0, 'min_samples': 7},
 {'eps': 5.0, 'min_samples': 10},
 {'eps': 5.0, 'min_samples': 12},
 {'eps': 10.0, 'min_samples': 5},
 {'eps': 10.0, 'min_samples': 7},
 {'eps': 10.0, 'min_samples': 10},
 {'eps': 10.0, 'min_samples': 12},
 {'eps': 12.0, 'min_samples': 5},
 {'eps': 12.0, 'min_samples': 7},
 {'eps': 12.0, 'min_samples': 10},
 {'eps': 12.0, 'min_samples': 12},
 {'eps': 14.0, 'min_samples': 5},
 {'eps': 14.0, 'min_samples': 7},
 {'eps': 14.0, 'min_samples': 10},
 {'eps': 14.0, 'min_samples': 12},
 {'eps': 20.0, 'min_samples': 5},
 {'eps': 20.0, 'min_samples': 7},
 {'eps': 20.0, 'min_samples': 10},
 {'eps': 20.0, 'min_samples': 12}]

In [21]:
model = DBSCAN()
best_score = -1

In [22]:
for g in parameter_grid:
    model.set_params(**g)  # sets model param to hyperparam in focus
    model.fit(drivers_features)
    
    ss = metrics.silhouette_score(drivers_features, model.labels_)
    print("Parameter: ", g, "Score: ", ss)
    if ss > best_score:
        best_score = ss
        best_grid = g

Parameter:  {'eps': 0.9, 'min_samples': 5} Score:  -0.6057173612292268
Parameter:  {'eps': 0.9, 'min_samples': 7} Score:  -0.4265046999507063
Parameter:  {'eps': 0.9, 'min_samples': 10} Score:  -0.39254168253371013
Parameter:  {'eps': 0.9, 'min_samples': 12} Score:  -0.4286838741223884
Parameter:  {'eps': 1.0, 'min_samples': 5} Score:  -0.6155746493060738
Parameter:  {'eps': 1.0, 'min_samples': 7} Score:  -0.41637001640330673
Parameter:  {'eps': 1.0, 'min_samples': 10} Score:  -0.3837814631696031
Parameter:  {'eps': 1.0, 'min_samples': 12} Score:  -0.38648235283744914
Parameter:  {'eps': 5.0, 'min_samples': 5} Score:  0.31011275260225
Parameter:  {'eps': 5.0, 'min_samples': 7} Score:  0.7820011223700856
Parameter:  {'eps': 5.0, 'min_samples': 10} Score:  0.7974222681120255
Parameter:  {'eps': 5.0, 'min_samples': 12} Score:  0.7914367881923341
Parameter:  {'eps': 10.0, 'min_samples': 5} Score:  0.7598056658175874
Parameter:  {'eps': 10.0, 'min_samples': 7} Score:  0.8157570071704705
Par

In [23]:
best_grid

{'eps': 20.0, 'min_samples': 5}

In [24]:
# uses best param to now train our model proper
model.set_params(**best_grid)
model.fit(drivers_features)

DBSCAN(eps=20.0)

In [26]:
len(model.labels_)   # so since in dbsca all datapoints are clusters initially

4000

In [27]:
# gets the usual cluster numbers excluding the outliers with cluster label of '-1'
n_clusters = len(set(model.labels_)) - (1 if -1 in model.labels_ else 0)

n_clusters

2

In [29]:
# gets noisy points or outliers
n_noise = list(model.labels_).count(-1)

n_noise

0

**MeanShift**

In [30]:
# instead of using paramGrid stuff use this for MeanShift since lin already available
from sklearn.cluster import estimate_bandwidth
estimate_bandwidth(drivers_features)

33.960524729584314

In [31]:
# check it
model = MeanShift(bandwidth = estimate_bandwidth(drivers_features)).fit(drivers_features)
metrics.silhouette_score(drivers_features, model.labels_)

0.8231396834167266