# Clustering by neighbourhood

Objecive: find which problems are more common in each neighbourhood

In [65]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors

from sklearn.metrics import PredictionErrorDisplay
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import silhouette_score

In [50]:
df = pd.read_csv('db_criminalBehaviour.csv', sep='\t')
df.drop(['categories', 'Unnamed: 0'], axis=1, inplace=True)

df.info()

  df = pd.read_csv('db_criminalBehaviour.csv', sep='\t')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90086 entries, 0 to 90085
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   neighbourhood            90046 non-null  object 
 1   urban_facility           44012 non-null  object 
 2   flag_urban_facility      90086 non-null  object 
 3   flag_flagrant            90086 non-null  object 
 4   street_name              90051 non-null  object 
 5   flag_civil_protection_1  90086 non-null  int64  
 6   problem_type_1           90086 non-null  object 
 7   flag_civil_protection_2  5717 non-null   float64
 8   problem_type_2           5717 non-null   object 
 9   flag_civil_protection_3  465 non-null    float64
 10  problem_type_3           465 non-null    object 
 11  flag_civil_protection_4  77 non-null     float64
 12  problem_type_4           77 non-null     object 
 13  flag_civil_protection_5  21 non-null     float64
 14  problem_type_5        

In [51]:
columns2keep = ['neighbourhood', 'flag_civil_protection_1',
       'problem_type_1', 'year', 'week_day',
       'month', 'day']

df = df.loc[:,columns2keep]

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90086 entries, 0 to 90085
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   neighbourhood            90046 non-null  object
 1   flag_civil_protection_1  90086 non-null  int64 
 2   problem_type_1           90086 non-null  object
 3   year                     90086 non-null  int64 
 4   week_day                 90086 non-null  object
 5   month                    90086 non-null  int64 
 6   day                      90086 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 4.8+ MB


In [53]:
num_features = ['year', 'month', 'day']
cat_features = ['problem_type_1', 'neighbourhood', 'week_day']

In [54]:
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=4)),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)])

In [55]:
prepr = preprocessor.fit(df)

In [56]:
df = prepr.transform(df)

In [61]:
# set min_samples to N_features * 2
db = DBSCAN(eps=0.3, min_samples=14).fit(df)

labels = db.labels_

In [62]:
np.unique(labels)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7], dtype=int64)

In [63]:
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of clusters: 8
Estimated number of noise points: 89953


In [64]:
silhouette_score(df,labels)

-0.24879331805281502

In [None]:
nearest_neighbors = NearestNeighbors(n_neighbors=14)
neighbors = nearest_neighbors.fit(df)

distances, indices = neighbors.kneighbors(df)
distances = np.sort(distances[:,13], axis=0)

fig = plt.figure(figsize=(5, 5))
plt.plot(distances)
plt.xlabel("Points")
plt.ylabel("Distance")