### Goal :
Clustering for identification of 'hot-zones' of Uber pick-ups demand in NYC.

## Import

In [63]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from shapely.geometry import MultiPoint

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [64]:
%cd /content/drive/MyDrive/Colab Notebooks/

/content/drive/MyDrive/Colab Notebooks


In [65]:
df1 = pd.read_csv('uber_data/uber-raw-data-jul14.csv')
df2 = pd.read_csv('uber_data/uber-raw-data-may14.csv')
df3 = pd.read_csv('uber_data/uber-raw-data-apr14.csv')
df4 = pd.read_csv('uber_data/uber-raw-data-jun14.csv')
df5 = pd.read_csv('uber_data/uber-raw-data-sep14.csv')
df6 = pd.read_csv('uber_data/uber-raw-data-aug14.csv')
df = pd.concat([df1,df2,df3,df4,df5,df6])
#Absolute Location of New York City (40.712784, -74005941)
Lat_origine = 40.712784
Lon_origine = -74.005941

## I. EDA

In [66]:
# Df infos
print('shape:', df.shape)

shape: (3186046, 4)


In [67]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,7/1/2014 0:03:00,40.7586,-73.9706,B02512
1,7/1/2014 0:05:00,40.7605,-73.9994,B02512
2,7/1/2014 0:06:00,40.732,-73.9999,B02512
3,7/1/2014 0:09:00,40.7635,-73.9793,B02512
4,7/1/2014 0:20:00,40.7204,-74.0047,B02512


In [68]:
df['Date/Time'] = pd.to_datetime(df['Date/Time'])
df['hour'] = df['Date/Time'].dt.hour
df['month'] = df['Date/Time'].dt.month
df['day'] = df['Date/Time'].dt.day
df['dayofweek'] = df['Date/Time'].dt.dayofweek

In [69]:
def finder_null_empty(df):
    lignes_manquantes = df[df.isnull().any(axis=1)]
    if not lignes_manquantes.empty:
        print("Valeurs manquantes ou nulles :")
        print(lignes_manquantes)
        df = df.dropna()
        print("Valeurs supprimées...")
    else:
        print("Aucune valeurs manquantes ou nulles trouvée.")
    return df

In [70]:
df_clean = finder_null_empty(df)

Valeurs manquantes ou nulles :
                Date/Time      Lat  Lon Base  hour  month  day  dayofweek
22763 2014-04-20 17:44:00  40.6841 -7.0  NaN    17      4   20          6
22745 2014-08-21 23:16:00  40.7000  NaN  NaN    23      8   21          3
Valeurs supprimées...


In [71]:
weekday_hourly_avg = (
    df_clean.groupby(['day', 'hour', 'dayofweek'])['Date/Time']
        .count()
        .reset_index()
        .rename(columns={'Date/Time': 'Count'})
        .groupby(['dayofweek', 'hour'])['Count']
        .mean()
        .reset_index()
        .rename(columns={'Count':'Avg'})
)
fig = px.bar(weekday_hourly_avg, x='hour', y='Avg', color='dayofweek',
             labels={'Hour': 'Hour of Day', 'Avg': 'Average Number of Pickups', 'dayofweek': 'Day of Week'},
             title='Uber Pickups Average Distribution by Hour and Weekday')
fig.show()

In [72]:
df_clean_light = df_clean.sample(n=10000, weights='hour', random_state=42)
base_mapping = {
    'B02512': 'Unter',
    'B02598': 'Hinter',
    'B02617': 'Weiter',
    'B02682': 'Schmecken',
    'B02764': 'Danach-NY',
    'B02765': 'Grun',
    'B02835': 'Dreist',
    'B02836': 'Drinnen',
    'unknown': 'Unknown'
}
df_clean_light['Base'] = df_clean_light['Base'].map(base_mapping)
fig = px.scatter_mapbox(df_clean_light, lat='Lat', lon='Lon', color='Base', zoom=9, mapbox_style="carto-positron",labels={'Base': 'Base'})
fig.show()

In [73]:
data_by_hour = df_clean_light.sort_values('hour')

heatmap = px.density_mapbox(
    data_by_hour,
    lat='Lat',
    lon='Lon',
    animation_frame='hour',
    center={'lat': Lat_origine, 'lon': Lon_origine},
    mapbox_style="carto-darkmatter",
    title='Uber pickups density overview',
    opacity=0.7,
    radius=5,
    zoom=8
)

heatmap.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            showactive=False,
            buttons=[
                dict(
                    label="Play",
                    method="animate",
                    args=[None, {"frame": {"duration": 500, "redraw": True}, "fromcurrent": True}]
                ),
                dict(
                    label="Pause",
                    method="animate",
                    args=[[None], {"frame": {"duration": 0, "redraw": False}, "mode": "immediate"}]
                )
            ]
        )
    ]
)
heatmap.show()


At this point, beyond the fact that most of it is concentrated in Manhattan, Brooklyn and the Bronx, we have nothing more convincing.

## II. Clustering

1. Kmeans

There are several ways (silhouette method and elbow method) to determine the optimal number of clusters for a K-means clustering analysis.
The silhouette method is considered more robust and reliable than the elbow method because it takes into account both cohesion and separation between clusters. However, the elbow method may be easier to interpret and use, especially for small to medium sized data sets. It is often recommended to use both methods together to obtain a more reliable estimate of the optimal number of clusters.<br>
After testing, the elbow method gives us 2 clusters as the optimal number. However, knowing the objective of this study, it has no meaning or usefulness. The silhouette method returns us between 6-8 optimal clusters.

Standardization of Latitude and Longitude Data for Sampling :

In [74]:
scaler = StandardScaler()
num = ['Lat','Lon']
X = scaler.fit_transform(df_clean[num])

This operation is carried out for two main reasons:


* Data standardization: <br>This helps to normalize the data and prevent certain features with greater amplitude from dominating others in the learning process
* Preparation of data for sampling: <br> We guarantee that the sample used for learning has the same transformation as the initial data, thus ensuring the consistency of the values.

Visualization of K-means Clusters:

In [75]:
kmeans_clustering = KMeans(n_clusters=7, max_iter=300, n_init=10, random_state=42)
kmeans_clustering.fit(X)
df_clean.loc[:, 'kmeans7_clusters'] = kmeans_clustering.labels_
df_clean.loc[:, 'kmeans7_clusters'] = kmeans_clustering.labels_ + 1
sample_ = df_clean.sample(n=10000, weights='kmeans7_clusters', random_state=42)
fig = px.scatter_mapbox(sample_, lat='Lat', lon='Lon', color='kmeans7_clusters',zoom=8, mapbox_style="carto-positron",title='Distribution of clusters from Kmeans')
cluster_centers = kmeans_clustering.cluster_centers_

#Normalisation inverse pour obtenir les coordonnées géographiques
original_centers = scaler.inverse_transform(cluster_centers)
fig.add_trace(
    px.scatter_mapbox(
        lat=original_centers[:, 0],
        lon=original_centers[:, 1],
        text=[f"Cluster {i+1} Center" for i in range(len(original_centers))],
        size=[20] * len(original_centers),
        color_discrete_sequence=["green"],
    ).data[0]
)
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



This first approach allows us to have a first segmentation of the territory of interest such as JFK airport (cluster 3), Newark International Airport (cluster 7). Lower Manhattan (cluster 1) and Upper Manhattan (cluster 4), in which professional activities are concentrated. But also to the north (cluster 5) and south (cluster 6) of Manhattan with a cluster around the Bronx and another around Brooklyn, probably due to the mix of small businesses, technology startups, and creative industries, but also residential areas with commercial areas, schools, and hospitals.

### Let's go further, and see if we can find more precise subpopulations

The advantage of doing a Gaussian Mixture after a K-means is to be able to obtain a more flexible and precise representation of the clusters in the data, by using Gaussian distributions to model the clusters and by using the initial cluster centroids provided by the K-means to improve the convergence and accuracy of cluster estimation.

Visualization of Gaussian Mixture Clusters:

In [76]:
# Centroïdes de cluster initiaux fournis par le K-means pour initialiser le modèle GM
gm = GaussianMixture(n_components=7, covariance_type='full', random_state=42, init_params='kmeans')
gm.fit(X)
labels = gm.predict(X)
probs = gm.predict_proba(X)

df_clean.loc[:, 'gm_clusters'] = labels + 1
df_clean.loc[:, 'gm_probs'] = [max(p) for p in probs]

sample_GM = df_clean.sample(n=10000, weights='gm_clusters', random_state=42)
fig = px.scatter_mapbox(sample_GM, lat='Lat', lon='Lon', color='gm_clusters', zoom=10, mapbox_style="carto-positron", title='Distribution of clusters from Gaussian Mixture')
cluster_centers = gm.means_
original_centers = scaler.inverse_transform(cluster_centers)
fig.add_trace(
    px.scatter_mapbox(
        pd.DataFrame(original_centers, columns=['Lat', 'Lon']),
        lat='Lat',
        lon='Lon',
        text=[f"Cluster {i+1} Center" for i in range(len(original_centers))],
        size=[20] * len(original_centers),
        color_discrete_sequence=["green"],
    ).data[0]
)

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



*   We're beginning to gather more specific details, although there is still considerable noise.
*   Manhattan is now segmented into three primary zones.
*   We can pinpoint a small cluster around Laguardia Airport.



### 4. DBSCAN

DBSCAN is a density-based algorithm that can handle clusters of different densities and shapes, and it does not require the user to specify the number of clusters.

Visualization of DBSCAN Clusters without noises:

In [77]:
n_samples_initial = int(len(df_clean) * 0.05)
sample_x = df_clean.sample(n=n_samples_initial, weights='hour', random_state=42)
sample_X = scaler.transform(sample_x[num])
dbscan = DBSCAN(eps=0.1, metric='manhattan', min_samples=300)
sample_x['db_clusters'] = dbscan.fit_predict(sample_X) + 1
db_sample = sample_x.sample(n=10000, weights='db_clusters', random_state=42)
fig = px.scatter_mapbox(db_sample, lat='Lat', lon='Lon', color='db_clusters', zoom=9, mapbox_style="carto-positron")
fig.show()


What do we have there? <br>
We now have smaller and better defined clusters, and above all the noise has disappeared. <br>
9 hotspots are highlighted, for exemple :<br>


*   Newark International Airport (Cluster 3)
*   Laguardia Airport (Cluster 7 and 5)
*   JFK Airport (Cluster 2)
*   Hunterpoint Avenue (Queens, NY), attractive starting or ending point for many trips to Manhattan Island (Cluster 8).
*   North and downtown Brooklyn (Cluster 4 and 9)
*   Manhattan island (Cluster 1)



In [78]:
def hs(max_distance, min_pickups, data, hour, day):
    # Filter BY hour and day
    days = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}
    data = data[(data['hour'] == hour) & (data['day'] == days[day])]

    coords = data[['Lat', 'Lon']].values
    x_coords = StandardScaler().fit_transform(coords)
    db = DBSCAN(eps=max_distance, min_samples=min_pickups, metric='manhattan').fit(x_coords)

    # Group cluster
    cluster_labels = db.labels_
    num_clusters = len(set(cluster_labels))
    clusters = [coords[cluster_labels == n] for n in range(num_clusters)]
    hot_spots = [(MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y, len(cluster)) for cluster in clusters if cluster.size > 0]
    lat, lon, num_members = zip(*hot_spots)

    # plotting
    df_hotspots = pd.DataFrame({
        'Lat': lat,
        'Lon': lon,
        'NumMembers': num_members,
        'LogNumMembers': np.log(num_members)
    })

    # Identifier zones de forte activité
    fig = px.scatter_mapbox(df_hotspots, lat='Lat', lon='Lon', color='LogNumMembers',zoom=10, mapbox_style="carto-positron", labels={'LogNumMembers': 'Intensity level'},hover_data={'Lat': False, 'Lon': False, 'NumMembers': True})

    fig.update_traces(marker=dict(size=10),
                      hovertemplate='<b>Latitude:</b> %{lat:.2f}<br>'
                                    '<b>Longitude:</b> %{lon:.2f}<br>'
                                    '<b>Trips:</b> %{customdata[0]}<br>'
                                    '<b>Intensity Level:</b> %{marker.color:.2f}')

    fig.show()


Test for a Friday, 4 A.M, with a max distance 0.1, and a minimum of 20 pickups, we have 6 spots in NY city meeting this description.

In [79]:
hs(max_distance=0.1, min_pickups=20, data=df_clean, hour=4, day='Friday')

What points of potential improvements and remarks:<br>
Do the dates of shows and sporting events have an important role? Weather? clients socio-economic? and demographic characteristics of the area,traffic and unfrastructure Data ... <br>
We also note that DBSCAN offers sharper clusters compared to other algorithms.