In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn import preprocessing
from sklearn.neighbors import NearestNeighbors

import warnings

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# zones = pd.read_csv(r"G:\Mon Drive\Fichiers\2.Scolarité\1. Jedha_Data_Science\CERTIF_PROJECTS\Src_uber-non git\taxi-zone-lookup.csv")
# dataset_04 = pd.read_csv(r"G:\Mon Drive\Fichiers\2.Scolarité\1. Jedha_Data_Science\CERTIF_PROJECTS\Src_uber-non git\uber-raw-data-apr14.csv")

zones = pd.read_csv("/content/drive/MyDrive/Fichiers/2.Scolarité/1. Jedha_Data_Science/CERTIF_PROJECTS/Src_uber-non git/taxi-zone-lookup.csv")
dataset_04 = pd.read_csv("/content/drive/MyDrive/Fichiers/2.Scolarité/1. Jedha_Data_Science/CERTIF_PROJECTS/Src_uber-non git/uber-raw-data-apr14.csv")

In [6]:
zones.head()

Unnamed: 0,LocationID,Borough,Zone
0,1,EWR,Newark Airport
1,2,Queens,Jamaica Bay
2,3,Bronx,Allerton/Pelham Gardens
3,4,Manhattan,Alphabet City
4,5,Staten Island,Arden Heights


In [None]:
print("general info : ")
display(zones.info())
print()

print("Basics statistics: ")
data_zones_desc = zones.describe(include='all')
display(data_zones_desc)
print()

In [None]:
print("general info : ")
display(dataset_04.info())
print()

print("Basics statistics: ")
data_uber_desc = dataset_04.describe(include='all')
display(data_uber_desc)
print()

#### Process datetime and pick one hour sample

In [7]:
dataset_04["Date/Time"] = pd.to_datetime(dataset_04["Date/Time"])

# dataset_04['Year'] = pd.to_datetime(dataset_04['Date']).dt.year
# dataset_04['Month'] = pd.to_datetime(dataset_04['Date']).dt.month
# dataset_04['Day'] = pd.to_datetime(dataset_04['Date']).dt.day
# dataset_04['Week_day'] = pd.to_datetime(dataset_04['Date']).dt.dayofweek

In [8]:
dataset_04["Date/Time"].value_counts(ascending=False)

Date/Time
2014-04-07 20:21:00    97
2014-04-07 20:22:00    87
2014-04-30 17:45:00    78
2014-04-30 18:43:00    70
2014-04-30 19:00:00    70
                       ..
2014-04-29 04:08:00     1
2014-04-27 09:26:00     1
2014-04-29 03:27:00     1
2014-04-29 03:16:00     1
2014-04-30 02:55:00     1
Name: count, Length: 41999, dtype: int64

In [9]:
mask_1 = dataset_04["Date/Time"] == "2014-04-07 20:21:00"
dataset_sample = dataset_04.loc[mask_1, :]
print(dataset_sample.shape[0])
display(dataset_sample.head())

97


Unnamed: 0,Date/Time,Lat,Lon,Base
8945,2014-04-07 20:21:00,40.7431,-74.0083,B02512
8946,2014-04-07 20:21:00,40.7066,-74.0084,B02512
8947,2014-04-07 20:21:00,40.7514,-73.9709,B02512
8948,2014-04-07 20:21:00,40.7178,-73.9941,B02512
8949,2014-04-07 20:21:00,40.7449,-74.0061,B02512


Visualising data on a map

In [None]:
fig = px.scatter_mapbox(dataset_sample, lat="Lat", lon="Lon", color="Base", mapbox_style="carto-positron")
fig.show()

### K-Means test

In [None]:
# Preprocessing. We will keep only standard scaled values of lat & lon since we are working on a sample of time.
# We assume that "Base" does not provide meaningful information for the moment.

In [None]:
dataset_sample_kmeans = dataset_sample[["Lat","Lon"]]
scaler = StandardScaler()
sc_dataset_sample_kmeans = scaler.fit_transform(dataset_sample_kmeans)

In [None]:
wcss =  []
k = []

for i in range (2,12):
    kmeans = KMeans(n_clusters= i, random_state = 0, n_init = 100)
    kmeans.fit(sc_dataset_sample_kmeans)
    wcss.append(kmeans.inertia_)
    k.append(i)
    print("WCSS for K={} --> {}".format(i, wcss[-1]))

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(k, wcss, 'bo-', markersize=8, linewidth=2)
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Method for Determining Optimal k')
plt.xticks(range(1, 20))
plt.grid(True)
plt.show()

In [None]:
kmeans_test_1 = KMeans(n_clusters=6, random_state=0)
kmeans_test_1.fit(sc_dataset_sample_kmeans)

In [None]:
dataset_sample_kmeans["cluster_kmeans"] = kmeans_test_1.labels_

In [None]:
fig = px.scatter_mapbox(dataset_sample_kmeans,
                        lat='Lat',
                        lon='Lon',
                        color='cluster_kmeans',
                        mapbox_style="carto-positron",
                        title='K-Means Clustering of Rides',
                        height=800)

fig.update_layout(
    mapbox=dict(
        center=dict(lat=dataset_sample_kmeans['Lat'].mean(), lon=dataset_sample_kmeans['Lon'].mean())
    ),
    margin={"r":0,"t":0,"l":0,"b":0}
)

fig.show()

### DBScan test

In [None]:
# We will take the standard scaled dataset sample used for KMeans test.

In [None]:
# Determining optimal EPS

# Convert latitude and longitude to radians for haversine distance
# data_radians = np.radians(sc_dataset_sample_kmeans)

k = 2  # Set k to test value of min_samples
nbrs = NearestNeighbors(n_neighbors=k, metric='euclidean').fit(sc_dataset_sample_kmeans)
distances, indices = nbrs.kneighbors(sc_dataset_sample_kmeans)

# Sort the distances to the k-th nearest neighbor
k_distances = distances[:, k-1]
k_distances.sort()

# Plotting
plt.plot(k_distances)
plt.xlabel('Points sorted by distance to {}-th nearest neighbor'.format(k))
plt.ylabel('Distance to {}-th nearest neighbor'.format(k))
plt.title('k-distance Graph')
plt.show()

In [None]:
# Determining optimal min_samples

eps = 0.2  # Set eps to test value of Epsilon
min_samples_range = range(2, 20)  # Example range for min_samples
best_score = -1
best_min_samples = 0

for min_samples in min_samples_range:
    db = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean').fit(sc_dataset_sample_kmeans)
    labels = db.labels_
    # Only calculate silhouette score if there are more than 1 cluster
    if len(set(labels)) > 1:
        score = silhouette_score(sc_dataset_sample_kmeans, labels, metric='euclidean')
        if score > best_score:
            best_score = score
            best_min_samples = min_samples

print('Best silhouette score:', best_score)
print('Optimal min_samples:', best_min_samples)

In [None]:
db_test_1 = DBSCAN(eps=0.2, min_samples=2, metric="euclidean", algorithm="brute")
db_test_1.fit(sc_dataset_sample_kmeans)

In [None]:
dataset_sample_kmeans["cluster_dbscan"] = db_test_1.labels_

In [None]:
fig = px.scatter_mapbox(dataset_sample_kmeans,
                        lat='Lat',
                        lon='Lon',
                        color='cluster_dbscan',
                        mapbox_style="carto-positron",
                        title='DBScan Clustering of Rides',
                        height=800)

fig.update_layout(
    mapbox=dict(
        center=dict(lat=dataset_sample_kmeans['Lat'].mean(), lon=dataset_sample_kmeans['Lon'].mean())
    ),
    margin={"r":0,"t":0,"l":0,"b":0}
)

fig.show()

### Full dataset clustering models

In [11]:
#Pre-processing full dataset

dataset_04['Week_day'] = pd.to_datetime(dataset_04['Date/Time']).dt.dayofweek

dataset_04_weekday = dataset_04.drop(columns = ["Base","Date/Time"])

In [None]:
#check
print(dataset_04_weekday['Week_day'].value_counts())
print(dataset_04_hour['Hour'].value_counts())

### Research of optimal parameters for KMeans

In [20]:
# Determining optimal EPS (sample 50_000 rows)
dataset_04_optimal_research = dataset_04[["Lat", "Lon"]].sample(n=50000)

In [21]:
dataset_04_optimal_research

Unnamed: 0,Lat,Lon
32228,40.7356,-74.0034
145744,40.7818,-73.9602
77788,40.7320,-73.9839
293734,40.7642,-73.9558
354799,40.7682,-73.9834
...,...,...
190983,40.7709,-73.8658
201222,40.7299,-73.9750
326192,40.7519,-73.9948
89005,40.7420,-73.9885


In [None]:
wcss =  []
k = []

for i in range (2,12):
    kmeans = KMeans(n_clusters= i, random_state = 0, n_init = 20)
    kmeans.fit(dataset_04_optimal_research)
    wcss.append(kmeans.inertia_)
    k.append(i)
    print("WCSS for K={} --> {}".format(i, wcss[-1]))

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(k, wcss, 'bo-', markersize=8, linewidth=2)
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Method for Determining Optimal k')
plt.xticks(range(1, 20))
plt.grid(True)
plt.show()

In [None]:
# Computer mean silhouette score
sil = []
k = []

for i in range (2,11):
    kmeans = KMeans(n_clusters= i, random_state = 0, n_init = 100)
    kmeans.fit(dataset_04_optimal_research)
    sil.append(silhouette_score(dataset_04_optimal_research, kmeans.predict(dataset_04_optimal_research)))
    k.append(i)
    print("Silhouette score for K={} is {}".format(i, sil[-1]))

### Research of optimal parameters for DBScan

In [None]:
# Convert latitude and longitude to radians for haversine distance
data_radians = np.radians(dataset_04_optimal_research)

In [None]:
# Determining optimal EPS

k = 1000  # Set k to test value of min_samples
nbrs = NearestNeighbors(n_neighbors=k, metric='haversine').fit(data_radians)
distances, indices = nbrs.kneighbors(data_radians)

# Sort the distances to the k-th nearest neighbor
k_distances = distances[:, k-1]
k_distances.sort()

# Plotting
plt.plot(k_distances)
plt.xlabel('Points sorted by distance to {}-th nearest neighbor'.format(k))
plt.ylabel('Distance to {}-th nearest neighbor'.format(k))
plt.title('k-distance Graph')
plt.show()

In [None]:
# Determining optimal min_samples

epsilon = 0.001  # Set eps to test value of Epsilon
min_samples_range = range(500, 5500, 500)  # Example range for min_samples
best_score = -1
best_min_samples = 0

for min_samples in min_samples_range:
    db = DBSCAN(eps=epsilon, min_samples=min_samples, metric='haversine').fit(data_radians)
    labels = db.labels_
    # Only calculate silhouette score if there are more than 1 cluster
    if len(set(labels)) > 1:
        score = silhouette_score(data_radians, labels, metric='haversine')
        if score > best_score:
            best_score = score
            best_min_samples = min_samples

print('Best silhouette score:', best_score)
print('Optimal min_samples:', best_min_samples)

In [None]:
#DBScan : Test without preprocessing the data of Lon/Len and using Haversine distance.

#Training models and Visualizing

##KMeans

In [None]:
clustered_data = []
n_clusters = 7

# Iterate through each day of the week
for day in range(7):
    filtered_data = dataset_04_weekday[dataset_04_weekday['Week_day'] == day]

    # Check if filtered_data is not empty to avoid errors
    if not filtered_data.empty:
        pickup_locations = filtered_data[['Lat', 'Lon']]

        # KMeans
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        kmeans.fit(pickup_locations)

        # Add cluster labels and centers to the dataframe
        filtered_data['Cluster'] = kmeans.labels_ + (day * n_clusters)

        clustered_data.append(filtered_data)

clustered_data_kmeans = pd.concat(clustered_data)

In [13]:
#Check
display(clustered_data_kmeans.head())
print(clustered_data_kmeans["Cluster"].nunique())

Unnamed: 0,Lat,Lon,Week_day,Cluster
7785,40.7205,-73.9939,0,5
7786,40.7407,-74.0077,0,5
7787,40.7591,-73.9892,0,0
7788,40.7419,-74.0034,0,5
7789,40.7419,-74.0034,0,5


49


In [14]:
def create_scattermapbox(day_data, day):
    return go.Scattermapbox(
        lat=day_data['Lat'],
        lon=day_data['Lon'],
        mode='markers',
        marker=dict(
            size=9,
            color=day_data['Cluster'],
            # colorscale='Viridis',
            showscale=False
        ),
        text=day_data['Cluster'],
        hoverinfo='text',
        name=f'Day {day}'
    )

# Create the figure
fig = go.Figure()

# Add scattermapbox traces for each day of the week
for day in range(7):
  day_data = clustered_data_kmeans[clustered_data_kmeans['Week_day'] == day]
  trace = create_scattermapbox(day_data, day)
  trace['visible'] = day == 0
  fig.add_trace(trace)

# Update layout for the map
fig.update_layout(
    updatemenus=[
        {
            'buttons': [
                {
                    'label': 'Monday',
                    'method': 'update',
                    'args': [{'visible': [day == 0 for day in range(7)]}]
                },
                {
                    'label': 'Tuesday',
                    'method': 'update',
                    'args': [{'visible': [day == 1 for day in range(7)]}]
                },
                {
                    'label': 'Wednesday',
                    'method': 'update',
                    'args': [{'visible': [day == 2 for day in range(7)]}]
                },
                {
                    'label': 'Thursday',
                    'method': 'update',
                    'args': [{'visible': [day == 3 for day in range(7)]}]
                },
                {
                    'label': 'Friday',
                    'method': 'update',
                    'args': [{'visible': [day == 4 for day in range(7)]}]
                },
                {
                    'label': 'Saturday',
                    'method': 'update',
                    'args': [{'visible': [day == 5 for day in range(7)]}]
                },
                {
                    'label': 'Sunday',
                    'method': 'update',
                    'args': [{'visible': [day == 6 for day in range(7)]}]
                }
            ],
            'direction': 'down',
            'showactive': True,
        }
    ],
    mapbox=dict(
        style='carto-positron',
        center=dict(lat=clustered_data_kmeans['Lat'].mean(), lon=clustered_data_kmeans['Lon'].mean()),
        zoom=10
    ),
    margin={"r":0,"t":0,"l":0,"b":0}
)

# Show the map
fig.show()

Output hidden; open in https://colab.research.google.com to view.

###Density mapbox ??

In [16]:
def create_densitymapbox(day_data, day):
    return go.Densitymapbox(
        lat=day_data['Lat'],
        lon=day_data['Lon'],
        z=day_data['Cluster'],
        radius=10,
        opacity=0.7,
        text=day_data['Cluster'],
        hoverinfo='text',
        name=f'Day {day}'
    )

# Create the figure
fig = go.Figure()

# Add scattermapbox traces for each day of the week
for day in range(7):
  day_data = clustered_data_kmeans[clustered_data_kmeans['Week_day'] == day]
  trace = create_densitymapbox(day_data, day)
  trace['visible'] = day == 0
  fig.add_trace(trace)

# Update layout for the map
fig.update_layout(
    updatemenus=[
        {
            'buttons': [
                {
                    'label': 'Monday',
                    'method': 'update',
                    'args': [{'visible': [day == 0 for day in range(7)]}]
                },
                {
                    'label': 'Tuesday',
                    'method': 'update',
                    'args': [{'visible': [day == 1 for day in range(7)]}]
                },
                {
                    'label': 'Wednesday',
                    'method': 'update',
                    'args': [{'visible': [day == 2 for day in range(7)]}]
                },
                {
                    'label': 'Thursday',
                    'method': 'update',
                    'args': [{'visible': [day == 3 for day in range(7)]}]
                },
                {
                    'label': 'Friday',
                    'method': 'update',
                    'args': [{'visible': [day == 4 for day in range(7)]}]
                },
                {
                    'label': 'Saturday',
                    'method': 'update',
                    'args': [{'visible': [day == 5 for day in range(7)]}]
                },
                {
                    'label': 'Sunday',
                    'method': 'update',
                    'args': [{'visible': [day == 6 for day in range(7)]}]
                }
            ],
            'direction': 'down',
            'showactive': True,
        }
    ],
    mapbox=dict(
        style='carto-positron',
        center=dict(lat=clustered_data_kmeans['Lat'].mean(), lon=clustered_data_kmeans['Lon'].mean()),
        zoom=10
    ),
    margin={"r":0,"t":0,"l":0,"b":0}
)

# Show the map
fig.show()

Output hidden; open in https://colab.research.google.com to view.

##DBScan

In [None]:
dataset_04_prep_dbscan = dataset_04_weekday.sample(n=50_000)
#Sampling for computational purposes
dataset_04_prep_dbscan[["Lat","Lon"]] = np.radians(dataset_04_prep_dbscan[["Lat","Lon"]])

In [None]:
clustered_data = []
epsilon = 0.001
min_s = 50


# Iterate through each day of the week
for day in range(7):
  filtered_data = dataset_04_prep_dbscan[dataset_04_prep_dbscan['Week_day'] == day]

  # Check if filtered_data is not empty to avoid errors
  if not filtered_data.empty:
    pickup_locations = filtered_data[['Lat', 'Lon']]

    # DBSCAN clustering
    db = DBSCAN(eps=epsilon, min_samples=min_s, metric='haversine')  # Adjust parameters
    db.fit(pickup_locations)

    # Add cluster labels to the dataframe (considering day)
    filtered_data['Cluster'] = db.labels_ + (day * len(set(db.labels_)))

    clustered_data.append(filtered_data)

clustered_data_dbscan = pd.concat(clustered_data)