# Imports

In [None]:
import plotly.express as px
import pandas as pd
from sklearn.cluster import DBSCAN, KMeans
from shapely.geometry import Point
import geopandas as gpd
import numpy as np
dictionary = {"Monday": 1, "Tuesday": 2, "Wednesday": 3, "Thursday": 4, "Friday": 5, "Saturday": 6, "Sunday": 7}

In [None]:
def read_df(m = 10000, filepath = "..\\data\\data_refactored\\uber-raw-data-14.csv"):
    df = pd.read_csv(filepath)
    n = len(df) // m
    df = df.iloc[::n, :]
    df = df.replace({"Day_Name": dictionary})
    

    return df
main_df = read_df()

# Function definitions

In [None]:
# DBSCAN
def get_centroid_df(df, eps=0.005, min_samples=5):
    points_df = gpd.GeoDataFrame(df[["Lat","Lon"]])[:3400]
    points_df['geometry'] = df.apply(lambda row: Point(row.Lon, row.Lat), axis=1)
    points_gdf = gpd.GeoDataFrame(points_df, geometry='geometry')

    points = np.array([(p.x, p.y) for p in points_gdf.geometry])

    # Use DBSCAN to cluster the points
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(points)

    # Assign the cluster labels to the GeoDataFrame
    points_gdf["cluster"] = labels

    # Calculate the centroids of each cluster
    centroids = points_gdf.dissolve(by="cluster").centroid
    centroids_gdf = gpd.GeoDataFrame(geometry=centroids)

    cluster_size = points_gdf.groupby("cluster").count()["geometry"]
    cluster_size = cluster_size.rename("cluster_size")
    centroids_gdf = centroids_gdf.join(cluster_size, on='cluster')
    
    centroids_gdf['Lat'] = centroids_gdf['geometry'].apply(lambda x: x.y)
    centroids_gdf['Lon'] = centroids_gdf['geometry'].apply(lambda x: x.x)
    
    return centroids_gdf[['Lat', 'Lon', 'cluster_size']]

In [None]:
# KMEANS
def get_centroid_df(df, n_clusters=10):
    points_df = gpd.GeoDataFrame(df[["Lat","Lon"]])[:3400]
    points_df['geometry'] = df.apply(lambda row: Point(row.Lon, row.Lat), axis=1)
    points_gdf = gpd.GeoDataFrame(points_df, geometry='geometry')

    points = np.array([(p.x, p.y) for p in points_gdf.geometry])

    # Use KMeans to cluster the points
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10)
    kmeans.fit(points)
    labels = kmeans.predict(points)
    
    # Assign the cluster labels to the GeoDataFrame
    points_gdf["cluster"] = labels

    # Calculate the centroids of each cluster
    centroids = points_gdf.dissolve(by="cluster").centroid
    centroids_gdf = gpd.GeoDataFrame(geometry=centroids)

    cluster_size = points_gdf.groupby("cluster").count()["geometry"]
    cluster_size = cluster_size.rename("cluster_size")
    centroids_gdf = centroids_gdf.join(cluster_size, on='cluster')
    
    centroids_gdf['Lat'] = centroids_gdf['geometry'].apply(lambda x: x.y)
    centroids_gdf['Lon'] = centroids_gdf['geometry'].apply(lambda x: x.x)
    
    return centroids_gdf[['Lat', 'Lon', 'cluster_size']]

In [None]:
def visualize_month(df=main_df, months:list[int]=None, latit = "Lat", longi = "Lon"):
    """Zwizualizowanie rozłożenie zamówień według miesięcy"""
    
    df = main_df.copy()
    if months: df = df[df['Month'].isin(months)]
    if len(df) == 0: return
        
    centroids_gdfs = []
    months = set(df["Month"])
    for month in months:
        centroid_gdf = get_centroid_df(df[df['Month'] == month])
        centroid_gdf["Month"] = month
        centroids_gdfs.append(centroid_gdf)

    centroids_gdfs = pd.concat(centroids_gdfs, axis = 0)
    
    centroids_gdfs['Month'] = centroids_gdfs['Month'].astype(str)
    fig = px.scatter_mapbox(centroids_gdfs, lat="Lat", lon="Lon", color="Month", size="cluster_size",
                      color_discrete_sequence=px.colors.qualitative.Dark24,
                      size_max=50, zoom=10, mapbox_style="carto-positron")

    fig.show()

In [None]:
def visualize_week_day(df=main_df, days:list[int]=None, latit = "Lat", longi = "Lon"):
    """Zwizualizowanie rozłożenie zamówień według dni tygodnia"""
    
    df = main_df.copy()
    if days: df = df[df['Day_Name'].isin(days)]
    if len(df) == 0: return

    centroids_gdfs = []
    for day_name in dictionary.values():
        centroid_gdf = get_centroid_df(df[df['Day_Name'].eq(day_name)])
        centroid_gdf["Day_Name"] = day_name
        centroids_gdfs.append(centroid_gdf)

    centroids_gdfs = pd.concat(centroids_gdfs, axis = 0)

    centroids_gdfs['Day_Name'] = centroids_gdfs['Day_Name'].astype(str)
    fig = px.scatter_mapbox(centroids_gdfs, lat=latit, lon=longi, color="Day_Name", size="cluster_size",
                     color_discrete_sequence=px.colors.qualitative.Dark24, 
                      size_max=50, zoom=10, mapbox_style="carto-positron")

    fig.show()

In [None]:
def visualize_by_time(df=main_df, hours:list[int]=None, dest_month=None, dest_day_name=None, latit = "Lat", longi = "Lon"):
    """Zwizualizowanie rozłożenie zamówień według pory dnia"""
    df = main_df.copy()
    if dest_month: df = df[df["Day_Name"].eq(dest_month)]
    if dest_day_name: df = df[df["Day_Name"].eq(dest_day_name)]
        
    df["Hour"] = pd.cut(df['Minutes'], bins=24, labels=range(24))
    df_list = [group[1] for group in df.groupby('Hour')]
    
    if hours: df_list = [df_list[i] for i in hours]
         
    centroids_gdfs = []
    for hour_df in df_list:
        hour = hour_df['Hour'].unique()[0]
        centroid_gdf = get_centroid_df(hour_df)
        centroid_gdf["Hour"] = hour
        centroids_gdfs.append(centroid_gdf)

    centroids_gdfs = pd.concat(centroids_gdfs, axis = 0)

    centroids_gdfs['Hour'] = centroids_gdfs['Hour'].astype(str)
    fig = px.scatter_mapbox(centroids_gdfs, lat="Lat", lon="Lon", color="Hour", size="cluster_size",
                            color_discrete_sequence=px.colors.qualitative.Dark24,
                            size_max=50, zoom=10, mapbox_style="carto-positron")

    fig.show()

# uber-raw-data-14.csv

## Pickups in each month

In [None]:
visualize_month()

In [None]:
for month in range(6, 9):
    visualize_month(months=[month])

## Pickups per week day

In [None]:
visualize_week_day()

## Pickups for day time

In [None]:
visualize_by_time()

In [None]:
visualize_by_time(hours=range(7, 10))

In [None]:
visualize_by_time(hours=list(range(15, 18)))