# User

________________________________________________________________________________
This is the only part of the code that needs to be modified by the user.
To view the graphs for your dataset, make sure to have you have it formatted as a **.csv** file. You need to click on the *'Files'* tab on the **left sidebar**, drag and drop it to upload it and then wait for it to upload itself. Then you will need to change **file_name = ''**, such that the name of the file you just uploades is inside the apostrophes. Afterwards, click on **top bar** on *'Runtime'*, and then *'Run all'*.
________________________________________________________________________________

In [67]:
file_name = ''
censor_flag = True

# Dev Only

Imports

In [45]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.cluster import OPTICS
from sklearn.metrics import silhouette_score
import plotly.graph_objects as go
import plotly.express as px
from itertools import product

Functions

In [51]:
'''
This function reads the .csv file, and prepares the dataframe for
further uses later.
'''
def read_file(file_path):
    # Load the CSV file
    df = pd.read_csv(file_path)

    # Convert the 'data Internare' column to datetime format
    df['data Internare'] = pd.to_datetime(df['data Internare'], format='%Y-%m-%d %H:%M:%S')

    # Define the beginning of the year
    start_of_year = pd.Timestamp('2013-01-01 00:00:00')

    # Rename the 'Lng' column to 'Long'
    df.rename(columns={'Lng': ' Long'}, inplace=True)

    # Calculate the difference in months from the start of the year
    df['month'] = (df['data Internare'] - start_of_year) / pd.Timedelta('30 days')

    return df

'''
This function applies the optics clustering, creating another column on the dataframe which signifies
which cluster each row is a part of.
It returns the changed dataframe and the distance matrix for the computed given spatial and temporal weights.
'''
def apply_optics_clustering(df, spatial_weight, temporal_weight, min_samples=5, xi=0.05, min_cluster_size=0.0005, debug=0):
    # Convert latitude and longitude to radians
    tdf = np.radians(df[['Lat', ' Long']])
    df_time = df['month']
    tdf = pd.concat([tdf, df_time], axis=1)

    # Calculate distance matrices
    distance_matrix_time = pairwise_distances(tdf, metric=lambda x, y: abs(x[2] - y[2]))
    df_spatial = tdf[['Lat', ' Long']]
    distance_matrix_haversine = pairwise_distances(df_spatial, metric='haversine')

    # Calculate weighted distance matrix
    distance_matrix = distance_matrix_time * temporal_weight + distance_matrix_haversine * spatial_weight

    # Apply OPTICS with precomputed distance matrix
    optics = OPTICS(min_samples=min_samples, xi=xi, min_cluster_size=min_cluster_size, metric='precomputed')
    optics.fit(distance_matrix)

    # Get cluster labels
    labels = optics.labels_

    # Add labels to the DataFrame
    df['label'] = labels
    df = df.sort_values(by='month')

    if debug == 1:
      df.head()

    return df, distance_matrix

'''
  This functions plots the whole graph with noise.
'''

def plot_clusters(df, censor = False):
    # Separate noise points (label = -1) and clusters
    noise_df = df[df['label'] == -1]
    cluster_df = df[df['label'] != -1]

    # Plot the clusters using Plotly
    fig = go.Figure()

    if censor == False:

      fig.add_trace(go.Scattermapbox(
          lat=noise_df['Lat'],
          lon=noise_df[' Long'],
          mode='markers',
          marker=go.scattermapbox.Marker(
              size=5,
              color='rgb(0, 0, 0)',
              opacity=0.7
          ),
          text=noise_df['data Internare'].dt.strftime('%Y-%m-%d %H:%M') + '<br>Noise',
      ))

      fig.add_trace(go.Scattermapbox(
          lat=cluster_df['Lat'],
          lon=cluster_df[' Long'],
          mode='markers',
          marker=go.scattermapbox.Marker(
              size=10,
              color=cluster_df['label'],
              colorscale='mygbm',
              opacity=0.7
          ),
          text=cluster_df['data Internare'].dt.strftime('%Y-%m-%d %H:%M') + '<br>' + cluster_df['label'].astype(str),
      ))

    if censor == True:

      fig.add_trace(go.Scattermapbox(
          lat=noise_df['Lat'],
          lon=noise_df[' Long'],
          mode='markers',
          marker=go.scattermapbox.Marker(
              size=5,
              color='rgb(0, 0, 0)',
              opacity=0.7
          ),
          hoverinfo='text',
          text=noise_df['data Internare'].dt.strftime('%Y-%m-%d') + '<br>Noise',
      ))

      fig.add_trace(go.Scattermapbox(
          lat=cluster_df['Lat'],
          lon=cluster_df[' Long'],
          mode='markers',
          marker=go.scattermapbox.Marker(
              size=10,
              color=cluster_df['label'],
              colorscale='mygbm',
              opacity=0.7
          ),
          hoverinfo='text',
          text=cluster_df['data Internare'].dt.strftime('%Y-%m-%d') + '<br>' + cluster_df['label'].astype(str),
      ))


    fig.update_layout(mapbox_style="carto-positron",
                      mapbox_zoom=7.5,
                      mapbox_center={"lat": 45.7407, "lon": 21.2196},
                      margin={"r": 0, "t": 0, "l": 0, "b": 0})

    fig.show()


'''
  This functions plots the graph, in a timeline manner with all different clusters representing a frame.
'''
def plot_cluster_slices(df, title='OPTICS Clusters with Haversine Distance', zoom=7.5, center_lat=45.7407, center_lon=21.2196, censor = False):

    df['censored date'] = pd.to_datetime(df['data Internare']).dt.strftime('%Y-%m-%d')
    cluster_df = df[df['label'] != -1]
    df = df.sort_values(by='label')

    if censor == False:
      hover_columns = {
        'Lat': True,          # Show latitude on hover
        ' Long': True,         # Show longitude on hover
        'label': True,        # Show cluster label on hover
        'data Internare': True,   # Show date on hover
      }

    if censor == True:
      hover_columns = {
        'Lat': False,          # Show latitude on hover
        ' Long': False,         # Show longitude on hover
        'label': True,        # Show cluster label on hover
        'censored date': True,   # Show date on hover
      }

    # Create the scatter mapbox plot
    fig = px.scatter_mapbox(
        cluster_df,
        lat='Lat',
        lon=' Long',  # Ensure 'Long' is used consistently
        color='label',
        title=title,
        animation_frame='label',
        animation_group='label',
        size_max=15,
        zoom=zoom,
        hover_data=hover_columns  # Add hover data
    )

    # Update traces with specific marker settings
    fig.update_traces(marker=dict(
        size=10,
        color=cluster_df['label'],
        colorscale='mygbm',
        opacity=0.7
    ))

    # Update layout with mapbox style and center coordinates
    fig.update_layout(mapbox_style="carto-positron",
                      mapbox_zoom=zoom,
                      mapbox_center={"lat": center_lat, "lon": center_lon},
                      margin={"r": 0, "t": 0, "l": 0, "b": 0})

    # Display the plot
    fig.show()

'''
  This functions plots the graph, in a timeline manner with each month representing a frame.
'''
def plot_cluster_by_month(df, title='OPTICS Clusters with Haversine Distance', zoom=7.5, center_lat=45.7407, center_lon=21.2196, censor = False):

    # Assuming df has a 'date' column from which you want to extract the month
  df['wholeMonth'] = pd.to_datetime(df['data Internare']).dt.strftime('%Y-%m')  # Extract year and month in YYYY-MM format
  df['censored date'] = pd.to_datetime(df['data Internare']).dt.strftime('%Y-%m-%d')
  df = df.sort_values(by='wholeMonth')

  # Filter the DataFrame to exclude label -1
  cluster_df = df[df['label'] != -1]

  # Create an empty DataFrame to store the data for each month
  frames = []

  # Loop through each unique month
  for currentMonth in cluster_df['wholeMonth'].unique():
    # Get the clusters that have any points in the current month
    clusters_in_month = cluster_df.loc[cluster_df['wholeMonth'] == currentMonth, 'label'].unique()

    # Select all points from these clusters
    points_in_clusters = cluster_df.loc[cluster_df['label'].isin(clusters_in_month)].copy()  # Use .copy() to avoid warnings

    # Assign the current month to all these points for the animation
    points_in_clusters.loc[:, 'wholeMonthFrame'] = currentMonth  # Use .loc[] to modify the DataFrame in place

    # Append to the frames list
    frames.append(points_in_clusters)

  # Concatenate all frames into a single DataFrame
  final_df = pd.concat(frames)

  # Define the columns to display on hover
  if censor == False:
    hover_columns = {
      'Lat': True,          # Show latitude on hover
      ' Long': True,         # Show longitude on hover
      'label': True,        # Show cluster label on hover
      'data Internare': True,   # Show date on hover
    }

  if censor == True:
    hover_columns = {
      'Lat': False,          # Show latitude on hover
      ' Long': False,         # Show longitude on hover
      'label': True,        # Show cluster label on hover
      'censored date': True,   # Show date on hover
    }

  # Create the scatter mapbox plot with wholewholeMonthFrame as the animation frame
  fig = px.scatter_mapbox(
      final_df,
      lat='Lat',
      lon=' Long',
      color='label',
      title=title,
      animation_frame='wholeMonthFrame',  # Use 'wholewholeMonthFrame' for animation frames
      size_max=15,
      zoom=zoom,
      hover_data=hover_columns  # Add hover data
  )

  # Update traces with specific marker settings
  fig.update_traces(marker=dict(
      size=10,
      colorscale='mygbm',
      opacity=0.7
  ))

  # Update layout with mapbox style and center coordinates
  fig.update_layout(
      mapbox_style="carto-positron",
      mapbox_zoom=zoom,
      mapbox_center={"lat": center_lat, "lon": center_lon},
      margin={"r": 0, "t": 0, "l": 0, "b": 0}
  )

  # Display the plot
  fig.show()


'''
  This function returns the silhouette score of a given distance matrix.
'''
    # Function to evaluate the clustering
def evaluate_clustering(labels, distance_matrix):
    if len(set(labels)) > 1:  # Ensure there is more than one cluster
        return silhouette_score(distance_matrix, labels, metric="precomputed")
    else:
        return -1  # Return -1 for cases where clustering is not meaningful

'''
  Function to perform grid search for optimal spatial and temporal weights.
'''
def grid_search_optimal_weights(df, min_samples=10, xi=0.05, min_cluster_size=0.0005, debug = 0):
    best_score = -1
    best_spatial_weight = None
    best_temporal_weight = None
    spatial_weights = np.linspace(0.1, 0.9, 9)

    for spatial_weight in spatial_weights:
        # Apply the clustering function
        temporal_weight = 1.0 - spatial_weight
        clustered_df, distance_matrix = apply_optics_clustering(df.copy(), spatial_weight, temporal_weight, min_samples, xi, min_cluster_size)

        # Evaluate the clustering
        score = evaluate_clustering(clustered_df['label'], distance_matrix)

        if debug == 1:
          print(f"Spatial Weight: {spatial_weight}, Temporal Weight: {temporal_weight}, Score: {score}")

        # Check if this is the best score
        if score > best_score:
            best_score = score
            best_spatial_weight = spatial_weight
            best_temporal_weight = temporal_weight

    if debug == 1:
      print(f"\nBest Spatial Weight: {best_spatial_weight}")
      print(f"Best Temporal Weight: {best_temporal_weight}")
      print(f"Best Silhouette Score: {best_score}")

    return best_spatial_weight, best_temporal_weight


Algorithm

In [47]:
health_df = read_file(file_name)
best_spatial_weight, best_temporal_weight = grid_search_optimal_weights(health_df, debug = 1)
apply_optics_clustering(health_df, best_spatial_weight, best_temporal_weight)

Spatial Weight: 0.1, Temporal Weight: 0.9, Score: -0.17675468872280933
Spatial Weight: 0.2, Temporal Weight: 0.8, Score: -0.17721723371468565
Spatial Weight: 0.30000000000000004, Temporal Weight: 0.7, Score: -0.18638414663445213
Spatial Weight: 0.4, Temporal Weight: 0.6, Score: -0.18415044826194651
Spatial Weight: 0.5, Temporal Weight: 0.5, Score: -0.18020704630756504
Spatial Weight: 0.6, Temporal Weight: 0.4, Score: -0.1842201463413697
Spatial Weight: 0.7000000000000001, Temporal Weight: 0.29999999999999993, Score: -0.18800000106679748
Spatial Weight: 0.8, Temporal Weight: 0.19999999999999996, Score: -0.19300096538227426
Spatial Weight: 0.9, Temporal Weight: 0.09999999999999998, Score: -0.20828582385621344

Best Spatial Weight: 0.1
Best Temporal Weight: 0.9
Best Silhouette Score: -0.17675468872280933


(           identifier          FO      data Internare        Lat       Long  \
 4930  xovoed7r86so3hg     72/2013 2013-01-04 11:08:39  45.719899  21.157768   
 4926  bwjm4pdcjaavwyo    125/2013 2013-01-07 09:53:15  45.969180  20.770985   
 4931  igaxx5z05gv0cem    173/2013 2013-01-08 08:54:07  45.837414  21.096789   
 4927  rpc34rc1hv38pqz    227/2013 2013-01-09 08:59:24  45.745880  21.198654   
 4928  wql88k4c7ffxam3    363/2013 2013-01-11 15:00:44  45.748872  21.208679   
 ...               ...         ...                 ...        ...        ...   
 627   4620toh5nz1u36v  22197/2023 2023-11-28 09:06:02  45.776878  21.220126   
 628   wtqnlets9cg3ojb  22201/2023 2023-11-28 09:12:16  45.546514  20.990175   
 629   qmima1gc07mer17  22205/2023 2023-11-28 09:17:26  45.731071  21.204905   
 630   liecg0a141ygz0i  22228/2023 2023-11-28 10:18:01  45.762213  21.024143   
 631   to14r14gai352is  22239/2023 2023-11-28 10:40:11  45.734127  21.237930   
 
            month  label  
 4930    0.

# Graphs

In [68]:
plot_clusters(health_df, censor = censor_flag)

In [69]:
plot_cluster_slices(health_df, censor = censor_flag)

In [70]:
plot_cluster_by_month(health_df, censor = censor_flag)