# Import & Initialization

In [None]:
import os
import time
from datetime import datetime
from dotenv import load_dotenv

import googlemaps
import gmplot

import pandas as pd

import osmnx as ox
import random

import numpy as np

import re
from difflib import SequenceMatcher
from fuzzywuzzy import fuzz

from heapq import heappush, heappop


import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder,MinMaxScaler
from sklearn.cluster import DBSCAN
import numpy as np
from datetime import datetime, timedelta
import pytz
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

from io import StringIO
import uuid
import ast

In [None]:
load_dotenv()
API_KEY = os.getenv('GOOGLE_MAPS_API_KEY')
gmaps = googlemaps.Client(key=API_KEY)

# Generate Road Names (Riyadh)

## Create Road Names

In [None]:
city_name = 'Riyadh, Saudi Arabia'
G = ox.graph_from_place(city_name, network_type='drive')

edges = ox.graph_to_gdfs(G, nodes=False, edges=True)

roads_with_coords = []

for _, row in edges.iterrows():
    origin = f"{row['geometry'].coords[0][1]},{row['geometry'].coords[0][0]}" 
    destination = f"{row['geometry'].coords[-1][1]},{row['geometry'].coords[-1][0]}" 
    
    road_name = row['name'] if 'name' in row and row['name'] is not None else "Unnamed road"
    
    roads_with_coords.append((road_name, origin, destination))

for road_info in roads_with_coords[:10]: 
    road_name, origin, destination = road_info
    print(f"Road: {road_name}, Origin: {origin}, Destination: {destination}")

In [None]:
list_graph_roads = []

for i in roads_with_coords:
    if isinstance(i[0], (list, np.ndarray)): 
        if not pd.isnull(i[0]).any(): 
            list_graph_roads.append(i[0])
    else:  
        if not pd.isnull(i[0]):
            list_graph_roads.append(i[0])


In [None]:
road_names = [road_info[0] for road_info in roads_with_coords if isinstance(road_info[0], str)]

unique_road_names = set(road_names)

## Filter Road Names

In [None]:
def remove_duplicate_roads(road_names):
    unique_roads = []
    threshold = 80 
    for road in road_names:
        if not any(fuzz.ratio(road, unique) > threshold for unique in unique_roads):
            print(f"Original road count: {road}")
            unique_roads.append(road)
    
    return unique_roads

cleaned_roads = remove_duplicate_roads(unique_road_names)

In [None]:
def get_road_start_end(gmaps_client, road_name, city='Riyadh'):
    try:
        geocode_result = gmaps_client.geocode(f"{road_name}, {city}")
        if geocode_result and 'bounds' in geocode_result[0]['geometry']:
            start_point = geocode_result[0]['geometry']['bounds']['northeast']
            end_point = geocode_result[0]['geometry']['bounds']['southwest']
            return (start_point['lat'], start_point['lng']), (end_point['lat'], end_point['lng'])
        else:
            return None, None
    except Exception as e:
        print(f"Error getting start/end points for {road_name}: {e}")
        return None, None

In [None]:
def get_traffic_data(gmaps_client, road_name, city='Riyadh'):
    start_coords, end_coords = get_road_start_end(gmaps_client, road_name, city)
    
    if start_coords is None or end_coords is None:
        print(f"Could not retrieve start/end points for {road_name}.")
        return None, None

    origin = f"{start_coords[0]},{start_coords[1]}"
    destination = f"{end_coords[0]},{end_coords[1]}"
    
    try:
        directions_result = gmaps_client.directions(
            origin,
            destination,
            mode="driving",
            departure_time="now",  # Real-time traffic data
            traffic_model="best_guess"
        )

        if directions_result:
            duration_in_traffic = directions_result[0]['legs'][0]['duration_in_traffic']['value'] / 60  # In minutes
            distance = directions_result[0]['legs'][0]['distance']['value'] / 1000  # In kilometers
            return duration_in_traffic, distance
        else:
            return None, None
    except Exception as e:
        print(f"Error fetching traffic data for {road_name}: {e}")
        return None, None

# Priority queue to store the top 200 roads with the lowest ratio
top_200_heap = []

# Iterate over all roads in the cleaned set and get traffic data
for road in cleaned_roads:
    # Get traffic data (time and distance)
    traffic_time, road_distance = get_traffic_data(gmaps, road)

    # Apply the condition (only process roads longer than 5 km)
    if traffic_time and road_distance and road_distance > 5:  # Only consider roads longer than 5 km
        ratio = road_distance / traffic_time  # Calculate the distance-to-time ratio

        # Tracker: Print the distance, time, and ratio before adding it to the heap
        print(f"Eligible Road: {road}, Distance: {road_distance} km, Time: {traffic_time} mins, Ratio: {ratio}")

        # If we have fewer than 200 items in the heap, just add the new road
        if len(top_200_heap) < 200:
            heappush(top_200_heap, (-ratio, road, road_distance, traffic_time))  # Store negative ratio to simulate max-heap
            print(f"Included Road: {road}, Distance: {road_distance} km, Time: {traffic_time} mins, Ratio: {ratio}")
        else:
            # If the heap is full and the new ratio is lower than the highest ratio in the heap, replace it
            if -ratio > top_200_heap[0][0]:  # Compare with the maximum (inverted) ratio
                heappop(top_200_heap)  # Remove the highest ratio
                heappush(top_200_heap, (-ratio, road, road_distance, traffic_time))  # Add the new road with negative ratio
                print(f"Replaced Road: {road}, Distance: {road_distance} km, Time: {traffic_time} mins, Ratio: {ratio}")
    else:
        print(f"Skipped Road: {road} (Distance < 5 km or invalid data)")

In [None]:
df = pd.DataFrame(top_200_heap, columns=['Negative Ratio', 'Road Name', 'Distance (km)', 'Time (mins)'])

df['Ratio (km/min)'] = -df['Negative Ratio']

df = df.drop(columns=['Negative Ratio'])

df.to_csv('filtered_road_names.csv', index=False)

print(f"Data saved to filtered_road_names.csv")

## Add Start & End Points to road names

In [None]:
df = pd.read_csv('filtered_road_names.csv')
df[['Start_point','End_point']] = df['Road Name'].apply(lambda x: pd.Series(get_road_start_end(gmaps, x, city='Riyadh')))
df.to_csv('filtered_road_names.csv', index=False)

# Generate Data

In [None]:
df = pd.read_csv('filtered_road_names.csv')

In [None]:
gmaps = googlemaps.Client(key=API_KEY)
df = pd.read_csv('filtered_road_names.csv')


def determine_traffic_color(delay, duration):
    if delay < 0.05 * duration:
        return 'Blue'
    elif delay < 0.20 * duration:
        return 'Yellow'
    elif delay < 0.50 * duration:
        return 'Orange'
    elif delay < 1.00 * duration:
        return 'Red'
    else:
        return 'Dark Red'

def get_traffic_data_from_dataframe(gmaps_client, start_lat, start_lng, end_lat, end_lng, road_name, city='Riyadh'):
    origin = f"{start_lat},{start_lng}"
    destination = f"{end_lat},{end_lng}"
    
    try:
        directions_result = gmaps_client.directions(
            origin, destination, mode="driving", departure_time="now", traffic_model="best_guess"
        )

        if not directions_result:
            return None

        route = directions_result[0]['legs'][0]
        duration_in_traffic_min = route['duration_in_traffic']['value'] / 60
        distance_km = route['distance']['value'] / 1000
        speed_kmh = distance_km / (duration_in_traffic_min / 60)
        delay_min = (route['duration_in_traffic']['value'] - route['duration']['value']) / 60
        traffic_condition = determine_traffic_color(delay_min, route['duration']['value'] / 60)
        timestamp = datetime.now().isoformat()
        
        return {
            'road_name': road_name,
            'distance_km': distance_km,
            'duration_in_traffic_min': duration_in_traffic_min,
            'speed_kmh': speed_kmh,
            'delay_min': delay_min,
            'traffic_condition': traffic_condition,
            'timestamp': timestamp,
            'Start_point': (start_lat, start_lng),
            'End_point': (end_lat, end_lng),
        }
    except Exception as e:
        print(f"Error fetching traffic data for {road_name}: {e}")
        return None

def generate_data_from_dataframe(csv_file, gmaps_client):
    df = pd.read_csv(csv_file)
    
    traffic_data_list = []
    df['Start_point'] = df['Start_point'].apply(ast.literal_eval)
    df['End_point'] = df['End_point'].apply(ast.literal_eval)
    
    for _, row in df.iterrows():
        road_name = row['Road Name']
        start_lat,start_lng = row['Start_point']
        end_lat,end_lng = row['End_point']
        
        traffic_data = get_traffic_data_from_dataframe(gmaps_client, start_lat, start_lng, end_lat, end_lng, road_name)
        
        if traffic_data:
            traffic_data_list.append(traffic_data)

    if traffic_data_list:
        traffic_df = pd.DataFrame(traffic_data_list)
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        unique_filename = f"traffic_data_{timestamp}_{uuid.uuid4()}.csv"
        
        traffic_df.to_csv(unique_filename, index=False)
        
        print(f"Traffic data saved as '{unique_filename}'.")

generate_data_from_dataframe('filtered_road_names.csv', gmaps)

# Select Important Roads

In [None]:
directory = '/content/DS'

files = os.listdir(directory)

csv_files = [f for f in files if f.endswith('.csv')]

dfs = []
for csv_file in csv_files:
    file_path = os.path.join(directory, csv_file)
    df = pd.read_csv(file_path)
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

In [None]:
# Convert the 'timestamp' column to datetime objects first
df['timestamp'] = pd.to_datetime(df['timestamp'])
# Now you can add the timedelta

df['timestamp'] = pd.to_datetime(df['timestamp'] + timedelta(hours=3))

df = df[df['distance_km'] >= 10]
df = df[df['distance_km'] <=  40]

#  Convert the 'timestamp' column to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

#  Extract the hour from the 'timestamp' column to identify the hour of congestion
df['hour'] = df['timestamp'].dt.hour
df['day'] = df['timestamp'].dt.day
df['month'] = df['timestamp'].dt.month

# # Feature engineering: create a new column for congestion ratio
# # Calculate the congestion ratio as (delay_min / distance_km)
df['congestion_ratio'] = df['delay_min'] / df['distance_km']
df['distance_ratio'] = df['duration_in_traffic_min'] / df['distance_km']

df = df[df['day'] != 22]

# # Select relevant columns: road_name, distance_km, delay_min, congestion_ratio, hour, and other details
congestion_info = df[['road_name', 'distance_km', 'delay_min', 'congestion_ratio','distance_ratio', 'hour','day','month', 'traffic_condition', 'timestamp', 'start_point', 'end_point']]



In [None]:
import pandas as pd

# Perform groupby with two columns ('start_point', 'end_point') and aggregate the mean of the specified columns
aggregation_result_3 = df.groupby(['road_name', 'start_point', 'end_point']).agg({
    'congestion_ratio': 'mean',
    'distance_ratio': 'mean'
}).reset_index()

# For 'traffic_condition', apply mode (most frequent value)
traffic_condition_mode = df.groupby(['start_point', 'end_point'])['traffic_condition'].agg(lambda x: x.mode()[0]).reset_index()

# Merge the mode of 'traffic_condition' back into the aggregation result
aggregation_result_3 = pd.merge(aggregation_result_3, traffic_condition_mode, on=['start_point', 'end_point'])

ag_df_3 = aggregation_result_3


In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Prepare the features for clustering from the aggregation_result (using congestion_ratio and distance_ratio)
features = aggregation_result_3[['congestion_ratio', 'distance_ratio']]

# Use the Silhouette Method to find the optimal number of clusters
silhouette_scores = []
K = range(2, 11)  # Test between 2 and 10 clusters (Silhouette score is not defined for k=1)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(features)
    labels = kmeans.labels_
    score = silhouette_score(features, labels)
    silhouette_scores.append(score)

# Plot the Silhouette Method results
plt.figure(figsize=(8, 5))
plt.plot(K, silhouette_scores, 'bx-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method for Optimal k')
plt.show()


In [None]:
# Prepare the features for clustering (select the specified numeric columns)
features = ag_df_3[['congestion_ratio', 'distance_ratio']]

# Use MinMaxScaler to scale the features for fair clustering
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

# Apply K-Means clustering with 4 clusters
kmeans = KMeans(n_clusters=2, random_state=42)
ag_df_3['cluster'] = kmeans.fit_predict(features_scaled)

# Display the first few rows with cluster labels
ag_df_3[['road_name', 'start_point', 'end_point', 'cluster']]

In [None]:
import plotly.express as px

# Create a scatter plot with Plotly
fig = px.scatter(
    ag_df_3,
    x='distance_ratio',
    y='congestion_ratio',
    color='cluster',
    hover_data=['road_name', 'start_point', 'end_point'],
    title='Clusters Based on Congestion Ratio and Distance Ratio',
    color_discrete_sequence=['blue', 'Red']  # Specify two colors for the clusters
)

# Update labels for the axes, reduce figure size, and remove the color legend (bar)
fig.update_layout(
    xaxis_title='Distance Ratio',
    yaxis_title='Congestion Ratio',
    width=600,  # Set figure width
    height=400,  # Set figure height
    coloraxis_showscale=False  # Remove the color bar next to the plot
)

# Show the interactive plot
fig.show()

In [None]:
cluster_data_3_0 = ag_df_3[ag_df_3['cluster'] == 0]
pd.set_option('display.max_rows', None)
display(cluster_data_3_0)

In [None]:
cluster_data_3_1 = ag_df_3[ag_df_3['cluster'] == 1]
pd.set_option('display.max_rows', None)
display(cluster_data_3_1)