In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import KernelDensity, BallTree
from sklearn.preprocessing import StandardScaler
from scipy.spatial import ConvexHull
from matplotlib.ticker import ScalarFormatter
import os
from colorama import Fore, Style

# File loading


In [2]:
columns = [
    "vehicleId", 
    "lat", 
    "lng", 
    "dateStored", 
    "velocity",
    "odometer", 
    "engineVoltage", 
    "dateStoredHuman", 
    "dateOnlyStoredHuman",    
    "timeOnly",
    "orientation", 
    "seconds_diff", 
    "acceleration",
    "isProblem"
]


input_dir   = "../../DataSets/API_Responses/Vehicle_Data/"
filename    = "all_vehicle_responses.csv"

In [3]:
%matplotlib tk

In [4]:
def merge_csv_file(input_dir, filename, columns):
    input_file = os.path.join(input_dir, filename)

    if not os.path.exists(input_file):
        raise FileNotFoundError(f"File '{filename}' not found in directory '{input_dir}'")

    try:
        # Read the CSV while allowing missing columns
        df = pd.read_csv(input_file, usecols=lambda x: x.strip() in columns, encoding='utf-8')
    except Exception as e:
        raise ValueError(f"Error reading '{input_file}': {e}")

    return df



merged_df = merge_csv_file(input_dir, filename, columns)
print(merged_df.head())


   vehicleId        lat        lng     dateStored  velocity  odometer  \
0          1  37.510833  22.385710  1717682537000       0.0       0.0   
1          1  37.510603  22.385977  1717682540000       0.0       0.0   
2          1  37.510640  22.385927  1717682545000       6.0       0.0   
3          1  37.510750  22.385907  1717682551000       7.0       0.0   
4          1  37.510877  22.385698  1717682557000      26.0       0.0   

   engineVoltage      dateStoredHuman dateOnlyStoredHuman  timeOnly  \
0           0.28  2024-06-06 17:02:17          2024-06-06  17:02:17   
1           0.28  2024-06-06 17:02:20          2024-06-06  17:02:20   
2           0.28  2024-06-06 17:02:25          2024-06-06  17:02:25   
3           0.28  2024-06-06 17:02:31          2024-06-06  17:02:31   
4           0.28  2024-06-06 17:02:37          2024-06-06  17:02:37   

  orientation  seconds_diff  acceleration  isProblem  
0   Southeast           NaN      0.000000          0  
1   Northwest           

Set **Bounding Box** only for **Τρίπολη**

In [5]:
latMin = 37.49764419371479
latMax = 37.56244081620044
lngMin = 22.344992459074458
lngMax = 22.521463853839485


query_filter = 'lat >= ' +str(latMin)+' & lat <= ' + str(latMax) + ' & lng >= ' +str(lngMin)+ ' & lng <= '+str(lngMax)
veh_data_tripoli = merged_df.query( query_filter ).copy(True)
merged_df = veh_data_tripoli

# Data Overview

In [6]:
df = merged_df
df_danger = df[df['isProblem'] == 1]
# df_danger = df[df['vehicleId'] == 15]


sns.set_theme(style="ticks")
fig, ax = plt.subplots()
#sns.jointplot(x=df_danger['lng'], y=df_danger['lat'], kind="hex", color="#4CB391", ax=ax)
ax.hexbin(x=df_danger['lng'], y=df_danger['lat'])
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')

plt.gca().xaxis.set_major_formatter(ScalarFormatter())
plt.gca().yaxis.set_major_formatter(ScalarFormatter())
plt.ticklabel_format(style='plain', axis='both')  # Disable scientific notation


ax.set_title('Density of problem points on spatial coordinates')

Text(0.5, 1.0, 'Density of problem points on spatial coordinates')

### Init DF15 (VehicleId == 15)

In [7]:
# df15 = df[df["vehicleId"] == 15]
# df15 = df15.head(500)
# df15_problem = df15[df15['isProblem'] == 1]
# plt.plot(df15.index, df15['acceleration'])
# plt.title('Acceleration vs Index')
# plt.ylabel('Acceleration')
# plt.xlabel('Index')
# plt.scatter(df15_problem.index, df15_problem['acceleration'], color='red')

# len(df15)

In [8]:
df_danger[['lng', 'lat']].describe()

Unnamed: 0,lng,lat
count,1999.0,1999.0
mean,22.37831,37.515229
std,0.007049,0.006226
min,22.363152,37.497893
25%,22.372455,37.51079
50%,22.376493,37.513002
75%,22.385418,37.519261
max,22.415382,37.53314


# Clustering

In [9]:
# #### MOCK DATA #####
#
# data = {
#     'lng': np.random.uniform(-180, 180, 200),
#     'lat': np.random.uniform(-90, 90, 200)
# }
# df = pd.DataFrame(data)
# df_danger = df

In [10]:
# Extracting the coordinates
coords = df_danger[['lng', 'lat']].values

# Standardizing the data for better clustering performance
scaler = StandardScaler()
coords_scaled = scaler.fit_transform(coords)

# Applying DBSCAN
dbscan = DBSCAN(eps=0.02, min_samples=4)  # Adjust eps as needed
clusters = dbscan.fit_predict(coords_scaled)

df_danger.loc[:, 'cluster'] = clusters  # Adding cluster labels to DataFrame


df_danger_cluster = df_danger[df_danger['cluster'] > -1]


# %matplotlib inline
# Plotting the results
plt.figure(figsize=(10, 6))
plt.scatter(df_danger_cluster['lng'], df_danger_cluster['lat'], c=df_danger_cluster['cluster'], cmap='tab10', edgecolors='k', alpha=0.7)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('DBSCAN Clustering of Geospatial Data')
plt.colorbar(label='Cluster')
plt.show()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_danger.loc[:, 'cluster'] = clusters  # Adding cluster labels to DataFrame


In [11]:
df_danger.columns

Index(['vehicleId', 'lat', 'lng', 'dateStored', 'velocity', 'odometer',
       'engineVoltage', 'dateStoredHuman', 'dateOnlyStoredHuman', 'timeOnly',
       'orientation', 'seconds_diff', 'acceleration', 'isProblem', 'cluster'],
      dtype='object')

In [12]:
df_danger.describe()

Unnamed: 0,vehicleId,lat,lng,dateStored,velocity,odometer,engineVoltage,seconds_diff,acceleration,isProblem,cluster
count,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0
mean,7.643322,37.515229,22.37831,1731511000000.0,14.552276,0.0,4.853224,3.528764,-1.071677,1.0,8.171086
std,4.178733,0.006226,0.007049,7869791000.0,15.016579,0.0,0.545615,3.110904,1.126465,0.0,16.481213
min,1.0,37.497893,22.363152,1717683000000.0,0.0,0.0,0.0,1.0,-14.722222,1.0,-1.0
25%,7.0,37.51079,22.372455,1728074000000.0,6.0,0.0,4.621,2.0,-1.111111,1.0,-1.0
50%,7.0,37.513002,22.376493,1730461000000.0,10.0,0.0,4.853,3.0,-0.763889,1.0,-1.0
75%,9.0,37.519261,22.385418,1738578000000.0,18.0,0.0,5.229,5.0,-0.555556,1.0,11.0
max,20.0,37.53314,22.415382,1743156000000.0,123.0,0.0,5.551,50.0,-0.505051,1.0,55.0


## Showing convex hulls

In [13]:
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull
import numpy as np

def plot_convex_hulls(df, clusters, normal_df_points):
    unique_clusters = set(clusters)
    colors = plt.cm.get_cmap("tab10", len(unique_clusters))  # Set of distinct colors for clusters

    fig, ax = plt.subplots()  # Create figure and axis objects

    # Plot points first for colorbar
    for cluster in unique_clusters:
        if cluster == -1:
            continue  # Skip noise points
        cluster_points = df[df['cluster'] == cluster][['lng', 'lat']].values
        ax.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {cluster}', c=[colors(cluster)], s=10)

    ax.scatter(normal_df_points['lng'], normal_df_points['lat'], c='gray', alpha=0.5)

    # Plot Convex Hulls
    for cluster in unique_clusters:
        if cluster == -1:
            continue  # Skip noise points
        cluster_points = df[df['cluster'] == cluster][['lng', 'lat']].values
        if len(cluster_points) >= 3:  # Convex hull requires at least 3 points
            hull = ConvexHull(cluster_points)
            hull_points = np.append(hull.vertices, hull.vertices[0])  # Close the loop
            ax.plot(cluster_points[hull_points, 0], cluster_points[hull_points, 1], 'r-')

    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    ax.set_title('DBSCAN Clustering of Geospatial Data with Convex Hulls')

    # Create colorbar using scatter points
    cb = fig.colorbar(plt.cm.ScalarMappable(cmap="tab10", norm=plt.Normalize(vmin=min(unique_clusters), vmax=max(unique_clusters))),
                      ax=ax, label='Cluster')

    plt.show()

plot_convex_hulls(df_danger_cluster, clusters, df[df['isProblem'] == 0])


  colors = plt.cm.get_cmap("tab10", len(unique_clusters))  # Set of distinct colors for clusters


In [14]:
def get_bbox_of_clusters(df, clusters):
    cluster_bboxes = {}

    # Iterate over unique clusters (excluding -1 for noise)
    unique_clusters = sorted(set(clusters) - {-1})  # Exclude noise points (-1)

    for cluster in unique_clusters:
        # Filter the points of the current cluster
        cluster_points = df[df['cluster'] == cluster][['lng', 'lat']]
        
        # Get the minimum and maximum lng and lat for the bounding box
        min_lng = cluster_points['lng'].min()
        max_lng = cluster_points['lng'].max()
        min_lat = cluster_points['lat'].min()
        max_lat = cluster_points['lat'].max()

        # Store the bounding box for the current cluster
        cluster_bboxes[cluster] = {
            'min_lng': min_lng,
            'max_lng': max_lng,
            'min_lat': min_lat,
            'max_lat': max_lat
        }

    return cluster_bboxes

cluster_bboxes = get_bbox_of_clusters(df_danger_cluster, clusters)

# Display the bounding boxes for each cluster
for cluster, bbox in cluster_bboxes.items():
    print(f"Cluster {cluster}: {bbox}")


Cluster 0: {'min_lng': 22.3853133, 'max_lng': 22.3874133, 'min_lat': 37.5102916, 'max_lat': 37.5114683}
Cluster 1: {'min_lng': 22.3841633, 'max_lng': 22.3843916, 'min_lat': 37.510425, 'max_lat': 37.5105433}
Cluster 2: {'min_lng': 22.3848866, 'max_lng': 22.3851333, 'min_lat': 37.5107683, 'max_lat': 37.5109899}
Cluster 3: {'min_lng': 22.3844933, 'max_lng': 22.3848033, 'min_lat': 37.510815, 'max_lat': 37.5110133}
Cluster 4: {'min_lng': 22.3854, 'max_lng': 22.3854449, 'min_lat': 37.5114433, 'max_lat': 37.5117283}
Cluster 5: {'min_lng': 22.3832566, 'max_lng': 22.3834216, 'min_lat': 37.5120583, 'max_lat': 37.512245}
Cluster 6: {'min_lng': 22.3759916, 'max_lng': 22.3763466, 'min_lat': 37.512855, 'max_lat': 37.5131816}
Cluster 7: {'min_lng': 22.3749533, 'max_lng': 22.3750599, 'min_lat': 37.509825, 'max_lat': 37.509945}
Cluster 8: {'min_lng': 22.3840833, 'max_lng': 22.3842783, 'min_lat': 37.5138016, 'max_lat': 37.5138783}
Cluster 9: {'min_lng': 22.372835, 'max_lng': 22.3731166, 'min_lat': 37.52

In [15]:
import pandas as pd
import numpy as np
from scipy.spatial import ConvexHull
from shapely.geometry import Polygon

def get_largest_cluster_bounding_box(df, cluster_column='cluster', coord_columns=['lng', 'lat']):
    # Ensure required columns exist
    if not all(col in df.columns for col in [cluster_column] + coord_columns):
        raise ValueError(f"DataFrame must contain columns: {cluster_column}, {coord_columns}")

    largest_area = 0
    largest_cluster = None
    largest_hull_points = None

    # Iterate through clusters
    for cluster, cluster_df in df.groupby(cluster_column):
        print(f"Processing Cluster: {cluster}, Points: {len(cluster_df)}")  # Debug print

        if cluster == -1 or len(cluster_df) < 3:  # Skip noise and small clusters
            print(f"\033[31mSkipping Cluster {cluster} (Noise or too few points)\033[0m")
            continue

        cluster_points = cluster_df[coord_columns].values

        try:
            hull = ConvexHull(cluster_points)
            hull_points = cluster_points[hull.vertices]
            hull_area = hull.volume  # For 2D, 'volume' is the area

            print(f"✅ Cluster {cluster}: Convex Hull computed, Area: {hull_area}")  # Debug print

            if hull_area > largest_area:
                largest_area = hull_area
                largest_cluster = cluster
                largest_hull_points = hull_points
        except Exception as e:
            print(f"\033[31m❌ Convex Hull Failed for Cluster {cluster}: {e}\033[0m")
            continue

    if largest_cluster is None:
        print("\033[31mNo valid clusters found.\033[0m")
        return None  

    # Create a bounding box using shapely
    polygon = Polygon(largest_hull_points)
    min_lng, min_lat, max_lng, max_lat = polygon.bounds

    return {
        'largest_cluster': largest_cluster,
        'bounding_box': {
            'min_lng': min_lng,
            'min_lat': min_lat,
            'max_lng': max_lng,
            'max_lat': max_lat
        }
    }

# Example usage:
result = get_largest_cluster_bounding_box(df_danger)
if result:
    print(f"Largest Cluster: {result['largest_cluster']}")
    print(f"Bounding Box: {result['bounding_box']}")
else:
    print("No valid clusters found.")


Processing Cluster: -1, Points: 1099
[31mSkipping Cluster -1 (Noise or too few points)[0m
Processing Cluster: 0, Points: 325
✅ Cluster 0: Convex Hull computed, Area: 1.2971677499916861e-06
Processing Cluster: 1, Points: 4
✅ Cluster 1: Convex Hull computed, Area: 1.380394500025382e-08
Processing Cluster: 2, Points: 6
✅ Cluster 2: Convex Hull computed, Area: 2.6830109999728087e-08
Processing Cluster: 3, Points: 6
✅ Cluster 3: Convex Hull computed, Area: 2.68868899994919e-08
Processing Cluster: 4, Points: 5
✅ Cluster 4: Convex Hull computed, Area: 7.657529999450587e-09
Processing Cluster: 5, Points: 4
✅ Cluster 5: Convex Hull computed, Area: 1.4697500000180946e-08
Processing Cluster: 6, Points: 15
✅ Cluster 6: Convex Hull computed, Area: 6.910550000022119e-08
Processing Cluster: 7, Points: 4
✅ Cluster 7: Convex Hull computed, Area: 6.337944999887063e-09
Processing Cluster: 8, Points: 5
✅ Cluster 8: Convex Hull computed, Area: 5.731250000101202e-09
Processing Cluster: 9, Points: 10
✅ Clu

## Plot Orientations with Convex Hulls

### Define Trips **every 3 seconds**

In [17]:
print(df_danger.columns)

Index(['vehicleId', 'lat', 'lng', 'dateStored', 'velocity', 'odometer',
       'engineVoltage', 'dateStoredHuman', 'dateOnlyStoredHuman', 'timeOnly',
       'orientation', 'seconds_diff', 'acceleration', 'isProblem', 'cluster',
       'trip_id'],
      dtype='object')


#### **Δεδομένου ότι η Powerfleet είπε ότι μία από τις προυποθέσεις είναι καθε 3 seconds, έβαλα 6 seconds για να καλυψω το χρονο αποστολής έως εγγραφής στη Data Base**

In [None]:
import pandas as pd

# Create a copy of df_danger
danger_orient = df_danger.copy()

# Ensure 'dateStoredHuman' is in datetime format
danger_orient['dateStoredHuman'] = pd.to_datetime(danger_orient['dateStoredHuman'])

# Sort data by vehicleId and dateStoredHuman
danger_orient = danger_orient.sort_values(by=['vehicleId', 'dateStoredHuman'])

# Compute time differences between consecutive rows within each vehicle in seconds
danger_orient['seconds_diff'] = danger_orient.groupby('vehicleId')['dateStoredHuman'].diff().dt.total_seconds()

# Print seconds_diff for debugging
print(danger_orient[['vehicleId', 'dateStoredHuman', 'seconds_diff']])

# Assign trip_id based on a gap of 3 seconds
df_danger['trip_id'] = df_danger.groupby('vehicleId', group_keys=False)['seconds_diff'].apply(lambda x: (x >= 6).cumsum()).reset_index(drop=True)

# Fill NaN trip IDs (first row of each vehicle) with 0
danger_orient.loc[:, 'trip_id'] = danger_orient['trip_id'].fillna(0).astype(int)

       vehicleId     dateStoredHuman  seconds_diff
6              1 2024-06-06 17:02:47           NaN
19             1 2024-06-06 19:05:54        7387.0
23             1 2024-06-06 19:11:40         346.0
28             1 2024-06-07 14:15:45       68645.0
37             1 2024-06-07 14:17:39         114.0
...          ...                 ...           ...
27006         20 2025-03-06 11:59:28           4.0
27010         20 2025-03-06 12:57:34        3486.0
27013         20 2025-03-06 12:57:48          14.0
27025         20 2025-03-06 13:03:45         357.0
27029         20 2025-03-06 13:09:02         317.0

[1999 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_danger['trip_id'] = df_danger.groupby('vehicleId', group_keys=False)['seconds_diff'].apply(lambda x: (x >= 6).cumsum()).reset_index(drop=True)


# TODO:: NA ΥΠΟΛΟΓΙΣΩ CONVEX HULLS ΓΙΑ ΤΑ ORIENTATIONS (ΠΡΕΠΕΙ ΝΑ ΕΙΝΑΙ ΕΝΤΟΣ 6 secs???)

In [None]:
import numpy as np
import matplotlib.pyplot as plt# Collect coords into list
import requests

import json
overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = """
[out:json];
area["ISO3166-1"="DE"][admin_level=2];
(node["amenity"="biergarten"](area);way["amenity"="biergarten"](area);rel["amenity"="biergarten"](area);
);
out center;
"""
response = requests.get(overpass_url,
                        params={'data': overpass_query})
data = response.json()

coords = []
for element in data['elements']:
    if element['type'] == 'node':
        lon = element['lon']
        lat = element['lat']
        coords.append((lon, lat))
    elif 'center' in element:
        lon = element['center']['lon']
        lat = element['center']['lat']
        coords.append((lon, lat))# Convert coordinates into numpy array


X = np.array(coords)

plt.plot(X[:, 0], X[:, 1], 'o')
plt.title('Biergarten in Germany')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.axis('equal')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

# Define bounding box (min_lon, min_lat, max_lon, max_lat)
bbox = (-122.523, 37.704, -122.354, 37.833)  # Example: San Francisco

# Example coordinate points (replace with real data)
coords = [
    (-122.45, 37.75),
    (-122.40, 37.78),
    (-122.48, 37.73)
]

# Convert to numpy array
X = np.array(coords)

# Create a figure with Cartopy
fig, ax = plt.subplots(figsize=(6, 4), subplot_kw={"projection": ccrs.PlateCarree()})

# Set the map extent to the bounding box
ax.set_extent([bbox[0], bbox[2], bbox[1], bbox[3]], crs=ccrs.PlateCarree())

# Add roads and features
ax.add_feature(cfeature.LAND, color="lightgray")
ax.add_feature(cfeature.OCEAN, color="lightblue")
ax.add_feature(cfeature.BORDERS, linestyle=":")
ax.add_feature(cfeature.COASTLINE)
ax.add_feature(cfeature.LAKES, color="blue", alpha=0.3)
ax.add_feature(cfeature.RIVERS, color="blue", alpha=0.3)

# Plot points
ax.scatter(X[:, 0], X[:, 1], color="red", marker="o", label="Points")

# Labels and title
ax.set_title("Custom Location Map")
ax.legend()

# Save the output
output_filename = "local_map.png"
plt.savefig(output_filename, dpi=300)
plt.show()

print(f"Map with points saved as {output_filename}")
