In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import KernelDensity, BallTree
from sklearn.preprocessing import StandardScaler
from scipy.spatial import ConvexHull
from matplotlib.ticker import ScalarFormatter
import os



# File loading


In [2]:
columns = [
    "vehicleId", 
    "lat", 
    "lng", 
    "dateStored", 
    "velocity",
    "odometer", 
    "engineVoltage", 
    "dateStoredHuman", 
    "dateOnlyStoredHuman",    
    "timeOnly",
    "orientation", 
    "seconds_diff", 
    "acceleration",
    "isProblem"
]


input_dir   = '../../DataSets/API_Responses/Vehicle_Data/'
filename    = "all_vehicle_responses.csv"

In [3]:
%matplotlib tk

In [4]:
def merge_csv_file(input_dir, filename, columns):
    input_file = os.path.join(input_dir, filename)

    if not os.path.exists(input_file):
        raise FileNotFoundError(f"File '{filename}' not found in directory '{input_dir}'")

    try:
        # Read the CSV while allowing missing columns
        df = pd.read_csv(input_file, usecols=lambda x: x.strip() in columns, encoding='utf-8')
    except Exception as e:
        raise ValueError(f"Error reading '{input_file}': {e}")

    return df



merged_df = merge_csv_file(input_dir, filename, columns)
print(merged_df.head())


   vehicleId        lat        lng           dateStored  velocity  odometer  \
0          1  38.034458  23.748177  2024-05-22 12:57:05       0.0       0.0   
1          1  38.034598  23.748143  2024-05-22 13:04:41       6.0       0.0   
2          1  38.034633  23.748132  2024-05-22 13:05:05       0.0       0.0   
3          1  38.034667  23.748163  2024-05-22 13:08:53       0.0       0.0   
4          1  38.034713  23.748122  2024-05-22 13:15:24       8.0       0.0   

   engineVoltage      dateStoredHuman dateOnlyStoredHuman  timeOnly  \
0            0.0  2024-05-22 15:57:05          2024-05-22  15:57:05   
1            0.0  2024-05-22 16:04:41          2024-05-22  16:04:41   
2            0.0  2024-05-22 16:05:05          2024-05-22  16:05:05   
3            0.0  2024-05-22 16:08:53          2024-05-22  16:08:53   
4            0.0  2024-05-22 16:15:24          2024-05-22  16:15:24   

  orientation  seconds_diff  acceleration  isProblem  
0   Northwest           NaN      0.000000  

Set **Bounding Box** only for **Τρίπολη**

In [23]:
latMin = 37.49764419371479
latMax = 37.56244081620044
lngMin = 22.344992459074458
lngMax = 22.521463853839485


query_filter = 'lat >= ' +str(latMin)+' & lat <= ' + str(latMax) + ' & lng >= ' +str(lngMin)+ ' & lng <= '+str(lngMax)
veh_data_tripoli = merged_df.query( query_filter ).copy(True)
merged_df = veh_data_tripoli

# Data Overview

In [24]:
merged_df.describe

<bound method NDFrame.describe of        vehicleId        lat        lng           dateStored  velocity  \
89             1  37.510833  22.385710  2024-06-06 14:02:17       0.0   
90             1  37.510603  22.385977  2024-06-06 14:02:20       0.0   
91             1  37.510640  22.385927  2024-06-06 14:02:25       6.0   
92             1  37.510750  22.385907  2024-06-06 14:02:31       7.0   
93             1  37.510877  22.385698  2024-06-06 14:02:37      26.0   
...          ...        ...        ...                  ...       ...   
48956         19  37.510782  22.386343  2024-07-31 13:47:23       6.0   
48957         19  37.510762  22.386272  2024-07-31 13:48:38       0.0   
48958         19  37.510742  22.386345  2024-07-31 13:50:20       8.0   
48959         19  37.510752  22.386367  2024-07-31 13:50:29       6.0   
48960         19  37.510773  22.386430  2024-07-31 13:51:29       0.0   

       odometer  engineVoltage      dateStoredHuman dateOnlyStoredHuman  \
89          0.

In [25]:
df = merged_df
df_danger = df[df['isProblem'] == 1]
df_danger = df[df['vehicleId'] == 15]


sns.set_theme(style="ticks")
fig, ax = plt.subplots()
#sns.jointplot(x=df_danger['lng'], y=df_danger['lat'], kind="hex", color="#4CB391", ax=ax)
ax.hexbin(x=df_danger['lng'], y=df_danger['lat'])
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')

plt.gca().xaxis.set_major_formatter(ScalarFormatter())
plt.gca().yaxis.set_major_formatter(ScalarFormatter())
plt.ticklabel_format(style='plain', axis='both')  # Disable scientific notation


ax.set_title('Density of problem points on spatial coordinates')

Text(0.5, 1.0, 'Density of problem points on spatial coordinates')

In [26]:
df_danger.describe

<bound method NDFrame.describe of        vehicleId        lat        lng           dateStored  velocity  \
18647         15  37.510750  22.386272  2024-10-14 23:15:27       0.0   
18648         15  37.510823  22.385855  2024-10-14 23:15:28      16.0   
18649         15  37.510872  22.385613  2024-10-14 23:15:39      14.0   
18650         15  37.510858  22.385625  2024-10-14 23:15:48      13.0   
18651         15  37.510848  22.385382  2024-10-14 23:15:53      24.0   
...          ...        ...        ...                  ...       ...   
20159         15  37.523477  22.376433  2024-11-07 19:45:19      33.0   
20160         15  37.524508  22.376960  2024-11-07 19:45:28      51.0   
20161         15  37.525552  22.378802  2024-11-07 19:45:50      14.0   
20162         15  37.525507  22.378780  2024-11-07 19:45:52      10.0   
20163         15  37.525475  22.378782  2024-11-07 19:46:02       0.0   

       odometer  engineVoltage      dateStoredHuman dateOnlyStoredHuman  \
18647       0.

In [27]:
df15 = df[df["vehicleId"] == 15]
df15 = df15.head(500)
df15_problem = df15[df15['isProblem'] == 1]
plt.plot(df15.index, df15['acceleration'])
plt.title('Acceleration vs Index')
plt.ylabel('Acceleration')
plt.xlabel('Index')
plt.scatter(df15_problem.index, df15_problem['acceleration'], color='red')

len(df15)

500

In [14]:
df_danger[['lng', 'lat']].describe()

Unnamed: 0,lng,lat
count,29976.0,29976.0
mean,23.655388,38.013484
std,0.342107,0.124708
min,22.360795,37.506782
25%,23.7599,38.046742
50%,23.760832,38.05068
75%,23.761495,38.050805
max,23.79282,38.094858


# Clustering

In [6]:
# #### MOCK DATA #####
#
# data = {
#     'lng': np.random.uniform(-180, 180, 200),
#     'lat': np.random.uniform(-90, 90, 200)
# }
# df = pd.DataFrame(data)
# df_danger = df

In [28]:
# Extracting the coordinates
coords = df_danger[['lng', 'lat']].values

# Standardizing the data for better clustering performance
scaler = StandardScaler()
coords_scaled = scaler.fit_transform(coords)

# Applying DBSCAN
dbscan = DBSCAN(eps=0.02, min_samples=4)  # Adjust eps as needed
clusters = dbscan.fit_predict(coords_scaled)

df_danger.loc[:, 'cluster'] = clusters  # Adding cluster labels to DataFrame


df_danger_cluster = df_danger[df_danger['cluster'] > -1]


# %matplotlib inline
# Plotting the results
plt.figure(figsize=(10, 6))
plt.scatter(df_danger_cluster['lng'], df_danger_cluster['lat'], c=df_danger_cluster['cluster'], cmap='tab10', edgecolors='k', alpha=0.7)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('DBSCAN Clustering of Geospatial Data')
plt.colorbar(label='Cluster')
plt.show()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_danger.loc[:, 'cluster'] = clusters  # Adding cluster labels to DataFrame


In [16]:
df_danger.columns

Index(['vehicleId', 'lat', 'lng', 'dateStored', 'velocity', 'odometer',
       'engineVoltage', 'dateStoredHuman', 'dateOnlyStoredHuman', 'timeOnly',
       'orientation', 'seconds_diff', 'acceleration', 'isProblem', 'cluster'],
      dtype='object')

## Showing convex hulls

In [29]:
def plot_convex_hulls(df, clusters, normal_df_points):
    # %matplotlib qt

    unique_clusters = set(clusters)
    colors = plt.cm.get_cmap("tab10", len(unique_clusters))  # Set of distinct colors for clusters

    # Plot points first for colorbar
    for cluster in unique_clusters:
        if cluster == -1:
            continue  # Skip noise points
        cluster_points = df[df['cluster'] == cluster][['lng', 'lat']].values
        plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {cluster}', c=[colors(cluster)], s=10)

    plt.scatter(normal_df_points['lng'], normal_df_points['lat'], c='gray', alpha=0.5)

    # Plot Convex Hulls
    for cluster in unique_clusters:
        if cluster == -1:
            continue  # Skip noise points
        cluster_points = df[df['cluster'] == cluster][['lng', 'lat']].values
        if len(cluster_points) >= 3:  # Convex hull requires at least 3 points
            hull = ConvexHull(cluster_points)
            hull_points = np.append(hull.vertices, hull.vertices[0])  # Close the loop
            plt.plot(cluster_points[hull_points, 0], cluster_points[hull_points, 1], 'r-')

    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.title('DBSCAN Clustering of Geospatial Data with Convex Hulls')

    # Create colorbar using scatter points
    plt.colorbar(plt.cm.ScalarMappable(cmap="tab10", norm=plt.Normalize(vmin=min(unique_clusters), vmax=max(unique_clusters))),
                 label='Cluster')

    #plt.legend()
    plt.show()


plot_convex_hulls(df_danger_cluster, clusters, df[df['isProblem'] == 0])


  colors = plt.cm.get_cmap("tab10", len(unique_clusters))  # Set of distinct colors for clusters


ValueError: Unable to determine Axes to steal space for Colorbar. Either provide the *cax* argument to use as the Axes for the Colorbar, provide the *ax* argument to steal space from it, or add *mappable* to an Axes.

In [18]:
import pandas as pd
import numpy as np
from scipy.spatial import ConvexHull
from shapely.geometry import Polygon

def get_largest_cluster_bounding_box(df, cluster_column='cluster', coord_columns=['lng', 'lat']):
    # Get unique clusters
    unique_clusters = df[cluster_column].unique()

    largest_area = 0
    largest_cluster = None
    largest_hull_points = None

    # Iterate through clusters to find the one with the largest area
    for cluster in unique_clusters:
        if cluster == -1:  # Skip noise points
            continue

        # Get the points for the current cluster
        cluster_points = df[df[cluster_column] == cluster][coord_columns].values

        # Compute the convex hull
        if len(cluster_points) >= 3:  # Convex hull requires at least 3 points
            hull = ConvexHull(cluster_points)
            hull_points = cluster_points[hull.vertices]
            hull_area = hull.volume  # For 2D, 'volume' is the area of the convex hull

            if hull_area > largest_area:
                largest_area = hull_area
                largest_cluster = cluster
                largest_hull_points = hull_points

    # If no clusters found or all were noise, return None
    if largest_cluster is None:
        return None

    # Create a Polygon object from the convex hull
    polygon = Polygon(largest_hull_points)

    # Get the bounding box (min and max lat, lng)
    min_lng, min_lat, max_lng, max_lat = polygon.bounds

    return {
        'largest_cluster': largest_cluster,
        'bounding_box': {
            'min_lng': min_lng,
            'min_lat': min_lat,
            'max_lng': max_lng,
            'max_lat': max_lat
        }
    }

# Example usage:
# Assuming df_danger is the DataFrame with the 'cluster' column, 'lng', and 'lat'
result = get_largest_cluster_bounding_box(df_danger)
if result:
    print(f"Largest Cluster: {result['largest_cluster']}")
    print(f"Bounding Box: {result['bounding_box']}")
else:
    print("No valid clusters found.")


Largest Cluster: 0
Bounding Box: {'min_lng': 23.7323533, 'min_lat': 38.0112216, 'max_lng': 23.7928033, 'max_lat': 38.0534233}


In [19]:
import numpy as np
import matplotlib.pyplot as plt# Collect coords into list
import requests

import json
overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = """
[out:json];
area["ISO3166-1"="DE"][admin_level=2];
(node["amenity"="biergarten"](area);
 way["amenity"="biergarten"](area);
 rel["amenity"="biergarten"](area);
);
out center;
"""
response = requests.get(overpass_url,
                        params={'data': overpass_query})
data = response.json()

coords = []
for element in data['elements']:
    if element['type'] == 'node':
        lon = element['lon']
        lat = element['lat']
        coords.append((lon, lat))
    elif 'center' in element:
        lon = element['center']['lon']
        lat = element['center']['lat']
        coords.append((lon, lat))# Convert coordinates into numpy array


X = np.array(coords)

plt.plot(X[:, 0], X[:, 1], 'o')
plt.title('Biergarten in Germany')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.axis('equal')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

# Define bounding box (min_lon, min_lat, max_lon, max_lat)
bbox = (-122.523, 37.704, -122.354, 37.833)  # Example: San Francisco

# Example coordinate points (replace with real data)
coords = [
    (-122.45, 37.75),
    (-122.40, 37.78),
    (-122.48, 37.73)
]

# Convert to numpy array
X = np.array(coords)

# Create a figure with Cartopy
fig, ax = plt.subplots(figsize=(6, 4), subplot_kw={"projection": ccrs.PlateCarree()})

# Set the map extent to the bounding box
ax.set_extent([bbox[0], bbox[2], bbox[1], bbox[3]], crs=ccrs.PlateCarree())

# Add roads and features
ax.add_feature(cfeature.LAND, color="lightgray")
ax.add_feature(cfeature.OCEAN, color="lightblue")
ax.add_feature(cfeature.BORDERS, linestyle=":")
ax.add_feature(cfeature.COASTLINE)
ax.add_feature(cfeature.LAKES, color="blue", alpha=0.3)
ax.add_feature(cfeature.RIVERS, color="blue", alpha=0.3)

# Plot points
ax.scatter(X[:, 0], X[:, 1], color="red", marker="o", label="Points")

# Labels and title
ax.set_title("Custom Location Map")
ax.legend()

# Save the output
output_filename = "local_map.png"
plt.savefig(output_filename, dpi=300)
plt.show()

print(f"Map with points saved as {output_filename}")
