# Imports and Settings

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import KernelDensity, BallTree
from sklearn.preprocessing import StandardScaler
from scipy.spatial import ConvexHull
from matplotlib.ticker import ScalarFormatter
import os
from colorama import Fore, Style
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull
from shapely.geometry import Polygon
import json


# File loading


In [2]:
columns = [
    "vehicleId", 
    "lat", 
    "lng", 
    "dateStored", 
    "velocity",
    "odometer", 
    "engineVoltage", 
    "dateStoredHuman", 
    "dateOnlyStoredHuman",    
    "timeOnly",
    "bearing",
    "orientation", 
    "seconds_diff", 
    "acceleration",
    "isProblem",
    "trip_id"
]


input_dir   = "../../DataSets/API_Responses/Vehicle_Data/"
filename    = "all_vehicle_responses.csv"

## Enable matloblib UI backend

In [3]:
%matplotlib tk

## Save plots file

In [4]:
PLOT_FOLDER_PATH = "./Plots/"


In [5]:
def merge_csv_file(input_dir, filename, columns):
    input_file = os.path.join(input_dir, filename)

    if not os.path.exists(input_file):
        raise FileNotFoundError(f"File '{filename}' not found in directory '{input_dir}'")

    try:
        # Read the CSV while allowing missing columns
        df = pd.read_csv(input_file, usecols=lambda x: x.strip() in columns, encoding='utf-8')
    except Exception as e:
        raise ValueError(f"Error reading '{input_file}': {e}")

    return df



merged_df = merge_csv_file(input_dir, filename, columns)
print(merged_df.head())


   vehicleId        lat        lng     dateStored  velocity  odometer  \
0          1  37.510833  22.385710  1717682537000       0.0       0.0   
1          1  37.510603  22.385977  1717682540000       0.0       0.0   
2          1  37.510640  22.385927  1717682545000       6.0       0.0   
3          1  37.510750  22.385907  1717682551000       7.0       0.0   
4          1  37.510877  22.385698  1717682557000      26.0       0.0   

   engineVoltage      dateStoredHuman dateOnlyStoredHuman  timeOnly  \
0           0.28  2024-06-06 17:02:17          2024-06-06  17:02:17   
1           0.28  2024-06-06 17:02:20          2024-06-06  17:02:20   
2           0.28  2024-06-06 17:02:25          2024-06-06  17:02:25   
3           0.28  2024-06-06 17:02:31          2024-06-06  17:02:31   
4           0.28  2024-06-06 17:02:37          2024-06-06  17:02:37   

      bearing orientation  seconds_diff  trip_id  acceleration  isProblem  
0  137.402376   Southeast           NaN       12      0.00

Set **Bounding Box** only for **Τρίπολη**

In [6]:
latMin = 37.49764419371479
latMax = 37.56244081620044
lngMin = 22.344992459074458
lngMax = 22.521463853839485


query_filter = 'lat >= ' +str(latMin)+' & lat <= ' + str(latMax) + ' & lng >= ' +str(lngMin)+ ' & lng <= '+str(lngMax)
veh_data_tripoli = merged_df.query( query_filter ).copy(True)
merged_df = veh_data_tripoli

# Data Overview

In [7]:
df = merged_df
df_danger = df[df['isProblem'] == 1]
# df_danger = df[df['vehicleId'] == 15]


sns.set_theme(style="ticks")
fig, ax = plt.subplots()
#sns.jointplot(x=df_danger['lng'], y=df_danger['lat'], kind="hex", color="#4CB391", ax=ax)
ax.hexbin(x=df_danger['lng'], y=df_danger['lat'])
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')

plt.gca().xaxis.set_major_formatter(ScalarFormatter())
plt.gca().yaxis.set_major_formatter(ScalarFormatter())
plt.ticklabel_format(style='plain', axis='both')  # Disable scientific notation


ax.set_title('Density of problem points on spatial coordinates')

Text(0.5, 1.0, 'Density of problem points on spatial coordinates')

### Init DF15 (VehicleId == 15)

In [8]:
# df15 = df[df["vehicleId"] == 15]
# df15 = df15.head(500)
# df15_problem = df15[df15['isProblem'] == 1]
# plt.plot(df15.index, df15['acceleration'])
# plt.title('Acceleration vs Index')
# plt.ylabel('Acceleration')
# plt.xlabel('Index')
# plt.scatter(df15_problem.index, df15_problem['acceleration'], color='red')

# len(df15)

In [9]:
df_danger[['lng', 'lat']].describe()

Unnamed: 0,lng,lat
count,2004.0,2004.0
mean,22.378294,37.515228
std,0.007051,0.006222
min,22.363152,37.497893
25%,22.372447,37.51079
50%,22.376483,37.513006
75%,22.385412,37.519258
max,22.415382,37.53314


# Clustering

In [10]:
# #### MOCK DATA #####
#
# data = {
#     'lng': np.random.uniform(-180, 180, 200),
#     'lat': np.random.uniform(-90, 90, 200)
# }
# df = pd.DataFrame(data)
# df_danger = df

In [11]:
# Extracting the coordinates
coords = df_danger[['lng', 'lat']].values

# Standardizing the data for better clustering performance
scaler = StandardScaler()
coords_scaled = scaler.fit_transform(coords)

# Applying DBSCAN
dbscan = DBSCAN(eps=0.02, min_samples=4)  # Adjust eps as needed
clusters = dbscan.fit_predict(coords_scaled)

df_danger.loc[:, 'cluster'] = clusters  # Adding cluster labels to DataFrame


df_danger_cluster = df_danger[df_danger['cluster'] > -1]


# %matplotlib inline
# Plotting the results
plt.figure(figsize=(10, 6))
plt.scatter(df_danger_cluster['lng'], df_danger_cluster['lat'], c=df_danger_cluster['cluster'], cmap='tab10', edgecolors='k', alpha=0.7)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('DBSCAN Clustering of Geospatial Data')
plt.colorbar(label='Cluster')

# Save the plot
plt.savefig(PLOT_FOLDER_PATH, bbox_inches='tight')
print(f"Plot saved to {PLOT_FOLDER_PATH}")

plt.show()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_danger.loc[:, 'cluster'] = clusters  # Adding cluster labels to DataFrame


Plot saved to ./Plots/


In [12]:
df_danger.columns

Index(['vehicleId', 'lat', 'lng', 'dateStored', 'velocity', 'odometer',
       'engineVoltage', 'dateStoredHuman', 'dateOnlyStoredHuman', 'timeOnly',
       'bearing', 'orientation', 'seconds_diff', 'trip_id', 'acceleration',
       'isProblem', 'cluster'],
      dtype='object')

In [13]:
df_danger.describe()

Unnamed: 0,vehicleId,lat,lng,dateStored,velocity,odometer,engineVoltage,bearing,seconds_diff,trip_id,acceleration,isProblem,cluster
count,2004.0,2004.0,2004.0,2004.0,2004.0,2004.0,2004.0,2004.0,2004.0,2004.0,2004.0,2004.0,2004.0
mean,7.641717,37.515228,22.378294,1731541000000.0,14.522455,0.0,4.853391,183.38795,3.538423,127.83483,-1.071486,1.0,8.376248
std,4.173637,0.006222,0.007051,7882518000.0,15.013025,0.0,0.545188,106.711803,3.119683,139.924229,1.125086,0.0,16.810956
min,1.0,37.497893,22.363152,1717683000000.0,0.0,0.0,0.0,0.0,1.0,0.0,-14.722222,1.0,-1.0
25%,7.0,37.51079,22.372447,1728074000000.0,6.0,0.0,4.621,94.893813,2.0,18.0,-1.111111,1.0,-1.0
50%,7.0,37.513006,22.376483,1730461000000.0,10.0,0.0,4.853,188.333335,3.0,70.0,-0.763889,1.0,-1.0
75%,9.0,37.519258,22.385412,1738756000000.0,18.0,0.0,5.2245,277.714093,5.0,202.25,-0.555556,1.0,11.0
max,20.0,37.53314,22.415382,1743590000000.0,123.0,0.0,5.551,359.405846,50.0,498.0,-0.505051,1.0,56.0


## Showing convex hulls

In [14]:
def plot_convex_hulls(df, clusters, normal_df_points):
    unique_clusters = set(clusters)
    colors = plt.cm.get_cmap("tab10", len(unique_clusters))  # Set of distinct colors for clusters

    fig, ax = plt.subplots()  # Create figure and axis objects

    # Plot points first for colorbar
    for cluster in unique_clusters:
        if cluster == -1:
            continue  # Skip noise points
        cluster_points = df[df['cluster'] == cluster][['lng', 'lat']].values
        ax.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {cluster}', c=[colors(cluster)], s=10)

    ax.scatter(normal_df_points['lng'], normal_df_points['lat'], c='gray', alpha=0.5)

    # Plot Convex Hulls
    for cluster in unique_clusters:
        if cluster == -1:
            continue  # Skip noise points
        cluster_points = df[df['cluster'] == cluster][['lng', 'lat']].values
        if len(cluster_points) >= 3:  # Convex hull requires at least 3 points
            hull = ConvexHull(cluster_points)
            hull_points = np.append(hull.vertices, hull.vertices[0])  # Close the loop
            ax.plot(cluster_points[hull_points, 0], cluster_points[hull_points, 1], 'r-')

    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    ax.set_title('DBSCAN Clustering of Geospatial Data with Convex Hulls')

    # Create colorbar using scatter points
    cb = fig.colorbar(plt.cm.ScalarMappable(cmap="tab10", norm=plt.Normalize(vmin=min(unique_clusters), vmax=max(unique_clusters))),
                      ax=ax, label='Cluster')

    # Save the plot
    plt.savefig(PLOT_FOLDER_PATH, bbox_inches='tight')
    print(f"Plot saved to {PLOT_FOLDER_PATH}")

    plt.show()

plot_convex_hulls(df_danger_cluster, clusters, df[df['isProblem'] == 0])


  colors = plt.cm.get_cmap("tab10", len(unique_clusters))  # Set of distinct colors for clusters


Plot saved to ./Plots/


## Get specific **Cluster's BBOX**

In [15]:
def get_bbox_of_clusters(df, clusters):
    cluster_bboxes = {}

    # Iterate over unique clusters (excluding -1 for noise)
    unique_clusters = sorted(set(clusters) - {-1})  # Exclude noise points (-1)

    for cluster in unique_clusters:
        # Filter the points of the current cluster
        cluster_points = df[df['cluster'] == cluster][['lng', 'lat']]
        
        # Get the minimum and maximum lng and lat for the bounding box
        min_lng = cluster_points['lng'].min()
        max_lng = cluster_points['lng'].max()
        min_lat = cluster_points['lat'].min()
        max_lat = cluster_points['lat'].max()

        # Store the bounding box for the current cluster
        cluster_bboxes[cluster] = {
            'min_lng': min_lng,
            'max_lng': max_lng,
            'min_lat': min_lat,
            'max_lat': max_lat
        }

    return cluster_bboxes

cluster_bboxes = get_bbox_of_clusters(df_danger_cluster, clusters)

# Display the bounding boxes for each cluster
for cluster, bbox in cluster_bboxes.items():
    print(f"Cluster {cluster}: {bbox}")


Cluster 0: {'min_lng': 22.3853133, 'max_lng': 22.3874133, 'min_lat': 37.5102916, 'max_lat': 37.5114683}
Cluster 1: {'min_lng': 22.3841633, 'max_lng': 22.3843916, 'min_lat': 37.510425, 'max_lat': 37.5105433}
Cluster 2: {'min_lng': 22.3848866, 'max_lng': 22.3851333, 'min_lat': 37.5107683, 'max_lat': 37.5109899}
Cluster 3: {'min_lng': 22.3844933, 'max_lng': 22.3848033, 'min_lat': 37.510815, 'max_lat': 37.5110133}
Cluster 4: {'min_lng': 22.3854, 'max_lng': 22.3854449, 'min_lat': 37.5114433, 'max_lat': 37.5117283}
Cluster 5: {'min_lng': 22.3832566, 'max_lng': 22.3834216, 'min_lat': 37.5120583, 'max_lat': 37.512245}
Cluster 6: {'min_lng': 22.3759916, 'max_lng': 22.3763466, 'min_lat': 37.512855, 'max_lat': 37.5131816}
Cluster 7: {'min_lng': 22.3749533, 'max_lng': 22.3750599, 'min_lat': 37.509825, 'max_lat': 37.509945}
Cluster 8: {'min_lng': 22.3840833, 'max_lng': 22.3842783, 'min_lat': 37.5138016, 'max_lat': 37.5138783}
Cluster 9: {'min_lng': 22.372835, 'max_lng': 22.3731166, 'min_lat': 37.52

## Plot Orientations with Convex Hulls

### Define Trips **every 3 seconds**

#### **Δεδομένου ότι η Powerfleet είπε ότι μία από τις προυποθέσεις είναι καθε 3 seconds, έβαλα 6 seconds για να καλυψω το χρονο αποστολής έως εγγραφής στη Data Base**

In [16]:
import pandas as pd

# Create a copy of df_danger
danger_orient = df_danger.copy()

# Ensure 'dateStoredHuman' is in datetime format
danger_orient['dateStoredHuman'] = pd.to_datetime(danger_orient['dateStoredHuman'])

# Sort data by vehicleId and dateStoredHuman
danger_orient = danger_orient.sort_values(by=['vehicleId', 'dateStoredHuman'])

# Assign trip_id based on a gap of 6 seconds
df_danger['trip_id'] = df_danger.groupby('vehicleId', group_keys=False)['seconds_diff'].apply(lambda x: (x >= 6).cumsum()).reset_index(drop=True)

# Fill NaN trip IDs (first row of each vehicle) with 0
danger_orient.loc[:, 'trip_id'] = danger_orient['trip_id'].fillna(0).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_danger['trip_id'] = df_danger.groupby('vehicleId', group_keys=False)['seconds_diff'].apply(lambda x: (x >= 6).cumsum()).reset_index(drop=True)


# TODO:: NA ΥΠΟΛΟΓΙΣΩ CONVEX HULLS ΓΙΑ ΤΑ ORIENTATIONS (ΠΡΕΠΕΙ ΝΑ ΕΙΝΑΙ ΕΝΤΟΣ 6 secs???)

## Prepare the DF

In [17]:
# *Get specific columns 
_ = merged_df.copy()
bearings_df = _[['vehicleId', 'lat', 'lng', 'bearing', 'orientation', 'seconds_diff', 'trip_id']]
print(bearings_df)

# Filter for vehicleId == 1
df_vehicle1 = df[df['vehicleId'] == 1]

# Count occurrences of each trip_id
trip_counts = df_vehicle1['trip_id'].value_counts()

# Get the trip_id with the highest count
most_frequent_trip_id = trip_counts.idxmax()

# Display the result
print(f"The trip_id with the most rows for vehicleId 1 is: {most_frequent_trip_id}")


       vehicleId        lat        lng     bearing orientation  seconds_diff  \
0              1  37.510833  22.385710  137.402376   Southeast           NaN   
1              1  37.510603  22.385977  312.778670   Northwest           3.0   
2              1  37.510640  22.385927  351.785725       North           5.0   
3              1  37.510750  22.385907  307.481149   Northwest           6.0   
4              1  37.510877  22.385698  318.388767   Northwest           6.0   
...          ...        ...        ...         ...         ...           ...   
27223         20  37.531460  22.369768  231.663210   Southwest           4.0   
27224         20  37.531275  22.369473  235.207818   Southwest           2.0   
27225         20  37.531122  22.369195  278.389323        West           4.0   
27226         20  37.531148  22.368967  294.596339   Northwest           2.0   
27227         20  37.531243  22.368705  135.769171   Southeast           3.0   

       trip_id  
0           12  
1    

### Get trip_id's rows

In [18]:
# Get all rows corresponding to the most frequent trip_id for vehicleId 1
df_most_frequent_trip = df[(df['vehicleId'] == 1) & (df['trip_id'] == most_frequent_trip_id)]

# Display the first few rows
print(df_most_frequent_trip.head())


     vehicleId        lat        lng     dateStored  velocity  odometer  \
857          1  37.510790  22.386127  1717841444000      18.0       0.0   
858          1  37.510803  22.385997  1717841468000       7.0       0.0   
859          1  37.510780  22.386223  1717841474000      17.0       0.0   
860          1  37.510742  22.386365  1717841477000      11.0       0.0   
861          1  37.510697  22.386493  1717841488000       9.0       0.0   

     engineVoltage      dateStoredHuman dateOnlyStoredHuman  timeOnly  \
857          5.413  2024-06-08 13:10:44          2024-06-08  13:10:44   
858          5.413  2024-06-08 13:11:08          2024-06-08  13:11:08   
859          5.413  2024-06-08 13:11:14          2024-06-08  13:11:14   
860          5.413  2024-06-08 13:11:17          2024-06-08  13:11:17   
861          5.420  2024-06-08 13:11:28          2024-06-08  13:11:28   

        bearing orientation  seconds_diff  trip_id  acceleration  isProblem  
857  277.349174        West     

## Calculate Bearing diff

In [19]:
def calculate_bearing_difference(merged_df):
    """
    Calculates the difference between consecutive bearing values in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): A DataFrame containing a 'bearing' column.
        
    Returns:
        pd.Series: A Series containing differences between consecutive bearings.
    """
    if 'bearing' not in df.columns or df.empty:
        return pd.Series([])
    
    df['bearing_diff'] = df['bearing'].diff().abs()
    return df

merged_df = calculate_bearing_difference(merged_df)


## Plot std deviation

### Filter for **Convex Hull BBOXes**

In [20]:
for cluster, bbox in cluster_bboxes.items():
    print(f"Cluster {cluster}: {bbox}")


Cluster 0: {'min_lng': 22.3853133, 'max_lng': 22.3874133, 'min_lat': 37.5102916, 'max_lat': 37.5114683}
Cluster 1: {'min_lng': 22.3841633, 'max_lng': 22.3843916, 'min_lat': 37.510425, 'max_lat': 37.5105433}
Cluster 2: {'min_lng': 22.3848866, 'max_lng': 22.3851333, 'min_lat': 37.5107683, 'max_lat': 37.5109899}
Cluster 3: {'min_lng': 22.3844933, 'max_lng': 22.3848033, 'min_lat': 37.510815, 'max_lat': 37.5110133}
Cluster 4: {'min_lng': 22.3854, 'max_lng': 22.3854449, 'min_lat': 37.5114433, 'max_lat': 37.5117283}
Cluster 5: {'min_lng': 22.3832566, 'max_lng': 22.3834216, 'min_lat': 37.5120583, 'max_lat': 37.512245}
Cluster 6: {'min_lng': 22.3759916, 'max_lng': 22.3763466, 'min_lat': 37.512855, 'max_lat': 37.5131816}
Cluster 7: {'min_lng': 22.3749533, 'max_lng': 22.3750599, 'min_lat': 37.509825, 'max_lat': 37.509945}
Cluster 8: {'min_lng': 22.3840833, 'max_lng': 22.3842783, 'min_lat': 37.5138016, 'max_lat': 37.5138783}
Cluster 9: {'min_lng': 22.372835, 'max_lng': 22.3731166, 'min_lat': 37.52

In [21]:
# Calculate the standard deviation of 'bearing_diff'
std_dev = merged_df['bearing_diff'].std()

# Define a threshold for high standard deviation
high_std_threshold = 1.5 * std_dev  # Adjust factor as needed

# Identify high standard deviation values
high_std_points = merged_df[merged_df['bearing_diff'] > high_std_threshold]

# Extract BBOX (bounding box) coordinates from last_value
min_x, min_y, max_x, max_y = last_value  # Assuming last_value holds BBOX as [min_x, min_y, max_x, max_y]




NameError: name 'last_value' is not defined

### 1️⃣ Hexbin Plot (Density Heatmap)

In [None]:
plt.figure(figsize=(8, 5))
plt.hexbin(merged_df['bearing_diff'], merged_df['bearing'], gridsize=30, cmap='Blues', mincnt=1)

# Highlight high standard deviation points
plt.scatter(high_std_points['bearing_diff'], high_std_points['bearing'], color='red', label='High Std Dev', alpha=0.8)

# Highlight the standard deviation as a vertical line
plt.axvline(x=std_dev, color='green', linestyle='--', label=f'Standard Deviation ({std_dev:.2f})')

# Bounding Box (if applicable)
plt.axvspan(min_x, max_x, color='gray', alpha=0.2, label='Bounding Box')

# Labels and title
plt.xlabel('Bearing Difference')
plt.ylabel('Bearing')
plt.title('Hexbin Plot: High Std Dev Bearings vs Bearing Difference')
plt.colorbar(label='Density')
plt.legend()
plt.grid(True)

# Save the plot
plt.savefig(PLOT_FOLDER_PATH, bbox_inches='tight')
print(f"Plot saved to {PLOT_FOLDER_PATH}")

plt.show()


### 2️⃣ Violin Plot (Distribution)

In [None]:
import seaborn as sns

plt.figure(figsize=(8, 5))
sns.violinplot(x=merged_df['bearing_diff'], inner="quartile", color="lightblue")

# Mark the standard deviation
plt.axvline(x=std_dev, color='red', linestyle='--', label=f'Standard Deviation ({std_dev:.2f})')

plt.xlabel('Bearing Difference')
plt.title('Violin Plot: Distribution of Bearing Difference')
plt.legend()

# Save the plot
plt.savefig(PLOT_FOLDER_PATH, bbox_inches='tight')
print(f"Plot saved to {PLOT_FOLDER_PATH}")

plt.show()


### 3️⃣ Boxplot (Detect Outliers)

In [None]:
plt.figure(figsize=(6, 5))
sns.boxplot(x=merged_df['bearing_diff'], color='lightblue')

# Mark the standard deviation
plt.axvline(x=std_dev, color='red', linestyle='--', label=f'Standard Deviation ({std_dev:.2f})')

plt.xlabel('Bearing Difference')
plt.title('Boxplot: Bearing Difference Outliers')
plt.legend()

# Save the plot
plt.savefig(PLOT_FOLDER_PATH, bbox_inches='tight')
print(f"Plot saved to {PLOT_FOLDER_PATH}")

plt.show()


### 4️⃣ KDE Plot (Smooth Distribution Curve)



In [None]:
plt.figure(figsize=(8, 5))
sns.kdeplot(merged_df['bearing_diff'], fill=True, color="blue", alpha=0.3)

# Mark standard deviation
plt.axvline(x=std_dev, color='red', linestyle='--', label=f'Standard Deviation ({std_dev:.2f})')

plt.xlabel('Bearing Difference')
plt.title('KDE Plot: Bearing Difference Distribution')
plt.legend()

# Save the plot
plt.savefig(PLOT_FOLDER_PATH, bbox_inches='tight')
print(f"Plot saved to {PLOT_FOLDER_PATH}")

plt.show()


# Save DF to csv

In [None]:
all_vehicles_data_path = "../../DataSets/API_Responses/Vehicle_Data/all_vehicle_responses.csv"
merged_df.to_csv(all_vehicles_data_path, index=False)
print(Fore.GREEN + f"DataFrame stored to {all_vehicles_data_path}" + Style.RESET_ALL)