In [1]:
import os
import glob
import pandas as pd
import geopandas as gpd
import gpxpy
from sklearn.cluster import KMeans


In [2]:
# Routenpunkte (Komoot-Strecke) laden
route_points = gpd.read_file('./RouteWithDistance_new.gpkg')
# route_points.plot()

In [3]:
route_points

Unnamed: 0,distance_from_start_km,geometry
0,0.000,POINT (570449.85 5515440.104)
1,0.005,POINT (570449.853 5515445.104)
2,0.010,POINT (570449.857 5515450.104)
3,0.015,POINT (570449.861 5515455.104)
4,0.020,POINT (570449.865 5515460.104)
...,...,...
6665,33.325,POINT (570451.424 5515463.707)
6666,33.330,POINT (570451.362 5515458.708)
6667,33.335,POINT (570451.299 5515453.708)
6668,33.340,POINT (570451.236 5515448.708)


In [59]:
pd.options.mode.chained_assignment = None  # default='warn'


# Iteration über jede Datei im Verzeichnis
for file in measurement_files:
    # Particle-Messpunkte laden
    particle_gdf = gpd.read_file(file)
    filename = os.path.basename(file)
    # Dateiname ohne Erweiterung als Schlüssel verwenden
    key = os.path.splitext(filename)[0]
    particle_data[key] = particle_gdf 
    # print(f"Loaded GeoDataFrame for {key}. Total number of measurement points: {len(particle_gdf)}")

    # Überprüfen des Formats der 'zeitstempel'-Spalte und Extrahieren der letzten 15 Zeichen
    if 'zeitstempel' in particle_gdf.columns:
        # Konvertieren der 'zeitstempel'-Spalte in Strings
        particle_gdf['zeitstempel_str'] = particle_gdf['zeitstempel'].astype(str)
        # Extrahieren der letzten 15 Zeichen und Erstellen einer neuen Spalte 'time'
        particle_gdf['time'] = particle_gdf['zeitstempel_str'].str[-15:]
        # Wenn die Länge der 'time'-Spalte 15 Zeichen beträgt, die letzten 7 Zeichen entfernen
        particle_gdf['time'] = particle_gdf['time'].apply(lambda x: x[:-7] if len(x) == 15 else x)
        # Entfernen der ':' und Leerzeichen und Konvertierung in Integer, falls möglich
        particle_gdf['time'] = particle_gdf['time'].str.replace(':', '').str.replace(' ', '')
        particle_gdf['time_int'] = pd.to_numeric(particle_gdf['time'], errors='coerce', downcast='integer')

        # Hinzufügen der neuen Spalten
        particle_gdf['first_or_second_time'] = None
        particle_gdf['route_distance_point'] = None
        particle_gdf['distance'] = None
        
        # Iteration über jede Zeile im aktuellen GeoDataFrame particle_gdf
        for index, row in particle_gdf.iterrows():
            # Create a buffer around the measurement point
            buffer_route = 30
            # Hier würde die Verwendung von route_points korrekt eingefügt werden, nachdem es geladen wurde
            route_points_within_buffer = route_points[route_points['geometry'].within(row.geometry.buffer(buffer_route))]
            # print(f"Number of route points within buffer for index {index}: {len(route_points_within_buffer)}")
            
            # If there are no route points within the buffer, continue with the next measurement point
            if route_points_within_buffer.empty:
                continue
            else:
                # Find other measurement points closeby
                buffer_measurement = 50
                measurement_points_within_buffer = particle_gdf[particle_gdf['geometry'].within(row.geometry.buffer(buffer_measurement))]
                # print(f"Number of measurement points within buffer for index {index}: {len(measurement_points_within_buffer)}")
                
                # Calculate the distance of the route points to the measurement point and the distance of other measurement points to the measurement point
                route_points_within_buffer['distance'] = route_points_within_buffer.distance(row.geometry)
                measurement_points_within_buffer['distance'] = measurement_points_within_buffer.distance(row.geometry)
                
                # Calculate the diff of the max and the min value of the measurement points in the time_int col
                max_diff = measurement_points_within_buffer['time_int'].max() - measurement_points_within_buffer['time_int'].min()
                # print(f"Max time difference for index {index}: {max_diff}")
                
                # If there are route points that have a higher difference than 0.1 in the Route_distance column and there is a higher time_diff than 10 min within the closeby measurement points
                if (route_points_within_buffer['distance'].diff().abs() > 0.1).any() and max_diff > 1000:
                    # Divide the route points in two groups according to the Route_distance column
                    kmeans_route = KMeans(n_clusters=2, random_state=0)
                    kmeans_route.fit(route_points_within_buffer[['distance']])
                    route_points_within_buffer['group'] = kmeans_route.labels_
                    # Calculate the mean of the Route_distance column for each group
                    route_points_within_buffer['mean'] = route_points_within_buffer.groupby('group')['distance'].transform('mean')
                    # Put 'first_time' in the group with the lower mean and 'second_time' in the group with the higher mean
                    route_points_within_buffer['group_order'] = route_points_within_buffer['mean'].apply(lambda x: 'first_time' if x == route_points_within_buffer['mean'].min() else 'second_time')
                    # print(f"Route points group order for index {index}:")
                    # print(route_points_within_buffer[['group', 'group_order']])
                    # print(f"Condition met for index {index}")


                    # Divide the closeby measurement points also
                    kmeans_measurements = KMeans(n_clusters=2, random_state=0)
                    kmeans_measurements.fit(measurement_points_within_buffer[['time_int']])
                    measurement_points_within_buffer['group'] = kmeans_measurements.labels_
                    # Calculate the mean of the time_int column for each group
                    measurement_points_within_buffer['mean'] = measurement_points_within_buffer.groupby('group')['time_int'].transform('mean')
                    # Put 'first_time' in the group with the lower mean and 'second_time' in the group with the higher mean
                    measurement_points_within_buffer['group_order'] = measurement_points_within_buffer['mean'].apply(lambda x: 'first_time' if x == measurement_points_within_buffer['mean'].min() else 'second_time')
                    print(f"Measurement points group order for index {index}:")
                    print(measurement_points_within_buffer[['group', 'group_order']])
                    
                    # Find out if the current measurement point has a time_int value that is closer to the higher_mean or the lower_mean using the absoute difference
                    if abs(row['time_int'] - higher_mean) < abs(row['time_int'] - lower_mean):
                        particle_gdf.loc[index, 'first_or_second_time'] = 'second_time'
                    else:
                        particle_gdf.loc[index, 'first_or_second_time'] = 'first_time'
                        
                    # Find the route point with the lowest distance value that has the same group_order as the measurement point
                    route_point = route_points_within_buffer[route_points_within_buffer['group_order'] == particle_gdf.loc[index, 'first_or_second_time']].nsmallest(1, 'distance')
                    # Put the Route_distance value of the route point in the measurement point route_distance_point column
                    particle_gdf.loc[index, 'route_distance_point'] = route_point['distance'].iloc[0]
                    
                    # Put the distance value of the route point in the measurement point distance column
                    particle_gdf.loc[index, 'distance'] = route_point['distance'].iloc[0]
                    
                else:
                    # Find the route point with the lowest distance value right away
                    route_point = route_points_within_buffer.nsmallest(1, 'distance')
                    # Put the Route_distance value of the route point in the measurement point route_distance_point column
                    particle_gdf.loc[index, 'route_distance_point'] = route_point['distance'].iloc[0]
                    # Put the distance value of the route point in the measurement point distance column
                    particle_gdf.loc[index, 'distance'] = route_point['distance'].iloc[0]
                    
    else:
        print(f"'zeitstempel' Spalte nicht gefunden in {key}")
        
    # Jetzt außerhalb der Schleife die Zeilen entfernen
for key, gdf in particle_data.items():
    # Drop the rows with None in route_distance_point
    gdf = gdf.dropna(subset=['route_distance_point'])
    particle_data[key] = gdf  # Update the dataframe in particle_data dictionary

    # print(f"Processed GeoDataFrame: {key}. Total rows: {len(gdf)}")
    print(gdf[['first_or_second_time', 'route_distance_point', 'distance']].head())

  first_or_second_time route_distance_point   distance
3                 None            24.065789  24.065789
4                 None            26.652717  26.652717
5                 None            26.841418  26.841418
6                 None            26.906956  26.906956
8                 None            26.170387  26.170387
  first_or_second_time route_distance_point  distance
0                 None             1.643155  1.643155
1                 None             1.597016  1.597016
2                 None             0.945835  0.945835
3                 None             1.202663  1.202663
4                 None             3.584043  3.584043
  first_or_second_time route_distance_point   distance
1                 None            28.517493  28.517493
4                 None            28.492508  28.492508
5                 None            25.024639  25.024639
6                 None             18.01194   18.01194
7                 None            12.280217  12.280217
  first_or_secon

In [None]:
# import geopandas as gpd
# from sklearn.cluster import KMeans
# import pandas as pd

# def remove_last_seven_if_length_15(s):
#     if len(s) == 15:
#         return s[:-7]
#     return s

# # Don't show warnings
# pd.options.mode.chained_assignment = None  # default='warn'

# # Set wd
# folder = "C:/EAGLE/SoSe_24/Urban_FieldMeasurements/Data_Collection_FirstWeek"

# # File list
# measurement_files = [
#     f"{folder}/Aufnahme_11062024_VM/Data_1/humid_DHT.gpkg",
#     f"{folder}/Aufnahme_11062024_VM/Data_1/Temp_DHT.gpkg",
#     f"{folder}/Aufnahme_11062024_VM/Data_1/particle.gpkg",
#     f"{folder}/Aufnahme_11062024_Abend/Data_0/humid_DHT.gpkg",
#     f"{folder}/Aufnahme_11062024_Abend/Data_0/Temp_DHT.gpkg",
#     f"{folder}/Aufnahme_11062024_Abend/Data_0/particle.gpkg",
#     f"{folder}/Aufnahme_12062024_Mittag/Data_0/humid_DHT.gpkg",
#     f"{folder}/Aufnahme_12062024_Mittag/Data_0/Temp_DHT.gpkg",
#     f"{folder}/Aufnahme_12062024_Mittag/Data_0/particle.gpkg",
#     f"{folder}/Aufnahme_12062024_Daemmerung/Data_4/humid_DHT.gpkg",
#     f"{folder}/Aufnahme_12062024_Daemmerung/Data_4/Temp_DHT.gpkg",
#     f"{folder}/Aufnahme_12062024_Daemmerung/Data_4/particleBack.gpkg",
#     f"{folder}/Aufnahme_12062024_Daemmerung/Data_4/particleBottom.gpkg",
#     f"{folder}/Aufnahme_12062024_Daemmerung/Data_4/particleFront.gpkg",
#     f"{folder}/Aufnahme_13062024_Morgen/Data_0/humid_DHT.gpkg",
#     f"{folder}/Aufnahme_13062024_Morgen/Data_0/Temp_DHT.gpkg",
#     f"{folder}/Aufnahme_13062024_Morgen/Data_0/particleBack.gpkg",
#     f"{folder}/Aufnahme_13062024_Morgen/Data_0/particleBottom.gpkg",
#     f"{folder}/Aufnahme_13062024_Morgen/Data_0/particleFront.gpkg",
#     f"{folder}/Aufnahme_13062024_nachmittag/Data_0/humid_DHT.gpkg",
#     f"{folder}/Aufnahme_13062024_nachmittag/Data_0/Temp_DHT.gpkg",
#     f"{folder}/Aufnahme_13062024_nachmittag/Data_0/particleBack.gpkg",
#     f"{folder}/Aufnahme_13062024_nachmittag/Data_0/particleBottom.gpkg",
#     f"{folder}/Aufnahme_13062024_nachmittag/Data_0/particleFront.gpkg"
#     ]

# # Load the route points gpkg (already in 32632 CRS)
# path_to_route_points = "C:/EAGLE/SoSe_24/Urban_FieldMeasurements/GIT/cycling_wue/Route_Noise_sensors_gpkg/Route_FINAL_points_5m.gpkg"
# route_points = gpd.read_file(path_to_route_points)

# for file in measurement_files:
#     # Print the file name
#     print(file)
#     measurement_points_UTM = gpd.read_file(file)

#     # Get the last 15 characters of the zeitstempel column and store it in a time column
#     measurement_points_UTM['time'] = measurement_points_UTM['zeitstempel'].str[11:]
#     # Remove rows that have None in the time column
#     measurement_points_UTM = measurement_points_UTM.dropna(subset=['time'])
#     # If the time column has a length of 15, remove the last 7 characters
#     measurement_points_UTM['time'] = measurement_points_UTM['time'].apply(remove_last_seven_if_length_15)
#     # Remove the : in the time column and store it as an integer
#     measurement_points_UTM['time_int'] = measurement_points_UTM['time'].str.replace(':', '')
#     measurement_points_UTM['time_int'] = measurement_points_UTM['time_int'].astype(int)

#     # New cols
#     measurement_points_UTM['first_or_second_time'] = None
#     measurement_points_UTM['route_distance_point'] = None
#     measurement_points_UTM['distance'] = None
    
#     for index, row in measurement_points_UTM.iterrows():
#         # Print current index and total number of measurement points
#         print(f"Index: {index} of {len(measurement_points_UTM)}")

#         # Create a buffer around the measurement point
#         buffer_route = 30
#         route_points_within_buffer = route_points[route_points['geometry'].within(row.geometry.buffer(buffer_route))]

#         # If there are no route points within the buffer, continue with the next measurement point
#         if route_points_within_buffer.empty:
#             continue
#         else:
#             # Find other measurement points closeby
#             buffer_measurement = 50
#             measurement_points_within_buffer = measurement_points_UTM[measurement_points_UTM['geometry'].within(row.geometry.buffer(buffer_measurement))]

#             # Calculate the distance of the route points to the measurement point and the distance of other measurement points to the measurement point
#             route_points_within_buffer['distance'] = route_points_within_buffer.distance(row.geometry)
#             measurement_points_within_buffer['distance'] = measurement_points_within_buffer.distance(row.geometry)

#             # Calculate the diff of the max and the min value of the measurement points in the time_int col
#             max_diff = measurement_points_within_buffer['time_int'].max() - measurement_points_within_buffer['time_int'].min()

            # If there are route points that have a higher difference than 0.1 in the Route_distance column and there is a higher time_diff than 10 min within the closeby measurement points
            if (route_points_within_buffer['Route_distance'].diff().abs() > 0.1).any() and max_diff > 1000:
                # Divide the route points in two groups according to the Route_distance column
                kmeans_route = KMeans(n_clusters=2, random_state=0)
                kmeans_route.fit(route_points_within_buffer[['Route_distance']])
                route_points_within_buffer['group'] = kmeans_route.labels_
                # Calculate the mean of the Route_distance column for each group
                route_points_within_buffer['mean'] = route_points_within_buffer.groupby('group')['Route_distance'].transform('mean')
                # Put 'first_time' in the group with the lower mean and 'second_time' in the group with the higher mean
                route_points_within_buffer['group_order'] = route_points_within_buffer['mean'].apply(lambda x: 'first_time' if x == route_points_within_buffer['mean'].min() else 'second_time')

                # Divide the closeby measurement points also
                kmeans_measurements = KMeans(n_clusters=2, random_state=0)
                kmeans_measurements.fit(measurement_points_within_buffer[['time_int']])
                measurement_points_within_buffer['group'] = kmeans_measurements.labels_
                # Calculate the mean of the time_int column for each group
                measurement_points_within_buffer['mean'] = measurement_points_within_buffer.groupby('group')['time_int'].transform('mean')
                # Store the unique values of the mean column in a list
                higher_mean = measurement_points_within_buffer['mean'].max()
                lower_mean = measurement_points_within_buffer['mean'].min()
                # Put 'first_time' in the group with the lower mean and 'second_time' in the group with the higher mean
                measurement_points_within_buffer['group_order'] = measurement_points_within_buffer['mean'].apply(lambda x: 'first_time' if x == measurement_points_within_buffer['mean'].min() else 'second_time')
                # Find out if the current measurement point has a time_int value that is closer to the higher_mean or the lower_mean using the absoute difference
                if abs(row['time_int'] - higher_mean) < abs(row['time_int'] - lower_mean):
                    measurement_points_UTM.loc[index, 'first_or_second_time'] = 'second_time'
                else:
                    measurement_points_UTM.loc[index, 'first_or_second_time'] = 'first_time'

                # Find the route point with the lowest distance value that has the same group_order as the measurement point
                route_point = route_points_within_buffer[
                    route_points_within_buffer['group_order'] == measurement_points_UTM.loc[
                        index, 'first_or_second_time']].nsmallest(1, 'distance')
                # Put the Route_distance value of the route point in the measurement point route_distance_point column
                measurement_points_UTM.loc[index, 'route_distance_point'] = route_point['Route_distance'].iloc[0]
                # Put the distance value of the route point in the measurement point distance column
                measurement_points_UTM.loc[index, 'distance'] = route_point['distance'].iloc[0]


            else:
                # Find the route point with the lowest distance value right away
                route_point = route_points_within_buffer.nsmallest(1, 'distance')
                # Put the Route_distance value of the route point in the measurement point route_distance_point column
                measurement_points_UTM.loc[index, 'route_distance_point'] = route_point['Route_distance'].iloc[0]
                # Put the distance value of the route point in the measurement point distance column
                measurement_points_UTM.loc[index, 'distance'] = route_point['distance'].iloc[0]


    # Drop the rows with None in route_distance_point
    measurement_points_UTM = measurement_points_UTM.dropna(subset=['route_distance_point'])

    # Define the cols route_distance_point and distance as float
    measurement_points_UTM['route_distance_point'] = measurement_points_UTM['route_distance_point'].astype(float)
    measurement_points_UTM['distance'] = measurement_points_UTM['distance'].astype(float)

    # Delete rows with route_distance_point smaller 0.08 and bigger 33.4?
    measurement_points_UTM_1 = measurement_points_UTM[(measurement_points_UTM['route_distance_point'] > 0.08) & (measurement_points_UTM['route_distance_point'] < 33.4)]
    # Print out the difference in lenght between the original and the new DataFrame
    print(len(measurement_points_UTM) - len(measurement_points_UTM_1))

    # Delete rows with a distance higher than 50?
    measurement_points_UTM_2 = measurement_points_UTM_1[measurement_points_UTM['distance'] < 50]
    # Print out the difference in lenght between the original and the new DataFrame
    print(len(measurement_points_UTM_1) - len(measurement_points_UTM_2))

    # Save the measurement points as a gpkg
    save_path = f"{file[:-5]}_RouteDist.gpkg"
    save_path_1 = f"{file[:-5]}_RouteDist_clipStartEnd.gpkg"
    save_path_2 = f"{file[:-5]}_RouteDist_clipStartEnd_clipDistance50.gpkg"
    measurement_points_UTM.to_file(save_path, driver="GPKG")
    measurement_points_UTM_1.to_file(save_path_1, driver="GPKG")
    measurement_points_UTM_2.to_file(save_path_2, driver="GPKG")
