In [1]:
import numpy as np
import pandas as pd

In [3]:
gps_path = '/run/user/1036/gvfs/smb-share:server=ait-pdfs.win.dtu.dk,share=department/Man/Public/4233-81647-eMOTIONAL-Cities/5 Data/ECDTU/Xing/GPS/Final/'

gps_data = pd.read_csv(gps_path + 'segments.csv', sep=',')
gps_data.head()

Unnamed: 0,HHID,INDIVID,Client Identifier,Access Code,Interval ID,Sequence No.,Latitude,Longitude
0,MMM242,MMM24201,,ECDTU,628692,1,55.68615,12.53335
1,MMM242,MMM24201,,ECDTU,628692,1,55.68595,12.53373
2,MMM242,MMM24201,,ECDTU,628692,1,55.68568,12.53419
3,MMM242,MMM24201,,ECDTU,628692,1,55.68551,12.5346
4,MMM242,MMM24201,,ECDTU,628692,1,55.68545,12.5354


In [None]:
def haversine(lat1, lon1, lat2, lon2):
    """
    Compute the distance between two geographic points using the Haversine formula.
    """
    R = 6371.0  # Earth's mean radius in km

    # Convert to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Coordinate differences
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Haversine formula
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    return R * c  # Distance in km

def process_gps_data(csv_path, output_path):
    """
    Remove all records for an Interval ID if any point within that interval has a
    consecutive-point distance greater than 500 m (0.5 km).
    """
    df = pd.read_csv(csv_path, sep=',', engine='python')  # Adjust delimiter if needed

    # Check required columns
    required_columns = {'Interval ID', 'Latitude', 'Longitude'}
    if not required_columns.issubset(df.columns):
        print(f"Error: Required columns {required_columns} were not found in the CSV file.")
        return

    # Compute distance between consecutive points within the same Interval ID
    df['distance_km'] = np.nan  # Initialize column

    for interval_id in df['Interval ID'].unique():
        mask = df['Interval ID'] == interval_id
        subset = df.loc[mask]

        # Distance from each point to the previous one
        df.loc[mask, 'distance_km'] = haversine(
            subset['Latitude'].shift(), subset['Longitude'].shift(), subset['Latitude'], subset['Longitude']
        )

    # Identify Interval IDs with at least one point where distance_km > 0.5 km
    intervals_with_error = df[df['distance_km'] > 0.5]['Interval ID'].unique()

    # Remove all points from those Interval IDs and keep only the necessary columns
    df_filtered = df[~df['Interval ID'].isin(intervals_with_error)][['Interval ID', 'Latitude', 'Longitude']]

    if not df_filtered.empty:
        try:
            df_filtered.to_csv(output_path, sep=';', index=False)
            print(f"Filtered data exported to {output_path}")
        except Exception as e:
            print(f"Error exporting CSV file: {e}")
    else:
        print("All Interval IDs were removed. No data left to export.")


# Example input/output paths (change as needed)
input_path = gps_path + "segments.csv"
output_path = r"/home/s232713/data/clean_points.csv"

# Run the function
process_gps_data(input_path, output_path)

In [None]:
print('Number of GPS data points:', len(gps_data))

cleaned_data = pd.read_csv('/home/s232713/data/clean_points.csv', sep=';')
print(cleaned_data.head())
print('Number of cleaned GPS data points:', len(cleaned_data))


Number of GPS data points: 2532153
Number of cleaned GPS data points: 1923328


In [2]:

cleaned_data = pd.read_csv('/home/s232713/data/clean_points.csv', sep=';')
print(cleaned_data.head())
print('Number of cleaned GPS data points:', len(cleaned_data))

   Interval ID  Latitude  Longitude
0       628692  55.68615   12.53335
1       628692  55.68595   12.53373
2       628692  55.68568   12.53419
3       628692  55.68551   12.53460
4       628692  55.68545   12.53540
Number of cleaned GPS data points: 1923328
