In [1]:
!pip install haversine -q

In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from datetime import timedelta
from haversine import haversine, Unit

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
INPUT_FILE = "/content/drive/MyDrive/Edulift/eVED_171101_week_matched.csv"
OUTPUT_FILE = "/content/drive/MyDrive/Edulift/eVED_features.csv"

In [5]:
def cluster_od(df):
    # Get trip starts and ends
    starts = df.groupby("Trip").first()[["matched_lat","matched_lon"]]
    ends = df.groupby("Trip").last()[["matched_lat","matched_lon"]]

    coords = np.vstack([starts.values, ends.values])
    clustering = DBSCAN(eps=0.005, min_samples=5).fit(coords)  # ~500m
    labels = clustering.labels_

    starts["od_cluster"] = labels[:len(starts)]
    ends["od_cluster"] = labels[len(starts):]

    return starts, ends

In [6]:
def compute_trip_stats(df):
    trip_stats = []
    for trip, g in df.groupby("Trip"):
        g = g.sort_values("timestamp")

        # Relative trip timing
        start_time = pd.to_datetime(g["timestamp"].iloc[0])
        end_time = pd.to_datetime(g["timestamp"].iloc[-1])
        duration = (end_time - start_time).total_seconds() / 60  # minutes

        # Distance using haversine
        coords = list(zip(g["matched_lat"], g["matched_lon"]))
        dist = sum(haversine(coords[i], coords[i+1], unit=Unit.KILOMETERS)
                   for i in range(len(coords)-1))

        avg_speed = dist / (duration/60) if duration > 0 else 0
        trip_stats.append([trip, start_time, end_time, duration, dist, avg_speed])

    return pd.DataFrame(trip_stats,
        columns=["Trip","start_time","end_time","duration_min","distance_km","avg_speed_kmh"])

In [7]:
df = pd.read_csv(INPUT_FILE)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 534559 entries, 0 to 534558
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   VehId        534559 non-null  float64
 1   Trip         534559 non-null  float64
 2   timestamp    534559 non-null  object 
 3   lat          534559 non-null  float64
 4   lon          534559 non-null  float64
 5   matched_lat  534550 non-null  float64
 6   matched_lon  534550 non-null  float64
 7   road_name    458535 non-null  object 
dtypes: float64(6), object(2)
memory usage: 32.6+ MB


In [9]:
df.head()

Unnamed: 0,VehId,Trip,timestamp,lat,lon,matched_lat,matched_lon,road_name
0,8.0,730.0,1970-01-01 00:00:00.000,42.26657,-83.707059,42.266567,-83.70711,Arlington Boulevard
1,8.0,730.0,1970-01-01 00:00:00.600,42.26657,-83.707059,42.266567,-83.70711,Arlington Boulevard
2,8.0,730.0,1970-01-01 00:00:02.100,42.26657,-83.707059,42.266567,-83.70711,Arlington Boulevard
3,8.0,730.0,1970-01-01 00:00:02.800,42.266959,-83.706973,42.26696,-83.70714,Arlington Boulevard
4,8.0,730.0,1970-01-01 00:00:03.200,42.266959,-83.706973,42.26696,-83.70714,Arlington Boulevard


In [10]:
starts, ends = cluster_od(df)

In [11]:
starts.head()

Unnamed: 0_level_0,matched_lat,matched_lon,od_cluster
Trip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8.0,42.281343,-83.73572,0
24.0,42.256714,-83.695647,1
27.0,42.271846,-83.72794,0
32.0,42.284952,-83.801928,2
34.0,42.274646,-83.674135,3


In [12]:
ends.head()

Unnamed: 0_level_0,matched_lat,matched_lon,od_cluster
Trip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8.0,42.262099,-83.704433,22
24.0,42.281597,-83.753555,0
27.0,42.252823,-83.67477,8
32.0,42.290112,-83.79383,-1
34.0,42.282448,-83.745979,0


In [13]:
stats = compute_trip_stats(df)

In [14]:
stats.head()

Unnamed: 0,Trip,start_time,end_time,duration_min,distance_km,avg_speed_kmh
0,8.0,1970-01-01,1970-01-01 00:07:05.300,7.088333,4.204831,35.592264
1,24.0,1970-01-01,1970-01-01 00:10:23.500,10.391667,6.370776,36.783952
2,27.0,1970-01-01,1970-01-01 00:08:03.000,8.05,5.219052,38.899767
3,32.0,1970-01-01,1970-01-01 00:03:42.800,3.713333,1.917766,30.987236
4,34.0,1970-01-01,1970-01-01 00:13:00.400,13.006667,7.093905,32.724317


In [15]:
features = stats.merge(starts, left_on="Trip", right_index=True, how="left") \
                    .merge(ends, left_on="Trip", right_index=True, suffixes=("_start","_end"))

In [16]:
features.head()

Unnamed: 0,Trip,start_time,end_time,duration_min,distance_km,avg_speed_kmh,matched_lat_start,matched_lon_start,od_cluster_start,matched_lat_end,matched_lon_end,od_cluster_end
0,8.0,1970-01-01,1970-01-01 00:07:05.300,7.088333,4.204831,35.592264,42.281343,-83.73572,0,42.262099,-83.704433,22
1,24.0,1970-01-01,1970-01-01 00:10:23.500,10.391667,6.370776,36.783952,42.256714,-83.695647,1,42.281597,-83.753555,0
2,27.0,1970-01-01,1970-01-01 00:08:03.000,8.05,5.219052,38.899767,42.271846,-83.72794,0,42.252823,-83.67477,8
3,32.0,1970-01-01,1970-01-01 00:03:42.800,3.713333,1.917766,30.987236,42.284952,-83.801928,2,42.290112,-83.79383,-1
4,34.0,1970-01-01,1970-01-01 00:13:00.400,13.006667,7.093905,32.724317,42.274646,-83.674135,3,42.282448,-83.745979,0


In [17]:
features.to_csv(OUTPUT_FILE, index=False)

In [18]:
df_features = pd.read_csv(OUTPUT_FILE)
df_features.head()

Unnamed: 0,Trip,start_time,end_time,duration_min,distance_km,avg_speed_kmh,matched_lat_start,matched_lon_start,od_cluster_start,matched_lat_end,matched_lon_end,od_cluster_end
0,8.0,1970-01-01,1970-01-01 00:07:05.300,7.088333,4.204831,35.592264,42.281343,-83.73572,0,42.262099,-83.704433,22
1,24.0,1970-01-01,1970-01-01 00:10:23.500,10.391667,6.370776,36.783952,42.256714,-83.695647,1,42.281597,-83.753555,0
2,27.0,1970-01-01,1970-01-01 00:08:03.000,8.05,5.219052,38.899767,42.271846,-83.72794,0,42.252823,-83.67477,8
3,32.0,1970-01-01,1970-01-01 00:03:42.800,3.713333,1.917766,30.987236,42.284952,-83.801928,2,42.290112,-83.79383,-1
4,34.0,1970-01-01,1970-01-01 00:13:00.400,13.006667,7.093905,32.724317,42.274646,-83.674135,3,42.282448,-83.745979,0


In [19]:
df_features['od_cluster_start'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 13, 12, 14, 15, 16,
       17, 18, 19, -1, 20, 21, 22, 23, 24, 25, 26, 27, 28, 48, 29, 30, 31,
       32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47])

In [20]:
df_features['od_cluster_end'].unique()

array([22,  0,  8, -1, 32,  3, 17, 47,  2,  5,  4, 14, 48, 19, 40, 29, 11,
       35,  7,  1, 37, 25, 24, 27, 42, 12, 15, 31, 30,  6, 26, 33, 44, 45,
       36, 21, 18, 20, 13, 16, 43, 28, 39, 34, 46, 41,  9, 10, 23, 38])