# GeoLife Augmentation
1. Download car, bike, pedestrian maps
2. Generate trajectories:
Trajectories are generated according to 3 mobiltiy profiles:cars, walk, bike
    * Source and destination taken from the augmented medoids
    * Gps interpolation
    * Time delta interpolation
3. Build the GeoLife augmented data set


In [None]:
#imports cell

import pandas as pd
import matplotlib.pyplot as plt
import osmnx as ox 
from sklearn.neighbors import KDTree
import networkx as nx
import folium
import random
from geographiclib.geodesic import Geodesic
import math
import yaml
import skmob
from geopy.distance import geodesic


with open("conf.yaml") as f:
    conf = yaml.load(f, Loader = yaml.FullLoader)

#base_path = conf["base_path"]
out_path = conf["out_path"]
interpolation_distance = conf["interpolation_distance"]
n_trajectory_user = conf["n_trajectory_user"]
num_cars = conf["n_cars"]
num_bikes = conf["n_bikes"]
num_pedestrians = conf["n_pedestrians"]
geolife_data_path = conf["geolife_data_path"]
start_time = conf["start_time"]
end_time = conf["end_time"]


In [None]:
#in order to load the augmented medoids (medoids + random locations) only once
medoids = pd.read_csv(out_path + "geolife_augmented_medoids.csv", usecols = ["lat", "lon"])
print(medoids.head())

n_medoids = len(medoids)
print("We have a total of {:d} starting/arrival points".format(n_medoids))

## 1. Geo_life maps: drive, bike, walk

In [None]:
%%time
#get graphs from place and serialize them on disk

#get the graphs
D = ox.graph_from_place('Beijing, China', which_result=2, network_type='drive')
B = ox.graph_from_place('Beijing, China', which_result=2, network_type='bike')
W = ox.graph_from_place('Beijing, China', which_result=2, network_type='walk')

In [None]:
#serialize them on disk
ox.save_graphml(D, out_path+"drive_graph.graphml")
ox.save_graphml(B, out_path+"bike_graph.graphml")
ox.save_graphml(W, out_path+"walk_graph.graphml")

print("Serialized graphs on disk")

In [None]:
#Load graphs from files if already downloaded

D = ox.load_graphml(out_path+"drive_graph.graphml")
B = ox.load_graphml(out_path+"bike_graph.graphml")
W = ox.load_graphml(out_path+"walk_graph.graphml")


## 2. Trajectory generation
We use the put_datetime helper function in order to put datetimes in the generated trajectories.

In [None]:
import random
import time

def str_time_prop(start, end, format, prop):
    """Get a time at a proportion of a range of two formatted times.

    start and end should be strings specifying times formated in the
    given format (strftime-style), giving an interval [start, end].
    prop specifies how a proportion of the interval to be taken after
    start.  The returned time will be in the specified format.
    """

    stime = time.mktime(time.strptime(start, format))
    etime = time.mktime(time.strptime(end, format))

    ptime = stime + prop * (etime - stime)

    return time.strftime(format, time.localtime(ptime))


def random_date(start, end, prop):
    return str_time_prop(start, end, '%Y-%m-%d %H:%M:%S', prop)

#print(random_date("2008-06-01 00:00:00", "2008-08-31 23:59:00", random.random()))

def put_datetime(trajs_df, speed):
    
    import datetime
    from datetime import timedelta


    traj_copy = trajs_df.copy(deep = True)
    traj_copy["date_time"] = start_time

    #selecting each traj by uid and tid
    
    overlaps = 0
    index = 0
    
    for uid in range(traj_copy.uid.min(), traj_copy.uid.max() + 1):
        # trajectories for user uid
        user = traj_copy[traj_copy["uid"] == uid]
        
        #reset intervals
        intervals = []
        
        tid = user.tid.min()
        max_tid = user.tid.max()
        
        while (tid <= max_tid):
            
            initial_tid_index = index

            #get a random date in our range
            date = random_date(start_time, end_time, random.random())
            date_time_obj = datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S')

            #print("Trajectory {:d} starting at {:s}".format(tid, date))
            traj = user[user["tid"] == tid]
            #put the starting date at the beginning of the traj
            traj_copy.iat[index, 4] = date
            t_since_start = 0
            d_tot = 0
            #in order to do it only once
            t_len = len(traj)

            for i in range(1, t_len):
                try : 
                    #print(traj.iloc[i])
                    dist = geodesic((traj.iloc[i-1].lat, traj.iloc[i].lon), \
                                               (traj.iloc[i].lat, traj.iloc[i].lon))

                    d_tot += dist.meters

                    #gets m / m/s
                    tdelta = (dist.meters)/speed
                    t_since_start += tdelta

                    row_dt = date_time_obj + timedelta(seconds = t_since_start)
                    index += 1
                    traj_copy.iat[index, 4] =  row_dt

                except IndexError:
                    print(i)
                    #print(traj.iloc[i-1])
                    #print(traj.iloc[i])
            try:        
                n_interval = pd.Interval(date_time_obj.timestamp(), row_dt.timestamp())
            except:
                print("Error interval:",date_time_obj.timestamp(), row_dt.timestamp())
                n_interval = pd.Interval(row_dt.timestamp(), date_time_obj.timestamp())
                
            #check if the trajectory is time-overlapping with another one
            overlapping = False #stupid flag
            for interval in intervals:
                if (interval.overlaps(n_interval)):
                    overlapping = True
                    overlaps += 1
                    break
                    
            if (overlapping):
                index = initial_tid_index
                continue
                
            else:
                intervals.append(n_interval) 
                tid += 1
                index += 1
                
            #print("Total time for traj n {:d}: {:f} minutes. meters: {:f}".format(tid, t_since_start/60, d_tot))
            #print(traj.iloc[0].date_time)
            #print(traj.iloc[i].date_time)

            #print(traj)
    print("Put datetimes on {:d} rows, generated {:d} overlapping trajectories".format(index - 1, overlaps))
    return traj_copy

#### We define the interpolation and the faker functions to be called for each generation.

In [None]:
#interpolates the dataframe
def interpolator(gdf, uid):
    
    cols = ["lat", "lon", "uid", "tid"]
    #traj_df = pd.DataFrame(columns = cols)
    rows = []
    
    #meters for interpolation
    k = interpolation_distance
    geod = Geodesic.WGS84

    for i in range(len(gdf) - 1):

        l = geod.InverseLine(gdf.iloc[i].y, gdf.iloc[i].x, gdf.iloc[i+1].y, gdf.iloc[i+1].x)
        # number of gps points between source and destination, given the interpolation_distance
        ds = k; n = int(math.ceil(l.s13 / ds))
        for i in range(n + 1):
            #if i == 0:
                #print( "distance latitude longitude azimuth")
            s = min(ds * i, l.s13)
            g = l.Position(s, Geodesic.STANDARD | Geodesic.LONG_UNROLL)
            lat = g["lat2"]
            lon = g["lon2"]
            
            new_row = [lat, lon, uid, 0]
            rows.append(new_row)
            #print(traj_df.head())
    
    traj_df = pd.DataFrame(rows, columns = cols)
    
    return traj_df
    

#faker function
def faker(profile, n_users):
    #traj faker helper fun
 
    print("Generating {:d} trajectories for {:d} users".format(n_trajectory_user,n_users))
    if(n_users <=0):
        print("No trjectories for: ",profile)
        return
    cols = ["lat", "lon", "uid", "tid"]
    fake_trajs = pd.DataFrame(columns = cols)
    #uid to recognize different types of fake users
    fake_uid = 0
    speed = 0
    
    if (profile == "drive"):
        G = ox.load_graphml(out_path+"drive_graph.graphml")
        # starting uid for drive users
        fake_uid = 1000
        speed = 11.1
        
    elif (profile == "bike"):
        G = ox.load_graphml(out_path+"bike_graph.graphml")
        fake_uid = 2000
        speed = 4.3

    elif (profile == "walk"):
        G = ox.load_graphml(out_path+"walk_graph.graphml")
        fake_uid = 3000
        speed = 1.38

    else:
        print("not a valid profile!")
        return None
    
    print("Loaded {:s} graph from disk".format(profile))
    
    
    gdf_nodes, gdf_edges = ox.graph_to_gdfs(G)
    #print(gdf_nodes)
    # build the tree
    tree = KDTree(gdf_nodes[['y', 'x']], metric='euclidean')
    n_trajs = 0
    errs = 0
    trajs = []
    # users
    for i in range(0, n_users):
        # trajectories for every user
        for j in range(0, n_trajectory_user):

            #sample 2 indexes
            # Select random the source and destination among all the locations
            picks = random.sample(range(0, n_medoids), 2)
            # Source
            med_a = picks[0]
            #destination
            med_b = picks[1]

            #get lat and lng for medoids
            med_a = (medoids.iloc[med_a].lat, medoids.iloc[med_a].lon)
            med_b = (medoids.iloc[med_b].lat, medoids.iloc[med_b].lon)

            #get the nearest points in the gdf
            med_a_idx = tree.query([med_a], k=1, return_distance=False)[0]
            med_b_idx = tree.query([med_b], k=1, return_distance=False)[0]

            closest_node_to_a = gdf_nodes.iloc[med_a_idx].index.values[0]
            closest_node_to_b = gdf_nodes.iloc[med_b_idx].index.values[0]  

            #calculate the shortest path
            try:
                path = nx.shortest_path(G, 
                             closest_node_to_a,
                             closest_node_to_b,
                             weight='length')
                n_trajs += 1

            #happens when there's not path between two points    
            except nx.NetworkXNoPath:
                errs += 1
                
                
            #print(path)
            gdf = gdf_nodes.loc[path]
            #print("Gdf number {:d}".format(n_trajs))
            #print(gdf.head())

            traj = interpolator(gdf, fake_uid)
            # add the interpolated gps points
            traj["tid"] = n_trajs
            
            trajs.append(traj)
            #fake_trajs = fake_trajs.append(traj, ignore_index = True)
            
            
            #print route for checking purposes
            """fig, ax = ox.plot_graph_route(G, path, fig_height=10, 
                                  fig_width=10, 
                                  show=False, close=False, 
                                  edge_color='black',
                                  orig_dest_node_color='green',
                                  route_color='green')
            plt.show()"""
            
        #on to another user!
        fake_uid += 1

    fake_trajs = pd.concat(trajs, ignore_index = True)
    fake_trajs.to_csv(out_path+"tmp_fake_traj.csv")
    print("generated {:d} trajectories for {:d} users with a {:s} profile. {:d} errors generated"
          .format(n_trajs, n_users, profile, errs))
    
    #print(fake_trajs.head())
    
    print("now putting datetimes on generated trajectories")
    return put_datetime(fake_trajs, speed)



We test our faker function with a relatively small generation

In [None]:
%%time
n_trajectory_user = 1
interpolation_distance = 10
test = faker("walk", 1)

### Generation of the trajectories

In [None]:
%%time

print("Now starting generation with {:d} cars, {:d} bikes and {:d} pedestrians"
     .format(num_cars, num_bikes, num_pedestrians))

cars = faker("drive", num_cars)
bikes = faker("bike", num_bikes)
pedestrians = faker("walk", 1000)

In [None]:
pedestrians

In [None]:
%%time
#cars.to_csv(out_path+"augmented_cars.csv")
#bikes.to_csv(out_path+"augmented_bikes.csv")
pedestrians.to_csv(out_path+"geolife_augmented_pedestrians.csv")

In [None]:
pedestrians.loc[pedestrians["uid"] == 3001]

Let's see if the trajectory generation worked correctly

In [None]:
import skmob

#print(trajs)

tdf = skmob.TrajDataFrame(trajs[(trajs["uid"] == 1000)], longitude = "lon", datetime = "date_time")
print(tdf)


tdf.plot_trajectory(zoom=12, weight=3, opacity=0.9, tiles='Stamen Toner')

## 3. Build the GeoLife augmented data set

In [None]:
# laod the original Geolife
col_names = ["lat", "lon", "uid", "tid", "date_time"] 

# Uncommment if needed
#pedestrians = pd.read_csv(out_path + "augmented_pedestrians.csv", parse_dates = True, infer_datetime_format = True,index_col = 0)
#bikes = pd.read_csv(data_path + "augmented_bikes.csv", parse_dates = True, infer_datetime_format = True,index_col = 0)
#cars = pd.read_csv(data_path + "augmented_cars.csv", parse_dates = True, infer_datetime_format = True,index_col = 0)

cols = ["date_time", "lat", "lon", "tid", "uid"]

df = pd.read_csv(geolife_data_path + "geo_life_full.csv", \
                 usecols = cols, parse_dates = True, infer_datetime_format = True)

pedestrians["tid"] += df.tid.max()
augmented = pd.concat([pedestrians, df], axis=0, ignore_index=True)
augmented.to_csv(out_path + "geolife_full_augmented.csv")


#restricting to beijing area
#df = df[(df['lat'].between(39.54, 40.3)) & (df['lon'].between(115.75, 117.13))]

#restricting to june - august 2008
#start_time = "2008-06-01 00:00:00"
#end_time = "2008-08-31 23:59:00"

#original = (df[(df.date_time > start_time) & (df.date_time < end_time)]).copy()

#print(original.head())
#print(original.tid.max())

#augmented["date_time"] = pd.to_datetime(augmented["date_time"], format = "%Y-%m-%d %H:%M:%S.%f")

In [None]:
pedestrians["tid"] += original.tid.max()
#bikes["tid"] += (original.tid.max() + pedestrians.tid.max())
#cars["tid"] += (original.tid.max() + pedestrians.tid.max() + bikes.tid.max())

In [None]:
#augmented = pd.concat([pedestrians, bikes, cars, original], axis=0, ignore_index=True)
augmented = pd.concat([pedestrians, original], axis=0, ignore_index=True)
augmented.to_csv(out_path + "geolife_full_augmented.csv")

## Preliminary analysis of the GeoLife augmented data set

In [None]:
import skmob
augmented = pd.read_csv(out_path + "geolife_full_augmented.csv")

In [None]:
len(augmented[augmented["uid"] >= 3000]["uid"].unique())
augmented.head()

In [None]:
tdf = skmob.TrajDataFrame(augmented, latitude="lat", longitude="lon", datetime="date_time",user_id="uid",trajectory_id="tid")

In [None]:
# check trajectories of pedestrian (uid>=3000) and plot the trajectories
tdf_sele = tdf[tdf["uid"]>3000]
tdf_sele["tid"].unique()[:58]

In [None]:
s = tdf_sele[tdf_sele["tid"] == 55948]
m = s.plot_trajectory(zoom=12, weight=3, opacity=0.9, tiles='Stamen Toner')
m

In [None]:
# Flows
from skmob.tessellation import tilers
tessellation = tilers.tiler.get("squared", base_shape="Beijing, China", meters=15000)

In [None]:
fdf = tdf_sele.to_flowdataframe(tessellation=tessellation, self_loops=True)

In [None]:
m = fdf.plot_flows(flow_color='red',flow_weight=10)
fdf.plot_tessellation(popup_features=['tile_ID', 'population'],map_osm=m)
m