# Stop Place Detection based on GeoLife
1. Detect and cluster the stop places 
2. calculate  cluster's medoids
3. extend the medoids with extra aggregation points, these represent the origins and destinatins of the synthetic trajectories

In [None]:
import pandas as pd
import yaml
import os
import skmob
from skmob.preprocessing import detection
from skmob.preprocessing import filtering
import folium
import sklearn
import numpy
from skmob.preprocessing import clustering



with open("conf.yaml") as f:
    conf = yaml.load(f, Loader=yaml.FullLoader)

out_path = conf["out_path"]
data_path = conf["data_path"]
geolife_data_path = conf["geolife_data_path"]

stop_radius_factor = conf["stop_radius_factor"]
minutes_for_a_stop = conf["minutes_for_a_stop"]
spatial_radius_km = conf["spatial_radius_km"]

start_time = conf["start_time"]
end_time = conf["end_time"]
extended_medoids = conf["extended_medoids"]

beijing_lat_min = conf["beijing_lat_min"]
beijing_lat_max = conf["beijing_lat_max"]
beijing_lon_min = conf["beijing_lon_min"]
beijing_lon_max = conf["beijing_lon_max"]

min_cluster_stop_sample = conf["min_cluster_stop_sample"]
cluster_radius_km = conf["cluster_radius_km"]
speed_filter = conf["speed_filter"]


In [None]:
# Load and filter
cols = ["date_time", "lat", "lon", "uid","tid"]
df = pd.read_csv(os.path.join(geolife_data_path,"geo_life_full.csv"),usecols = cols, parse_dates = ["date_time"])

#restricting to beijing area
df = df[(df['lat'].between(beijing_lat_min, beijing_lat_max )) & (df['lon'].between(beijing_lon_min, beijing_lon_max))]

#restricting to period of interest
df = df[(df.date_time > start_time) & (df.date_time < end_time)]

#build trajectories
tdf = skmob.TrajDataFrame(df, latitude='lat', longitude='lon', datetime='date_time', user_id='uid',trajectory_id ="tid")

#filter noise from trajectories
ftdf = filtering.filter(tdf, max_speed_kmh=speed_filter)
ftdf.head()

## 1. Stop Places and clustering
Convert to TrajectoryDataFrame and set the uid column to "0" in order to calculate and cluster stops for all the trajectories, indipendently on the user who recorded each of them

In [None]:
#detecting stops
stdf = detection.stops(tdf, stop_radius_factor=stop_radius_factor, minutes_for_a_stop=minutes_for_a_stop, spatial_radius_km=spatial_radius_km, leaving_time=True)
stdf.head()

Now we show the stopping points on a folium map

In [None]:
stopping_points = folium.Map(location=[39.9042, 116.4074], tiles="Stamen Toner")

stdf.plot_stops(stopping_points)
stopping_points.save(out_path+"geolife_stop_places.html")

# show the map here
#stopping_points

Clustering

In [None]:
#clustering the stopping points

clustered_stops = folium.Map(location=[39.9042, 116.4074], tiles="Stamen Toner")
cstdf = clustering.cluster(stdf, cluster_radius_km=cluster_radius_km)

#printing on a folium map
cstdf.plot_stops(clustered_stops,max_users = 200)
clustered_stops.save(out_path+"geolife_clustered_stop_places.html")

In [None]:
cluster_i = cstdf[(cstdf["cluster"] == 2)].copy()
cluster_i.reset_index(inplace=True)
pairwise = sklearn.metrics.pairwise_distances(cluster_i[["lat", "lng"]], metric='euclidean')
pairwise.shape
# index of vector pairwise with the smallest distance
medoid = numpy.argmin(pairwise.sum(axis=0))
cluster_i.loc[[medoid]]
#print(medoid)
#medoids.append(cluster_i.loc[[medoid]])

## 2. Compute the medoids
Now we calculate the medoid for each cluster and we put each of them in a dataframe, then we serialize it on disk for further usage.
Medoid is obtained by finding the point of the cluster with minimum diastance from any other point of the same cluster
- pariwise distances among all the cluster's points
- sum the distances by row
- find the lowest  aggregated distance

[Medoid on Stack over flow](https://stackoverflow.com/questions/38017194/calculating-medoid-of-a-cluster-python)


In [None]:
medoids = pd.DataFrame()

#for each cluster
#for i in range(0, cstdf.cluster.max() + 1):
for cluster_id in cstdf.cluster.unique():
    cluster_i = cstdf[(cstdf["cluster"] == cluster_id)].copy()
    cluster_i.reset_index(inplace=True)
    pairwise = sklearn.metrics.pairwise_distances(cluster_i[["lat", "lng"]], metric='euclidean')
    # index of vector pairwise with the smallest distance
    medoid = numpy.argmin(pairwise.sum(axis=0))
    #print(cluster_i.loc[medoid])
    medoids = medoids.append(cluster_i.loc[[medoid]])

#setting back the index
medoids = medoids.set_index("index")
medoids.to_csv(out_path+"geolife_medoids.csv")

map_locations = folium.Map(location=[39.9042, 116.4074], tiles="Stamen Toner")
#we plot the medoids on a folium map
#medoids.plot_stops(map_locations)

for medoid in medoids.iterrows():
    long = medoid[1]["lng"]
    lat = medoid[1]["lat"]
    pop = medoid[1]["cluster"]
    folium.Marker((lat,long),popup="cluster:"+str(pop)).add_to(map_locations)
    

cstdf.plot_stops(map_locations,max_users=200)

map_locations

## 3. Extend the locations with extra locations 
In order to get more trajectories in the broader Beijing area, we try to pick random points to "augment" our medoids

In [None]:
import random as rand

#picking 100 random points in our inner bounding box perimeter
rd_pts = [(rand.uniform(beijing_lat_min, beijing_lat_max), (rand.uniform(beijing_lon_min, beijing_lon_max))) for i in range(extended_medoids)]

We visualize medoids AND random points on a folium map

In [None]:
import folium 

for pt in rd_pts:
    folium.Marker(pt).add_to(map_locations)

#map_locations.save(out_path+"geolife_augmented_medoids.html")
map_locations

In [None]:
points_df = pd.DataFrame(rd_pts, columns = ["lat", "lon"])
print(points_df.head())

meds_reset = medoids.reset_index().drop(["index", "datetime", "leaving_datetime", "uid", "cluster"], axis = 1)
meds_reset = meds_reset.rename(columns = {"lng":"lon"})
augmented_medoids = pd.concat([points_df, meds_reset], ignore_index = True)
augmented_medoids.to_csv(os.path.join(out_path,"geolife_augmented_medoids.csv"))
