In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import geopy
from geopy.distance import geodesic
import datetime
import folium
from tqdm import tqdm

### Data representation
$\begin{align}\{p^u_{i}|i=1,2,3,...,n ,\end{align}$
and
$\begin{align}u=1,2,3,...,U\}\end{align}$

* $\begin{align}p^u_{i}=(x_{i},y_{i},t_{i1},t_{i2})\end{align}$
* $\begin{align}x_{i},y_{i}:\end{align}$geographical coordinates
* $\begin{align}t_{i1},t_{i2}:\end{align}$start and end time

In [2]:
DATA_DIR = './data/pre_stop_points.csv'
stop_df = pd.read_csv(DATA_DIR)
stop_df['activity'] = stop_df.pop('activity_class')
stop_df.head()

Unnamed: 0,uid,id,lat,lng,start_time,end_time,activity
0,9497,250460,37.554742,127.026641,2019-11-07 10:45:58,2019-11-07 16:00:49,0.0
1,9497,240029,37.564022,127.03558,2019-11-06 23:27:51,2019-11-07 10:45:58,3.0
2,9497,238990,37.560267,127.033065,2019-11-06 20:22:16,2019-11-06 23:25:41,3.0
3,9497,234786,37.561929,127.038133,2019-11-06 17:15:56,2019-11-06 18:13:46,3.0
4,9497,226233,37.561171,127.037129,2019-11-06 07:49:15,2019-11-06 08:29:32,3.0


### Data quantization
$\begin{align}q_{i}(\end{align}$quantized version of$\begin{align}p_{i})=
\{c_{i},\mathcal{S}_{i},a_{i}\}\end{align}$

* $\begin{align}(x_{i},y_{i})\mapsto c_{i}\end{align}$
* $\begin{align}(t_{i1},t_{i2})\mapsto \mathcal{S}_{i}\end{align}$
* $\begin{align}a_{i}:\end{align}$activity classes

#### Spatial cell

* The location $\begin{align}(x_{i},y_{i})\mapsto\end{align}$a cell $\begin{align}c_{i}\end{align}$

In [3]:
def df_to_gdf(df, x, y):
    geometry = [Point(xy) for xy in zip(df[x], df[y])] # create Geometry series with lat / longitude
    df = df.drop([x, y], axis=1)
    gdf = gpd.GeoDataFrame(df, crs=None, geometry=geometry)
    return gdf

In [4]:
stop_gdf = df_to_gdf(stop_df, x='lng', y='lat')
stop_gdf.head()

Unnamed: 0,uid,id,start_time,end_time,activity,geometry
0,9497,250460,2019-11-07 10:45:58,2019-11-07 16:00:49,0.0,POINT (127.026641 37.55474155)
1,9497,240029,2019-11-06 23:27:51,2019-11-07 10:45:58,3.0,POINT (127.0355804 37.56402230000001)
2,9497,238990,2019-11-06 20:22:16,2019-11-06 23:25:41,3.0,POINT (127.0330645 37.5602669)
3,9497,234786,2019-11-06 17:15:56,2019-11-06 18:13:46,3.0,POINT (127.0381327 37.5619293)
4,9497,226233,2019-11-06 07:49:15,2019-11-06 08:29:32,3.0,POINT (127.03712865 37.56117105)


In [5]:
def get_radius(point, radius):
    lng, lat = point.coords[0]
    origin = geopy.Point(lat, lng)
    destination = geodesic(meters=radius).destination(origin, 0)
    radius_coords = destination.latitude - lat
    return radius_coords

In [6]:
def create_circular_cell(points, radius):
    cells = list()
    
    for i in points.index:
        pos_id = points.id[i]
        point = points.geometry[i]
        cell = point.buffer(get_radius(point, radius))
        cells.append([pos_id, cell])
    
    circular_cell_df = gpd.GeoDataFrame(cells, columns=['cell_id', 'geometry'])
    return circular_cell_df

#### Parameter settings
* radius: [100, 200, 300, 400, 500] meters

In [7]:
RADIUS = 500
cell_df = create_circular_cell(stop_gdf, RADIUS)
cell_df.head()

Unnamed: 0,cell_id,geometry
0,250460,"POLYGON ((127.0311459870436 37.55474155, 127.0..."
1,240029,"POLYGON ((127.040085379944 37.56402230000001, ..."
2,238990,"POLYGON ((127.0375694828169 37.5602669, 127.03..."
3,234786,"POLYGON ((127.0426376815452 37.5619293, 127.04..."
4,226233,"POLYGON ((127.0416336321252 37.56117105, 127.0..."


In [8]:
def interior_point(points, polygon):
    sindex = points.sindex # r-tree spatial indexing
    possible_matches_index = list(sindex.intersection(polygon.bounds))
    possible_matches = points.iloc[possible_matches_index]
    precise_matches = possible_matches[possible_matches.intersects(polygon)]
    return precise_matches

In [9]:
def dynamic_cell_mapping(points, cells, pois=True):
    dynamic_cell_df = gpd.GeoDataFrame()
    
    for i in tqdm(cells.index):
        cell = interior_point(points, cells.geometry[i])
        
        if pois != True:
            cell = cell[~cell.id.isin([cell_df.cell_id[i]])]
        cell['cell_id'] = cells.cell_id[i]
        cell['cell_geometry'] = cells.geometry[i]
        dynamic_cell_df = dynamic_cell_df.append(cell)
    
    dynamic_cell_df = dynamic_cell_df.reset_index(drop=True)
    return dynamic_cell_df

In [10]:
circular_cell_df = dynamic_cell_mapping(stop_gdf, cell_df, pois=False)
circular_cell_df.head()

100%|████████████████████████████████████████████████████████████████████████████████| 907/907 [00:31<00:00, 27.16it/s]


Unnamed: 0,uid,id,start_time,end_time,activity,geometry,cell_id,cell_geometry
0,9497,189180,2019-11-03 21:03:04,2019-11-04 08:28:38,3.0,POINT (127.02713945 37.55424545),250460,"POLYGON ((127.0311459870436 37.55474155, 127.0..."
1,9497,63156,2019-10-27 03:20:51,2019-10-27 11:37:09,0.0,POINT (127.02710955 37.554328375),250460,"POLYGON ((127.0311459870436 37.55474155, 127.0..."
2,9497,179285,2019-11-03 03:36:00,2019-11-03 13:25:06,3.0,POINT (127.02729345 37.554355725),250460,"POLYGON ((127.0311459870436 37.55474155, 127.0..."
3,9497,219718,2019-11-05 22:24:08,2019-11-05 23:29:57,3.0,POINT (127.0270443 37.5544848),250460,"POLYGON ((127.0311459870436 37.55474155, 127.0..."
4,9497,84956,2019-10-28 13:32:31,2019-10-28 17:09:05,0.0,POINT (127.0271086 37.5545072),250460,"POLYGON ((127.0311459870436 37.55474155, 127.0..."


### Feature extraction
- *Activity Frequency*<br/>
$\begin{align}Pr(a_{i}=l|b_{i}):=\frac{\sum^{N}_{j=1}\delta_{a_{j},l}\cdot\delta_{b_{j},b_{i}}}
{\sum^{L}_{l=1}\sum^{N}_{j=1}\delta_{a_{j},l}\cdot\delta_{b_{j},b_{i}}}\end{align}$

In [13]:
def spatial_frequency(cells, activity_classes=4):
    column_names = ['id', 'spatial_1', 'spatial_2', 'spatial_3', 'spatial_4']
    spatial_frequency = list()

    for cell_id in cells.cell_id.unique():
        activity_frequency = [cell_id]
        cell = cells[cells.cell_id == cell_id]
        activity_count = cell['cell_id'].groupby(cell['activity']).count()
        total = len(cell)

        for activity_type in range(activity_classes):
            if activity_type in cell.activity.unique():
                activity_frequency.append(activity_count[activity_type] / total)
            else:
                activity_frequency.append(0.0)
        spatial_frequency.append(activity_frequency)
    
    spatial_frequency_df = pd.DataFrame(spatial_frequency, columns=column_names)
    return spatial_frequency_df

In [14]:
activity_classes = len(stop_df.activity.unique())
uniform_prob = 1.0 / activity_classes

In [15]:
spatial_frequency_df = spatial_frequency(circular_cell_df)
spatial_frequency_df = pd.merge(stop_df[['id']], spatial_frequency_df, how='left', on='id').fillna(uniform_prob)
spatial_frequency_df.head()

Unnamed: 0,id,spatial_1,spatial_2,spatial_3,spatial_4
0,250460,0.4375,0.0,0.25,0.3125
1,240029,0.119403,0.119403,0.208955,0.552239
2,238990,0.0,0.32,0.2,0.48
3,234786,0.092308,0.107692,0.246154,0.553846
4,226233,0.085714,0.1,0.242857,0.571429


In [16]:
poi_df = pd.read_csv('./data/POIs_mapping.csv')
poi_gdf = df_to_gdf(poi_df, x='lng', y='lat')
poi_gdf['activity'] = poi_gdf['activity_class']

In [17]:
circular_cell_poi_df = dynamic_cell_mapping(poi_gdf, cell_df)

100%|████████████████████████████████████████████████████████████████████████████████| 907/907 [00:51<00:00, 13.96it/s]


In [18]:
contextual_frequency_df = spatial_frequency(circular_cell_poi_df)
contextual_frequency_df = contextual_frequency_df.rename(
    columns={'spatial_1': 'contextual_1',
             'spatial_2': 'contextual_2',
             'spatial_3': 'contextual_3',
             'spatial_4': 'contextual_4'})

In [19]:
activity_frequency_df = pd.merge(spatial_frequency_df, contextual_frequency_df, how='left')
activity_frequency_df = activity_frequency_df.fillna(uniform_prob)
activity_frequency_df.head()

Unnamed: 0,id,spatial_1,spatial_2,spatial_3,spatial_4,contextual_1,contextual_2,contextual_3,contextual_4
0,250460,0.4375,0.0,0.25,0.3125,0.095238,0.0,0.230159,0.674603
1,240029,0.119403,0.119403,0.208955,0.552239,0.04888,0.0,0.158859,0.792261
2,238990,0.0,0.32,0.2,0.48,0.027957,0.0,0.219355,0.752688
3,234786,0.092308,0.107692,0.246154,0.553846,0.044681,0.0,0.151064,0.804255
4,226233,0.085714,0.1,0.242857,0.571429,0.040153,0.0,0.164436,0.795411


In [20]:
assert len(stop_df) == len(activity_frequency_df)

### Save

In [None]:
# activity_frequency_df.to_csv(f'./data/parameter_settings/pre_circular_{RADIUS}_act_freq.csv', index=False)