In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon
import geopy
from geopy import distance
from geopy.distance import geodesic
import datetime
import folium
from tqdm import tqdm

### Data representation
$\begin{align}\{p^u_{i}|i=1,2,3,...,n ,\end{align}$
and
$\begin{align}u=1,2,3,...,U\}\end{align}$
<ul>
    <li>$\begin{align}p^u_{i}=(x_{i},y_{i},t_{i1},t_{i2})\end{align}$</li>
    <li>$\begin{align}x_{i},y_{i}:\end{align}$geographical coordinates</li>
    <li>$\begin{align}t_{i1},t_{i2}:\end{align}$start and end time</li>
</ul>

In [2]:
DATA_DIR = './data/pre_stop_points.csv'
stop_df = pd.read_csv(DATA_DIR)
stop_df['activity'] = stop_df['activity_class']
stop_df = stop_df.drop(['activity_class'], axis=1)
stop_df.head()

Unnamed: 0,uid,id,lat,lng,start_time,end_time,activity
0,9497,250460,37.554742,127.026641,2019-11-07 10:45:58,2019-11-07 16:00:49,0.0
1,9497,240029,37.564022,127.03558,2019-11-06 23:27:51,2019-11-07 10:45:58,3.0
2,9497,238990,37.560267,127.033065,2019-11-06 20:22:16,2019-11-06 23:25:41,3.0
3,9497,234786,37.561929,127.038133,2019-11-06 17:15:56,2019-11-06 18:13:46,3.0
4,9497,226233,37.561171,127.037129,2019-11-06 07:49:15,2019-11-06 08:29:32,3.0


### Data quantization
$\begin{align}q_{i}(\end{align}$quantized version of$\begin{align}p_{i})=\{c_{i},S_{i},a_{i}\}\end{align}$
<ul>
    <li>$\begin{align}(x_{i},y_{i})→c_{i}\end{align}$</li>
    <li>$\begin{align}(t_{i1},t_{i2})→S_{i}\end{align}$</li>
    <li>$\begin{align}a_{i}:\end{align}$activity classes</li>
</ul>

#### Spatial cell
<ul>
    <li>The location $\begin{align}(x_{i},y_{i})→\end{align}$a cell $\begin{align}c_{i}\end{align}$
    </li>
</ul>

In [3]:
def df_to_gdf(df, x, y):
    geometry = [Point(xy) for xy in zip(df[x], df[y])] # create Geometry series with lat / longitude
    df = df.drop([x, y], axis=1)
    gdf = gpd.GeoDataFrame(df, crs=None, geometry=geometry)
    return gdf

In [4]:
stop_gdf = df_to_gdf(stop_df, x='lng', y='lat')
stop_gdf.head()

Unnamed: 0,uid,id,start_time,end_time,activity,geometry
0,9497,250460,2019-11-07 10:45:58,2019-11-07 16:00:49,0.0,POINT (127.026641 37.55474155)
1,9497,240029,2019-11-06 23:27:51,2019-11-07 10:45:58,3.0,POINT (127.0355804 37.56402230000001)
2,9497,238990,2019-11-06 20:22:16,2019-11-06 23:25:41,3.0,POINT (127.0330645 37.5602669)
3,9497,234786,2019-11-06 17:15:56,2019-11-06 18:13:46,3.0,POINT (127.0381327 37.5619293)
4,9497,226233,2019-11-06 07:49:15,2019-11-06 08:29:32,3.0,POINT (127.03712865 37.56117105)


In [5]:
def get_rectangle_size(polygon, size):
    width = distance.distance(
        (polygon.exterior.coords[0][1], polygon.exterior.coords[0][0]),
        (polygon.exterior.coords[1][1], polygon.exterior.coords[1][0])
    ).meters
    
    height = distance.distance(
        (polygon.exterior.coords[1][1], polygon.exterior.coords[1][0]),
        (polygon.exterior.coords[2][1], polygon.exterior.coords[2][0])
    ).meters

    width += size - (width % size)
    height += size - (height % size)
    return width, height

In [6]:
def create_rectangle(polygon, size):
    width, height = get_rectangle_size(polygon, size)
    xmin = polygon.exterior.coords[0][0]
    ymin = polygon.exterior.coords[0][1]
    origin = geopy.Point(ymin, xmin)

    destination = geodesic(meters=width).destination(origin, 90)
    xmax = destination.longitude
    destination = geodesic(meters=height).destination(origin, 0)
    ymax = destination.latitude
    rectangle = Polygon([(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)])
    return rectangle

In [7]:
def get_cell_row_column(polygon, size):
    rows = int(get_rectangle_size(polygon, size)[1] / size)
    cols = int(get_rectangle_size(polygon, size)[0] / size)
    return rows, cols

In [8]:
def get_cell_width_height(polygon, size):
    rectangle = create_rectangle(polygon, size)
    xmin, ymin, xmax, ymax = rectangle.bounds
    rows, cols = get_cell_row_column(rectangle, size)
    
    width = (xmax - xmin) / cols
    height = (ymax - ymin) / rows
    return width, height

In [9]:
def get_cell_origin(polygon, size):
    rectangle = create_rectangle(polygon, size)
    xmin, ymin, xmax, ymax = rectangle.bounds
    width, height = get_cell_width_height(polygon, size)
    
    x_left_origin = xmin
    x_right_origin = xmin + width
    y_top_origin = ymax
    y_bottom_origin = ymax - height
    return x_left_origin, x_right_origin, y_top_origin, y_bottom_origin

In [10]:
def create_cell(polygon, size):
    cells = list()
    rows, cols = get_cell_row_column(polygon, size)
    width, height = get_cell_width_height(polygon, size)
    x_left_origin, x_right_origin, y_top_origin, y_bottom_origin = get_cell_origin(polygon, size)
    
    for i in range(cols):
        y_top = y_top_origin
        y_bottom = y_bottom_origin

        for j in range(rows):
            cells.append(Polygon([(x_left_origin, y_top), (x_right_origin, y_top), (x_right_origin, y_bottom), (x_left_origin, y_bottom)])) 
            y_top = y_top - height
            y_bottom = y_bottom - height
            
        x_left_origin = x_left_origin + width
        x_right_origin = x_right_origin + width
        
    cell_df = gpd.GeoDataFrame({'geometry': cells})
    cell_df = cell_df.reset_index().rename(columns={'index': 'cell_id'})
    return cell_df

#### Parameter settings
* rectangle_size: [200, 400, 600, 800, 1000] meters

In [11]:
RECT_SIZE = 200
countries = gpd.read_file('https://cosmosnotebooksdata.blob.core.windows.net/notebookdata/countries.json') # Load country/region information for mapping
base_polygon = countries[countries.sovereignt == 'South Korea'].geometry.iloc[0]
hull = base_polygon.convex_hull.envelope

cell_df = create_cell(hull, size=RECT_SIZE)
cell_df.head()

Unnamed: 0,cell_id,geometry
0,0,"POLYGON ((126.1173979025323 38.61286683631251,..."
1,1,"POLYGON ((126.1173979025323 38.61106529153794,..."
2,2,"POLYGON ((126.1173979025323 38.60926374676338,..."
3,3,"POLYGON ((126.1173979025323 38.60746220198881,..."
4,4,"POLYGON ((126.1173979025323 38.60566065721424,..."


In [12]:
def interior_point(points, polygon):
    sindex = points.sindex
    possible_matches_index = list(sindex.intersection(polygon.bounds))
    possible_matches = points.iloc[possible_matches_index]
    precise_matches = possible_matches[possible_matches.intersects(polygon)]
    return precise_matches

In [13]:
def fixed_cell_mapping(points, cells):
    fixed_cell_df = pd.DataFrame()
    
    for i in tqdm(cells.index):
        if len(interior_point(points, cells.geometry[i])) != 0:
            cell = interior_point(points, cells.geometry[i])
            cell['cell_id'] = cells.cell_id[i]
            cell['cell_geometry'] = cells.geometry[i]
            fixed_cell_df = fixed_cell_df.append(cell)
            
    fixed_cell_df = fixed_cell_df.reset_index(drop=True)
    return fixed_cell_df

In [14]:
rectangle_cell_df = fixed_cell_mapping(stop_gdf, cell_df)
rectangle_cell_df.head()

100%|█████████████████████████████████████████████████████████████████████| 3610563/3610563 [1:22:19<00:00, 730.90it/s]


Unnamed: 0,uid,id,start_time,end_time,activity,geometry,cell_id,cell_geometry
0,4633,1567228,2019-12-14 12:02:36,2019-12-14 12:37:39,1.0,POINT (126.6534031 37.4123842),577044,"POLYGON ((126.6522164061735 37.41303801645195,..."
1,1907,886789,2019-11-27 11:30:16,2019-11-27 14:21:26,1.0,POINT (126.720842725 37.62142425),649561,"POLYGON ((126.7196122338681 37.62201721030154,..."
2,287,274850,2019-11-08 20:01:56,2019-11-08 22:38:33,3.0,POINT (126.72203385 37.7268774),651845,"POLYGON ((126.721786292826 37.72830835200089, ..."
3,864,1238929,2019-12-03 17:23:49,2019-12-03 18:00:23,3.0,POINT (126.8036999 37.5639939),738627,"POLYGON ((126.802226474268 37.56436777751544, ..."
4,8466,950934,2019-11-28 09:57:04,2019-11-28 11:58:36,2.0,POINT (126.811826125 37.57215055),747994,"POLYGON ((126.8109227100996 37.57337550138827,..."


In [15]:
def choropleth_map(cell_df):
    import json, warnings
    warnings.filterwarnings(action='ignore')
    
    sub_cell_df = cell_df.drop_duplicates(subset=['cell_id'])
    geo_json = json.loads(gpd.GeoSeries(sub_cell_df.cell_geometry).to_json())
    
    count_df = cell_df.copy()
    count_df['point'] = 1
    count_df = count_df.groupby(['cell_id']).count()[['point']].reset_index()
    
    for idx, obj in enumerate(geo_json['features']):
        obj['properties'].update({'cell_id': int(count_df.cell_id[idx])})
        
    osm_map = folium.Map(location=[37.5666103, 126.9783882], zoom_start=13)
    osm_map.choropleth(
        geo_data=geo_json,
        data=count_df,
        columns=['cell_id', 'point'],
        fill_color='YlGn',
        key_on='properties.cell_id',
        highligth=True,
        fill_opacity=0.75,
        line_opacity=0.5,
        legend_name='# of stop points'
    )
    return osm_map

In [16]:
choropleth_map(rectangle_cell_df)

### Feature extraction
<ul>
    <li><i>Activity Frequency</i></li>
</ul>
$\begin{align}Pr(a_{i}=l|b_{i}):=\frac{\sum^{N}_{j=1}\delta_{a_{j},l}\cdot\delta_{b_{j},b_{i}}}
{\sum^{L}_{l=1}\sum^{N}_{j=1}\delta_{a_{j},l}\cdot\delta_{b_{j},b_{i}}}\end{align}$

In [17]:
def spatial_frequency(cells, activity_classes=4):
    column_names = ['cell_id', 'spatial_1', 'spatial_2', 'spatial_3', 'spatial_4']
    spatial_frequency = list()

    for cell_id in cells.cell_id.unique():
        activity_frequency = [cell_id]
        cell = cells[cells.cell_id == cell_id]
        activity_count = cell['cell_id'].groupby(cell['activity']).count()
        total = len(cell)

        for activity_type in range(activity_classes):
            if activity_type in cell.activity.unique():
                activity_frequency.append(activity_count[activity_type] / total)
            else:
                activity_frequency.append(0.0)
        spatial_frequency.append(activity_frequency)
    
    spatial_frequency_df = pd.DataFrame(spatial_frequency, columns=column_names)
    return spatial_frequency_df

In [18]:
activity_classes = len(stop_df.activity.unique())

In [19]:
spatial_frequency_df = spatial_frequency(rectangle_cell_df)
spatial_frequency_df = pd.merge(rectangle_cell_df, spatial_frequency_df, how='left')
spatial_frequency_df = spatial_frequency_df[['uid', 'id', 'spatial_1', 'spatial_2', 'spatial_3', 'spatial_4']]
spatial_frequency_df.head()

Unnamed: 0,uid,id,spatial_1,spatial_2,spatial_3,spatial_4
0,4633,1567228,0.0,1.0,0.0,0.0
1,1907,886789,0.0,1.0,0.0,0.0
2,287,274850,0.0,0.0,0.0,1.0
3,864,1238929,0.0,0.0,0.0,1.0
4,8466,950934,0.0,0.0,1.0,0.0


In [20]:
poi_df = pd.read_csv('./data/POIs_mapping.csv')
poi_gdf = df_to_gdf(poi_df, x='lng', y='lat')
poi_gdf['activity'] = poi_gdf['activity_class']

In [21]:
map_cell_df = rectangle_cell_df.drop_duplicates(subset=['cell_id'])[['cell_id', 'cell_geometry']]
map_cell_df = map_cell_df.rename(columns={'cell_geometry': 'geometry'})

In [22]:
rectangle_cell_poi_df = fixed_cell_mapping(poi_gdf, map_cell_df)

100%|███████████████████████████████████████████████████████████████████████████████| 340/340 [00:03<00:00, 109.29it/s]


In [23]:
contextual_frequency_df = spatial_frequency(rectangle_cell_poi_df)
contextual_frequency_df = contextual_frequency_df.rename(
    columns={'spatial_1': 'contextual_1',
             'spatial_2': 'contextual_2',
             'spatial_3': 'contextual_3',
             'spatial_4': 'contextual_4'})

In [24]:
uniform_prob = 1.0 / activity_classes
contextual_frequency_df = pd.merge(rectangle_cell_df, contextual_frequency_df, how='left')
contextual_frequency_df = contextual_frequency_df[['id', 'contextual_1', 'contextual_2', 'contextual_3', 'contextual_4']]
contextual_frequency_df = contextual_frequency_df.fillna(uniform_prob)
contextual_frequency_df.head()

Unnamed: 0,id,contextual_1,contextual_2,contextual_3,contextual_4
0,1567228,0.0,0.0,0.0,1.0
1,886789,0.0,0.0,0.27907,0.72093
2,274850,0.25,0.25,0.25,0.25
3,1238929,0.0,0.0,0.016129,0.983871
4,950934,0.5,0.0,0.5,0.0


In [25]:
activity_frequency_df = pd.merge(spatial_frequency_df, contextual_frequency_df)
activity_frequency_df.head()

Unnamed: 0,uid,id,spatial_1,spatial_2,spatial_3,spatial_4,contextual_1,contextual_2,contextual_3,contextual_4
0,4633,1567228,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1907,886789,0.0,1.0,0.0,0.0,0.0,0.0,0.27907,0.72093
2,287,274850,0.0,0.0,0.0,1.0,0.25,0.25,0.25,0.25
3,864,1238929,0.0,0.0,0.0,1.0,0.0,0.0,0.016129,0.983871
4,8466,950934,0.0,0.0,1.0,0.0,0.5,0.0,0.5,0.0


In [26]:
assert len(stop_df) == len(activity_frequency_df)

* Distance-based empirical probability  

$\begin{align}Pr(a_{i}=l|\mathcal{X}_{l}(c_{i})):=\phi(d(p_{i},\mathcal{X}_{l}(c_{i})))\end{align}$  

and  

$\begin{align}Pr(a_{i}=l|\mathcal{A}_{l}(c_{i})):=\phi (d(p_{i},\mathcal{A}_{l}(c_{i})))\end{align}$

In [27]:
def distance_based_probability(point_i, point_j, activity_classes):
    norm_dist_list = [point_i.id]
    
    for activity_type in range(activity_classes): 
        activity_cell = point_j[point_j['activity'] == activity_type]

        if len(activity_cell) != 0:
            sindex = activity_cell.sindex # r-tree
            nearest_index = list(sindex.nearest(point_i.geometry.bounds, 1))[0]
            dist = point_i.geometry.distance(activity_cell.iloc[nearest_index].geometry)
            norm_dist = (1 + (dist**2))**-1
            norm_dist_list.append(norm_dist)
        else:
            norm_dist_list.append(0.0)
    return norm_dist_list

In [28]:
# Historical Neighbor Activity Confidence
def historical_neighbor(cell, activity_classes=4):
    column_names = ['id', 'historical_1 (min)', 'historical_2 (min)', 'historical_3 (min)', 'historical_4 (min)']
    historical_neighbor_df = pd.DataFrame()
    
    for i in cell.index:
        point_i = cell.loc[i]
        point_j = cell[~cell.index.isin([point_i.name])]
        dist_probs = distance_based_probability(point_i, point_j, activity_classes)
        historical_neighbor_df = historical_neighbor_df.append([dist_probs])
        
    historical_neighbor_df.columns = column_names
    return historical_neighbor_df

In [29]:
# Contextual Neighbor Activity Confidence
def contextual_neighbor(cell, pois, activity_classes=4):
    column_names = ['id', 'contextual_1 (min)', 'contextual_2 (min)', 'contextual_3 (min)', 'contextual_4 (min)']
    contextual_neighbor_df = pd.DataFrame()
    
    for i in cell.index:
        point_i = cell.loc[i]
        point_j = pois[pois.cell_id == point_i.cell_id]
        
        if len(point_j) != 0:
            dist_probs = distance_based_probability(point_i, point_j, activity_classes)
        else:
            dist_probs = [point_i.id]
            zero_probs = [0.0] * activity_classes
            dist_probs.extend(zero_probs)
        contextual_neighbor_df = contextual_neighbor_df.append([dist_probs])
    
    contextual_neighbor_df.columns = column_names
    return contextual_neighbor_df

In [30]:
historical_dist_probs_df = pd.DataFrame()
contextual_dist_probs_df = pd.DataFrame()

for cell_id in tqdm(rectangle_cell_df.cell_id.unique()):
    cell_i = rectangle_cell_df[rectangle_cell_df.cell_id == cell_id]
    
    historical_dist_probs = historical_neighbor(cell_i)
    contextual_dist_probs = contextual_neighbor(cell_i, rectangle_cell_poi_df)
    
    historical_dist_probs_df = historical_dist_probs_df.append(historical_dist_probs)
    contextual_dist_probs_df = contextual_dist_probs_df.append(contextual_dist_probs)

100%|████████████████████████████████████████████████████████████████████████████████| 340/340 [00:13<00:00, 25.00it/s]


In [31]:
dist_probs_df = pd.merge(historical_dist_probs_df, contextual_dist_probs_df)
dist_probs_df.head()

Unnamed: 0,id,historical_1 (min),historical_2 (min),historical_3 (min),historical_4 (min),contextual_1 (min),contextual_2 (min),contextual_3 (min),contextual_4 (min)
0,1567228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999999
1,886789,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,274850,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1238929,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,950934,0.0,0.0,0.0,0.0,0.999999,0.0,1.0,0.0


In [32]:
assert len(stop_df) == len(dist_probs_df)

### Feature vector
$\begin{align}x=[Temporal\ Activity\ Frequency \in \mathbb{R}^{1 \times L},\end{align}$<br/>
$\begin{align}\qquad Spatial\ Activity\ Frequency \in \mathbb{R}^{1 \times L},\end{align}$<br/>
$\begin{align}\qquad Contextual\ Activity\ Frequency \in \mathbb{R}^{1 \times L},\end{align}$<br/>
$\begin{align}\qquad Historical\ Activity\ Confidence \in \mathbb{R}^{1 \times L},\end{align}$<br/>
$\begin{align}\qquad Contextual\ Activity\ Confidence \in \mathbb{R}^{1 \times L}]^{T} \in \mathbb{R}^{5L} \end{align}$

In [33]:
feature_vector_df = pd.merge(activity_frequency_df, dist_probs_df, on='id')
feature_vector_df = pd.merge(feature_vector_df, stop_df[['uid', 'id', 'activity']]) # label
feature_vector_df.head()

Unnamed: 0,uid,id,spatial_1,spatial_2,spatial_3,spatial_4,contextual_1,contextual_2,contextual_3,contextual_4,historical_1 (min),historical_2 (min),historical_3 (min),historical_4 (min),contextual_1 (min),contextual_2 (min),contextual_3 (min),contextual_4 (min),activity
0,4633,1567228,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999999,1.0
1,1907,886789,0.0,1.0,0.0,0.0,0.0,0.0,0.27907,0.72093,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,287,274850,0.0,0.0,0.0,1.0,0.25,0.25,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
3,864,1238929,0.0,0.0,0.0,1.0,0.0,0.0,0.016129,0.983871,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3.0
4,8466,950934,0.0,0.0,1.0,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.999999,0.0,1.0,0.0,2.0


In [34]:
assert len(stop_df) == len(feature_vector_df)

### Save

In [35]:
# feature_vector_df.to_csv(f'./data/parameter_settings/pre_rect_{RECT_SIZE}.csv', index=False)

In [36]:
# count_df = rectangle_cell_df.groupby('cell_id').count()
# count_df = count_df.reset_index()[['cell_id', 'id']]
# count_df = count_df[count_df.id == 1]

# train_ids = rectangle_cell_df[rectangle_cell_df.cell_id.isin(count_df.cell_id.tolist())]
# train_ids = train_ids[['id']]

In [37]:
# train_ids.to_csv(f'./data/parameter_settings/pre_rect_{RECT_SIZE}_train_ids.csv', index=False)