In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from datetime import timedelta

### Dataset

In [2]:
def df_to_gdf(df, x, y):
    geometry = [Point(xy) for xy in zip(df[x], df[y])] # create Geometry series with lat / longitude
    df = df.drop([x, y], axis=1)
    gdf = gpd.GeoDataFrame(df, crs=None, geometry=geometry)
    return gdf

In [3]:
DATA_DIR = './data'

stop_df = pd.read_csv(DATA_DIR + '/pre_stop_points.csv')
stop_df.start_time = pd.to_datetime(stop_df.start_time)
stop_df.end_time = pd.to_datetime(stop_df.end_time)
stop_gdf = df_to_gdf(stop_df, x='lng', y='lat')

poi_df = pd.read_csv(DATA_DIR + '/POIs_mapping.csv')
poi_gdf = df_to_gdf(poi_df, x='lng', y='lat')

### Personal preference

#### Node potential
$
\begin{align}
\phi_{i}(y_{i} = v_{k}) \leftarrow P(v_{k}.l|r_{i}.l) \propto \exp \bigg\{-\frac{D(r_{i}.l, v_{k}.l)}{2\sigma^{2}}\bigg\}
\end{align}
$

In [4]:
def get_nearest_points(k, base_point, candidate_points):
    sindex = candidate_points.sindex # r-tree
    nearest_index = list(sindex.nearest(base_point.geometry.iloc[0].bounds, k))
    nearest_points = candidate_points.iloc[nearest_index]
    return nearest_points

In [5]:
def dist(origin, departure):
    from geopy.distance import geodesic
    geo_dist = geodesic(
        (origin.coords[0][1], origin.coords[0][0]),
        (departure.coords[0][1], departure.coords[0][0])
    ).meters
    return geo_dist

In [6]:
def node_potential(record, venue):
    sigma = 1e-4
    potential_value = np.exp(-(dist(record, venue) / 2*sigma**2))
    return potential_value

#### Pairwise potential
$\begin{align}
\psi_{ij}(y_{i}, y_{j})=
    \begin{cases}
    1 & \text{if } y_{i} = y_{j}\\
    e^{-\alpha} & \text{if } y_{i}.c = y_{j}.c \wedge y_{i} \neq y_{j}\\
    e^{-\beta} & \text{if } y_{i}.c \neq y_{j}.c
    \end{cases}
\end{align}$

In [7]:
# α=0.1
# β=0.2
def pairwise_potential(nearest_points, edge_df, alpha=0.1, beta=0.2):
    potential_value = list()
    
    for i in nearest_points.index:
        for edge in edge_df.index:
            if (nearest_points.activity_class[i] == edge_df.activity_class[edge]) and edge_df.geo_distance[edge] < 50:
                potential_value.append(1.0)
            elif nearest_points.activity_class[i] == edge_df.activity_class[edge]:
                potential_value.append(np.exp(-alpha))
            else:
                potential_value.append(np.exp(-beta))
    return potential_value

#### Edge list
$\begin{align}
\mathcal{N} = \mathcal{N}_{T} \cup \mathcal{N}_{D},
\end{align}$  
$\begin{align}
\mathcal{N}_{D} = \{<i,j>:D(r_{i},r_{j}) < \xi_{D}\},
\end{align}$  
$\begin{align}
\mathcal{N}_{T} = \{<i,j>:T(r_{i},r_{j}) < \xi_{T}\},
\end{align}$  

#### Pairwise Markov Random Field
$\begin{align}
P(Y) = \frac{1}{Z} \prod_{i} \phi_{i}(y_{i}) \prod_{<i,j> \in \mathcal{N}} \psi_{ij}(y_{i}, y_{j})
\end{align}$  
  
$\begin{align}
E(Y) = \sum_{i} -\log\phi_{i}(y_{i}) \sum_{<i,j> \in \mathcal{N}} -\log\psi_{ij}(y_{i}, y_{j})
\end{align}$

In [8]:
def pairwise_MRF(node, edges, label, normalize=False):
    node_e = -np.log(node_potential(node.geometry.iloc[0], label.geometry.iloc[0]))
    pairwise_e = sum(-np.log(pairwise_potential(label, edges)))
    energy = node_e + pairwise_e
    
    if normalize:
        energy = (1 + (energy**2))**-1
    return energy

#### Energy value

In [None]:
from tqdm import tqdm

energy_df = pd.DataFrame()

for uid in tqdm(stop_gdf.uid.unique()):
    user_df = stop_gdf[stop_gdf.uid == uid]
    _user_df = user_df.reset_index(drop=True)
    
    for i in _user_df.index:
        base_point = _user_df.iloc[[i]]
        energy_list = [base_point.id.iloc[0]]
        
        for atv_class in range(4):
            candidate_points = poi_gdf[poi_gdf.activity_class == atv_class]
            nearest_points = get_nearest_points(5, base_point, candidate_points)

            _base_point = [base_point.geometry.iloc[0]] * len(_user_df)
            geo_distance = list(map(dist, _base_point, _user_df.geometry))
            base_time = base_point.start_time.iloc[0]
            timestamp = abs(base_time - _user_df.start_time) % timedelta(days=1)

            _user_df['timestamp'] = timestamp
            _user_df['geo_distance'] = geo_distance
            edge_df = _user_df[(_user_df.timestamp < timedelta(hours=1)) & (_user_df.geo_distance < 100) &
                               (~_user_df.id.isin(base_point.id))]
            energy = list([pairwise_MRF(base_point, edge_df,
                                        label=nearest_points.iloc[[k]],
                                        normalize=False) for k in range(len(nearest_points))])
            
            tmp_df = pd.DataFrame(energy, columns=['energy'])
            argmin = tmp_df.energy.idxmin()
            energy = tmp_df.iloc[argmin].iloc[0]
            energy_list.append(energy)
        energy_df = energy_df.append(pd.DataFrame([energy_list], columns=['id', 'energy_1', 'energy_2', 'energy_3', 'energy_4']))

 45%|████████████████████████████████████▊                                             | 22/49 [06:24<07:56, 17.65s/it]