# Spatial

In this notebook we train a FastText model in order to create POI embeddings. Then we also utilize spatial information in order to enhance these representations.

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
import os

from gensim.models import FastText
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Load train and test sets.

In [None]:
labels = [
    'Active Life', 'Arts & Entertainment', 'Automotive', 'Beauty & Spas',
    'Education', 'Event Planning & Services', 'Financial Services', 'Food',
    'Health & Medical', 'Home Services', 'Hotels & Travel', 'Local Flavor',
    'Local Services', 'Mass Media', 'Nightlife', 'Pets', 'Professional Services',
    'Public Services & Government', 'Real Estate', 'Religious Organizations',
    'Restaurants', 'Shopping'
]

train_df = pd.read_csv('data/train.csv', na_filter=False)
test_df = pd.read_csv('data/test.csv', na_filter=False)

Train FastText model on train sequences.

In [None]:
emb_size = 300
epochs = 100
ft_model_fname = f'ft_{emb_size}_{epochs}.model'
models_dir = 'ft_models'

train_sequences = train_df['sequence'].apply(lambda x: x.split())
test_sequences = test_df['sequence'].apply(lambda x: x.split())

if ft_model_fname in os.listdir(models_dir):
    ft_model = FastText.load(os.path.join(models_dir, ft_model_fname))
else:
    ft_model = FastText(size=emb_size)
    ft_model.build_vocab(sentences=train_sequences)
    ft_model.train(sentences=train_sequences, total_examples=len(train_sequences), epochs=epochs)
    ft_model.save(os.path.join(models_dir, ft_model_fname))

Create initial POI embeddings, without spatial information.

In [None]:
train_indiv_embs = np.stack(train_sequences.apply(lambda x: ft_model.wv[x].mean(axis=0)))
test_indiv_embs = np.stack(test_sequences.apply(lambda x: ft_model.wv[x].mean(axis=0)))

Transform coordinates to specific crs and then compute buffers around each POI (in meters).

In [None]:
distance = 500
src_crs = 'epsg:4326'
tgt_crs = 'epsg:3857'

train_gdf = gpd.GeoDataFrame(
    geometry=gpd.points_from_xy(train_df['longitude'], train_df['latitude']),
    crs=src_crs).to_crs(tgt_crs)
train_gdf['buffer'] = train_gdf.buffer(distance)

test_gdf = gpd.GeoDataFrame(
    geometry=gpd.points_from_xy(test_df['longitude'], test_df['latitude']),
    crs=src_crs).to_crs(tgt_crs)
test_gdf['buffer'] = test_gdf.buffer(distance)

Each POI gets a new spatial embedding by averaging the representations of its neighbors.

In [None]:
train_spatial_embs = np.zeros((len(train_df), emb_size))
test_spatial_embs = np.zeros((len(test_df), emb_size))

for idx, buffer in enumerate(train_gdf['buffer']):
    mask = train_gdf.drop(idx)['geometry'].intersects(buffer)
    neighbors = train_gdf.drop(idx).loc[mask].index.tolist()
    if neighbors:
        train_spatial_embs[idx] = train_indiv_embs[neighbors].mean(axis=0)

for idx, buffer in enumerate(test_gdf['buffer']):
    mask = train_gdf['geometry'].intersects(buffer)
    neighbors = train_gdf.loc[mask].index.tolist()
    if neighbors:
        test_spatial_embs[idx] = train_indiv_embs[neighbors].mean(axis=0)

In [None]:
train_features = np.hstack([train_indiv_embs, train_spatial_embs])
test_features = np.hstack([test_indiv_embs, test_spatial_embs])

train_labels = train_df['categories'].str.get_dummies(sep=', ')
test_labels = test_df['categories'].str.get_dummies(sep=', ')

Classification via Logistic Regression.

In [None]:
test_preds = np.zeros((len(test_df), len(labels)))
scores = []

for label_idx, label_name in enumerate(labels):
    train_target = train_labels[label_name]
    test_target = test_labels[label_name]

    clf = LogisticRegression(solver='sag')
    clf.fit(train_features, train_target)
    preds = clf.predict(test_features)
    test_preds[:, label_idx] = preds

    score = accuracy_score(test_target, preds)
    scores.append(score)
    print('Test score for class {} is {:.4f}'.format(label_name, score))

print('Mean test score is {:.4f}'.format(np.mean(scores)))