# Imports

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
from pathlib import Path

import pandas as pd
import numpy as np

from geopy import distance

In [3]:
def get_nearest_y(X, y):
    """
    input:
        X - a list of tuples representing (lat, lng)
        y - a list of tuples representing (lat, lng)
    return:
        distance_to_nearest - the distance from X to the closest y (in meters)
        index_of_nearest - the index of the closest y from X
    """
    distance_to_nearest = []
    index_of_nearest = []

    for point_x in X:
        distance_to_y = []

        for point_y in y:
            distance_to_y.append(distance.distance(point_x, point_y).meters)
        
        distance_to_y = np.array(distance_to_y)
        min_distance = np.min(distance_to_y, axis=0)
        min_index = np.argmin(distance_to_y, axis=0)
    
        distance_to_nearest.append(min_distance)
        index_of_nearest.append(min_index)
    
    return distance_to_nearest, index_of_nearest

def get_nearest_distances(df_X, df_y):
    df_y['coord'] = df_y[['lat', 'lng']].apply(tuple, axis=1)
    distance_result, index_result = get_nearest_y(df_X['coord'], df_y['coord'])
    return distance_result, index_result

# Data Engineering for Auxiliary Data

In [4]:
data_dir = Path("./raw_data/")
aux_data_dir = Path("./raw_data/auxiliary-data/")

In [5]:
def generate_auxiliary_data(df):
    
    df_aux = pd.DataFrame(index=df.index)

    # Zip the lat and lng into tuples, keep only the id of the training dataset and the coordinates
    coord = df[["lat","lng"]].apply(tuple, axis=1)
    df_aux = pd.concat([df_aux, coord], axis=1)
    df_aux.columns = ['coord']

    # MRT
    df_sg_mrt_stations = pd.read_csv(Path(aux_data_dir, "sg-mrt-stations.csv"))
    distance_mrt, index_mrt = get_nearest_distances(df_aux, df_sg_mrt_stations)
    df_aux['dist_meters_to_nearest_mrt'] = distance_mrt
    df_aux['nearest_mrt_code'] = list(df_sg_mrt_stations.iloc[index_mrt]['code'])
    df_aux['nearest_mrt_line'] = list(df_sg_mrt_stations.iloc[index_mrt]['line'])

    # Shopping mall
    df_sg_shopping_malls = pd.read_csv(Path(aux_data_dir, "sg-shopping-malls.csv"))
    distance_mall, index_mall = get_nearest_distances(df_aux, df_sg_shopping_malls)
    df_aux['dist_meters_to_nearest_mall'] = distance_mall
    df_aux['nearest_mall_index'] = index_mall

    # Commercial centres
    df_sg_commercial_centres = pd.read_csv(Path(aux_data_dir, "sg-commerical-centres.csv"))
    distance_commercial, index_commercial = get_nearest_distances(df_aux, df_sg_commercial_centres)
    df_aux['dist_meters_to_nearest_commercial_centre'] = distance_commercial
    df_aux['nearest_commercial_centre_index'] = index_commercial

    # Primary schools
    df_sg_pri_sch = pd.read_csv(Path(aux_data_dir, "sg-primary-schools.csv"))
    distance_pri_sch, index_pri_sch = get_nearest_distances(df_aux, df_sg_pri_sch)
    df_aux['dist_meters_to_nearest_primary_school'] = distance_pri_sch
    df_aux['nearest_primary_school_index'] = index_pri_sch

    # Secondary schools
    df_sg_sec_sch = pd.read_csv(Path(aux_data_dir, "sg-secondary-schools.csv"))
    distance_sec_sch, index_sec_sch = get_nearest_distances(df_aux, df_sg_sec_sch)
    df_aux['dist_meters_to_nearest_secondary_school'] = distance_sec_sch
    df_aux['nearest_secondary_school_index'] = index_sec_sch

    return df_aux

## Generate the auxiliary features for nearest community facilities

This takes around 8 seconds per 100 records.

For the entire training and test set, it takes around 40 minutes (_Probably have some room to optimize this_)

In [6]:
df_train = pd.read_csv(Path(data_dir, "train.csv"))
df_train_aux = generate_auxiliary_data(df_train)
df_train_aux

In [7]:
df_test = pd.read_csv(Path(data_dir, "test.csv"))
df_test_aux = generate_auxiliary_data(df_test)
df_test_aux

Unnamed: 0,coord,dist_meters_to_nearest_mrt,nearest_mrt_code,nearest_mrt_line,dist_meters_to_nearest_mall,nearest_mall_index,dist_meters_to_nearest_commercial_centre,nearest_commercial_centre_index,dist_meters_to_nearest_primary_school,nearest_primary_school_index,dist_meters_to_nearest_secondary_school,nearest_secondary_school_index
0,"(1.3443339, 103.8786904)",261.672594,cc12,cc,985.874655,110,1044.920854,31,341.056419,88,343.615356,71
1,"(1.3802812, 103.9438781)",1001.264290,ew1,ew,281.936655,86,1435.511147,15,1078.101901,93,1396.610341,55
2,"(1.2946685, 103.8500737)",251.350375,cc2,cc,212.685809,7,157.232207,37,343.739518,22,562.723759,109
3,"(1.3731198, 103.74609393885731)",1367.323437,ns4,ns,501.765017,133,4504.563672,1,702.141804,34,1062.566045,122
4,"(1.34146785, 103.8490475)",273.515978,ns18,ns,1009.165929,21,985.918261,32,418.716281,79,360.715437,9
...,...,...,...,...,...,...,...,...,...,...,...,...
95,"(1.2932171, 103.810024069313)",463.354051,ew19,ew,743.777973,155,895.843229,25,393.561414,124,367.028510,99
96,"(1.4438513, 103.81992431665668)",575.096828,ns11,ns,477.840248,101,2842.560449,19,192.821052,133,414.030196,110
97,"(1.2761249, 103.8532534)",151.822813,ce2,ce,527.330227,27,1185.430709,0,1487.816706,20,2116.877490,89
98,"(1.35355965, 103.92872486684811)",1399.803015,dt31,dt,1298.838354,65,1832.620942,3,915.592782,142,418.502201,116


### Save the results

In [8]:
df_train_aux.to_csv("raw_data/train_auxiliary_data.csv")
df_test_aux.to_csv("raw_data/test_auxiliary_data.csv")