# Data Engineering for Auxiliary Data

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
from pathlib import Path

import pandas as pd
import numpy as np

from geopy import distance

In [3]:
def get_nearest_y(X, y):
    """
    input:
        X - a list of tuples representing (lat, lng)
        y - a list of tuples representing (lat, lng)
    return:
        distance_to_nearest - the distance from X to the closest y (in meters)
        index_of_nearest - the index of the closest y from X
    """
    distance_to_nearest = []
    index_of_nearest = []

    for point_x in X:
        distance_to_y = []

        for point_y in y:
            distance_to_y.append(distance.distance(point_x, point_y).meters)
        
        distance_to_y = np.array(distance_to_y)
        min_distance = np.min(distance_to_y, axis=0)
        min_index = np.argmin(distance_to_y, axis=0)
    
        distance_to_nearest.append(min_distance)
        index_of_nearest.append(min_index)
    
    return distance_to_nearest, index_of_nearest

def get_nearest_distances(df_X, df_y):
    df_y['coord'] = df_y[['lat', 'lng']].apply(tuple, axis=1)
    distance_result, index_result = get_nearest_y(df_X['coord'], df_y['coord'])
    return distance_result, index_result

In [4]:
data_dir = Path("./raw_data/")
aux_data_dir = Path("./raw_data/auxiliary-data/")

In [5]:
def generate_auxiliary_data(df):
    
    df_aux = pd.DataFrame(index=df.index)

    # Zip the lat and lng into tuples, keep only the id of the training dataset and the coordinates
    coord = df[["lat","lng"]].apply(tuple, axis=1)
    df_aux = pd.concat([df_aux, coord], axis=1)
    df_aux.columns = ['coord']

    # MRT
    df_sg_mrt_stations = pd.read_csv(Path(aux_data_dir, "sg-mrt-stations.csv"))
    distance_mrt, index_mrt = get_nearest_distances(df_aux, df_sg_mrt_stations)
    df_aux['dist_meters_to_nearest_mrt'] = distance_mrt
    df_aux['nearest_mrt_code'] = list(df_sg_mrt_stations.iloc[index_mrt]['code'])
    df_aux['nearest_mrt_line'] = list(df_sg_mrt_stations.iloc[index_mrt]['line'])

    # Shopping mall
    df_sg_shopping_malls = pd.read_csv(Path(aux_data_dir, "sg-shopping-malls.csv"))
    distance_mall, index_mall = get_nearest_distances(df_aux, df_sg_shopping_malls)
    df_aux['dist_meters_to_nearest_mall'] = distance_mall
    df_aux['nearest_mall_index'] = index_mall

    # Commercial centres
    df_sg_commercial_centres = pd.read_csv(Path(aux_data_dir, "sg-commerical-centres.csv"))
    distance_commercial, index_commercial = get_nearest_distances(df_aux, df_sg_commercial_centres)
    df_aux['dist_meters_to_nearest_commercial_centre'] = distance_commercial
    df_aux['nearest_commercial_centre_index'] = index_commercial

    # Primary schools
    df_sg_pri_sch = pd.read_csv(Path(aux_data_dir, "sg-primary-schools.csv"))
    distance_pri_sch, index_pri_sch = get_nearest_distances(df_aux, df_sg_pri_sch)
    df_aux['dist_meters_to_nearest_primary_school'] = distance_pri_sch
    df_aux['nearest_primary_school_index'] = index_pri_sch

    # Secondary schools
    df_sg_sec_sch = pd.read_csv(Path(aux_data_dir, "sg-secondary-schools.csv"))
    distance_sec_sch, index_sec_sch = get_nearest_distances(df_aux, df_sg_sec_sch)
    df_aux['dist_meters_to_nearest_secondary_school'] = distance_sec_sch
    df_aux['nearest_secondary_school_index'] = index_sec_sch

    return df_aux

## Generate the auxiliary features for nearest community facilities

This takes around 8 seconds per 100 records.

For the entire training and test set, it takes around 40 minutes (_Probably have some room to optimize this_)

In [6]:
df_train = pd.read_csv(Path(data_dir, "train.csv"))
df_train_aux = generate_auxiliary_data(df_train)
df_train_aux

Unnamed: 0,coord,dist_meters_to_nearest_mrt,nearest_mrt_code,nearest_mrt_line,dist_meters_to_nearest_mall,nearest_mall_index,dist_meters_to_nearest_commercial_centre,nearest_commercial_centre_index,dist_meters_to_nearest_primary_school,nearest_primary_school_index,dist_meters_to_nearest_secondary_school,nearest_secondary_school_index
0,"(1.41439935, 103.83719568172816)",573.566928,ns14,ns,620.066152,103,3342.136748,2,274.527938,99,182.562453,88
1,"(1.3725968000000002, 103.87562460126242)",1728.895028,ne13,ne,551.705560,113,2388.871680,31,122.819531,130,290.331371,15
2,"(1.2987726, 103.895798)",1315.256431,cc8,cc,822.517804,72,2159.421533,30,888.509938,147,891.530935,32
3,"(1.3123637, 103.80327091227252)",723.885330,cc20,cc,907.845081,19,1605.948865,28,1086.151844,97,1102.475585,100
4,"(1.2739587, 103.84363531310518)",370.021700,ew15,ew,428.960960,51,1864.412456,0,464.744201,20,1542.648854,89
...,...,...,...,...,...,...,...,...,...,...,...,...
20249,"(1.3859375, 103.8344656879244)",150.007415,te5,te,2008.342040,94,3165.626279,18,791.022417,5,1029.960484,97
20250,"(1.3159484, 103.8575892)",442.631218,ne8,ne,532.359497,11,1642.455781,29,657.243186,67,997.602605,13
20251,"(1.3159611, 103.836848)",422.130692,dt11,dt,765.085896,58,855.544220,29,366.210022,6,430.393277,4
20252,"(1.4407533, 103.8066711)",632.423139,ns10,ns,602.359111,89,2155.912402,19,230.110808,62,707.623180,0


In [7]:
df_test = pd.read_csv(Path(data_dir, "test.csv"))
df_test_aux = generate_auxiliary_data(df_test)
df_test_aux

Unnamed: 0,coord,dist_meters_to_nearest_mrt,nearest_mrt_code,nearest_mrt_line,dist_meters_to_nearest_mall,nearest_mall_index,dist_meters_to_nearest_commercial_centre,nearest_commercial_centre_index,dist_meters_to_nearest_primary_school,nearest_primary_school_index,dist_meters_to_nearest_secondary_school,nearest_secondary_school_index
0,"(1.3443339, 103.8786904)",261.672594,cc12,cc,985.874655,110,1044.920854,31,341.056419,88,343.615356,71
1,"(1.3802812, 103.9438781)",1001.264290,ew1,ew,281.936655,86,1435.511147,15,1078.101901,93,1396.610341,55
2,"(1.2946685, 103.8500737)",251.350375,cc2,cc,212.685809,7,157.232207,37,343.739518,22,562.723759,109
3,"(1.3731198, 103.74609393885731)",1367.323437,ns4,ns,501.765017,133,4504.563672,1,702.141804,34,1062.566045,122
4,"(1.34146785, 103.8490475)",273.515978,ns18,ns,1009.165929,21,985.918261,32,418.716281,79,360.715437,9
...,...,...,...,...,...,...,...,...,...,...,...,...
6995,"(1.24953425, 103.84428115577713)",2992.831027,ew15,ew,2885.355551,142,4272.197274,0,2946.737990,20,3861.603350,29
6996,"(1.3324923, 103.8000035)",389.150989,dt7,dt,965.822466,156,3081.254261,28,743.925796,126,322.085215,79
6997,"(1.29410615, 103.83673469533848)",729.462991,ns23,ns,532.653630,17,1527.283142,37,1444.051119,3,866.888397,89
6998,"(1.30373625, 103.91143827379192)",1920.516380,ew6,ew,729.699795,76,2555.480675,30,147.133722,148,749.160547,25


### Save the results

In [8]:
df_train_aux.to_csv("raw_data/train_auxiliary_data.csv")
df_test_aux.to_csv("raw_data/test_auxiliary_data.csv")