# Extracting features from additional datasets and mergeing with the main dataset


In [1]:
import pandas as pd
import scipy.spatial as sp
from numpy import inf
import numpy as np
from itertools import takewhile

In [3]:
df = pd.read_csv("data_air/AB_data_clean.csv")

In [2]:
#Custom function to get list of elements from the additional datasets, for each entry in the main dataset
def k_neighbours(main_dataframe_row, second_dataframe,k, radius_meters):
    coord = second_dataframe[['latitude','longitude']]
    t = sp.cKDTree(coord)# https://gis.stackexchange.com/questions/382899/getting-scipy-ckdtree-to-return-everything-with-a-given-meter-radius
    #  0.1 m approximately equals to 0.0000009 degree.
    m_1 = 10*0.0000009
    dist, idx = t.query([main_dataframe_row.get('latitude'), main_dataframe_row.get('longitude')], second_dataframe.shape[0], distance_upper_bound=m_1*radius_meters)
    idx = idx[dist != inf][0:k] #array
    dist = dist[dist != inf][0:k] #array
    dist = np.round(dist /0.000009,1)
    return idx.tolist(), dist.tolist()

We now have three additional datasets, each with a ``latitude`` and ``longtitude`` column for each data entry. In order to convert it into information that could be used by the main dataset, we did the following for each dataset: We used cKDTree from scipy to create a kind of lookup object, then we queried the location of each Airbnb listing and obtained a list with information from all the trees/rats/touristic places within a radius of 2.5km for each Airbnb listing. Adding some extra processing, we obtained three lists for the number of trees/rats/tourist places within 0.5, 1 and 2km.

## Extract features from Trees dataset

In [4]:
df_trees = pd.read_csv("data_trees/trees_data_clean.csv")

<font color='darkred'>Note:</font> The cell below is disabled to avoid overwriting files, and because it takes 2+ hours to run

In [None]:
#DO NOT EXECUTE
#df_test = df
#distances_trees = df_test.apply(lambda x: pd.Series(k_neighbours(x, df_trees, round(df_trees.shape[0]), 2500), index=['df_trees index_2500m', 'trees distance_2500m']), axis=1)
#distances_trees['number_of_trees_2500m'] = distances_trees['trees distance_2500m'].apply(lambda x: len(x))

#distances_trees['number_of_trees_1000m'] = distances_trees['trees distance_2500m'].apply(lambda x: len(list(takewhile(lambda y: y<1000, x))))
#distances_trees['number_of_trees_500m'] = distances_trees['trees distance_2500m'].apply(lambda x: len(list(takewhile(lambda y: y<500, x))))

#distances_trees.drop(['df_trees index_2500m', 'trees distance_2500m'], axis = 1).to_csv('data_trees/trees_distances_simple.csv')

In [20]:
#FILE IS LOADED INSTEAD
distances_trees = pd.read_csv("data_trees/trees_distances_simple.csv")
distances_trees = distances_trees[['number_of_trees_2500m', 'number_of_trees_1000m', 'number_of_trees_500m']]
display(distances_trees.head(2))
distances_trees.shape[0]

Unnamed: 0,number_of_trees_2500m,number_of_trees_1000m,number_of_trees_500m
0,5491,1037,321
1,3611,296,46


48713

## Extract features from Rats dataset

In [9]:
df_rats = pd.read_csv("data_rats/rats_data_clean.csv")

<font color='darkred'>Note:</font> The cell below is disabled to avoid overwriting files

In [None]:
#DO NOT EXECUTE
#df_test = df
#distances_rats = df_test.apply(lambda x: pd.Series(k_neighbours(x, df_rats, round(df_rats.shape[0]), 2500), index=['df_rats index_2500m', 'rats distance_2500m']), axis=1)
#distances_rats['number_of_rats_2500m'] = distances_rats['rats distance_2500m'].apply(lambda x: len(x))

#distances_rats['number_of_rats_1000m'] = distances_rats['rats distance_2500m'].apply(lambda x: len(list(takewhile(lambda y: y<1000, x))))
#distances_rats['number_of_rats_500m'] = distances_rats['rats distance_2500m'].apply(lambda x: len(list(takewhile(lambda y: y<500, x))))

#distances_rats.drop(['df_rats index_2500m', 'rats distance_2500m'], axis = 1).to_csv('data_rats/rats_distances_simple.csv')

In [22]:
#FILE IS LOADED INSTEAD
distances_rats = pd.read_csv("data_rats/rats_distances_simple.csv")
distances_rats = distances_rats[['number_of_rats_2500m', 'number_of_rats_1000m', 'number_of_rats_500m']]
display(distances_rats.head(2))
distances_rats.shape[0]

Unnamed: 0,number_of_rats_2500m,number_of_rats_1000m,number_of_rats_500m
0,456,57,8
1,448,54,5


48713

## Extract features from Tourist Places dataset

In [14]:
df_places = pd.read_csv("data_places/places_data_clean.csv")

<font color='darkred'>Note:</font> The cell below is disabled to avoid overwriting files

In [None]:
#DO NOT EXECUTE
#df_test = df
#distances_places = df_test.apply(lambda x: pd.Series(k_neighbours(x, df_places, round(df_places.shape[0]), 2500), index=['df_places index_2500m', 'places distance_2500m']), axis=1)
#distances_places['number_of_places_2500m'] = distances_places['places distance_2500m'].apply(lambda x: len(x))

#distances_places['number_of_places_1000m'] = distances_places['places distance_2500m'].apply(lambda x: len(list(takewhile(lambda y: y<1000, x))))
#distances_places['number_of_places_500m'] = distances_places['places distance_2500m'].apply(lambda x: len(list(takewhile(lambda y: y<500, x))))

#distances_places.drop(['df_places index_2500m', 'places distance_2500m'], axis = 1).to_csv('data_places/places_distances_simple.csv')

In [24]:
#FILE CAN BE LOADED INSTEAD
distances_places = pd.read_csv("data_places/places_distances_simple.csv")
distances_places = distances_places[['number_of_places_2500m', 'number_of_places_1000m', 'number_of_places_500m']]
display(distances_places.head(2))
distances_places.shape[0]

Unnamed: 0,number_of_places_2500m,number_of_places_1000m,number_of_places_500m
0,8,0,0
1,75,29,4


48713

## Merging

Then we merged all the new features into a new dataset, and afterwards we merged them with the main dataset. 

In [28]:
distances_join = distances_places.join([distances_rats, distances_trees], how = 'inner')
df_join = df.join(distances_join, how = 'inner')
display(df_join.head(2))
print("Length of AirBnB dataframe: ", df.shape[0])
print("Length after joining with distances(trees, rats and places) dataset: ", distances_join.shape[0])

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,room_type_Shared room,number_of_places_2500m,number_of_places_1000m,number_of_places_500m,number_of_rats_2500m,number_of_rats_1000m,number_of_rats_500m,number_of_trees_2500m,number_of_trees_1000m,number_of_trees_500m
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,...,0.0,8,0,0,456,57,8,5491,1037,321
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,...,0.0,75,29,4,448,54,5,3611,296,46


Length of AirBnB dataframe:  48713
Length after joining with distances(trees, rats and places) dataset:  48713


<font color='darkred'>Note:</font> The cell below is disabled to avoid overwriting files

In [29]:
#df_join.to_csv("joined_data.csv")