In [1]:
# 1. Imports:
import pandas as pd
import requests
from shapely.geometry import Point
import geopandas as gpd

In [2]:
# 2. Acquisition & Wrangling:

In [3]:
# 2.a) Read & transform csv

In [4]:
PATH = '../data/bicimad_stations.csv'
GEO = 'geometry.coordinates'

In [5]:
def read_csv(file_path):
    df_read_csv = pd.read_csv(PATH, sep='\t', index_col=0)
    return df_read_csv

In [6]:
read_csv_output=read_csv(PATH)

In [7]:
def string_to_coordinates(string):
    list_coordinates = [float(element) for element in string.replace("[", "").replace("]", "").replace(" ","").split(",")]
    return list_coordinates

In [8]:
def normalize_csv(df, orig_column, new_col1, new_col2):
    df[orig_column] = df[orig_column].apply(lambda row: string_to_coordinates(row))
    df[new_col1] = df[orig_column].apply(lambda row: row[0])
    df[new_col2] = df[orig_column].apply(lambda row: row[1])
    return df

In [9]:
df_bicimad=normalize_csv(read_csv_output, GEO, 'bm_longitude', 'bm_latitude')
df_bicimad.head()

Unnamed: 0,id,name,light,number,address,activate,no_available,total_bases,dock_bikes,free_bases,reservations_count,geometry.type,geometry.coordinates,bm_longitude,bm_latitude
0,1,1a - Puerta del Sol A,3,1a,Puerta del Sol nº 1,1,1,30,0,0,0,Point,"[-3.7018341, 40.4172137]",-3.701834,40.417214
1,2,1b - Puerta del Sol B,3,1b,Puerta del Sol nº 1,1,1,30,0,0,0,Point,"[-3.701602938060457, 40.41731271011562]",-3.701603,40.417313
2,3,2 - Miguel Moya,3,2,Calle Miguel Moya nº 1,1,1,24,0,0,0,Point,"[-3.7058415, 40.4205886]",-3.705842,40.420589
3,4,3 - Plaza Conde Suchil,2,3,Plaza del Conde del Valle de Súchil nº 3,1,0,18,9,9,0,Point,"[-3.7069171, 40.4302937]",-3.706917,40.430294
4,5,4 - Malasaña,1,4,Calle Manuela Malasaña nº 5,1,0,24,23,1,0,Point,"[-3.7025875, 40.4285524]",-3.702587,40.428552


In [10]:
# 2.b) Get data from API REST and transform

In [11]:
API_ENDPOINT="https://datos.madrid.es/egob"
DATASET="/catalogo/300356-0-monumentos-ciudad-madrid.json"
COLUMNS_2FIX = ["address", "location", "organization"] 

In [12]:
def get_dataset(api_endpoint, dataset):
    response = requests.get(api_endpoint+dataset)
    json_data = response.json() 
    df_dataset = pd.DataFrame(json_data["@graph"]) # the data that we need is stored in key "@graph" within json
    return df_dataset

In [13]:
get_dataset_output = get_dataset(API_ENDPOINT, DATASET)

In [18]:
def normalize_dataset(df):
    df = df.dropna() # 1 drop nulls
    for column in COLUMNS_2FIX: # 2 normalize dictionaries within columns
        df = pd.concat([df.drop(columns = [column]), df[column].apply(lambda x: pd.Series(x))], axis = 1)
    return df

In [21]:
df_monuments = normalize_dataset(get_dataset_output)
#df_monuments.info()

In [22]:
# 2.c) Build main df as cross join of bicimad stations and places of interest

In [23]:
def merge(df1, df2):
    df_merged = pd.merge(df1, df2, how="cross")
    return df_merged

In [24]:
full_dataset = merge(df_monuments, df_bicimad)

In [25]:
df_sample = full_dataset.head(792)

In [26]:
# 3. Analysis:

In [27]:
# 3.a) Calculate the distance to each bicimad station

In [28]:
def to_mercator(lat, long):
    # transform latitude/longitude data in degrees to pseudo-mercator coordinates in metres
    c = gpd.GeoSeries([Point(lat, long)], crs=4326)
    c = c.to_crs(3857)
    return c

def distance_meters(lat_start, long_start, lat_finish, long_finish):
    # return the distance in metres between to latitude/longitude pair points in degrees 
    # (e.g.: Start Point -> 40.4400607 / -3.6425358 End Point -> 40.4234825 / -3.6292625)
    start = to_mercator(lat_start, long_start)
    finish = to_mercator(lat_finish, long_finish)
    return start.distance(finish)

In [55]:
def add_distance_col(df, col_name, col_type, lat_start, long_start, lat_finish, long_finish):
    df[col_name] = df.apply(lambda row: distance_meters(row[lat_start], row[long_start],
                                                                          row[lat_finish], row[long_finish]), axis = 1)
    return df

In [56]:
df_sample = add_distance_col(df_sample, "distance", "int64", "latitude", "longitude", "bm_latitude", "bm_longitude")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = df.apply(lambda row: distance_meters(row[lat_start], row[long_start],


In [82]:
df_sample.head(235)

Unnamed: 0,@id,id_x,title,relation,references,district,locality,postal-code,street-address,area,...,no_available,total_bases,dock_bikes,free_bases,reservations_count,geometry.type,geometry.coordinates,bm_longitude,bm_latitude,distance
0,https://datos.madrid.es/egob/catalogo/tipo/mon...,409634,A las víctimas del Holocausto,https://patrimonioypaisaje.madrid.es/sites/v/i...,https://patrimonioypaisaje.madrid.es/FrameWork...,{'@id': 'https://datos.madrid.es/egob/kos/Prov...,MADRID,,JARDIN TRES CULTURAS,,...,1,30,0,0,0,Point,"[-3.7018341, 40.4172137]",-3.701834,40.417214,12152.077352
1,https://datos.madrid.es/egob/catalogo/tipo/mon...,409634,A las víctimas del Holocausto,https://patrimonioypaisaje.madrid.es/sites/v/i...,https://patrimonioypaisaje.madrid.es/FrameWork...,{'@id': 'https://datos.madrid.es/egob/kos/Prov...,MADRID,,JARDIN TRES CULTURAS,,...,1,30,0,0,0,Point,"[-3.701602938060457, 40.41731271011562]",-3.701603,40.417313,12124.059117
2,https://datos.madrid.es/egob/catalogo/tipo/mon...,409634,A las víctimas del Holocausto,https://patrimonioypaisaje.madrid.es/sites/v/i...,https://patrimonioypaisaje.madrid.es/FrameWork...,{'@id': 'https://datos.madrid.es/egob/kos/Prov...,MADRID,,JARDIN TRES CULTURAS,,...,1,24,0,0,0,Point,"[-3.7058415, 40.4205886]",-3.705842,40.420589,12404.627842
3,https://datos.madrid.es/egob/catalogo/tipo/mon...,409634,A las víctimas del Holocausto,https://patrimonioypaisaje.madrid.es/sites/v/i...,https://patrimonioypaisaje.madrid.es/FrameWork...,{'@id': 'https://datos.madrid.es/egob/kos/Prov...,MADRID,,JARDIN TRES CULTURAS,,...,0,18,9,9,0,Point,"[-3.7069171, 40.4302937]",-3.706917,40.430294,12135.695695
4,https://datos.madrid.es/egob/catalogo/tipo/mon...,409634,A las víctimas del Holocausto,https://patrimonioypaisaje.madrid.es/sites/v/i...,https://patrimonioypaisaje.madrid.es/FrameWork...,{'@id': 'https://datos.madrid.es/egob/kos/Prov...,MADRID,,JARDIN TRES CULTURAS,,...,0,24,23,1,0,Point,"[-3.7025875, 40.4285524]",-3.702587,40.428552,11742.130030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,https://datos.madrid.es/egob/catalogo/tipo/mon...,409634,A las víctimas del Holocausto,https://patrimonioypaisaje.madrid.es/sites/v/i...,https://patrimonioypaisaje.madrid.es/FrameWork...,{'@id': 'https://datos.madrid.es/egob/kos/Prov...,MADRID,,JARDIN TRES CULTURAS,,...,0,24,17,7,0,Point,"[-3.7080429728445563, 40.39440607959559]",-3.708043,40.394406,14014.735591
231,https://datos.madrid.es/egob/catalogo/tipo/mon...,409634,A las víctimas del Holocausto,https://patrimonioypaisaje.madrid.es/sites/v/i...,https://patrimonioypaisaje.madrid.es/FrameWork...,{'@id': 'https://datos.madrid.es/egob/kos/Prov...,MADRID,,JARDIN TRES CULTURAS,,...,0,24,19,4,0,Point,"[-3.64835857929032, 40.43715089959262]",-3.648359,40.437151,5841.282012
232,https://datos.madrid.es/egob/catalogo/tipo/mon...,409634,A las víctimas del Holocausto,https://patrimonioypaisaje.madrid.es/sites/v/i...,https://patrimonioypaisaje.madrid.es/FrameWork...,{'@id': 'https://datos.madrid.es/egob/kos/Prov...,MADRID,,JARDIN TRES CULTURAS,,...,0,24,13,10,0,Point,"[-3.648958444298144, 40.44370542379883]",-3.648958,40.443705,5559.887276
233,https://datos.madrid.es/egob/catalogo/tipo/mon...,409634,A las víctimas del Holocausto,https://patrimonioypaisaje.madrid.es/sites/v/i...,https://patrimonioypaisaje.madrid.es/FrameWork...,{'@id': 'https://datos.madrid.es/egob/kos/Prov...,MADRID,,JARDIN TRES CULTURAS,,...,0,24,6,17,0,Point,"[-3.6557656204708775, 40.43965673033275]",-3.655766,40.439657,6438.264535


In [None]:
# 3.b) Group by place of interest:

In [115]:
def group_by(df, col_group, col_agg):
    df_grouped = df.groupby([col_group]).agg(min_value = (col_agg, 'min'), idxmin = (col_agg, 'idxmin')).reset_index()
    df_grouped['idxmin'] = df_grouped['idxmin'].map(df['id_y'])
    df_grouped = df_grouped.rename(columns = {'idxmin': 'id_y', 'min_value': col_agg})
    return df_grouped


In [116]:
group_by_output = group_by(df_sample, "id_x", "distance")
group_by_output.head()

Unnamed: 0,id_x,distance,id_y
0,409268,94.201295,99
1,409512,2633.107767,257
2,409634,5559.887276,238
