In [1]:
from moredata.datasets import get_path
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

df = pd.read_csv(get_path("airbnb-berlin-main"))
df = df.loc[(~df["latitude"].isna()) & (~df["longitude"].isna())]
df.to_json("./data/airbnb-berlin.json", orient="records")



In [2]:
df.columns = [
    "iid",
    "nname",
    "neighbourhood",
    "neighbourhood_cleansed",
    "room_type",
    "price",
    "latitude",
    "longitude",
]

## Using JsonData

In [3]:
import moredata

data = moredata.models.JsonData(
    data_file="./data/airbnb-berlin.json", parser=moredata.parser.parse_document
)

osm_enricher = moredata.enricher.osm.OSMPlacesConnector(
    files=["./data/tourism.csv.gz"], radius=50, geometry_intersected=True
)

data_enriched = osm_enricher.enrich(data)

moredata.utils.write_json_generator_to_json(
    "./data/airbnb-berlin-enriched", data_enriched, 100000
)

## Using Geopandas

In [4]:
import moredata
import geopandas

gdf = geopandas.GeoDataFrame(
    df, geometry=geopandas.points_from_xy(df.longitude, df.latitude)
)

data = moredata.models.GeopandasData.from_geodataframe(gdf)

osm_enricher = moredata.enricher.osm.OSMPlacesConnector(
    files=["./data/tourism.csv.gz"], radius=50, geometry_intersected=True
)

data_enriched = osm_enricher.enrich(data)

In [5]:
data_enriched = data_enriched.data

In [6]:
left_cols = gdf.columns.to_list()
cols = left_cols + ["geometry_intersected"]
data_enriched[~data_enriched["geometry_intersected"].isna()][cols]

Unnamed: 0,iid,nname,neighbourhood,neighbourhood_cleansed,room_type,price,latitude,longitude,geometry,geometry_intersected
35,54952,Riverfront Panorama,"Berlin, Germany",Alexanderplatz,Private room,$45.00,52.52264,13.38810,POINT (1490356.475 6895182.792),POINT (13.38828 52.52270)
35,54952,Riverfront Panorama,"Berlin, Germany",Alexanderplatz,Private room,$45.00,52.52264,13.38810,POINT (1490356.475 6895182.792),POINT (13.38826 52.52279)
35,54952,Riverfront Panorama,"Berlin, Germany",Alexanderplatz,Private room,$45.00,52.52264,13.38810,POINT (1490356.475 6895182.792),POINT (13.38777 52.52249)
52,82893,Apartment 2 Mitte,,Brunnenstr. Süd,Entire home/apt,$75.00,52.53200,13.38285,POINT (1489772.047 6896895.447),"POLYGON ((13.38156 52.53280, 13.38136 52.53273..."
52,82893,Apartment 2 Mitte,,Brunnenstr. Süd,Entire home/apt,$75.00,52.53200,13.38285,POINT (1489772.047 6896895.447),POINT (13.38241 52.53200)
...,...,...,...,...,...,...,...,...,...,...
19781,48355629,Modern & cozy room for 4 near to subway central,,Schöneberg-Nord,Private room,$53.00,52.49410,13.35946,POINT (1487168.284 6889962.909),POINT (13.35929 52.49416)
19781,48355629,Modern & cozy room for 4 near to subway central,,Schöneberg-Nord,Private room,$53.00,52.49410,13.35946,POINT (1487168.284 6889962.909),POINT (13.35925 52.49417)
19798,48398660,Loft Studio in Berlin Kreuzberg *Dachgeschoss*,,Südliche Friedrichstadt,Entire home/apt,$49.00,52.50096,13.38969,POINT (1490533.473 6891217.274),POINT (13.38973 52.50104)
19848,48579325,New Exclusiv Appartment with Balcony downtown,,Alexanderplatz,Entire home/apt,$80.00,52.51145,13.41549,POINT (1493405.516 6893135.769),POINT (13.41569 52.51133)


In [7]:
data_enriched[data_enriched["geometry_intersected"].isna()]

Unnamed: 0,iid,nname,neighbourhood,neighbourhood_cleansed,room_type,price,latitude,longitude,geometry,index_right,...,site,photography,not:historic,not:name,cemetery,wheelchair:source,electrified,gauge,railway,geometry_intersected
0,1944,bright & airy Pberg/Mitte 3 months or more,"Berlin, Germany",Brunnenstr. Nord,Private room,$20.00,52.54425,13.39749,POINT (1491401.765 6899137.455),,...,,,,,,,,,,
1,2015,Berlin-Mitte Value! Quiet courtyard/very central,"Berlin, Germany",Brunnenstr. Süd,Entire home/apt,$59.00,52.53454,13.40256,POINT (1491966.155 6897360.269),,...,,,,,,,,,,
2,3176,Fabulous Flat in great Location,"Berlin, Germany",Prenzlauer Berg Südwest,Entire home/apt,$90.00,52.53500,13.41758,POINT (1493638.173 6897444.452),,...,,,,,,,,,,
3,3309,BerlinSpot Schöneberg near KaDeWe,"Berlin, Germany",Schöneberg-Nord,Private room,$29.00,52.49885,13.34906,POINT (1486010.562 6890831.435),,...,,,,,,,,,,
4,6883,Stylish East Side Loft in Center with AC & 2 b...,"Berlin, Germany",Frankfurter Allee Süd FK,Entire home/apt,$79.00,52.51171,13.45477,POINT (1497778.145 6893183.326),,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19854,48597066,Rollstuhlgeeignete Wohnung für zwei in Hönow M...,,Hellersdorf-Nord,Entire home/apt,$22.00,52.54192,13.60749,POINT (1514778.858 6898710.967),,...,,,,,,,,,,
19855,48599795,Lichtdurchflutete Wohnung,,Tempelhof,Entire home/apt,$80.00,52.45901,13.41783,POINT (1493666.003 6883549.691),,...,,,,,,,,,,
19856,48600069,Bright Twin Capacity 1 At Mitte,,Alexanderplatz,Private room,"$1,840.00",52.51105,13.40680,POINT (1492438.149 6893062.606),,...,,,,,,,,,,
19857,48602039,One Room Apartment,,Frankfurter Allee Süd FK,Entire home/apt,$25.00,52.49805,13.46587,POINT (1499013.791 6890685.150),,...,,,,,,,,,,


In [8]:
data_enriched

Unnamed: 0,iid,nname,neighbourhood,neighbourhood_cleansed,room_type,price,latitude,longitude,geometry,index_right,...,site,photography,not:historic,not:name,cemetery,wheelchair:source,electrified,gauge,railway,geometry_intersected
0,1944,bright & airy Pberg/Mitte 3 months or more,"Berlin, Germany",Brunnenstr. Nord,Private room,$20.00,52.54425,13.39749,POINT (1491401.765 6899137.455),,...,,,,,,,,,,
1,2015,Berlin-Mitte Value! Quiet courtyard/very central,"Berlin, Germany",Brunnenstr. Süd,Entire home/apt,$59.00,52.53454,13.40256,POINT (1491966.155 6897360.269),,...,,,,,,,,,,
2,3176,Fabulous Flat in great Location,"Berlin, Germany",Prenzlauer Berg Südwest,Entire home/apt,$90.00,52.53500,13.41758,POINT (1493638.173 6897444.452),,...,,,,,,,,,,
3,3309,BerlinSpot Schöneberg near KaDeWe,"Berlin, Germany",Schöneberg-Nord,Private room,$29.00,52.49885,13.34906,POINT (1486010.562 6890831.435),,...,,,,,,,,,,
4,6883,Stylish East Side Loft in Center with AC & 2 b...,"Berlin, Germany",Frankfurter Allee Süd FK,Entire home/apt,$79.00,52.51171,13.45477,POINT (1497778.145 6893183.326),,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19854,48597066,Rollstuhlgeeignete Wohnung für zwei in Hönow M...,,Hellersdorf-Nord,Entire home/apt,$22.00,52.54192,13.60749,POINT (1514778.858 6898710.967),,...,,,,,,,,,,
19855,48599795,Lichtdurchflutete Wohnung,,Tempelhof,Entire home/apt,$80.00,52.45901,13.41783,POINT (1493666.003 6883549.691),,...,,,,,,,,,,
19856,48600069,Bright Twin Capacity 1 At Mitte,,Alexanderplatz,Private room,"$1,840.00",52.51105,13.40680,POINT (1492438.149 6893062.606),,...,,,,,,,,,,
19857,48602039,One Room Apartment,,Frankfurter Allee Süd FK,Entire home/apt,$25.00,52.49805,13.46587,POINT (1499013.791 6890685.150),,...,,,,,,,,,,


In [9]:
data_enriched.columns.tolist()

['iid',
 'nname',
 'neighbourhood',
 'neighbourhood_cleansed',
 'room_type',
 'price',
 'latitude',
 'longitude',
 'geometry',
 'index_right',
 'Unnamed: 0',
 'type',
 'id',
 'tags',
 'value',
 'key',
 'bicycle',
 'direction_northeast',
 'direction_south',
 'direction_southwest',
 'information',
 'tourism',
 'architect',
 'artwork_type',
 'heritage',
 'heritage:operator',
 'historic',
 'inscription',
 'lda:criteria',
 'loc_name',
 'material',
 'name',
 'ref:lda',
 'start_date',
 'wheelchair',
 'wikidata',
 'wikimedia_commons',
 'wikipedia',
 'addr:city',
 'addr:country',
 'addr:housenumber',
 'addr:postcode',
 'addr:street',
 'addr:suburb',
 'amenity',
 'shelter_type',
 'website',
 'brand',
 'brand:wikidata',
 'brand:wikipedia',
 'stars',
 'contact:email',
 'contact:phone',
 'contact:website',
 'operator',
 'toilets:wheelchair',
 'old_name',
 'rooms',
 'email',
 'fax',
 'image',
 'name:en',
 'name:fr',
 'name:ru',
 'opening_hours',
 'phone',
 'internet_access',
 'internet_access:fee',


In [10]:
left_cols = gdf.columns.to_list()[2:]
cols = left_cols + ["key", "value", "geometry_intersected"]
data_enriched[~data_enriched["index_right"].isna()][cols]

Unnamed: 0,neighbourhood,neighbourhood_cleansed,room_type,price,latitude,longitude,geometry,key,value,geometry_intersected
35,"Berlin, Germany",Alexanderplatz,Private room,$45.00,52.52264,13.38810,POINT (1490356.475 6895182.792),tourism,,POINT (13.38828 52.52270)
35,"Berlin, Germany",Alexanderplatz,Private room,$45.00,52.52264,13.38810,POINT (1490356.475 6895182.792),tourism,,POINT (13.38826 52.52279)
35,"Berlin, Germany",Alexanderplatz,Private room,$45.00,52.52264,13.38810,POINT (1490356.475 6895182.792),tourism,,POINT (13.38777 52.52249)
52,,Brunnenstr. Süd,Entire home/apt,$75.00,52.53200,13.38285,POINT (1489772.047 6896895.447),tourism,,"POLYGON ((13.38156 52.53280, 13.38136 52.53273..."
52,,Brunnenstr. Süd,Entire home/apt,$75.00,52.53200,13.38285,POINT (1489772.047 6896895.447),,,POINT (13.38241 52.53200)
...,...,...,...,...,...,...,...,...,...,...
19781,,Schöneberg-Nord,Private room,$53.00,52.49410,13.35946,POINT (1487168.284 6889962.909),,,POINT (13.35929 52.49416)
19781,,Schöneberg-Nord,Private room,$53.00,52.49410,13.35946,POINT (1487168.284 6889962.909),,,POINT (13.35925 52.49417)
19798,,Südliche Friedrichstadt,Entire home/apt,$49.00,52.50096,13.38969,POINT (1490533.473 6891217.274),tourism,,POINT (13.38973 52.50104)
19848,,Alexanderplatz,Entire home/apt,$80.00,52.51145,13.41549,POINT (1493405.516 6893135.769),tourism,,POINT (13.41569 52.51133)


## Using Dask Geopandas

In [11]:
import dask
from distributed import Client, LocalCluster

dask.config.set({"distributed.nanny.environ.MALLOC_TRIM_THRESHOLD_": 0})

cluster = LocalCluster(n_workers=4, threads_per_worker=2, memory_limit="2GB")
client = Client(cluster)

client

2023-05-16 11:52:50,791 - distributed.diskutils - INFO - Found stale lock file and directory '/home/gegen07/dev/open-source/more-data/examples/osm/dask-worker-space/worker-6713t43b', purging
2023-05-16 11:52:50,791 - distributed.diskutils - INFO - Found stale lock file and directory '/home/gegen07/dev/open-source/more-data/examples/osm/dask-worker-space/worker-4h4k06oo', purging
2023-05-16 11:52:50,791 - distributed.diskutils - INFO - Found stale lock file and directory '/home/gegen07/dev/open-source/more-data/examples/osm/dask-worker-space/worker-lgplyhii', purging
2023-05-16 11:52:50,792 - distributed.diskutils - INFO - Found stale lock file and directory '/home/gegen07/dev/open-source/more-data/examples/osm/dask-worker-space/worker-0cx3tfb2', purging


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:36321/status,

0,1
Dashboard: http://127.0.0.1:36321/status,Workers: 4
Total threads: 8,Total memory: 7.45 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:34911,Workers: 4
Dashboard: http://127.0.0.1:36321/status,Total threads: 8
Started: Just now,Total memory: 7.45 GiB

0,1
Comm: tcp://127.0.0.1:35415,Total threads: 2
Dashboard: http://127.0.0.1:43109/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:35025,
Local directory: /home/gegen07/dev/open-source/more-data/examples/osm/dask-worker-space/worker-_re0ixld,Local directory: /home/gegen07/dev/open-source/more-data/examples/osm/dask-worker-space/worker-_re0ixld

0,1
Comm: tcp://127.0.0.1:44505,Total threads: 2
Dashboard: http://127.0.0.1:34235/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:46485,
Local directory: /home/gegen07/dev/open-source/more-data/examples/osm/dask-worker-space/worker-iqa6z2vb,Local directory: /home/gegen07/dev/open-source/more-data/examples/osm/dask-worker-space/worker-iqa6z2vb

0,1
Comm: tcp://127.0.0.1:36499,Total threads: 2
Dashboard: http://127.0.0.1:42361/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:41957,
Local directory: /home/gegen07/dev/open-source/more-data/examples/osm/dask-worker-space/worker-wqi0r78s,Local directory: /home/gegen07/dev/open-source/more-data/examples/osm/dask-worker-space/worker-wqi0r78s

0,1
Comm: tcp://127.0.0.1:40173,Total threads: 2
Dashboard: http://127.0.0.1:46021/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:39877,
Local directory: /home/gegen07/dev/open-source/more-data/examples/osm/dask-worker-space/worker-nt7c8_j5,Local directory: /home/gegen07/dev/open-source/more-data/examples/osm/dask-worker-space/worker-nt7c8_j5


In [12]:
import moredata
import geopandas


gdf = geopandas.GeoDataFrame(
    df, geometry=geopandas.points_from_xy(df.longitude, df.latitude)
)
data = moredata.models.DaskGeopandasData.from_geodataframe(gdf)

osm_enricher = moredata.enricher.osm.OSMPlacesConnector(
    files=["./data/tourism.csv.gz"], radius=50, geometry_intersected=True
)

data_enriched = osm_enricher.enrich(data)

In [13]:
data_enriched = data_enriched.data.compute()



In [14]:
left_cols = gdf.columns.to_list()[2:]
cols = left_cols + ["key", "value", "geometry_intersected"]
data_enriched[~data_enriched["index_right"].isna()][cols]

Unnamed: 0,neighbourhood,neighbourhood_cleansed,room_type,price,latitude,longitude,geometry,key,value,geometry_intersected
35,"Berlin, Germany",Alexanderplatz,Private room,$45.00,52.52264,13.38810,POINT (1490356.475 6895182.792),tourism,,POINT (13.38828 52.52270)
35,"Berlin, Germany",Alexanderplatz,Private room,$45.00,52.52264,13.38810,POINT (1490356.475 6895182.792),tourism,,POINT (13.38826 52.52279)
35,"Berlin, Germany",Alexanderplatz,Private room,$45.00,52.52264,13.38810,POINT (1490356.475 6895182.792),tourism,,POINT (13.38777 52.52249)
52,,Brunnenstr. Süd,Entire home/apt,$75.00,52.53200,13.38285,POINT (1489772.047 6896895.447),tourism,,"POLYGON ((13.38156 52.53280, 13.38136 52.53273..."
54,,Brunnenstr. Süd,Entire home/apt,$79.00,52.53182,13.38230,POINT (1489710.822 6896862.508),tourism,,"POLYGON ((13.38156 52.53280, 13.38136 52.53273..."
...,...,...,...,...,...,...,...,...,...,...
19781,,Schöneberg-Nord,Private room,$53.00,52.49410,13.35946,POINT (1487168.284 6889962.909),,,POINT (13.35943 52.49412)
19781,,Schöneberg-Nord,Private room,$53.00,52.49410,13.35946,POINT (1487168.284 6889962.909),,,POINT (13.35945 52.49413)
19781,,Schöneberg-Nord,Private room,$53.00,52.49410,13.35946,POINT (1487168.284 6889962.909),,,POINT (13.35929 52.49416)
19781,,Schöneberg-Nord,Private room,$53.00,52.49410,13.35946,POINT (1487168.284 6889962.909),,,POINT (13.35925 52.49417)
