In [27]:
import os
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from shapely.geometry import Point
import pickle
import time

# First define some functions

In [2]:
# Find hrefs in main search page

def generate_links(response):
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Index from 3 in order to skip promoted offers
    offer_links = []
    for offer in soup.find_all(class_='offer-item-details')[3:]:
        offer_links.append(offer.find('a').attrs['href'])
        
    return offer_links

In [3]:
# Find data in single offer's page and return as a json

def process_offer_link(link):
    
    # Get full html page
    offer_response = requests.get(link)
    # Convert to soup
    offer_soup = BeautifulSoup(offer_response.text, 'html.parser')
    
    # Find interesting data
    apartament_data = json.loads(offer_soup.find(id="server-app-state").text)['initialProps']['data']['advert']
    
    return apartament_data

In [4]:
# Extract relevant data from json

def collect_data_from_json(apartament_data):
    
    # Initiate empty dict
    parsed_data = dict()
    
    # Price
    parsed_data['Cena'] = apartament_data['price']['value']
    
    # Basic infos
    for factor in apartament_data['characteristics']:
        parsed_data[factor['label']] = factor['value_translated']
    
    # Date of last edit
    parsed_data['Data ogłoszenia'] = apartament_data['dateModified']
    
    # Adres
    parsed_data['Adres'] = apartament_data['location']['address']
    
    # Geolocation
    latitude = apartament_data['location']['coordinates']['latitude']
    longitude = apartament_data['location']['coordinates']['longitude']
    radius = apartament_data['location']['coordinates']['radius']
    parsed_data['Lokalizacja'] = Point(latitude, longitude) if radius == 0 else np.nan
    #print(json.dumps(apartament_data['location'], indent=4, sort_keys=True))
    
    # Links to small photos
    photos = [i['thumbnail'] for i in apartament_data['photos']]
    
    return parsed_data, photos

# Now lets get to scrap!
## `Skip these steps if you want to use pickle files for apartament data`

In [24]:
response = requests.get('https://www.otodom.pl/wynajem/mieszkanie/warszawa/?search%5Bfilter_float_price%3Ato%5D=3000&search%5Bfilter_enum_rooms_num%5D%5B0%5D=2&search%5Bdescription%5D=1&search%5Bcity_id%5D=26&nrAdsPerPage=72')

In [70]:
offer_links = generate_links(response)
for link in offer_links:
    pass
    #print(link)

In [29]:
# In order to not overload the server, let's wait 2 sec before processing the next link
apartaments_data = []
for i, link in enumerate(offer_links):
    apartaments_data.append(process_offer_link(link))
    print("Processed link number {}".format(i))
    time.sleep(2)

Processed link number 0
Processed link number 1
Processed link number 2
Processed link number 3
Processed link number 4
Processed link number 5
Processed link number 6
Processed link number 7
Processed link number 8
Processed link number 9
Processed link number 10
Processed link number 11
Processed link number 12
Processed link number 13
Processed link number 14
Processed link number 15
Processed link number 16
Processed link number 17
Processed link number 18
Processed link number 19
Processed link number 20
Processed link number 21
Processed link number 22
Processed link number 23
Processed link number 24
Processed link number 25
Processed link number 26
Processed link number 27
Processed link number 28
Processed link number 29
Processed link number 30
Processed link number 31
Processed link number 32
Processed link number 33
Processed link number 34
Processed link number 35
Processed link number 36
Processed link number 37
Processed link number 38
Processed link number 39
Processed 

In [30]:
# Dump `apartaments_data` and `offer_links` to pickles

with open('apartaments_data.pkl', 'wb') as f:
    pickle.dump(apartaments_data, f)
with open('offer_links.pkl', 'wb') as f:
    pickle.dump(offer_links, f)

# Load pickle files
## `Start here if you want to use pickle files for aparatament_data`

In [71]:
with open('apartaments_data.pkl', 'rb') as f:
    apartaments_data = pickle.load(f) 
with open('offer_links.pkl', 'rb') as f:
    offer_links = pickle.load(f)

In [72]:
df_list = []
photos_list = []
for apartament_data in apartaments_data:
    
    parsed_data, photos = collect_data_from_json(apartament_data)
    
    photos_list.append(photos)

    # Append features data to the list of DataFrames
    columns = list(parsed_data.keys())
    rows = list(parsed_data.values())
    df_list.append(pd.DataFrame([rows], columns=columns))

In [73]:
# Concatonate all DataFrames in desired way

df = pd.concat(df_list, sort=False).reset_index(drop=True)
df.head()

Unnamed: 0,Cena,Czynsz - dodatkowo,Kaucja,Powierzchnia,Liczba pokoi,Rodzaj zabudowy,Piętro,Liczba pięter,Materiał budynku,Okna,Ogrzewanie,Rok budowy,Stan wykończenia,Dostępne od,Data ogłoszenia,Adres,Lokalizacja
0,2200,1 zł,2 200 zł,45 m²,2,blok,1,6.0,cegła,drewniane,miejskie,2001.0,do zamieszkania,2018-12-01,2019-12-23 23:56:40,"Warszawa, Ursus, Skorosze, Chełmońskiego 2",POINT (52.1923861 20.8999832)
1,2600,,2 600 zł,42 m²,2,apartamentowiec,6,13.0,cegła,plastikowe,miejskie,2017.0,,,2019-12-23 23:46:01,"Warszawa, Wola, Mirów, Pereca",POINT (52.2336971 20.9940979)
2,2600,1 zł,2 600 zł,40 m²,2,blok,6,11.0,wielka płyta,plastikowe,miejskie,1970.0,do zamieszkania,2019-12-31,2019-12-23 23:31:01,"Warszawa, Wola, ul. Sienna 66",POINT (52.2308245 20.9964303)
3,2100,500 zł,2 100 zł,33 m²,2,blok,3,8.0,,,miejskie,2019.0,do zamieszkania,,2019-12-23 23:27:56,"Warszawa, Praga-Północ, ul. Markowska",POINT (52.25187321237169 21.04533585291748)
4,2400,1 zł,2 400 zł,38 m²,2,blok,2,,pustak,plastikowe,miejskie,,do zamieszkania,2019-12-31,2019-12-23 23:15:37,"Warszawa, Targówek, ul. Poborzańska 39",POINT (52.29063619316574 21.03143987940674)


In [74]:
photos_list[0][0]

'https://apollo-ireland.akamaized.net/v1/files/eyJmbiI6Ind5d3VhOHQ0ZGMwZTItQVBMIiwidyI6W3siZm4iOiJqMWozbzEzbTZiZ24xLUFQTCIsInMiOiIxNCIsInAiOiIxMCwtMTAiLCJhIjoiMCJ9XX0.AWatftNwFutPZlmSYXl5qhRcIlfQFwK9ZuqrdBPBnbU/image;s=184x138;q=80'

# Data collecting completed. Now start doing something smart

## First add `price` to `rental price` in order to get `real price`

In [75]:
# Check if value is Nan or not
def notNan(a):
    return a == a

In [76]:
def real_price(row):
    if notNan(row['Czynsz - dodatkowo']):
        return int(row['Cena']) + int(''.join(c for c in row['Czynsz - dodatkowo'] if c.isdigit()))
    else:
        return int(row['Cena'])

In [77]:
if 'Cena rzeczywista' in df:
    df = df.drop(columns = 'Cena rzeczywista')
df.insert(2, "Cena rzeczywista", df.apply(real_price, axis=1))
df.head()

Unnamed: 0,Cena,Czynsz - dodatkowo,Cena rzeczywista,Kaucja,Powierzchnia,Liczba pokoi,Rodzaj zabudowy,Piętro,Liczba pięter,Materiał budynku,Okna,Ogrzewanie,Rok budowy,Stan wykończenia,Dostępne od,Data ogłoszenia,Adres,Lokalizacja
0,2200,1 zł,2201,2 200 zł,45 m²,2,blok,1,6.0,cegła,drewniane,miejskie,2001.0,do zamieszkania,2018-12-01,2019-12-23 23:56:40,"Warszawa, Ursus, Skorosze, Chełmońskiego 2",POINT (52.1923861 20.8999832)
1,2600,,2600,2 600 zł,42 m²,2,apartamentowiec,6,13.0,cegła,plastikowe,miejskie,2017.0,,,2019-12-23 23:46:01,"Warszawa, Wola, Mirów, Pereca",POINT (52.2336971 20.9940979)
2,2600,1 zł,2601,2 600 zł,40 m²,2,blok,6,11.0,wielka płyta,plastikowe,miejskie,1970.0,do zamieszkania,2019-12-31,2019-12-23 23:31:01,"Warszawa, Wola, ul. Sienna 66",POINT (52.2308245 20.9964303)
3,2100,500 zł,2600,2 100 zł,33 m²,2,blok,3,8.0,,,miejskie,2019.0,do zamieszkania,,2019-12-23 23:27:56,"Warszawa, Praga-Północ, ul. Markowska",POINT (52.25187321237169 21.04533585291748)
4,2400,1 zł,2401,2 400 zł,38 m²,2,blok,2,,pustak,plastikowe,miejskie,,do zamieszkania,2019-12-31,2019-12-23 23:15:37,"Warszawa, Targówek, ul. Poborzańska 39",POINT (52.29063619316574 21.03143987940674)


# Import interactive maps module and geoencoding library

In [78]:
import geopandas as gpd
from geopandas.tools import geocode
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster

In [79]:
# Function for displaying the map
def embed_map(m, file_name):
    from IPython.display import IFrame
    m.save(file_name)
    return IFrame(file_name, width='100%', height='500px')

In [80]:
# Function for encoding geolocation from name
def encode(name):
    #print("Encoding '{}'".format(name))
    try:
        return geocode(name, provider='nominatim').geometry
    except Exception as err:
        print("Error while encoding '{}'!".format(name))
        return None

# Encode locations of metro stations
## `Or use pickle file for that`

In [81]:
# Get to the page containing a table with names of all metro stations in Warsaw
response = requests.get('https://pl.wikipedia.org/wiki/Lista_stacji_metra_w_Warszawie')
soup = BeautifulSoup(response.text, 'html.parser')

In [82]:
# Load from pickle if already encoded
if os.path.isfile("metro.pkl"):
    with open('metro.pkl', 'rb') as f:
        metro = pickle.load(f)
else:
    # Encode locations of metro stations and create a DataFrame
    metro = pd.DataFrame(columns = ['Nazwa', 'Lokalizacja'])

    for row in soup.find("table").find_all("tr")[1:-1]:
        name = ("Metro " + row.find_all("td")[1].text).replace('\n','')
        loc = encode(name + " Warsaw")
        if loc is not None:
            metro = metro.append({'Nazwa':name, 'Lokalizacja':Point(loc.y, loc.x)}, ignore_index=True)

        # Dump to pickle
        with open('metro.pkl', 'wb') as f:
            pickle.dump(metro, f)

# Plot locations of metro stations 
There are few errors (`Metro Ratusz Arsenał` and `Metro Ursynów` weren't properly encoded)

In [83]:
# Create a map
m_1 = folium.Map(location=[52.2323,21.0000], tiles='cartodbpositron', zoom_start=11)

# Add points to the map
for idx, station in metro.iterrows():
        Marker((station['Lokalizacja'].x, station['Lokalizacja'].y), popup=station['Nazwa']).add_to(m_1)


# Display the map
embed_map(m_1, 'm_1.html')

# Plot locations of houses

In [84]:
# Create a map
m_2 = folium.Map(location=[52.2323,21.0000], tiles='cartodbpositron', zoom_start=11)

# Add points to the map
for idx, house in df.iterrows():
    if notNan(house['Lokalizacja']):
        Marker((house['Lokalizacja'].x, house['Lokalizacja'].y), popup=idx).add_to(m_2)

# Display the map
embed_map(m_2, 'm_2.html')

# Define functions for calculating distance on Earth

In [85]:
# Using geopy
import geopy.distance

def dist(a, b):
    return geopy.distance.geodesic((a.x, a.y), (b.x, b.y)).km

In [86]:
# Using geopandas
def dist_2(a, b):
    a = gpd.GeoSeries(Point(a.y, a.x), crs=({"init":"epsg:4326"})).to_crs(epsg=3310)
    b = gpd.GeoSeries(Point(b.y, b.x), crs=({"init":"epsg:4326"})).to_crs(epsg=3310)
    return a.distance(b) / 1000

# Find distance to nearest metro station for all valid houses

In [89]:
output = []

for idx, house in df.iterrows():
    if notNan(house['Lokalizacja']):
        list_ = [dist(house['Lokalizacja'], station['Lokalizacja']) for jdx, station in metro.iterrows()]
        output.append((offer_links[idx], house['Cena rzeczywista'], round(min(list_),2), metro.iloc[np.argmin(list_)]['Nazwa']))
                
# Display as an offer, sorted
for a,b,c,d in sorted(output, key=lambda tup: tup[1]):
    pass
    #print("Oferta: {}\n, cena: {} zł, {} km od stacji {}\n".format(a,b,c,d))