In [1]:
import os
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from shapely.geometry import Point
import pickle
import time

# First define some functions

In [2]:
# Find hrefs in main search page

def generate_links(response):
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Index from 3 in order to skip promoted offers
    offer_links = []
    for offer in soup.find_all(class_='offer-item-details')[3:]:
        offer_links.append(offer.find('a').attrs['href'])
        
    return offer_links

In [3]:
# Find data in single offer's page and return as a json

def process_offer_link(link):
    
    # Get full html page
    offer_response = requests.get(link)
    # Convert to soup
    offer_soup = BeautifulSoup(offer_response.text, 'html.parser')
    
    # Find interesting data
    apartament_data = json.loads(offer_soup.find(id="server-app-state").text)['initialProps']['data']['advert']
    
    return apartament_data

In [4]:
# Function for extracting relevant data from json

def collect_data_from_json(apartament_data):
    
    # Initiate empty dict
    parsed_data = dict()
    
    # Price
    parsed_data['Cena'] = apartament_data['price']['value']
    
    # Basic infos
    for factor in apartament_data['characteristics']:
        parsed_data[factor['label']] = factor['value_translated']
    
    # Date of last edit
    parsed_data['Data ogłoszenia'] = apartament_data['dateModified']
    
    # Adres
    parsed_data['Adres'] = apartament_data['location']['address']
    
    # Geolocation
    latitude = apartament_data['location']['coordinates']['latitude']
    longitude = apartament_data['location']['coordinates']['longitude']
    radius = apartament_data['location']['coordinates']['radius']
    (parsed_data['φ'], parsed_data['λ']) = (latitude, longitude) if radius == 0 else (np.nan, np.nan)
    #print(json.dumps(apartament_data['location'], indent=4, sort_keys=True))
    
    # Links to small photos
    photos = [i['thumbnail'] for i in apartament_data['photos']]
    
    return parsed_data, photos

# Now lets get to scrap!

In [5]:
# Load page with 72 results of apartaments with basic price < 3000 zł and 2 rooms

response = requests.get('https://www.otodom.pl/wynajem/mieszkanie/warszawa/?search%5Bfilter_float_price%3Ato%5D=3000&search%5Bfilter_enum_rooms_num%5D%5B0%5D=2&search%5Bdescription%5D=1&search%5Bcity_id%5D=26&nrAdsPerPage=72')

In [6]:
offer_links = generate_links(response)
for link in offer_links:
    pass
    #print(link)

In [8]:
# In order to not overload the server, let's wait 2 sec before processing the next link

apartaments_data = []
for i, link in enumerate(offer_links):
    apartaments_data.append(process_offer_link(link))
    print("Processed link number {}".format(i))
    time.sleep(2)

Processed link number 0
Processed link number 1
Processed link number 2
Processed link number 3
Processed link number 4
Processed link number 5
Processed link number 6
Processed link number 7
Processed link number 8
Processed link number 9
Processed link number 10
Processed link number 11
Processed link number 12
Processed link number 13
Processed link number 14
Processed link number 15
Processed link number 16
Processed link number 17
Processed link number 18
Processed link number 19
Processed link number 20
Processed link number 21
Processed link number 22
Processed link number 23
Processed link number 24
Processed link number 25
Processed link number 26
Processed link number 27
Processed link number 28
Processed link number 29
Processed link number 30
Processed link number 31
Processed link number 32
Processed link number 33
Processed link number 34
Processed link number 35
Processed link number 36
Processed link number 37
Processed link number 38
Processed link number 39
Processed 

# Dump `apartaments_data` (json) and `offer_links` (list) to pickles for backups!

In [21]:
if not os.path.isdir("pickles"):
     os.mkdir("pickles")

with open('pickles/apartaments_data.pkl', 'wb') as f:
    pickle.dump(apartaments_data, f)
with open('pickles/offer_links.pkl', 'wb') as f:
    pickle.dump(offer_links, f)

# Load pickle files for apartaments_data and offer_links

In [22]:
with open('pickles/apartaments_data.pkl', 'rb') as f:
    apartaments_data = pickle.load(f) 
with open('pickles/offer_links.pkl', 'rb') as f:
    offer_links = pickle.load(f)

# Extract relevant features from json

In [23]:
df_list = []
photos_list = []
for apartament_data in apartaments_data:
    
    parsed_data, photos = collect_data_from_json(apartament_data)
    
    photos_list.append(photos)

    # Append features data to the list of DataFrames
    columns = list(parsed_data.keys())
    rows = list(parsed_data.values())
    df_list.append(pd.DataFrame([rows], columns=columns))

In [24]:
# Concatonate all DataFrames in desired way

df = pd.concat(df_list, sort=False).reset_index(drop=True)

In [25]:
# Add offer_links and photos_list (as columns) to the DataFrame

if 'Link' in df:
    df = df.drop(columns = 'Link')
df.insert(df.shape[1], "Link", offer_links)

if 'Zdjęcia' in df:
    df = df.drop(columns = 'Zdjęcia')
df.insert(df.shape[1], "Zdjęcia", str(photos_list).strip('[]'))

# Save as .csv for future analyse

In [26]:
df.to_csv("apartaments.csv", index=False)

In [27]:
df.head()

Unnamed: 0,Cena,Czynsz - dodatkowo,Kaucja,Powierzchnia,Liczba pokoi,Rodzaj zabudowy,Piętro,Liczba pięter,Materiał budynku,Okna,Ogrzewanie,Rok budowy,Stan wykończenia,Dostępne od,Data ogłoszenia,Adres,φ,λ,Link,Zdjęcia
0,2200,400 zł,2 600 zł,36 m²,2,blok,parter,4,cegła,drewniane,miejskie,2018.0,do zamieszkania,2019-12-15,2019-12-24 16:18:45,"Warszawa, Wilanów, Sarmacka 4 B",52.23614,21.00817,https://www.otodom.pl/oferta/bezposrednio-2-po...,'https://apollo-ireland.akamaized.net/v1/files...
1,2900,300 zł,2 900 zł,50 m²,2,apartamentowiec,> 10,14,cegła,drewniane,miejskie,2010.0,do zamieszkania,2019-12-02,2019-12-24 16:13:36,"Warszawa, Mokotów, Melody Park",52.169271,21.020154,https://www.otodom.pl/oferta/pulawska-dol-sluz...,'https://apollo-ireland.akamaized.net/v1/files...
2,2600,400 zł,3 000 zł,42 m²,2,kamienica,5,5,cegła,plastikowe,miejskie,1925.0,do zamieszkania,2019-12-16,2019-12-24 16:01:38,"Warszawa, Praga-Północ, Praga, Kaweczynska 16",52.20556,21.07452,https://www.otodom.pl/oferta/mieszkanie-nowocz...,'https://apollo-ireland.akamaized.net/v1/files...
3,2700,500 zł,3 000 zł,50 m²,2,apartamentowiec,1,8,cegła,plastikowe,miejskie,,do zamieszkania,2020-01-20,2019-12-24 15:36:46,"Warszawa, Mokotów, ul. Postępu 10",52.17801,20.996466,https://www.otodom.pl/oferta/postepu-10-nowe-k...,'https://apollo-ireland.akamaized.net/v1/files...
4,3000,200 zł,3 000 zł,45 m²,2,apartamentowiec,5,7,cegła,drewniane,miejskie,2016.0,do zamieszkania,2020-01-01,2019-12-24 14:50:54,"Warszawa, Mokotów, Dolny Mokotów, ul. Magazynowa",52.185539,21.004877,https://www.otodom.pl/oferta/magazynowa-45-m2-...,'https://apollo-ireland.akamaized.net/v1/files...
