In [1]:
import os
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from shapely.geometry import Point
import pickle
import time
import datetime

# First define some functions

In [2]:
# Find hrefs in main search page

def generate_links(response):
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Index from 3 in order to skip promoted offers
    offer_links = []
    for offer in soup.find_all(class_='offer-item-details')[3:]:
        offer_links.append(offer.find('a').attrs['href'])
        
    return offer_links

In [3]:
# Find data in single offer's page and return as a json

def process_offer_link(link):
    
    # Get full html page
    offer_response = requests.get(link)
    # Convert to soup
    offer_soup = BeautifulSoup(offer_response.text, 'html.parser')
    
    # Find interesting data
    apartament_data = json.loads(offer_soup.find(id="server-app-state").text)['initialProps']['data']['advert']
    
    return apartament_data

In [4]:
# Function for extracting relevant data from json

def collect_data_from_json(apartament_data):
    
    # Uncomment below to review json
    #print(json.dumps(apartament_data, indent=4, sort_keys=True)) 
    
    # Initiate empty dict
    parsed_data = dict()
    
    # Price
    parsed_data['Cena'] = apartament_data['price']['value']
    
    # Basic infos
    for factor in apartament_data['characteristics']:
        if factor['label'] in ['Czynsz - dodatkowo', 'Kaucja', 'Powierzchnia']:
            parsed_data[factor['label']] = factor['value']
        else:
            parsed_data[factor['label']] = factor['value_translated']
    
    # Date of last edit >>personal data<<
    #parsed_data['Data ogłoszenia'] = apartament_data['dateModified']
    
    # Adres >>personal data<<
    #parsed_data['Adres'] = apartament_data['location']['address']
    
    # URL >>personal data<<
    #parsed_data['URL'] = apartament_data['url']
    
    # Geolocation
    latitude = apartament_data['location']['coordinates']['latitude']
    longitude = apartament_data['location']['coordinates']['longitude']
    radius = apartament_data['location']['coordinates']['radius']
    (parsed_data['φ'], parsed_data['λ']) = (latitude, longitude) if radius == 0 else (np.nan, np.nan)
    
    # Links to small photos >>personal data<<
    photos = [i['thumbnail'] for i in apartament_data['photos']]
    
    return parsed_data, photos

# Now lets get to scrap!

In [15]:
# Load 90 pages each with 72 results of apartaments

offer_links = []

for page in range(90):
    response = requests.get('https://www.otodom.pl/sprzedaz/mieszkanie/warszawa/?search%5Bdescription%5D=1&search%5Bregion_id%5D=7&search%5Bcity_id%5D=26&nrAdsPerPage=72'
                       + '&page={}'.format(page+1))
    
    offer_links += generate_links(response)
    print("Processed page number {}".format(page+1))

Processed page number 1
Processed page number 2
Processed page number 3
Processed page number 4
Processed page number 5
Processed page number 6
Processed page number 7
Processed page number 8
Processed page number 9
Processed page number 10
Processed page number 11
Processed page number 12
Processed page number 13
Processed page number 14
Processed page number 15
Processed page number 16
Processed page number 17
Processed page number 18
Processed page number 19
Processed page number 20
Processed page number 21
Processed page number 22
Processed page number 23
Processed page number 24
Processed page number 25
Processed page number 26
Processed page number 27
Processed page number 28
Processed page number 29
Processed page number 30
Processed page number 31
Processed page number 32
Processed page number 33
Processed page number 34
Processed page number 35
Processed page number 36
Processed page number 37
Processed page number 38
Processed page number 39
Processed page number 40
Processed

In [16]:
# In order to not overload the server, we can wait 1 sec before processing the a link

apartaments_data = []
for i, link in enumerate(offer_links[:3600]):
    try:
        apartaments_data.append(process_offer_link(link))
        print("Processed link number {}".format(i+1))
    except:
        print("Some error happened while processing link number {}".format(i+1))
    time.sleep(1)

Processed link number 1
Processed link number 2
Processed link number 3
Processed link number 4
Processed link number 5
Processed link number 6
Processed link number 7
Processed link number 8
Processed link number 9
Processed link number 10
Processed link number 11
Processed link number 12
Processed link number 13
Processed link number 14
Processed link number 15
Processed link number 16
Processed link number 17
Processed link number 18
Processed link number 19
Processed link number 20
Processed link number 21
Processed link number 22
Processed link number 23
Processed link number 24
Processed link number 25
Processed link number 26
Processed link number 27
Processed link number 28
Processed link number 29
Processed link number 30
Processed link number 31
Processed link number 32
Processed link number 33
Processed link number 34
Processed link number 35
Processed link number 36
Processed link number 37
Processed link number 38
Processed link number 39
Processed link number 40
Processed

Processed link number 321
Processed link number 322
Processed link number 323
Processed link number 324
Processed link number 325
Processed link number 326
Processed link number 327
Processed link number 328
Processed link number 329
Processed link number 330
Processed link number 331
Processed link number 332
Processed link number 333
Processed link number 334
Processed link number 335
Processed link number 336
Processed link number 337
Processed link number 338
Processed link number 339
Processed link number 340
Processed link number 341
Processed link number 342
Processed link number 343
Processed link number 344
Processed link number 345
Processed link number 346
Processed link number 347
Processed link number 348
Processed link number 349
Processed link number 350
Processed link number 351
Processed link number 352
Processed link number 353
Processed link number 354
Processed link number 355
Processed link number 356
Processed link number 357
Processed link number 358
Processed li

Processed link number 637
Processed link number 638
Processed link number 639
Processed link number 640
Processed link number 641
Processed link number 642
Processed link number 643
Processed link number 644
Processed link number 645
Processed link number 646
Processed link number 647
Processed link number 648
Processed link number 649
Processed link number 650
Processed link number 651
Processed link number 652
Processed link number 653
Processed link number 654
Processed link number 655
Processed link number 656
Processed link number 657
Processed link number 658
Processed link number 659
Processed link number 660
Processed link number 661
Processed link number 662
Processed link number 663
Processed link number 664
Processed link number 665
Processed link number 666
Processed link number 667
Processed link number 668
Processed link number 669
Processed link number 670
Processed link number 671
Processed link number 672
Processed link number 673
Processed link number 674
Processed li

Processed link number 952
Processed link number 953
Processed link number 954
Processed link number 955
Processed link number 956
Processed link number 957
Processed link number 958
Processed link number 959
Processed link number 960
Processed link number 961
Processed link number 962
Processed link number 963
Processed link number 964
Processed link number 965
Processed link number 966
Processed link number 967
Processed link number 968
Processed link number 969
Processed link number 970
Processed link number 971
Processed link number 972
Processed link number 973
Processed link number 974
Processed link number 975
Processed link number 976
Processed link number 977
Processed link number 978
Processed link number 979
Processed link number 980
Processed link number 981
Processed link number 982
Processed link number 983
Processed link number 984
Processed link number 985
Processed link number 986
Processed link number 987
Processed link number 988
Processed link number 989
Processed li

Processed link number 1258
Processed link number 1259
Processed link number 1260
Processed link number 1261
Processed link number 1262
Processed link number 1263
Processed link number 1264
Processed link number 1265
Processed link number 1266
Processed link number 1267
Processed link number 1268
Processed link number 1269
Processed link number 1270
Processed link number 1271
Processed link number 1272
Processed link number 1273
Processed link number 1274
Processed link number 1275
Processed link number 1276
Processed link number 1277
Processed link number 1278
Processed link number 1279
Processed link number 1280
Processed link number 1281
Processed link number 1282
Processed link number 1283
Processed link number 1284
Processed link number 1285
Processed link number 1286
Processed link number 1287
Processed link number 1288
Processed link number 1289
Processed link number 1290
Processed link number 1291
Processed link number 1292
Processed link number 1293
Processed link number 1294
P

Processed link number 1562
Processed link number 1563
Processed link number 1564
Processed link number 1565
Processed link number 1566
Processed link number 1567
Processed link number 1568
Processed link number 1569
Processed link number 1570
Processed link number 1571
Processed link number 1572
Processed link number 1573
Processed link number 1574
Processed link number 1575
Processed link number 1576
Processed link number 1577
Processed link number 1578
Processed link number 1579
Processed link number 1580
Processed link number 1581
Processed link number 1582
Processed link number 1583
Processed link number 1584
Processed link number 1585
Processed link number 1586
Processed link number 1587
Processed link number 1588
Processed link number 1589
Processed link number 1590
Processed link number 1591
Processed link number 1592
Processed link number 1593
Processed link number 1594
Processed link number 1595
Processed link number 1596
Processed link number 1597
Processed link number 1598
P

Processed link number 1866
Processed link number 1867
Processed link number 1868
Processed link number 1869
Processed link number 1870
Processed link number 1871
Processed link number 1872
Processed link number 1873
Processed link number 1874
Processed link number 1875
Processed link number 1876
Processed link number 1877
Processed link number 1878
Processed link number 1879
Processed link number 1880
Processed link number 1881
Processed link number 1882
Processed link number 1883
Processed link number 1884
Processed link number 1885
Processed link number 1886
Processed link number 1887
Processed link number 1888
Processed link number 1889
Processed link number 1890
Processed link number 1891
Processed link number 1892
Processed link number 1893
Processed link number 1894
Processed link number 1895
Processed link number 1896
Processed link number 1897
Processed link number 1898
Processed link number 1899
Processed link number 1900
Processed link number 1901
Processed link number 1902
P

Processed link number 2170
Processed link number 2171
Processed link number 2172
Processed link number 2173
Processed link number 2174
Processed link number 2175
Processed link number 2176
Processed link number 2177
Processed link number 2178
Processed link number 2179
Processed link number 2180
Processed link number 2181
Processed link number 2182
Processed link number 2183
Processed link number 2184
Processed link number 2185
Processed link number 2186
Processed link number 2187
Processed link number 2188
Processed link number 2189
Processed link number 2190
Processed link number 2191
Processed link number 2192
Processed link number 2193
Processed link number 2194
Processed link number 2195
Processed link number 2196
Processed link number 2197
Processed link number 2198
Processed link number 2199
Processed link number 2200
Processed link number 2201
Processed link number 2202
Processed link number 2203
Processed link number 2204
Processed link number 2205
Processed link number 2206
P

Processed link number 2474
Processed link number 2475
Processed link number 2476
Processed link number 2477
Processed link number 2478
Processed link number 2479
Processed link number 2480
Processed link number 2481
Processed link number 2482
Processed link number 2483
Processed link number 2484
Processed link number 2485
Processed link number 2486
Processed link number 2487
Processed link number 2488
Processed link number 2489
Processed link number 2490
Processed link number 2491
Processed link number 2492
Processed link number 2493
Processed link number 2494
Processed link number 2495
Processed link number 2496
Processed link number 2497
Processed link number 2498
Processed link number 2499
Processed link number 2500
Processed link number 2501
Processed link number 2502
Processed link number 2503
Processed link number 2504
Processed link number 2505
Processed link number 2506
Processed link number 2507
Processed link number 2508
Processed link number 2509
Processed link number 2510
P

Processed link number 2777
Processed link number 2778
Processed link number 2779
Processed link number 2780
Processed link number 2781
Processed link number 2782
Processed link number 2783
Processed link number 2784
Processed link number 2785
Processed link number 2786
Processed link number 2787
Processed link number 2788
Processed link number 2789
Processed link number 2790
Processed link number 2791
Processed link number 2792
Processed link number 2793
Processed link number 2794
Processed link number 2795
Processed link number 2796
Processed link number 2797
Processed link number 2798
Processed link number 2799
Processed link number 2800
Processed link number 2801
Processed link number 2802
Processed link number 2803
Processed link number 2804
Processed link number 2805
Processed link number 2806
Processed link number 2807
Processed link number 2808
Processed link number 2809
Processed link number 2810
Processed link number 2811
Processed link number 2812
Processed link number 2813
P

Processed link number 3081
Processed link number 3082
Processed link number 3083
Processed link number 3084
Processed link number 3085
Processed link number 3086
Processed link number 3087
Processed link number 3088
Processed link number 3089
Processed link number 3090
Processed link number 3091
Processed link number 3092
Processed link number 3093
Processed link number 3094
Processed link number 3095
Processed link number 3096
Processed link number 3097
Processed link number 3098
Processed link number 3099
Processed link number 3100
Processed link number 3101
Processed link number 3102
Processed link number 3103
Processed link number 3104
Processed link number 3105
Processed link number 3106
Processed link number 3107
Processed link number 3108
Processed link number 3109
Processed link number 3110
Processed link number 3111
Processed link number 3112
Processed link number 3113
Processed link number 3114
Processed link number 3115
Processed link number 3116
Processed link number 3117
P

Processed link number 3381
Processed link number 3382
Processed link number 3383
Processed link number 3384
Processed link number 3385
Processed link number 3386
Processed link number 3387
Processed link number 3388
Processed link number 3389
Processed link number 3390
Processed link number 3391
Processed link number 3392
Processed link number 3393
Processed link number 3394
Processed link number 3395
Processed link number 3396
Processed link number 3397
Processed link number 3398
Processed link number 3399
Processed link number 3400
Processed link number 3401
Processed link number 3402
Processed link number 3403
Processed link number 3404
Processed link number 3405
Processed link number 3406
Processed link number 3407
Processed link number 3408
Processed link number 3409
Processed link number 3410
Processed link number 3411
Processed link number 3412
Processed link number 3413
Processed link number 3414
Processed link number 3415
Processed link number 3416
Processed link number 3417
P

# Dump `apartaments_data` (json) and `offer_links` (list) to pickles for backups!

In [18]:
if not os.path.isdir("pickles"):
     os.mkdir("pickles")

with open('pickles/apartaments_data.pkl', 'wb') as f:
    pickle.dump(apartaments_data, f)
with open('pickles/offer_links.pkl', 'wb') as f:
    pickle.dump(offer_links, f)

# (OR) Load pickle files for `apartaments_data` and `offer_links`

In [19]:
with open('pickles/apartaments_data.pkl', 'rb') as f:
    apartaments_data = pickle.load(f) 
with open('pickles/offer_links.pkl', 'rb') as f:
    offer_links = pickle.load(f)

# Extract relevant features from json

In [20]:
df_list = []
photos_list = []
for house_data in apartaments_data:
    
    parsed_data, photos = collect_data_from_json(house_data)
    
    photos_list.append(photos)

    # Append features data to the list of DataFrames
    columns = list(parsed_data.keys())
    rows = list(parsed_data.values())
    df_list.append(pd.DataFrame([rows], columns=columns))

In [21]:
# Concatonate all DataFrames in desired way

df = pd.concat(df_list, sort=False).reset_index(drop=True)
df = df.drop(columns = 'Dostępne od')  # drop because it's a personal data

In [22]:
# Add photos_list (as columns) to the DataFrame

"""if 'Zdjęcia' in df:
    df = df.drop(columns = 'Zdjęcia')
df.insert(df.shape[1], "Zdjęcia", str(photos_list).strip('[]'))"""
# Skip because it's a personal data

'if \'Zdjęcia\' in df:\n    df = df.drop(columns = \'Zdjęcia\')\ndf.insert(df.shape[1], "Zdjęcia", str(photos_list).strip(\'[]\'))'

# Remove duplicated advertisements

In [23]:
print(len(df), len(df.drop_duplicates()))
df = df.drop_duplicates()

3593 3450


# Save as .csv for future analyse

In [24]:
df.to_csv("apartaments.csv", index=False)

In [25]:
df.head()

Unnamed: 0,Cena,Powierzchnia,Liczba pokoi,Rynek,Rodzaj zabudowy,Piętro,Liczba pięter,Materiał budynku,Okna,Rok budowy,Stan wykończenia,Czynsz,Forma własności,φ,λ,Ogrzewanie
0,850000,60.0,3,wtórny,kamienica,2,3,cegła,plastikowe,1950.0,do zamieszkania,541 zł,spółdzielcze wł. z KW,52.201025,21.027389,
1,850000,63.0,3,wtórny,kamienica,2,4,cegła,drewniane,,do zamieszkania,700 zł,pełna własność,52.230041,21.021545,miejskie
2,695000,69.5,3,wtórny,apartamentowiec,2,7,,plastikowe,,,635 zł,pełna własność,52.206864,20.954589,miejskie
3,670000,63.0,3,wtórny,kamienica,3,4,,plastikowe,1925.0,do zamieszkania,,pełna własność,,,miejskie
4,520000,55.0,4,wtórny,blok,3,4,,,,do remontu,,pełna własność,,,miejskie
