In [1]:
import os
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from shapely.geometry import Point
import pickle
import time
import datetime

# First define some functions

In [4]:
# Find hrefs in main search page

def generate_links(response):
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Index from 3 in order to skip promoted offers
    offer_links = []
    for offer in soup.find_all(class_='offer-item-details')[3:]:
        offer_links.append(offer.find('a').attrs['href'])
        
    return offer_links

In [5]:
# Find data in single offer's page and return as a json

def process_offer_link(link):
    
    # Get full html page
    offer_response = requests.get(link)
    # Convert to soup
    offer_soup = BeautifulSoup(offer_response.text, 'html.parser')
    
    # Find interesting data
    apartament_data = json.loads(offer_soup.find(id="server-app-state").text)['initialProps']['data']['advert']
    
    return apartament_data

In [6]:
# Function for extracting relevant data from json

def collect_data_from_json(apartament_data):
    
    # Uncomment below to review json
    #print(json.dumps(apartament_data, indent=4, sort_keys=True)) 
    
    # Initiate empty dict
    parsed_data = dict()
    
    # Price
    parsed_data['Cena'] = apartament_data['price']['value']
    
    # Basic infos
    for factor in apartament_data['characteristics']:
        if factor['label'] in ['Czynsz - dodatkowo', 'Kaucja', 'Powierzchnia']:
            parsed_data[factor['label']] = factor['value']
        else:
            parsed_data[factor['label']] = factor['value_translated']
    
    # Date of last edit >>personal data<<
    #parsed_data['Data ogłoszenia'] = apartament_data['dateModified']
    
    # Adres >>personal data<<
    #parsed_data['Adres'] = apartament_data['location']['address']
    
    # URL >>personal data<<
    #parsed_data['URL'] = apartament_data['url']
    
    # Geolocation
    latitude = apartament_data['location']['coordinates']['latitude']
    longitude = apartament_data['location']['coordinates']['longitude']
    radius = apartament_data['location']['coordinates']['radius']
    (parsed_data['φ'], parsed_data['λ']) = (latitude, longitude) if radius == 0 else (np.nan, np.nan)
    
    # Links to small photos >>personal data<<
    photos = [i['thumbnail'] for i in apartament_data['photos']]
    
    return parsed_data, photos

# Now lets get to scrap!

In [30]:
# Load 90 pages each with 72 results of apartaments

offer_links = []

for page in range(90):
    response = requests.get('https://www.otodom.pl/wynajem/mieszkanie/warszawa/?search%5Bdescription%5D=1&search%5Bcity_id%5D=26&nrAdsPerPage=72'
                       + '&page={}'.format(page+1))
    
    offer_links += generate_links(response)
    print("Processed page number {}".format(page+1))

Processed page number 1
Processed page number 2
Processed page number 3
Processed page number 4
Processed page number 5
Processed page number 6
Processed page number 7
Processed page number 8
Processed page number 9
Processed page number 10
Processed page number 11
Processed page number 12
Processed page number 13
Processed page number 14
Processed page number 15
Processed page number 16
Processed page number 17
Processed page number 18
Processed page number 19
Processed page number 20
Processed page number 21
Processed page number 22
Processed page number 23
Processed page number 24
Processed page number 25
Processed page number 26
Processed page number 27
Processed page number 28
Processed page number 29
Processed page number 30
Processed page number 31
Processed page number 32
Processed page number 33
Processed page number 34
Processed page number 35
Processed page number 36
Processed page number 37
Processed page number 38
Processed page number 39
Processed page number 40
Processed

In [None]:
# In order to not overload the server, we can wait 1 sec before processing the a link

apartaments_data = []
for i, link in enumerate(offer_links[:3600]):
    try:
        apartaments_data.append(process_offer_link(link))
        print("Processed link number {}".format(i+1))
    except:
        print("Some error happened while processing link number {}".format(i+1))
    time.sleep(1)

# Dump `apartaments_data` (json) and `offer_links` (list) to pickles for backups!

In [43]:
if not os.path.isdir("pickles"):
     os.mkdir("pickles")

with open('pickles/apartaments_data.pkl', 'wb') as f:
    pickle.dump(apartaments_data, f)
with open('pickles/offer_links.pkl', 'wb') as f:
    pickle.dump(offer_links, f)

# Load pickle files for apartaments_data and offer_links

In [2]:
with open('pickles/apartaments_data.pkl', 'rb') as f:
    apartaments_data = pickle.load(f) 
with open('pickles/offer_links.pkl', 'rb') as f:
    offer_links = pickle.load(f)

# Extract relevant features from json

In [7]:
df_list = []
photos_list = []
for house_data in apartaments_data:
    
    parsed_data, photos = collect_data_from_json(house_data)
    
    photos_list.append(photos)

    # Append features data to the list of DataFrames
    columns = list(parsed_data.keys())
    rows = list(parsed_data.values())
    df_list.append(pd.DataFrame([rows], columns=columns))

In [8]:
# Concatonate all DataFrames in desired way

df = pd.concat(df_list, sort=False).reset_index(drop=True)
df = df.drop(columns = 'Dostępne od')  # drop because it's a personal data

In [9]:
# Add photos_list (as columns) to the DataFrame

"""if 'Zdjęcia' in df:
    df = df.drop(columns = 'Zdjęcia')
df.insert(df.shape[1], "Zdjęcia", str(photos_list).strip('[]'))"""
# Skip because it's a personal data

'if \'Zdjęcia\' in df:\n    df = df.drop(columns = \'Zdjęcia\')\ndf.insert(df.shape[1], "Zdjęcia", str(photos_list).strip(\'[]\'))'

# Remove duplicated advertisements

In [10]:
print(len(df), len(df.drop_duplicates()))
df = df.drop_duplicates()

3564 3525


# Count time in minutes needed to travel to city center (PKiN)

In [13]:
API_KEY = "3dvH5AuLtxyris6xtRfYMKSkzYlrJGC95Gn4aXRoVjY"

In [56]:
# Return minutes difference between times `dep` and `arr` in requests' format
def diff_time(dep, arr):
    dep_time = datetime.datetime.strptime(dep, '%Y-%m-%dT%H:%M:%S')
    arr_time = datetime.datetime.strptime(arr, '%Y-%m-%dT%H:%M:%S')
    return (arr_time - dep_time).total_seconds() / 60

# Return minimum time travel between house location and City center (PKiN), counted since bus departure
def count_minutes(row):

    params = dict({'dep' : '{}, {}'.format(row['φ'], row['λ'])
              , 'arr' : '52.231731, 21.005144'  # City center (PKiN)
              , 'time' : '2019-12-27T12:00:00'  # Some random time
              , 'routing' : 'tt'
              , 'apiKey' : API_KEY})
        
    res = requests.get("https://transit.ls.hereapi.com/v3/route.json", params=params)
    
    if res.status_code != 200:
        return np.nan
    else:
        try:
            return min([diff_time(connection['Dep']['time'], connection['Arr']['time']) 
                        for connection in res.json()['Res']['Connections']['Connection']])
        except:
            return np.nan

In [60]:
travel_time = []

for idx, row in df.iterrows():
    travel_time.append(count_minutes(row))
    print("Encoded row number", idx)

Encoded row number 0
Encoded row number 1
Encoded row number 2
Encoded row number 3
Encoded row number 4
Encoded row number 5
Encoded row number 6
Encoded row number 7
Encoded row number 8
Encoded row number 9
Encoded row number 10
Encoded row number 11
Encoded row number 12
Encoded row number 13
Encoded row number 14
Encoded row number 15
Encoded row number 16
Encoded row number 17
Encoded row number 18
Encoded row number 19
Encoded row number 20
Encoded row number 21
Encoded row number 22
Encoded row number 23
Encoded row number 24
Encoded row number 25
Encoded row number 26
Encoded row number 27
Encoded row number 28
Encoded row number 29
Encoded row number 30
Encoded row number 31
Encoded row number 32
Encoded row number 33
Encoded row number 34
Encoded row number 35
Encoded row number 36
Encoded row number 37
Encoded row number 38
Encoded row number 39
Encoded row number 40
Encoded row number 41
Encoded row number 42
Encoded row number 43
Encoded row number 44
Encoded row number 4

Encoded row number 363
Encoded row number 364
Encoded row number 365
Encoded row number 366
Encoded row number 367
Encoded row number 368
Encoded row number 369
Encoded row number 370
Encoded row number 371
Encoded row number 372
Encoded row number 373
Encoded row number 374
Encoded row number 375
Encoded row number 376
Encoded row number 377
Encoded row number 378
Encoded row number 379
Encoded row number 380
Encoded row number 381
Encoded row number 382
Encoded row number 383
Encoded row number 384
Encoded row number 385
Encoded row number 386
Encoded row number 387
Encoded row number 388
Encoded row number 389
Encoded row number 390
Encoded row number 391
Encoded row number 392
Encoded row number 393
Encoded row number 394
Encoded row number 395
Encoded row number 396
Encoded row number 397
Encoded row number 398
Encoded row number 399
Encoded row number 400
Encoded row number 401
Encoded row number 402
Encoded row number 403
Encoded row number 404
Encoded row number 405
Encoded row

Encoded row number 722
Encoded row number 723
Encoded row number 724
Encoded row number 725
Encoded row number 726
Encoded row number 727
Encoded row number 728
Encoded row number 729
Encoded row number 730
Encoded row number 731
Encoded row number 732
Encoded row number 733
Encoded row number 734
Encoded row number 735
Encoded row number 736
Encoded row number 737
Encoded row number 738
Encoded row number 739
Encoded row number 740
Encoded row number 741
Encoded row number 742
Encoded row number 743
Encoded row number 744
Encoded row number 745
Encoded row number 746
Encoded row number 747
Encoded row number 748
Encoded row number 749
Encoded row number 750
Encoded row number 751
Encoded row number 752
Encoded row number 753
Encoded row number 754
Encoded row number 755
Encoded row number 756
Encoded row number 757
Encoded row number 758
Encoded row number 759
Encoded row number 760
Encoded row number 761
Encoded row number 762
Encoded row number 763
Encoded row number 764
Encoded row

Encoded row number 1081
Encoded row number 1082
Encoded row number 1083
Encoded row number 1084
Encoded row number 1085
Encoded row number 1086
Encoded row number 1087
Encoded row number 1088
Encoded row number 1089
Encoded row number 1090
Encoded row number 1091
Encoded row number 1092
Encoded row number 1093
Encoded row number 1094
Encoded row number 1095
Encoded row number 1096
Encoded row number 1097
Encoded row number 1098
Encoded row number 1099
Encoded row number 1100
Encoded row number 1101
Encoded row number 1102
Encoded row number 1103
Encoded row number 1104
Encoded row number 1105
Encoded row number 1106
Encoded row number 1107
Encoded row number 1108
Encoded row number 1109
Encoded row number 1110
Encoded row number 1111
Encoded row number 1112
Encoded row number 1113
Encoded row number 1114
Encoded row number 1115
Encoded row number 1116
Encoded row number 1117
Encoded row number 1118
Encoded row number 1119
Encoded row number 1120
Encoded row number 1121
Encoded row numb

Encoded row number 1424
Encoded row number 1425
Encoded row number 1426
Encoded row number 1427
Encoded row number 1428
Encoded row number 1429
Encoded row number 1430
Encoded row number 1431
Encoded row number 1432
Encoded row number 1433
Encoded row number 1434
Encoded row number 1435
Encoded row number 1436
Encoded row number 1437
Encoded row number 1438
Encoded row number 1439
Encoded row number 1440
Encoded row number 1441
Encoded row number 1442
Encoded row number 1443
Encoded row number 1444
Encoded row number 1445
Encoded row number 1446
Encoded row number 1447
Encoded row number 1448
Encoded row number 1449
Encoded row number 1450
Encoded row number 1451
Encoded row number 1452
Encoded row number 1453
Encoded row number 1454
Encoded row number 1455
Encoded row number 1456
Encoded row number 1457
Encoded row number 1458
Encoded row number 1459
Encoded row number 1460
Encoded row number 1461
Encoded row number 1462
Encoded row number 1463
Encoded row number 1464
Encoded row numb

Encoded row number 1767
Encoded row number 1768
Encoded row number 1769
Encoded row number 1770
Encoded row number 1772
Encoded row number 1773
Encoded row number 1774
Encoded row number 1775
Encoded row number 1776
Encoded row number 1777
Encoded row number 1778
Encoded row number 1779
Encoded row number 1780
Encoded row number 1781
Encoded row number 1782
Encoded row number 1783
Encoded row number 1784
Encoded row number 1785
Encoded row number 1786
Encoded row number 1787
Encoded row number 1788
Encoded row number 1789
Encoded row number 1790
Encoded row number 1791
Encoded row number 1792
Encoded row number 1793
Encoded row number 1794
Encoded row number 1795
Encoded row number 1796
Encoded row number 1797
Encoded row number 1798
Encoded row number 1799
Encoded row number 1800
Encoded row number 1801
Encoded row number 1802
Encoded row number 1803
Encoded row number 1804
Encoded row number 1805
Encoded row number 1806
Encoded row number 1807
Encoded row number 1808
Encoded row numb

Encoded row number 2116
Encoded row number 2117
Encoded row number 2118
Encoded row number 2119
Encoded row number 2120
Encoded row number 2121
Encoded row number 2122
Encoded row number 2123
Encoded row number 2124
Encoded row number 2125
Encoded row number 2126
Encoded row number 2127
Encoded row number 2128
Encoded row number 2129
Encoded row number 2130
Encoded row number 2131
Encoded row number 2132
Encoded row number 2133
Encoded row number 2134
Encoded row number 2135
Encoded row number 2136
Encoded row number 2137
Encoded row number 2138
Encoded row number 2139
Encoded row number 2140
Encoded row number 2141
Encoded row number 2143
Encoded row number 2144
Encoded row number 2145
Encoded row number 2146
Encoded row number 2147
Encoded row number 2148
Encoded row number 2149
Encoded row number 2150
Encoded row number 2151
Encoded row number 2152
Encoded row number 2153
Encoded row number 2154
Encoded row number 2155
Encoded row number 2156
Encoded row number 2157
Encoded row numb

Encoded row number 2461
Encoded row number 2462
Encoded row number 2463
Encoded row number 2464
Encoded row number 2465
Encoded row number 2466
Encoded row number 2467
Encoded row number 2468
Encoded row number 2469
Encoded row number 2470
Encoded row number 2471
Encoded row number 2472
Encoded row number 2473
Encoded row number 2474
Encoded row number 2475
Encoded row number 2476
Encoded row number 2477
Encoded row number 2478
Encoded row number 2479
Encoded row number 2480
Encoded row number 2481
Encoded row number 2483
Encoded row number 2484
Encoded row number 2485
Encoded row number 2486
Encoded row number 2487
Encoded row number 2488
Encoded row number 2489
Encoded row number 2490
Encoded row number 2491
Encoded row number 2492
Encoded row number 2493
Encoded row number 2494
Encoded row number 2495
Encoded row number 2496
Encoded row number 2497
Encoded row number 2498
Encoded row number 2499
Encoded row number 2500
Encoded row number 2501
Encoded row number 2502
Encoded row numb

Encoded row number 2812
Encoded row number 2813
Encoded row number 2814
Encoded row number 2815
Encoded row number 2816
Encoded row number 2817
Encoded row number 2818
Encoded row number 2819
Encoded row number 2820
Encoded row number 2821
Encoded row number 2822
Encoded row number 2823
Encoded row number 2824
Encoded row number 2825
Encoded row number 2826
Encoded row number 2827
Encoded row number 2828
Encoded row number 2829
Encoded row number 2830
Encoded row number 2831
Encoded row number 2832
Encoded row number 2833
Encoded row number 2834
Encoded row number 2835
Encoded row number 2836
Encoded row number 2837
Encoded row number 2838
Encoded row number 2839
Encoded row number 2840
Encoded row number 2841
Encoded row number 2842
Encoded row number 2843
Encoded row number 2844
Encoded row number 2845
Encoded row number 2846
Encoded row number 2847
Encoded row number 2848
Encoded row number 2849
Encoded row number 2850
Encoded row number 2851
Encoded row number 2852
Encoded row numb

Encoded row number 3158
Encoded row number 3159
Encoded row number 3160
Encoded row number 3161
Encoded row number 3162
Encoded row number 3163
Encoded row number 3164
Encoded row number 3165
Encoded row number 3166
Encoded row number 3167
Encoded row number 3168
Encoded row number 3169
Encoded row number 3170
Encoded row number 3171
Encoded row number 3172
Encoded row number 3173
Encoded row number 3174
Encoded row number 3175
Encoded row number 3176
Encoded row number 3177
Encoded row number 3178
Encoded row number 3179
Encoded row number 3180
Encoded row number 3181
Encoded row number 3182
Encoded row number 3183
Encoded row number 3184
Encoded row number 3185
Encoded row number 3186
Encoded row number 3187
Encoded row number 3188
Encoded row number 3189
Encoded row number 3190
Encoded row number 3191
Encoded row number 3192
Encoded row number 3193
Encoded row number 3194
Encoded row number 3195
Encoded row number 3197
Encoded row number 3198
Encoded row number 3199
Encoded row numb

Encoded row number 3503
Encoded row number 3504
Encoded row number 3505
Encoded row number 3506
Encoded row number 3507
Encoded row number 3508
Encoded row number 3509
Encoded row number 3510
Encoded row number 3511
Encoded row number 3512
Encoded row number 3513
Encoded row number 3514
Encoded row number 3515
Encoded row number 3516
Encoded row number 3517
Encoded row number 3518
Encoded row number 3519
Encoded row number 3520
Encoded row number 3521
Encoded row number 3522
Encoded row number 3523
Encoded row number 3524
Encoded row number 3525
Encoded row number 3526
Encoded row number 3527
Encoded row number 3528
Encoded row number 3529
Encoded row number 3530
Encoded row number 3531
Encoded row number 3532
Encoded row number 3533
Encoded row number 3534
Encoded row number 3535
Encoded row number 3536
Encoded row number 3537
Encoded row number 3538
Encoded row number 3539
Encoded row number 3540
Encoded row number 3541
Encoded row number 3542
Encoded row number 3543
Encoded row numb

Dump `travel_time` to pickle

In [64]:
if not os.path.isdir("pickles"):
     os.mkdir("pickles")

with open('pickles/travel_time.pkl', 'wb') as f:
    pickle.dump(travel_time, f)

Load `travel_time` from pickle

In [65]:
with open('pickles/travel_time.pkl', 'rb') as f:
    travel_time = pickle.load(f) 

Add as a new column to initial DataFrame `df`

In [72]:
if 'Czas dojazdu do centrum' in df:
    df = df.drop(columns = 'Czas dojazdu do centrum')
df.insert(df.shape[1], "Czas dojazdu do centrum", travel_time)

# Save as .csv for future analyse

In [69]:
df.to_csv("apartaments.csv", index=False)

In [73]:
df.head()

Unnamed: 0,Cena,Kaucja,Powierzchnia,Liczba pokoi,Rodzaj zabudowy,Piętro,Okna,Ogrzewanie,Stan wykończenia,φ,λ,Czynsz - dodatkowo,Liczba pięter,Materiał budynku,Rok budowy,Czas dojazdu do centrum
0,2400,2400.0,48,3,blok,2,plastikowe,miejskie,do zamieszkania,52.231618,21.066343,,,,,25.0
1,2400,3000.0,45,2,apartamentowiec,2,plastikowe,miejskie,do zamieszkania,52.289194,20.930776,500.0,5.0,cegła,2014.0,36.0
2,2600,,40,2,apartamentowiec,4,plastikowe,miejskie,do zamieszkania,52.199425,21.04463,,6.0,pustak,2018.0,31.0
3,2300,2500.0,54,2,blok,1,plastikowe,miejskie,do zamieszkania,52.287912,21.054128,400.0,5.0,,2007.0,46.0
4,2950,,54,3,blok,3,drewniane,miejskie,do zamieszkania,52.153523,21.082662,350.0,4.0,cegła,2016.0,47.0
