## Setup

In [1]:
import mechanicalsoup as ms
import pandas as pd
import numpy as np
import re

## Get Listings

In [2]:
# get zipcodes
zipcodes = '3450,3500'

In [3]:
# get data as html
browser = ms.StatefulBrowser()
browser.open("https://www.boliga.dk/resultat?zipCodes=" + zipcodes + "&propertyType=1")
html = browser.get_current_page()

In [4]:
# get paging stats
pg_stats = html.find_all('div', attrs={'class': re.compile('paging-stats')})
pg_stats = pg_stats[-1].get_text()

In [5]:
# deduct pages needed to iterate
listings_count = re.search('(\d+)(?!.*\d)', pg_stats).group(0)
pages = int(np.ceil(int(listings_count) / 50))
print('need to iterate', pages, 'pages...')

need to iterate 2 pages...


In [7]:
# get listings
listings = []
for p in range(1, pages+1):
    print('fetching data from page', p)
    
    # open page
    url = "https://www.boliga.dk/resultat?zipCodes=" + zipcodes + "&page=" + str(p) + "&propertyType=1"
    browser.open(url)
    
    # retrieve listings
    html = browser.get_current_page()
    a = html.find_all('a', attrs={'href': re.compile("^/bolig/")})
    for item in a:
        listings.append(item.get('href'))
    
listings = np.unique(listings)

fetching data from page 1
fetching data from page 2


# Get Data per Listing

In [8]:
# method to return bs4 objects for processing
def getSoup(bolig_id):
    
    # open page
    url = "https://www.boliga.dk" + bolig_id
    browser.open(url)
    soup = browser.get_current_page()
    
    # get inner details
    inner_details = soup.find_all('div', attrs={'class': 'app-inner-details'})[0]
    
    # get first row
    short_id = re.search('/\d+/', bolig_id).group(0)
    soup_row = soup.find_all('div', attrs={'class': 'row no-gutters'})[0]
    soup_icons = inner_details.find_all('use')
    
    return short_id, soup_row, soup_icons

In [67]:
# processing of bs4 object, related to icons on page
def processIcons(soup):
    
    d = {}
    
    for i in soup:
        
        # get all spans to read data from
        icon_name = i.get('xlink:href')
        
        p = i.find_parent()
        pp = p.find_parent()
        ppp = pp.find_parent()

        icon_type = ''
        icon_value = ''
        
        grp1 = ['#icon-rooms', '#icon-floor']
        grp2 = ['#icon-square','#icon-lot-size','#icon-construction-year',\
                '#icon-energy','#icon-taxes','#icon-basement-size']

        if icon_name in grp1:
            spans = pp.find_all('span')
            icon_value = spans[1].get_text().strip()
            d[icon_name] = icon_value
            
        if icon_name in grp2:
            spans = pp.find_all('span')
            icon_value = spans[1].get_text().strip()
            d[icon_name] = icon_value
        
    return d

In [92]:
def processRow(soup):
    d = {}
    
    # get created date
    span = soup.find_all('span', attrs={'class': 'text-muted'})[0]
    d['address']  = span.get_text()
    
    # get price
    span = soup.find_all('span', attrs={'class': 'font-weight-bolder'})[0]
    d['price'] = span.get_text()
    
    span = soup.find_all('p', attrs={'class': 'ng-star-inserted'})[-1]
    d['created'] = span.get_text()
    
    return d

In [105]:
def getListingData(url_list):
    housingDetails = []
    for b in url_list[:10]:

        # get soup
        short_id, soup_row, soup_icons = getSoup(b)

        # process for details
        id_details = {'id': short_id}
        row_details = processRow(soup_row)
        icon_details = processIcons(soup_icons)
        d = {**id_details, **row_details, **icon_details}

        # add to result list
        housingDetails.append(d)
        
    return pd.DataFrame(housingDetails)

### Process all to dataframe

In [106]:
df = getListingData(listings)
df.head()

Unnamed: 0,id,price,address,created,#icon-square,#icon-lot-size,#icon-rooms,#icon-floor,#icon-construction-year,#icon-energy,#icon-taxes,#icon-basement-size
0,/1485565/,4.495.000 kr.,"Enhøjsvej 22, 3450 Allerød",Oprettet 24. sep. 2018,219 m²,1.182 m²,7,0. sal,1949,D,5.315 kr. / md.,102 m²
1,/1511475/,5.245.000 kr.,"Skovvejen 21 3450 Allerød, 3450 Allerød",Oprettet 27. dec. 2018,173 m²,681 m²,4,0. sal,2019,-,7.732 kr. / md.,0 m²
2,/1572772/,14.950.000 kr.,"Kollerød Bygade 23, Kollerød, 3450 Allerød",Oprettet 29. jun. 2019,250 m²,147.974 m²,10,0. sal,1996,A10,4.514 kr. / md.,0 m²
3,/1606136/,5.295.000 kr.,"Bybrøndstræde 1, 3500 Værløse",Oprettet 17. okt. 2019,185 m²,647 m²,6,0. sal,1850,C,5.061 kr. / md.,0 m²
4,/1620436/,13.500.000 kr.,"Kollerødvej 51, 3450 Allerød",Oprettet 10. dec. 2019,501 m²,4.059 m²,9,0. sal,1823,D,15.535 kr. / md.,0 m²
