In [1]:
import requests
from bs4 import BeautifulSoup

import re

import time
import pandas as pd
import numpy as np

from datetime import date

import json
import os

In [2]:
def get_data(page_url):
    
    page = requests.get(page_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find("body")
    
    re_dict = r"""<script>[^=]+=\s*([^;]+)"""
    
    dict_string = re.findall(re_dict,str(results))
    
    output = {}
    
    if dict_string:
    
        res = json.loads(dict_string[0])
        res = res['searchResult']['result']['propertySales']
        
        for r in res:
            addressId = r['addressId']
            square_meters = int(r['residentialArea'])
            postal = r['postal']
            city = r['city']
            address = r['address']
            
            if r['mapPosition']['hasCoordinates'] == True:
                lat = r['mapPosition']['latLng']['lat']
                lng = r['mapPosition']['latLng']['lng']
            else:
                lat, lng = None, None
            
            latest_sale_price = None
            for sale in r['sales']:
                if sale['saleType'] == 'Fri handel':
                    latest_sale_price = int(sale['salePrice'].replace('.',''))
                    latest_sale_date = sale['saleDate']
                    break
        
            output[addressId] = {
                    'square_meters':square_meters,
                    'postal':postal,
                    'city':city,
                    'address':address,
                    'lat':lat,
                    'lng':lng,
                    'latest_sale_price':latest_sale_price,
                    'latest_sale_date':latest_sale_date,
                    'square_meters_price':latest_sale_price/square_meters
            }
        
    return output

In [3]:
test_url = r"https://www.boligsiden.dk/salgspris/solgt/ejerlejlighed/3?periode.from=2010-01-01&sortdescending=true&sort=salgsdato&kommune=koebenhavn&periode.to=2022-12-31&salgstype=Free"

test_dict = get_data(test_url)

In [4]:
pd.DataFrame.from_dict(test_dict, orient='index').head()

Unnamed: 0,square_meters,postal,city,address,lat,lng,latest_sale_price,latest_sale_date,square_meters_price
sommerstedgade-17-3-tv-1718-koebenhavn-v-01016724__17__3__tv,60,1718,København V,"Sommerstedgade 17, 3. tv",55.666298,12.556574,3850000,02-02-2022,64166.666667
peter-fabers-gade-18-3-tv-2200-koebenhavn-n-01015492__18__3__tv,51,2200,København N,"Peter Fabers Gade 18, 3. tv",55.691002,12.555524,3500000,02-02-2022,68627.45098
broenshoejvej-12a-1-tv-2700-broenshoej-01010944_12a__1__tv,96,2700,Brønshøj,"Brønshøjvej 12A, 1. tv",55.703332,12.497554,3650000,02-02-2022,38020.833333
ved-soenderport-18-5-tv-2300-koebenhavn-s-01018104__18__5__tv,62,2300,København S,"Ved Sønderport 18, 5. tv",55.666579,12.60178,3150000,01-02-2022,50806.451613
strandboulevarden-97-5-tv-2100-koebenhavn-oe-01016988__97__5__tv,88,2100,København Ø,"Strandboulevarden 97, 5. tv",55.707377,12.585439,4950000,01-02-2022,56250.0


In [16]:
for i in range(1,11+1):
    if i % 20 == 0:
        print('page: ',str(i))
    kommune = 'HERLEV'
    url = r"https://www.boligsiden.dk/salgspris/solgt/ejerlejlighed/{}?periode.from=2010-01-01&sortdescending=true&sort=salgsdato&kommune={}&periode.to=2022-12-31&salgstype=Free".format(i,kommune)
    _dict = get_data(url)
    _df = pd.DataFrame.from_dict(_dict, orient='index')
    
    output_path='boligsiden_data_{}.csv'.format(kommune)
    _df.to_csv(output_path, mode='a',sep=';',encoding='utf-8', header=not os.path.exists(output_path))