# Setup

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## Connect
Set up connection

In [2]:
# Set URL to Boliga search
URL = 'https://www.boliga.dk/salg/resultater?propertyType=1,2,3&municipality=101&salesDateMin=2015&salesDateMax=today&searchTab=1&sort=date-d&page=1'

# Set user agent
''' Link to find user agent
https://httpbin.org/get
'''
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
    "Accept-Encoding": "gzip, deflate, br"
}

## Functions

### Function 1
Create function to scrape single page from Boliga.

In [3]:
def fun_scrape_boliga(
    wd, # Webdriver session
    do_print  = 0 # set 1 to print
):
    
    # Find table element
    table = WebDriverWait(wd, 10).until(
    EC.presence_of_element_located((By.XPATH, '/html/body/app-root/app-scroll-position-restoration/app-main-layout/app-sold-properties-list/div[3]/app-sold-list-table/table'))
    )
    
    
    # Initalize list to data frames
    l_dfs = []    
    
    # Find table rows
    l_table_rows = WebDriverWait(wd, 20).until(
    EC.presence_of_all_elements_located((By.TAG_NAME, 'tr'))
    )
    
    
    
    # Loop through table rows
    for r in l_table_rows:
        
        # Get values from row
        l_values = r.text.split('\n')
        
        # Create list with column names
        l_col_names = ['Type', 'StreetAddress','Area', 'Price', 'Date','SellType', 'Size', 'Price_m2', 'Rooms_YearBuilt_PriceChange']

        # Zip values and columns names to dict
        dict_row = dict(zip(l_col_names, l_values))

        # Create data frame from dict
        df = pd.DataFrame.from_dict([dict_row])
        
        # Append values to data frame
        l_dfs.append(df)
        
        # Print row number
        if print == 1:
            print('Row {} parsed'.format(count))
    
        
    # Concatenate dataframes to one, and return
    df_out = pd.concat(l_dfs)
    
    print('Page parsed')
    return df_out  


### Function 2
Create function to loop through all pages of search on Boliga

In [4]:
def fun_loop_boliga(
    wd, # Webdriver session
    date, # date to add to file names
    max_pages = 2000, # maximum number of pages to search through
    sleep_time = 2 # wait time
):

    ## Find number of pages ## 
    # Find buttons 
    l_buttons = wd.find_elements_by_class_name('page-button')
    
    # Initialize empty list to store number of pages
    l_n_pages = []
    
    # For ech botton, save text in list
    for button in l_buttons:
        try:
            n_pages = int(button.text)
            l_n_pages.append(n_pages)
        except:
            next
            
    # Find maximum in list
    max(l_n_pages)
    n_pages = max(l_n_pages)
    print('Number of pages: {}'.format(n_pages))
      
    # Change number of pages if maximum is exceeded
    if max_pages < n_pages:
        n_pages = max_pages
        print('Number of pages set to max. pages: {}'.format(n_pages))
    
    
    ## Loop through pages ## 
    # Initialize empty list of data frames
    l_dfs = []
    
    i = 1
    while i <= n_pages:

        time.sleep(sleep_time)
        
        # Call function to scrape single page
        df = fun_scrape_boliga(wd)
        
        file = 'Files/BoligaScraping_' + str(date) + '_Page_{}.csv'.format(i)
        
        # Print each page to a file, in case of lost connection
        df.to_csv(file,
                 index = False)
        
        print(i)
        i += 1
                
        # Append data frame to list
        l_dfs.append(df)
        
        # Get next page
        wd.find_element_by_xpath('/html/body/app-root/app-scroll-position-restoration/app-main-layout/app-sold-properties-list/div[3]/div/div/app-pagination/div/div[4]/a').click()
        
        
    # Concatenate data frames
    df_out = pd.concat(l_dfs)
    
    print('Done!')
    return(df_out)
                               
                               

# Scrape 

In [5]:
# Initialize Chrome
wd = webdriver.Chrome(executable_path='chromedriver.exe')

# Go to page
wd.get(URL)

# Click to pass Cookie pop-up
wd.find_element_by_xpath('//*[@id="coiPage-1"]/div[2]/div[1]/button[2]').click()


# Run function to scrape websie
df = fun_loop_boliga(wd,
                     date = 20220314,
                     max_pages = 1,
                     sleep_time = 3)


Number of pages: 1087
Number of pages set to max. pages: 1
Page parsed
1
Done!


Check data

In [7]:
print(df.shape)

df.head()

(50, 9)


Unnamed: 0,Type,StreetAddress,Area,Price,Date,SellType,Size,Price_m2,Rooms_YearBuilt_PriceChange
0,E,"Damagervej 5B, st. tv",2450 København SV,1.300.000 kr.,02-03-2022,Fam. Salg,66 m²,19.697 kr/m²,2 1953 Aktuel værdi
0,E,"Hobrogade 4, 4. tv",2100 København Ø,1.955.000 kr.,02-03-2022,Fam. Salg,72 m²,27.153 kr/m²,3 1900 Aktuel værdi
0,E,"Havdrupvej 11, st. tv",2700 Brønshøj,2.000.000 kr.,02-03-2022,Alm. Salg,54 m²,37.037 kr/m²,2 1937 -5% Aktuel værdi
0,E,"Esbern Snares Gade 14, 4. th",1725 København V,1.147.500 kr.,01-03-2022,Fam. Salg,61 m²,18.811 kr/m²,2 1907 Aktuel værdi
0,E,"Victor Borges Plads 4, 1. th",2100 København Ø,6.384.354 kr.,01-03-2022,Alm. Salg,171 m²,37.335 kr/m²,6 1900 Aktuel værdi


Write data

In [None]:
#df.to_csv('Data_Boliga_20220208.csv', index  = False)