# Exam Project - Scraping BoligPortalen

In [43]:
# Import packages:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import tqdm
import time

# Import packages:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm  # Import tqdm for the progress bar

# Define function to get the data from the website: BoligPortal 
def scrape_data(offset):
    # Define the base URL:
    base_url = 'https://www.boligportal.dk/lejeboliger/?min_rental_period=0&offset={}'

    # Create the complete URL with the given offset:
    url = base_url.format(offset)

    # Connect to site:
    response = requests.get(url, headers={'name':'Jesper Højberg Knudsen','email':'fmw786@econ.ku.dk'})

    # Parse data with BeautifulSoup:
    soup = BeautifulSoup(response.content, 'lxml')

    # Find links to the individual ads:
    links = soup.find_all('a', class_='AdCardSrp__Link css-17x8ssx')

    # Make a list of URLs:
    url_list = [link['href'] for link in links]

    return url_list

# Set the initial offset and the step size:
initial_offset = 0
step_size = 18

# Define the number of iterations you want to perform:
num_iterations = 10  # You can change this as needed

# Create an empty list to store all URLs:
all_urls = []

# Loop through the desired number of iterations:
for i in tqdm(range(num_iterations), desc="Scraping URLs"):  # Use tqdm for progress bar
    offset = initial_offset + (i * step_size)
    urls = scrape_data(offset)
    all_urls.extend(urls)

# Make final list of working URLs:
final_urls = []

for url in all_urls:
    temp_url = 'https://www.boligportal.dk' + url
    final_urls.append(temp_url)

# Print all final scraped URLs:
for url in final_urls:
    print(url)


Scraping URLs: 100%|██████████| 10/10 [00:29<00:00,  2.99s/it]

https://www.boligportal.dk/lejligheder/aalborg/102m2-4-vaer-id-5378405
https://www.boligportal.dk/lejligheder/n%C3%A6stved/63m2-2-vaer-id-5150925
https://www.boligportal.dk/huse/bredsten/114m2-4-vaer-id-5378377
https://www.boligportal.dk/lejligheder/stubbek%C3%B8bing/66m2-2-vaer-id-5314357
https://www.boligportal.dk/lejligheder/aalborg/60m2-2-vaer-id-5378335
https://www.boligportal.dk/lejligheder/k%C3%B8benhavn/100m2-4-vaer-id-5334883
https://www.boligportal.dk/lejligheder/aalborg/65m2-2-vaer-id-5378340
https://www.boligportal.dk/lejligheder/k%C3%B8benhavn/99m2-4-vaer-id-5378402
https://www.boligportal.dk/lejligheder/hj%C3%B8rring/35m2-1-vaer-id-5378349
https://www.boligportal.dk/lejligheder/aalborg/77m2-3-vaer-id-5378332
https://www.boligportal.dk/lejligheder/k%C3%B8benhavn/102m2-4-vaer-id-5359564
https://www.boligportal.dk/r%C3%A6kkehuse/bryrup/101m2-5-vaer-id-5378400
https://www.boligportal.dk/lejligheder/horsens/48m2-2-vaer-id-5378381
https://www.boligportal.dk/lejligheder/aalborg/




In [44]:
# List of information we want to scarpe:
adress_list = []
monthly_rent_list = []
aconto_list = []
sqm_list = []
nr_rooms_list = []
floor_list = []
property_type_list = []
furnished_list = []
shareable_list = []
pets_allowed_list = []
elevator_list = []
senior_frendly_list = []
only_for_students_list = []
balcony_or_terrace_list = []
parking_list = []
energy_label_list = []
description_list = []

# Initialize a counter to keep track of the number of scraped URLs
scraped_count = 0

# Loop through all the pages
for i in tqdm(range(len(final_urls)), desc="Scraping Data"):
    # Scraping
    url = final_urls[i]
    response = requests.get(url, headers={'name':'Jesper Højberg Knudsen','email':'fmw786@econ.ku.dk'})
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Adress
    # Find the label "css-11fbmqw" within the section
    element = soup.find('span', class_='css-11fbmqw')
    if element:
        # Extract the corresponding value
        adress_element = element.find_next('div', class_='css-v49nss')

        if adress_element:
            # Extract the text content and add it to the list
            adress_list.append(adress_element.text.strip())
        else:
            adress_list.append('N/A') # Handle case if the value is not found

    else:
        adress_list.append('N/A') # Handle case if the label is not found


    # Monthly rent
    # Find the label "Månedlig leje" within the section
    label_element = soup.find('span', text='Månedlig leje', class_='css-arxwps')
    
    if label_element:
        # Extract the corresponding value
        monthly_rent_element = label_element.find_next('span', class_='css-1h46kg2')
        
        if monthly_rent_element:
            # Extract the text content and add it to the list
            monthly_rent_list.append(monthly_rent_element.text.strip())
        else:
            monthly_rent_list.append('N/A')  # Handle case if the value is not found
    else:
        monthly_rent_list.append('N/A')  # Handle case if the label is not found

    # Aconto
    # Find the label "Aconto" within the section
    aconto_element = soup.find('span', class_='css-arxwps', text='Aconto')

    if aconto_element:
        # Extract the corresponding value
        aconto_element = aconto_element.find_next('span', class_='css-1h46kg2')

        if aconto_element:
            # Extract the text content and add it to the list
            aconto_list.append(aconto_element.text.strip())
        else:
            aconto_list.append('N/A')
    else:
        aconto_list.append('N/A')
        
    # Square meters
    # Find the label "Størrelse" within the section
    square_element = soup.find('span', class_='css-arxwps', text='Størrelse')

    if square_element:
        # Extract the corresponding value
        square_element = square_element.find_next('span', class_='css-1h46kg2')

        if square_element:
            # Extract the text content and add it to the list
            sqm_list.append(square_element.text.strip())
        else:
            sqm_list.append('N/A')
    else:
        sqm_list.append('N/A')


    # Number of rooms
    # Find the label "Værelser" within the section
    nr_rooms_element = soup.find('span', class_='css-arxwps', text='Værelser')

    if nr_rooms_element:
        # Extract the corresponding value
        nr_rooms_element = nr_rooms_element.find_next('span', class_='css-1h46kg2')

        if nr_rooms_element:
            # Extract the text content and add it to the list
            nr_rooms_list.append(nr_rooms_element.text.strip())
        else:
            nr_rooms_list.append('N/A')
    else:
        nr_rooms_list.append('N/A')
    
    # Floor
    # Find the label "Etage" within the section
    floor_element = soup.find('span', class_='css-arxwps', text='Etage')

    if floor_element:
        # Extract the corresponding value
        floor_element = floor_element.find_next('span', class_='css-1h46kg2')

        if floor_element:
            # Extract the text content and add it to the list
            floor_list.append(floor_element.text.strip())
        else:
            floor_list.append('N/A')
    else:
        floor_list.append('N/A')

    # Property type
    # Find the label "Boligtype" within the section
    property_type_element = soup.find('span', class_='css-arxwps', text='Boligtype')
    if property_type_element:
        # Extract the corresponding value
        property_type = property_type_element.find_next('span', class_='css-1h46kg2')
        if property_type:
            # Extract the text content and add it to the list
            property_type_list.append(property_type.text.strip())
        else:
            property_type_list.append('N/A')
    else:
        property_type_list.append('N/A')

    # Furnished
    # Find the label "Møbleret" within the section
    furnished_element = soup.find('span', class_='css-arxwps', text='Møbleret')

    if furnished_element:
        # Extract the corresponding value
        furnished_element = furnished_element.find_next('span', class_='css-1h46kg2')

        if furnished_element:
            # Extract the text content and add it to the list
            furnished_list.append(furnished_element.text.strip())
        else:
            furnished_list.append('N/A')
    else:
        furnished_list.append('N/A')

    # Shareable
    # Find the label "Delevenlig" within the section
    shareable_element = soup.find('span', class_='css-arxwps', text='Delevenlig')

    if shareable_element:
        # Extract the corresponding value
        shareable_element = shareable_element.find_next('span', class_='css-1h46kg2')

        if shareable_element:
            # Extract the text content and add it to the list
            shareable_list.append(shareable_element.text.strip())
        else:
            shareable_list.append('N/A')
    else:
        shareable_list.append('N/A')

    # Pets allowed
    # Find the label "Husdyr tilladt" within the section
    pets_allowed_element = soup.find('span', class_='css-arxwps', text='Husdyr tilladt')

    if pets_allowed_element:
        # Extract the corresponding value
        pets_allowed_element = pets_allowed_element.find_next('span', class_='css-1h46kg2')

        if pets_allowed_element:
            # Extract the text content and add it to the list
            pets_allowed_list.append(pets_allowed_element.text.strip())
        else:
            pets_allowed_list.append('N/A')
    else:
        pets_allowed_list.append('N/A')

    # elevator
    # Find the label "Elevator" within the section
    elevator_element = soup.find('span', class_='css-arxwps', text='Elevator')

    if elevator_element:
        # Extract the corresponding value
        elevator_element = elevator_element.find_next('span', class_='css-1h46kg2')

        if elevator_element:
            # Extract the text content and add it to the list
            elevator_list.append(elevator_element.text.strip())
        else:
            elevator_list.append('N/A')
    else:
        elevator_list.append('N/A')

    # Seniors frendly
    # Find the label "Seniorvenlig" within the section
    seniors_frendly_element = soup.find('span', class_='css-arxwps', text='Seniorvenlig')

    if seniors_frendly_element:
        # Extract the corresponding value
        seniors_frendly_element = seniors_frendly_element.find_next('span', class_='css-1h46kg2')

        if seniors_frendly_element:
            # Extract the text content and add it to the list
            senior_frendly_list.append(seniors_frendly_element.text.strip())
        else:
            senior_frendly_list.append('N/A')
    else:
        senior_frendly_list.append('N/A')

    # Students only
    students_only_element = soup.find('span', class_='css-arxwps', text='Kun for studerende')

    if students_only_element:
        # Extract the corresponding value
        students_only_element = students_only_element.find_next('span', class_='css-1h46kg2')

        if students_only_element:
            # Extract the text content and add it to the list
            only_for_students_list.append(students_only_element.text.strip())
        else:
            only_for_students_list.append('N/A')
    else:
        only_for_students_list.append('N/A')

        
    # Balcony or terrace
    balcony_element = soup.find('span', class_='css-arxwps', text='Altan/terrasse')

    if balcony_element:
        # Extract the corresponding value
        balcony_element = balcony_element.find_next('span', class_='css-1h46kg2')

        if balcony_element:
            # Extract the text content and add it to the list
            balcony_or_terrace_list.append(balcony_element.text.strip())
        else:
            balcony_or_terrace_list.append('N/A')
    else:
        balcony_or_terrace_list.append('N/A')

    # Parking
    parking_element = soup.find('span', class_='css-arxwps', text='Parkering')

    if parking_element:
        # Extract the corresponding value
        parking_element = parking_element.find_next('span', class_='css-1h46kg2')

        if parking_element:
            # Extract the text content and add it to the list
            parking_list.append(parking_element.text.strip())
        else:
            parking_list.append('N/A')
    else:
        parking_list.append('N/A')

    # Energy label
    # Find: <img src="/static/images/energy_labels/C_str2.png" class="css-rdsunt">
    # Get "C_str2" from the src attribute
    energy_label_element = soup.find('span', class_='css-arxwps', text='Energimærke')
    energy_label_list1 = []

    if energy_label_element:
        # Extract the corresponding value
        energy_label_element = energy_label_element.find_next('img', class_='css-rdsunt')

        if energy_label_element:
            # Extract the text content and add it to the list
            energy_label_list1.append(energy_label_element['src'].split('/')[-1].split('_')[0])
            # Get the first letter of C_str2
            energy_label_list.append(energy_label_list1[0][0])

        else:
            energy_label_list.append('N/A')
    else:
        energy_label_list.append('N/A')

    # Find the div with class "css-1f7mpex" for the description
    description_element = soup.find('div', class_='css-1f7mpex')

    # Initialize an empty string to store the description
    description_text = ""

    # Check if the description element is found
    if description_element:
        # Extract the text content of the description
        description_text = description_element.get_text(separator=' ', strip=True)

    # If description_text is empty, set it to "N/A"
    if not description_text:
        description_text = "N/A"

    # Add the description text to the list
    description_list.append(description_text)

    # Increment the counter
    scraped_count += 1

    # Check if 100 URLs have been scraped and save the dataframe
    if scraped_count % 100 == 0:
        chunk_number = scraped_count // 100
        df_chunk = pd.DataFrame({
            'Adress': adress_list,
            'Monthly rent': monthly_rent_list,
            'Aconto': aconto_list,
            'Square meters': sqm_list,
            'Rooms': nr_rooms_list,
            'Floor': floor_list,
            'Property type': property_type_list,
            'Furnished': furnished_list,
            'Shareable': shareable_list,
            'Pets allowed': pets_allowed_list,
            'Elevator': elevator_list,
            'Senior frendly': senior_frendly_list,
            'Only for students': only_for_students_list,
            'Balcony/Terrace': balcony_or_terrace_list,
            'Parking': parking_list,
            'Energy label': energy_label_list,
            'Description': description_list,
            'Link': final_urls[:scraped_count]  # Save only up to the current count
        })
        
        # Save the dataframe
        df_chunk.to_csv(f'data_chunk_{chunk_number}.csv', index=False)

    # Pause for a short time before scraping the next page
    time.sleep(0.5)

all_chunks = []
for chunk_number in range(1, (scraped_count // 100) + 1):
    chunk_filename = f'data_chunk_{chunk_number}.csv'
    chunk_df = pd.read_csv(chunk_filename)
    all_chunks.append(chunk_df)

# Concatenate all chunks
final_df = pd.concat(all_chunks, ignore_index=True)

# Add the remaining data to the final dataframe
remaining_data = {
    'Adress': adress_list,
    'Monthly rent': monthly_rent_list,
    'Aconto': aconto_list,
    'Square meters': sqm_list,
    'Rooms': nr_rooms_list,
    'Floor': floor_list,
    'Property type': property_type_list,
    'Furnished': furnished_list,
    'Shareable': shareable_list,
    'Pets allowed': pets_allowed_list,
    'Elevator': elevator_list,
    'Senior frendly': senior_frendly_list,
    'Only for students': only_for_students_list,
    'Balcony/Terrace': balcony_or_terrace_list,
    'Parking': parking_list,
    'Energy label': energy_label_list,
    'Description': description_list,
    'Link': final_urls[:scraped_count]
}
remaining_df = pd.DataFrame(remaining_data)

# Concatenate the remaining data to the final dataframe
final_df = pd.concat([final_df, remaining_df], ignore_index=True)

# Save the final dataframe
final_df.to_csv('final_data.csv', index=False)


Scraping Data: 100%|██████████| 240/240 [03:35<00:00,  1.11it/s]


In [45]:
# Display the first 120 rows of the dataframe
final_df.head(120)

Unnamed: 0,Adress,Monthly rent,Aconto,Square meters,Rooms,Floor,Property type,Furnished,Shareable,Pets allowed,Elevator,Senior frendly,Only for students,Balcony/Terrace,Parking,Energy label,Description,Link
0,"Poul Anker Bechs Vej, 9200 Aalborg, Aalborg SV...",9.770 kr.,700 kr.,102 m²,4,2.,Lejlighed,Nej,Nej,Ja,Ja,Ja,Nej,Ja,Ja,,Hasserisparken ligger på det naturskønne Hasse...,https://www.boligportal.dk/lejligheder/aalborg...
1,"Kildemarksvej, 4700 Næstved - 1. sal",6.065 kr.,1.135 kr.,63 m²,2,1.,Lejlighed,Nej,Ja,Nej,Nej,Nej,Nej,Nej,Ja,,"Stor, lys 2 værelse lejlighed, bestående af en...",https://www.boligportal.dk/lejligheder/n%C3%A6...
2,"Vestervang, 7182 Bredsten",7.600 kr.,350 kr.,114 m²,4,-,Hus,Nej,Nej,Ja,Nej,Nej,Nej,Ja,Ja,E,Huset indeholder en stor vinkelstue med pilleb...,https://www.boligportal.dk/huse/bredsten/114m2...
3,"Møllegårds Alle, 4850 Stubbekøbing - 1. sal",5.500 kr.,600 kr.,66 m²,2,1.,Lejlighed,Nej,Nej,Nej,Nej,Nej,Nej,Nej,Nej,D,Hyggelig ejendom med fire lejemål med fælles h...,https://www.boligportal.dk/lejligheder/stubbek...
4,"Fyensgade, 9000 Aalborg, Aalborg - 4. sal",5.845 kr.,900 kr.,60 m²,2,4.,Lejlighed,Nej,Nej,Nej,Nej,Nej,Nej,Ja,Nej,,Lejligheden byder velkommen med en indbydende ...,https://www.boligportal.dk/lejligheder/aalborg...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,"Tornhøjvej, 9220 Aalborg, Aalborg Øst - 2. sal",7.232 kr.,425 kr.,82 m²,3,2.,Lejlighed,Nej,Nej,Ja,Ja,Nej,Nej,Ja,Nej,,"De nye lejeboliger er med 1-, 2- eller 3-værel...",https://www.boligportal.dk/lejligheder/aalborg...
116,"Miliegræsset, 5220 Odense, Odense SØ",7.800 kr.,,69 m²,3,-,Lejlighed,Nej,Nej,Ja,Nej,Nej,Nej,Ja,Ja,,I attraktive Fraugde udlejes dette dejlige 2 p...,https://www.boligportal.dk/lejligheder/odense/...
117,"Miliegræsset, 5220 Odense, Odense SØ",7.800 kr.,,69 m²,3,-,Lejlighed,Nej,Nej,Ja,Nej,Nej,Nej,Ja,Ja,,I attraktive Fraugde udlejes dette dejlige 2 p...,https://www.boligportal.dk/lejligheder/odense/...
118,"Elektronikvej, 2605 København, Brøndby - 2. sal",13.800 kr.,1.150 kr.,102 m²,4,2.,Lejlighed,Nej,Ja,Ja,Ja,Ja,Nej,Ja,Ja,A,Flyt ind på Mekanikken - nybygget ejendom fra ...,https://www.boligportal.dk/lejligheder/k%C3%B8...
