# Exam Project - Scraping BoligPortalen

In [1]:
# Import packages:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import tqdm
import time

# Import packages:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm  # Import tqdm for the progress bar

# Define function to get the data from the website: BoligPortal 
def scrape_data(offset):
    # Define the base URL:
    base_url = 'https://www.boligportal.dk/lejeboliger/?min_rental_period=0&offset={}'

    # Create the complete URL with the given offset:
    url = base_url.format(offset)

    # Connect to site:
    response = requests.get(url, headers={'name':'Jesper Højberg Knudsen','email':'fmw786@econ.ku.dk'})

    # Parse data with BeautifulSoup:
    soup = BeautifulSoup(response.content, 'lxml')

    # Find all links to the individual ads:
    links = soup.find_all('a', class_='AdCardSrp__Link css-17x8ssx')

    # Make a list of URLs:
    url_list = [link['href'] for link in links]

    # Return the 4th to 21st URLs (indices 3 to 20)
    return url_list[3:21]

# Set the initial offset and the step size:
initial_offset = 0
step_size = 18

# Define the number of iterations you want to perform:
num_iterations = 950  # You can change this as needed

# Create an empty list to store all URLs:
all_urls = []

# Loop through the desired number of iterations:
for i in tqdm(range(num_iterations), desc="Scraping URLs"):  # Use tqdm for progress bar
    offset = initial_offset + (i * step_size)
    urls = scrape_data(offset)
    all_urls.extend(urls)

# Make final list of working URLs:
final_urls_temp = []


for url in all_urls:
    temp_url = 'https://www.boligportal.dk' + url
    final_urls_temp.append(temp_url)

# Remove duplicates in final_urls_temp:
final_urls = final_urls = list(set(final_urls_temp))
num_duplicates_removed_url = len(final_urls_temp) - len(final_urls)

# Print number of duplicates removed:
print("Number of duplicates removed:", num_duplicates_removed_url)

# Save final urls to csv file:
pd.DataFrame(final_urls).to_csv('final_urls.csv', index=False, header=False)

# Print all final scraped URLs:
for url in final_urls:
    print(url)


Scraping URLs: 100%|██████████| 950/950 [16:03<00:00,  1.01s/it]

Number of duplicates removed: 2844
https://www.boligportal.dk/lejligheder/bramming/109m2-4-vaer-id-5375615
https://www.boligportal.dk/lejligheder/ikast/91m2-3-vaer-id-4590523
https://www.boligportal.dk/lejligheder/hedensted/83m2-3-vaer-id-5362866
https://www.boligportal.dk/lejligheder/kors%C3%B8r/98m2-3-vaer-id-5371905
https://www.boligportal.dk/lejligheder/viborg/100m2-3-vaer-id-4409083
https://www.boligportal.dk/lejligheder/engesvang/104m2-4-vaer-id-5359149
https://www.boligportal.dk/lejligheder/aarhus/114m2-4-vaer-id-5148557
https://www.boligportal.dk/lejligheder/randers/54m2-2-vaer-id-4826599
https://www.boligportal.dk/lejligheder/viborg/74m2-3-vaer-id-4950484
https://www.boligportal.dk/lejligheder/hiller%C3%B8d/94m2-3-vaer-id-5256818
https://www.boligportal.dk/lejligheder/horsens/67m2-2-vaer-id-5376236
https://www.boligportal.dk/r%C3%A6kkehuse/hedensted/84m2-3-vaer-id-5313249
https://www.boligportal.dk/lejligheder/hadsund/62m2-2-vaer-id-3767720
https://www.boligportal.dk/lejlighed




In [2]:
# List of information we want to scarpe:
adress_list = []
monthly_rent_list = []
aconto_list = []
sqm_list = []
nr_rooms_list = []
floor_list = []
property_type_list = []
furnished_list = []
shareable_list = []
pets_allowed_list = []
elevator_list = []
senior_frendly_list = []
only_for_students_list = []
balcony_or_terrace_list = []
parking_list = []
energy_label_list = []
description_list = []

# Initialize a counter to keep track of the number of scraped URLs
scraped_count = 0

# Initialize lists to hold data for the current chunk
current_chunk = []

# Loop through all the pages
for i in tqdm(range(len(final_urls)), desc="Scraping Data"):
    # Scraping
    url = final_urls[i]
    response = requests.get(url, headers={'name':'Jesper Højberg Knudsen','email':'fmw786@econ.ku.dk'})
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Adress
    # Find the label "css-11fbmqw" within the section
    element = soup.find('span', class_='css-11fbmqw')
    if element:
        # Extract the corresponding value
        adress_element = element.find_next('div', class_='css-v49nss')

        if adress_element:
            # Extract the text content and add it to the list
            adress_list.append(adress_element.text.strip())
        else:
            adress_list.append('N/A') # Handle case if the value is not found

    else:
        adress_list.append('N/A') # Handle case if the label is not found


    # Monthly rent
    # Find the label "Månedlig leje" within the section
    label_element = soup.find('span', text='Månedlig leje', class_='css-arxwps')
    
    if label_element:
        # Extract the corresponding value
        monthly_rent_element = label_element.find_next('span', class_='css-1h46kg2')
        
        if monthly_rent_element:
            # Extract the text content and add it to the list
            monthly_rent_list.append(monthly_rent_element.text.strip())
        else:
            monthly_rent_list.append('N/A')  # Handle case if the value is not found
    else:
        monthly_rent_list.append('N/A')  # Handle case if the label is not found

    # Aconto
    # Find the label "Aconto" within the section
    aconto_element = soup.find('span', class_='css-arxwps', text='Aconto')

    if aconto_element:
        # Extract the corresponding value
        aconto_element = aconto_element.find_next('span', class_='css-1h46kg2')

        if aconto_element:
            # Extract the text content and add it to the list
            aconto_list.append(aconto_element.text.strip())
        else:
            aconto_list.append('N/A')
    else:
        aconto_list.append('N/A')
        
    # Square meters
    # Find the label "Størrelse" within the section
    square_element = soup.find('span', class_='css-arxwps', text='Størrelse')

    if square_element:
        # Extract the corresponding value
        square_element = square_element.find_next('span', class_='css-1h46kg2')

        if square_element:
            # Extract the text content and add it to the list
            sqm_list.append(square_element.text.strip())
        else:
            sqm_list.append('N/A')
    else:
        sqm_list.append('N/A')


    # Number of rooms
    # Find the label "Værelser" within the section
    nr_rooms_element = soup.find('span', class_='css-arxwps', text='Værelser')

    if nr_rooms_element:
        # Extract the corresponding value
        nr_rooms_element = nr_rooms_element.find_next('span', class_='css-1h46kg2')

        if nr_rooms_element:
            # Extract the text content and add it to the list
            nr_rooms_list.append(nr_rooms_element.text.strip())
        else:
            nr_rooms_list.append('N/A')
    else:
        nr_rooms_list.append('N/A')
    
    # Floor
    # Find the label "Etage" within the section
    floor_element = soup.find('span', class_='css-arxwps', text='Etage')

    if floor_element:
        # Extract the corresponding value
        floor_element = floor_element.find_next('span', class_='css-1h46kg2')

        if floor_element:
            # Extract the text content and add it to the list
            floor_list.append(floor_element.text.strip())
        else:
            floor_list.append('N/A')
    else:
        floor_list.append('N/A')

    # Property type
    # Find the label "Boligtype" within the section
    property_type_element = soup.find('span', class_='css-arxwps', text='Boligtype')
    if property_type_element:
        # Extract the corresponding value
        property_type = property_type_element.find_next('span', class_='css-1h46kg2')
        if property_type:
            # Extract the text content and add it to the list
            property_type_list.append(property_type.text.strip())
        else:
            property_type_list.append('N/A')
    else:
        property_type_list.append('N/A')

    # Furnished
    # Find the label "Møbleret" within the section
    furnished_element = soup.find('span', class_='css-arxwps', text='Møbleret')

    if furnished_element:
        # Extract the corresponding value
        furnished_element = furnished_element.find_next('span', class_='css-1h46kg2')

        if furnished_element:
            # Extract the text content and add it to the list
            furnished_list.append(furnished_element.text.strip())
        else:
            furnished_list.append('N/A')
    else:
        furnished_list.append('N/A')

    # Shareable
    # Find the label "Delevenlig" within the section
    shareable_element = soup.find('span', class_='css-arxwps', text='Delevenlig')

    if shareable_element:
        # Extract the corresponding value
        shareable_element = shareable_element.find_next('span', class_='css-1h46kg2')

        if shareable_element:
            # Extract the text content and add it to the list
            shareable_list.append(shareable_element.text.strip())
        else:
            shareable_list.append('N/A')
    else:
        shareable_list.append('N/A')

    # Pets allowed
    # Find the label "Husdyr tilladt" within the section
    pets_allowed_element = soup.find('span', class_='css-arxwps', text='Husdyr tilladt')

    if pets_allowed_element:
        # Extract the corresponding value
        pets_allowed_element = pets_allowed_element.find_next('span', class_='css-1h46kg2')

        if pets_allowed_element:
            # Extract the text content and add it to the list
            pets_allowed_list.append(pets_allowed_element.text.strip())
        else:
            pets_allowed_list.append('N/A')
    else:
        pets_allowed_list.append('N/A')

    # elevator
    # Find the label "Elevator" within the section
    elevator_element = soup.find('span', class_='css-arxwps', text='Elevator')

    if elevator_element:
        # Extract the corresponding value
        elevator_element = elevator_element.find_next('span', class_='css-1h46kg2')

        if elevator_element:
            # Extract the text content and add it to the list
            elevator_list.append(elevator_element.text.strip())
        else:
            elevator_list.append('N/A')
    else:
        elevator_list.append('N/A')

    # Seniors frendly
    # Find the label "Seniorvenlig" within the section
    seniors_frendly_element = soup.find('span', class_='css-arxwps', text='Seniorvenlig')

    if seniors_frendly_element:
        # Extract the corresponding value
        seniors_frendly_element = seniors_frendly_element.find_next('span', class_='css-1h46kg2')

        if seniors_frendly_element:
            # Extract the text content and add it to the list
            senior_frendly_list.append(seniors_frendly_element.text.strip())
        else:
            senior_frendly_list.append('N/A')
    else:
        senior_frendly_list.append('N/A')

    # Students only
    students_only_element = soup.find('span', class_='css-arxwps', text='Kun for studerende')

    if students_only_element:
        # Extract the corresponding value
        students_only_element = students_only_element.find_next('span', class_='css-1h46kg2')

        if students_only_element:
            # Extract the text content and add it to the list
            only_for_students_list.append(students_only_element.text.strip())
        else:
            only_for_students_list.append('N/A')
    else:
        only_for_students_list.append('N/A')

        
    # Balcony or terrace
    balcony_element = soup.find('span', class_='css-arxwps', text='Altan/terrasse')

    if balcony_element:
        # Extract the corresponding value
        balcony_element = balcony_element.find_next('span', class_='css-1h46kg2')

        if balcony_element:
            # Extract the text content and add it to the list
            balcony_or_terrace_list.append(balcony_element.text.strip())
        else:
            balcony_or_terrace_list.append('N/A')
    else:
        balcony_or_terrace_list.append('N/A')

    # Parking
    parking_element = soup.find('span', class_='css-arxwps', text='Parkering')

    if parking_element:
        # Extract the corresponding value
        parking_element = parking_element.find_next('span', class_='css-1h46kg2')

        if parking_element:
            # Extract the text content and add it to the list
            parking_list.append(parking_element.text.strip())
        else:
            parking_list.append('N/A')
    else:
        parking_list.append('N/A')

    # Energy label
    # Find: <img src="/static/images/energy_labels/C_str2.png" class="css-rdsunt">
    # Get "C_str2" from the src attribute
    energy_label_element = soup.find('span', class_='css-arxwps', text='Energimærke')
    energy_label_list1 = []

    if energy_label_element:
        # Extract the corresponding value
        energy_label_element = energy_label_element.find_next('img', class_='css-rdsunt')

        if energy_label_element:
            # Extract the text content and add it to the list
            energy_label_list1.append(energy_label_element['src'].split('/')[-1].split('_')[0])
            # Get the first letter of C_str2
            energy_label_list.append(energy_label_list1[0][0])

        else:
            energy_label_list.append('N/A')
    else:
        energy_label_list.append('N/A')

    # Find the div with class "css-1f7mpex" for the description
    description_element = soup.find('div', class_='css-1f7mpex')

    # Initialize an empty string to store the description
    description_text = ""

    # Check if the description element is found
    if description_element:
        # Extract the text content of the description
        description_text = description_element.get_text(separator=' ', strip=True)

    # If description_text is empty, set it to "N/A"
    if not description_text:
        description_text = "N/A"

    # Add the description text to the list
    description_list.append(description_text)

    # Append data to the current chunk
    current_chunk.append({
        'Adress': adress_list[i],
        'Monthly rent': monthly_rent_list[i],
        'Aconto': aconto_list[i],
        'Square meters': sqm_list[i],
        'Rooms': nr_rooms_list[i],
        'Floor': floor_list[i],
        'Property type': property_type_list[i],
        'Furnished': furnished_list[i],
        'Shareable': shareable_list[i],
        'Pets allowed': pets_allowed_list[i],
        'Elevator': elevator_list[i],
        'Senior frendly': senior_frendly_list[i],
        'Only for students': only_for_students_list[i],
        'Balcony/Terrace': balcony_or_terrace_list[i],
        'Parking': parking_list[i],
        'Energy label': energy_label_list[i],
        'Description': description_list[i],
        'Link': final_urls[i]
    })

    # Increment the counter
    scraped_count += 1

    # Check if 100 URLs have been scraped or if it's the last iteration, and save the dataframe
    if scraped_count % 100 == 0 or i == len(final_urls) - 1:
        chunk_number = (scraped_count - 1) // 100 + 1  # Update chunk number calculation
        
        # Check if it's the last iteration and adjust the data for the current chunk accordingly
        if i == len(final_urls) - 1:
            current_chunk = current_chunk[:scraped_count % 100 if scraped_count % 100 != 0 else 100]
        
        df_chunk = pd.DataFrame(current_chunk)

        # Save the dataframe
        df_chunk.to_csv(f'data_chunk_{chunk_number}.csv', index=False)

        # Clear the current chunk for the next iteration
        current_chunk = []

    # Pause for a short time before scraping the next page
    time.sleep(0.5)

# Concatenate and save the final DataFrame
all_chunks = []
num_chunks = (scraped_count // 100) + (1 if scraped_count % 100 != 0 else 0)

for chunk_number in range(1, num_chunks + 1):
    chunk_filename = f'data_chunk_{chunk_number}.csv'
    chunk_df = pd.read_csv(chunk_filename)
    all_chunks.append(chunk_df)

# Concatenate all chunks
boligportalen_df = pd.concat(all_chunks, ignore_index=True)

# Count the initial number of rows
initial_row_count = len(boligportalen_df)

# Remove the duplicated links from the dataframe
boligportalen_df = boligportalen_df.drop_duplicates(subset=['Link'])

# Count the final number of rows after removing duplicates
final_row_count = len(boligportalen_df)

# Calculate the number of duplicates removed
duplicates_removed = initial_row_count - final_row_count

# Save boligportalen_df as a csv file
boligportalen_df.to_csv('boligportalen_df.csv', index=False)

# Check if there are any duplicates in the 'Link' column
duplicates_exist = boligportalen_df['Link'].duplicated().any()

# Print the result as a True or False statement
print("Duplicates exist in the 'Link' column:", duplicates_exist)

# Print the number of duplicates removed
print("Number of duplicates removed:", duplicates_removed)

boligportalen_df.head(1000)


Scraping Data: 100%|██████████| 14238/14238 [3:16:19<00:00,  1.21it/s]  


Duplicates exist in the 'Link' column: False
Number of duplicates removed: 0


Unnamed: 0,Adress,Monthly rent,Aconto,Square meters,Rooms,Floor,Property type,Furnished,Shareable,Pets allowed,Elevator,Senior frendly,Only for students,Balcony/Terrace,Parking,Energy label,Description,Link
0,"Sct Knuds Alle, 6740 Bramming - 1. sal",7.712 kr.,868 kr.,109 m²,4.0,1.,Lejlighed,Nej,Ja,Nej,Nej,Nej,Nej,Nej,Nej,,Flyt i lækkert nybyggeri centralt i Bramming\n...,https://www.boligportal.dk/lejligheder/brammin...
1,"Strøget, 7430 Ikast - 2. sal",5.725 kr.,900 kr.,91 m²,3.0,2.,Lejlighed,Nej,Ja,Nej,Nej,Nej,Nej,Ja,Nej,E,Central placeret 3 værelses lejlighed udlejes ...,https://www.boligportal.dk/lejligheder/ikast/9...
2,"Lykkehåbs Alle, 8722 Hedensted - 1. sal",8.545 kr.,1.000 kr.,83 m²,3.0,1.,Lejlighed,Nej,Nej,Ja,Ja,Ja,Nej,Ja,Ja,A,Lejlighederne er bygget i elegant skandinavisk...,https://www.boligportal.dk/lejligheder/hedenst...
3,"Annagade, 4220 Korsør - 1. sal",7.000 kr.,1.100 kr.,98 m²,3.0,1.,Lejlighed,Nej,Nej,Nej,Nej,Nej,Nej,Nej,Nej,,Stor 3 værelses lejlighed beliggende på Annaga...,https://www.boligportal.dk/lejligheder/kors%C3...
4,"Ramsvej, 8800 Viborg - 2. sal",6.990 kr.,1.000 kr.,100 m²,3.0,2.,Lejlighed,Nej,Ja,Nej,Nej,Nej,Nej,Nej,Nej,,Rigtig fin lejlighed med central beliggenhed ...,https://www.boligportal.dk/lejligheder/viborg/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,"P. Knudsens Vej, 8930 Randers, Randers NØ - 2...",4.369 kr.,990 kr.,72 m²,2.0,2.,Lejlighed,Nej,Nej,Nej,Nej,Nej,Nej,Nej,Nej,,Lejemålet er beliggende på P. Knudsens Vej og ...,https://www.boligportal.dk/lejligheder/randers...
996,"Voltvej, 2605 København, Brøndby - 3. sal",13.000 kr.,1.050 kr.,97 m²,4.0,3.,Lejlighed,Nej,Ja,Ja,Ja,Nej,Nej,Ja,Nej,,Dette er historien om et område under forvandl...,https://www.boligportal.dk/lejligheder/k%C3%B8...
997,"Bifrostgade, 8230 Aarhus, Åbyhøj - Stuen",10.400 kr.,600 kr.,83 m²,3.0,Stuen,Lejlighed,Nej,Ja,Ja,Ja,Ja,Nej,Ja,Ja,A,"Åbyhave ligger i Åbyen, som er en del af det n...",https://www.boligportal.dk/lejligheder/aarhus/...
998,"Billeshavehegn, 5500 Middelfart, Billeshave",10.618 kr.,600 kr.,112 m²,4.0,-,Rækkehus,Nej,Ja,Ja,Nej,Ja,Nej,Ja,Ja,,Rummelige og velindrettede rækkehuse placeret ...,https://www.boligportal.dk/r%C3%A6kkehuse/midd...
