## ToDo
 * [x] get 10000 links
 * [x] locate required data in links
 * [ ] clean data (empty rows, numerical values, etc.)
 * [x] build the df and save it to a csv file


In [2]:
import requests
import lxml.html
import bs4
from bs4 import BeautifulSoup

import json
import pandas as pd

import random
import time

import logging
import collections

import selenium

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By

## Getting the required number of links

For this project we needed to collect 10000 datapieces and save those to a csv. To get that many we used ImmoWeb as they have a large amount of listings for us to use. Adding some of the required filters (no annuity sales) we set up our base link: https://www.immoweb.be/en/search/house/for-sale?countries=BE&isALifeAnnuitySale=false&page=1&orderBy=newest

In this links we used string formatting to replace the pagenumber and thus loop through the necessary pages. We do this untill we can no longer get a next page and encounter ImmoWebs error page, after which we jump out of our loop.

This does take a while, and we have included a text file with all the links we scraped and in a lower cell included the code you need to fill up your list of links from this text file if necessary.

In [3]:
links = []
page = 1

while True:
    url = f"https://www.immoweb.be/en/search/house/for-sale?countries=BE&isALifeAnnuitySale=false&page={page}&orderBy=newest"

    driver = webdriver.Firefox()
    driver.implicitly_wait(2)
    driver.get(url)
    
    try:
        driver.find_element(By.CLASS_NAME, "page-error")
    except:
        link_soup = BeautifulSoup(driver.page_source)
    else:
        driver.close()
        break

    driver.close()

    for list_item in link_soup.find_all("li", attrs={"class": "search-results__item"}):
        for link in list_item.find_all("a", attrs={"class": "card__title-link"}):
            if link.get("href") not in links:
                links.append(link.get("href"))
    
    page += 1

This code is for you to fill the list of links with a list of links saved in a file. The assumed name and location of the file are saved in `file_name` and the assumed separator is `\n`. If the file you use is structured or located differently, edit the necessary variables before running the code.

In [3]:

links = []
filename = "./data/links.txt" #If you have the links saved elsewhere, just replace the path
my_file = open(filename, "r")
for link in my_file.read().split("\n"):
    links.append(link)
    
links.remove("")
    
my_file.close()

Now we have our links we need to remove any duplicates.

In [36]:
print(len(links))
links = list(dict.fromkeys(links))
print (len(links))

10201
10200


If we don't have enough links yet you can run this code to every so often recheck the newest listings to see if any new properties have been added untill we fill our list to the required amount.

In [20]:
while len(links) < 10000:
    url = "https://www.immoweb.be/en/search/house/for-sale?countries=BE&isALifeAnnuitySale=false&page=1&orderBy=newest"

    driver = webdriver.Firefox()
    driver.implicitly_wait(2)
    driver.get(url)
    
    link_soup = BeautifulSoup(driver.page_source)
    
    driver.close()

    for list_item in link_soup.find_all("li", attrs={"class": "search-results__item"}):
        for link in list_item.find_all("a", attrs={"class": "card__title-link"}):
            if link.get("href") not in links: #This does not seem to actually 
                links.append(link.get("href"))
    if len(links) < 10000:
        time.sleep(180)
    
print("Done") 
print(len(links))

Done
10201


If you really want to, you could save the links to a text file. This will make it so you do not need to scrape all the same data over and over again.

In [6]:
filename = "./data/links.txt"
my_file = open(filename, "w")
for link in links:
    my_file.write(link)
    my_file.write("\n")

my_file.close()

## Get the data from the links

To give a short overview of what we'll be doing from now on, we now make a soup from just 1 of the links so we can start looking at the data we have and selecting which data we need. Further below we will loop over all the links, building soups from the different links and saving the required data to our dataframe. For this step we can also just use a regular get request alongside BeautifulSoup to find the data we need

In [43]:
r = requests.get(links[0])
property_soup = BeautifulSoup(r.content, "lxml")

Now we'll take our soup and look for the relevant data. Since ImmoWeb fills it's site using data from a database we simply need to locate this data, parse it and store the relevant parts for easier access.

In [44]:
div_with_script = property_soup.find("div", attrs={"class": "classified"})
script_text = div_with_script.script.text.split("= ", 1)[1]
json_data = json.loads(script_text.rstrip()[:-1])

property_data = json_data["property"]
price_data = json_data["price"]
for key in property_data.keys():
    print(key)
    print(f"{property_data[key]}\n")

type
HOUSE


subtype
APARTMENT_BLOCK


title
None


description
None


name
None


isHolidayProperty
None


bedroomCount
3


bedrooms
[]


bathroomCount
2


bathrooms
[]


location
{'country': 'Belgium', 'region': 'Flanders', 'province': 'Antwerp', 'district': 'Antwerp', 'locality': 'Antwerp', 'postalCode': '2100', 'street': 'Alfons Schneiderlaan', 'number': '166', 'box': None, 'propertyName': None, 'floor': None, 'latitude': 51.2258129, 'longitude': 4.4734032, 'distance': None, 'approximated': None, 'regionCode': 'FLANDERS', 'type': 'URBAN', 'hasSeaView': None, 'pointsOfInterest': [{'type': 'SCHOOL', 'distance': 0}, {'type': 'SHOPS', 'distance': 0}, {'type': 'TRANSPORT', 'distance': 0}], 'placeName': None}


netHabitableSurface
222


roomCount
None


monthlyCosts
None


attic
None


hasAttic
None


basement
{'surface': None}


hasBasement
True


hasDressingRoom
None


diningRoom
None


hasDiningRoom
None


building
{'annexCount': None, 'condition': 'JUST_RENOVATED', 'constructionYear'

As you can see, in the captured json, we get pretty much all the data we need. For the data we are currently missing it's just a matter of accessing the different levels and attributes of the json. Luckily I already went through the entire structure of the json to locate all the necessary parts as seen below.

## Finish the project

With all this knowledge and info, we can now simply loop over our links, make a call for every single one and put the required data in a dataframe. This is going to take a while, so drink some coffee, take a nap, review some code or just do anything you like. And if anyone asks what you're doing, just tell them you're creating the dataframe.

ETA: 4h 10min

In [10]:
location = []
property_type = []
property_subtype = []
price = []
type_of_sale = []
number_of_bedrooms = []
living_area = []
kitchen = []
furnished = []
open_fireplace = []
terrace = []
terrace_orientation = []
garden = []
garden_orientation = []
surface_area_land = []
number_of_facades = []
pool = []
condition = []

index = 0

for link in links:
    index += 1
    
    try:
        r = requests.get(link)
        soup = BeautifulSoup(r.content, "lxml")

        div_with_script = soup.find("div", attrs={"class": "classified"})
        script_text = div_with_script.script.text.split("= ", 1)[1]
        json_data = json.loads(script_text.rstrip()[:-1])


        property_data = json_data["property"]
        price_data = json_data["price"]


        location.append(property_data["location"]["locality"] if property_data["location"] != None else "Unknown")

        property_type.append(property_data["type"])

        property_subtype.append(property_data["subtype"])

        price.append(price_data["mainValue"])

        type_of_sale.append(price_data["type"])

        number_of_bedrooms.append(property_data["bedroomCount"])

        living_area.append(property_data["netHabitableSurface"])

        kitchen.append(property_data['kitchen']["type"] if property_data["kitchen"] != None else "Unknown")

        furnished.append(json_data["transaction"]["sale"]["isFurnished"])

        open_fireplace.append(property_data["fireplaceExists"])

        terrace.append(property_data["hasTerrace"] if property_data["hasTerrace"] != None else "Unknown")
        if property_data["hasTerrace"] == False:
            terrace_orientation.append("No Terrace")
        else:
            terrace_orientation.append(property_data["terraceOrientation"] if property_data["terraceOrientation"] != None else "Unknown")

        garden.append(property_data["hasGarden"] if property_data["hasGarden"] != None else "Unknown")
        if property_data["hasGarden"] == False:
            garden_orientation.append("No Garden")
        else:
            garden_orientation.append(property_data["gardenOrientation"] if property_data["gardenOrientation"] != None else "Unknown")

        surface_area_land.append(property_data["land"]["surface"] if property_data["land"] != None else "NaN")

        number_of_facades.append(property_data["building"]["facadeCount"] if property_data["building"] != None else "Unknown")

        pool.append(property_data["hasSwimmingPool"])

        condition.append(property_data["building"]["condition"] if property_data["building"] != None else "Unknown")
    
    except Exception as exception:
        print(type(exception).__name__)
        print(index)
        print(link)
        
    time.sleep(random.uniform(1.0, 2.0))

print("Done")

<class 'Exception'>
67
https://www.immoweb.be/en/classified/villa/for-sale/sint-amandsberg/9040/9780299?searchId=621e32823e872
<class 'Exception'>
87
https://www.immoweb.be/en/classified/house/for-sale/brugge/8200/9780258?searchId=621e32823e872
<class 'Exception'>
241
https://www.immoweb.be/en/classified/villa/for-sale/lovendegem-vinderhoute/9921/9779784?searchId=621e32d02baed
<class 'Exception'>
255
https://www.immoweb.be/en/classified/villa/for-sale/gentinnes/1450/9779735?searchId=621e32d02baed
<class 'Exception'>
712
https://www.immoweb.be/en/classified/apartment-block/for-sale/nalinnes/6120/9777759?searchId=621e334eebd49
<class 'Exception'>
713
https://www.immoweb.be/en/classified/house/for-sale/nalinnes/6120/9777758?searchId=621e334eebd49
<class 'Exception'>
714
https://www.immoweb.be/en/classified/house/for-sale/nalinnes/6120/9777757?searchId=621e334eebd49
<class 'Exception'>
737
https://www.immoweb.be/en/classified/bungalow/for-sale/damme/8340/9777675?searchId=621e33580dfec
<cla

<class 'Exception'>
7005
https://www.immoweb.be/en/classified/house/for-sale/oostende/8400/9720397?searchId=621e3c954d683
<class 'Exception'>
7114
https://www.immoweb.be/en/classified/house/for-sale/roeselare/8800/9752797?searchId=621e3cb8b7a65
<class 'Exception'>
7176
https://www.immoweb.be/en/classified/house/for-sale/antwerp/2610/9752537?searchId=621e3cca6e167
<class 'Exception'>
7214
https://www.immoweb.be/en/classified/house/for-sale/koekelberg/1081/9752330?searchId=621e3cd2e3ea0
<class 'Exception'>
7259
https://www.immoweb.be/en/classified/house/for-sale/zulte/9870/9752086?searchId=621e3cdbad1db
<class 'Exception'>
7348
https://www.immoweb.be/en/classified/house/for-sale/lier/2500/9751726?searchId=621e3cf649155
<class 'Exception'>
7509
https://www.immoweb.be/en/classified/house/for-sale/kasterlee/2460/9750944?searchId=621e3d296d2ac
<class 'Exception'>
7564
https://www.immoweb.be/en/classified/house/for-sale/hoboken/2660/9750701?searchId=621e3d3c50c49
<class 'Exception'>
7578
http

Unnamed: 0,Location,Property type,Property subtype,Price,Type of sale,Number of bedrooms,Living area,Kitchen,Furnished,Open fireplace,Terrace,Terrace orientation,Garden,Garden orientation,Surface area land,Number of facades,Pool,Condition
0,Verviers,HOUSE,HOUSE,296607.0,residential_sale,3.0,130.0,Unknown,False,False,True,Unknown,True,Unknown,239,3,,AS_NEW
1,Haasdonk,HOUSE,HOUSE,560000.0,residential_sale,4.0,,Unknown,,False,Unknown,Unknown,Unknown,Unknown,626,3,,AS_NEW
2,VIANE,HOUSE,HOUSE,299000.0,residential_sale,5.0,200.0,SEMI_EQUIPPED,False,False,Unknown,Unknown,Unknown,Unknown,1150,,,GOOD
3,Gerpinnes,HOUSE,HOUSE,300000.0,residential_sale,0.0,270.0,Unknown,False,False,True,SOUTH,True,SOUTH,498,3,False,
4,Nivelles,HOUSE,HOUSE,195000.0,residential_sale,3.0,135.0,INSTALLED,False,False,True,Unknown,Unknown,Unknown,165,2,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9901,Hoeselt,HOUSE,APARTMENT_BLOCK,196900.0,residential_sale,5.0,205.0,SEMI_EQUIPPED,False,False,True,Unknown,True,WEST,162,2,,GOOD
9902,SINT-NIKLAAS,HOUSE,HOUSE,339000.0,residential_sale,2.0,159.0,SEMI_EQUIPPED,False,False,Unknown,Unknown,True,SOUTH_EAST,320,3,,GOOD
9903,RIJMENAM,HOUSE,HOUSE,345000.0,residential_sale,3.0,,SEMI_EQUIPPED,False,False,Unknown,Unknown,True,NORTH_WEST,0,3,,GOOD
9904,Vilvoorde (Peutie),HOUSE,HOUSE,275000.0,residential_sale,3.0,170.0,Unknown,False,False,Unknown,Unknown,Unknown,Unknown,444,3,False,


We now have all the necessary data saved across multiple lists, so now we simply build a dataframe using these lists, and a csv using this dataframe and we're all done. 

Congratulations, you just scraped a ton of data and saved it to a csv! This should conclude this project.

In [37]:
df = pd.DataFrame({})
df["Location"] = location
df["Property type"] = property_type
df["Property subtype"] = property_subtype
df["Price"] = price
df["Type of sale"] = type_of_sale
df["Number of bedrooms"] = number_of_bedrooms
df["Living area"] = living_area
df["Kitchen"] = kitchen
df["Furnished"] = furnished
df["Open fireplace"] = open_fireplace
df["Terrace"] = terrace
df["Terrace orientation"] = terrace_orientation
df["Garden"] = garden
df["Garden orientation"] = garden_orientation
df["Surface area land"] = surface_area_land
df["Number of facades"] = number_of_facades
df["Pool"] = pool
df["Condition"] = condition


df.to_csv("./data/houses.csv", index=True)
df

Unnamed: 0,Location,Property type,Property subtype,Price,Type of sale,Number of bedrooms,Living area,Kitchen,Furnished,Open fireplace,Terrace,Terrace orientation,Garden,Garden orientation,Surface area land,Number of facades,Pool,Condition
0,Verviers,HOUSE,HOUSE,296607.0,residential_sale,3.0,130.0,Unknown,False,False,True,Unknown,True,Unknown,239,3,,AS_NEW
1,Haasdonk,HOUSE,HOUSE,560000.0,residential_sale,4.0,,Unknown,,False,Unknown,Unknown,Unknown,Unknown,626,3,,AS_NEW
2,VIANE,HOUSE,HOUSE,299000.0,residential_sale,5.0,200.0,SEMI_EQUIPPED,False,False,Unknown,Unknown,Unknown,Unknown,1150,,,GOOD
3,Gerpinnes,HOUSE,HOUSE,300000.0,residential_sale,0.0,270.0,Unknown,False,False,True,SOUTH,True,SOUTH,498,3,False,
4,Nivelles,HOUSE,HOUSE,195000.0,residential_sale,3.0,135.0,INSTALLED,False,False,True,Unknown,Unknown,Unknown,165,2,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10077,Charleroi,HOUSE,HOUSE,220000.0,residential_sale,3.0,,,False,False,Unknown,Unknown,Unknown,Unknown,0,3,,JUST_RENOVATED
10078,Gosselies,HOUSE,HOUSE,99000.0,residential_sale,2.0,,INSTALLED,,False,Unknown,Unknown,Unknown,Unknown,80,2,,TO_BE_DONE_UP
10079,Rhode-Saint-Genèse,HOUSE,VILLA,1650000.0,residential_sale,6.0,530.0,INSTALLED,,False,True,Unknown,True,SOUTH,1400,4,True,AS_NEW
10080,Hasselt,HOUSE,HOUSE,150000.0,first_session_with_reserve_price,4.0,239.0,Unknown,,False,Unknown,Unknown,Unknown,Unknown,775,3,,TO_RENOVATE


Next step: data cleaning, because sometimes we actually want fully unique and useable values.

In [28]:
print(len(df))
print(len(df.drop_duplicates()))

10082
9807
