## Steps
 * get 10000 links
 * get data from links
 * see where all required data is saved
 * clean data (empty rows, numerical values, etc.)


In [35]:
import bs4
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import json
import re
import lxml.html
import time
from time import gmtime, strftime
import random
from random import randint
import logging
import collections

from tabulate import tabulate
import os

date = strftime("%Y-%m-%d")

import selenium

# The selenium.webdriver module provides all the implementations of WebDriver
# Currently supported are Firefox, Chrome, IE and Remote. The `Keys` class provides keys on
# the keyboard such as RETURN, F1, ALT etc.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By

## Get the initial link and its data

Using the page in the url we loop through the pages untill we have as many links as we need (currently 100). This takes a while, due to needing time to fully load the page before we can get the data. Alternatively, if you have the links saved in a text file you could just load those, this has been added below, but is commented out so you don't override the list right after you made it.

In [36]:
links = []

page = 1

while len(links) < 10000:
    url = f"https://www.immoweb.be/en/search/house-and-apartment/for-sale?countries=BE&page={page}&orderBy=newest"

    driver = webdriver.Firefox()
    driver.implicitly_wait(2)
    driver.get(url)
    
    try:
        driver.find_element(By.CLASS_NAME, "page-error")
    except:
        link_soup = BeautifulSoup(driver.page_source)
    else:
        driver.close()
        break

    driver.close()

    for list_item in link_soup.find_all("li", attrs={"class": "search-results__item"}):
        for link in list_item.find_all("a", attrs={"class": "card__title-link"}):
            links.append(link.get("href"))
    
    page += 1

This code is for you to fill the list of links with a list of links saved in a file. The assumed name and location of the file are saved in `file_name` and the assumed separator is `\n`. If the file you use is structured or located differently, edit the necessary variables before running the code.

In [None]:
"""
links = []
filename = "./data/links.txt" #If you have the links saved elsewhere, just replace the path
my_file = open(filename, "r")
for link in my_file.read().split("\n"):
    links.append(link)
    
my_file.close()
"""

Now we have our links we need to remove any duplicates we might have accidentally gotten.

In [45]:
print(len(links))
links = list(dict.fromkeys(links))
print (len(links))

10020
10020


If we don't have enough links yet we can every so often recheck the newest listings to see if any new properties have een added untill we fill our list to the required amount.

In [46]:
while len(links) < 10000:
    url = "https://www.immoweb.be/en/search/house-and-apartment/for-sale?countries=BE&page=1&orderBy=newest"

    driver = webdriver.Firefox()
    driver.implicitly_wait(2)
    driver.get(url)
    
    link_soup = BeautifulSoup(driver.page_source)
    
    driver.close()

    for list_item in link_soup.find_all("li", attrs={"class": "search-results__item"}):
        for link in list_item.find_all("a", attrs={"class": "card__title-link"}):
            if link.get("href") not in links:
                links.append(link.get("href"))
    if len(links) < 10000:
        time.sleep(180)
    
print("Done") 
print(len(links))

Done
10020


If you really want to, you could save the links to a text file. This will make it so you do not need to scrape all the same data over and over again.

In [47]:
filename = "./data/links.txt"
my_file = open(filename, "a")
for link in links:
    my_file.write(link)
    my_file.write("\n")

my_file.close()

## Get the data from one of the links

To give a short overview of what we'll be doing from now on, we now make a soup from just 1 of the links so we can start looking at the data we have and selecting which data we need. Further below we will loop over all the links, building soups from the different links and saving the required data to our dataframe. For this step we can also just use a regular get request alongside BeautifulSoup to find the data we need

In [43]:
r = requests.get(links[0])
property_soup = BeautifulSoup(r.content, "lxml")

Now we'll take our soup and look for the relevant data. Since ImmoWeb fills it's site using data from a database we simply need to locate this data, parse it and store the relevant parts for easier access.

In [44]:
div_with_script = property_soup.find("div", attrs={"class": "classified"})
script_text = div_with_script.script.text.split("= ", 1)[1]
json_data = json.loads(script_text.rstrip()[:-1])

property_data = json_data["property"]
price_data = json_data["price"]
for key in property_data.keys():
    print(key)
    print(property_data[key])
    print("\n")

type
HOUSE


subtype
APARTMENT_BLOCK


title
None


description
None


name
None


isHolidayProperty
None


bedroomCount
3


bedrooms
[]


bathroomCount
2


bathrooms
[]


location
{'country': 'Belgium', 'region': 'Flanders', 'province': 'Antwerp', 'district': 'Antwerp', 'locality': 'Antwerp', 'postalCode': '2100', 'street': 'Alfons Schneiderlaan', 'number': '166', 'box': None, 'propertyName': None, 'floor': None, 'latitude': 51.2258129, 'longitude': 4.4734032, 'distance': None, 'approximated': None, 'regionCode': 'FLANDERS', 'type': 'URBAN', 'hasSeaView': None, 'pointsOfInterest': [{'type': 'SCHOOL', 'distance': 0}, {'type': 'SHOPS', 'distance': 0}, {'type': 'TRANSPORT', 'distance': 0}], 'placeName': None}


netHabitableSurface
222


roomCount
None


monthlyCosts
None


attic
None


hasAttic
None


basement
{'surface': None}


hasBasement
True


hasDressingRoom
None


diningRoom
None


hasDiningRoom
None


building
{'annexCount': None, 'condition': 'JUST_RENOVATED', 'constructionYear'

As you can see, in the captured json, we get pretty much all the data we need. For the data we are currently missing it's just a matter of accessing the different levels and attributes of the json. Luckily I already went through the entire structure of the json to locate all the necessary parts as seen below.

## Finish the project

With all this knowledge and info, we can now simply loop over our links, make a call for every single one and put the required data in a dataframe. This is going to take a while, so drink some coffee, take a nap, review some code or just do anything you like. And if anyone asks what you're doing, just tell them you're creating the dataframe.

ETA: 4h 10min

In [34]:
location = []
property_type = []
property_subtype = []
price = []
type_of_sale = []
number_of_bedrooms = []
living_area = []
kitchen = []
furnished = []
open_fireplace = []
terrace = []
terrace_orientation = []
garden = []
garden_orientation = []
surface_area_land = []
number_of_facades = []
pool = []
condition = []


for link in links:
    r = requests.get(link)
    soup = BeautifulSoup(r.content, "lxml")
    
    div_with_script = soup.find("div", attrs={"class": "classified"})
    script_text = div_with_script.script.text.split("= ", 1)[1]
    json_data = json.loads(script_text.rstrip()[:-1])

    
    property_data = json_data["property"]
    price_data = json_data["price"]
    
   
    location.append(property_data["location"]["locality"] if property_data["location"] != None else "Unknown")
    
    property_type.append(property_data["type"])
    
    property_subtype.append(property_data["subtype"])
    
    price.append(price_data["mainValue"])
    
    type_of_sale.append(price_data["type"])
    
    number_of_bedrooms.append(property_data["bedroomCount"])
    
    living_area.append(property_data["netHabitableSurface"])
    
    kitchen.append(property_data['kitchen']["type"] if property_data["kitchen"] != None else "Unknown")

    furnished.append(json_data["transaction"]["sale"]["isFurnished"])
    
    open_fireplace.append(property_data["fireplaceExists"])
    
    terrace.append(property_data["hasTerrace"] if property_data["hasTerrace"] != None else "Unknown")
    if property_data["hasTerrace"] == False:
        terrace_orientation.append("No Terrace")
    else:
        terrace_orientation.append(property_data["terraceOrientation"] if property_data["terraceOrientation"] != None else "Unknown")
    
    garden.append(property_data["hasGarden"] if property_data["hasGarden"] != None else "Unknown")
    if property_data["hasGarden"] == False:
        garden_orientation.append("No Garden")
    else:
        garden_orientation.append(property_data["gardenOrientation"] if property_data["gardenOrientation"] != None else "Unknown")
    
    surface_area_land.append(property_data["land"]["surface"] if property_data["land"] != None else "NaN")
    
    number_of_facades.append(property_data["building"]["facadeCount"] if property_data["building"] != None else "Unknown")
    
    pool.append(property_data["hasSwimmingPool"])
    
    condition.append(property_data["building"]["condition"] if property_data["building"]["condition"] != None else "Unknown")
    
    time.sleep(random.uniform(1.0, 2.0))


df = pd.DataFrame({})
df["Location"] = location
df["Property type"] = property_type
df["Property subtype"] = property_subtype
df["Price"] = price
df["Type of sale"] = type_of_sale
df["Number of bedrooms"] = number_of_bedrooms
df["Living area"] = living_area
df["Kitchen"] = kitchen
df["Furnished"] = furnished
df["Open fireplace"] = open_fireplace
df["Terrace"] = terrace
df["Terrace orientation"] = terrace_orientation
df["Garden"] = garden
df["Garden orientation"] = garden_orientation
df["Surface area land"] = surface_area_land
df["Number of facades"] = number_of_facades
df["Pool"] = pool
df["Condition"] = condition


df.to_csv("./data/houses.csv", index=True)
df

Unnamed: 0,Location,Property type,Property subtype,Price,Type of sale,Number of bedrooms,Living area,Kitchen,Furnished,Open fireplace,Terrace,Terrace orientation,Garden,Garden orientation,Surface area land,Number of facades,Pool,Condition
0,Drogenbos,APARTMENT,APARTMENT,199500,residential_sale,1,60,Unknown,False,False,True,SOUTH_WEST,Unknown,Unknown,,,False,Unknown
1,Zwalm,HOUSE,HOUSE,595000,residential_sale,5,234,INSTALLED,False,False,True,Unknown,True,Unknown,1662.0,3.0,False,GOOD
