## Steps
 * get 10000 links
 * get data from links
 * see where all required data is saved
 * clean data (empty rows, numerical values, etc.)


In [2]:
import bs4
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import json
import re
import lxml.html
import time
import random
from random import randint
import logging
import collections
from time import gmtime, strftime

import re
from tabulate import tabulate
import os

date = strftime("%Y-%m-%d")

import selenium

# The selenium.webdriver module provides all the implementations of WebDriver
# Currently supported are Firefox, Chrome, IE and Remote. The `Keys` class provides keys on
# the keyboard such as RETURN, F1, ALT etc.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By

## Get the initial link and its data

Using the page in the url we loop through the pages untill we have as many links as we need (currently 100). This takes a while, due to needing time to fully load the page before we can get the data (30 links per page though, maybe something I can do with asynchronous stuff so we don't have to wait as long(just rand time between 0 and 2 sec maybe))

In [3]:
links = []

page = 1

while len(links) < 100:
    url = f"https://www.immoweb.be/en/search/house-and-apartment/for-sale?countries=BE&page={page}&orderBy=newest"

    driver = webdriver.Firefox()
    driver.implicitly_wait(5)
    driver.get(url)

    #python_button = driver.find_elements_by_xpath('//div[@data-reactid="269"]')[0]
    #python_button.click()

    # And then we use Beautiful soup
    soup = BeautifulSoup(driver.page_source)

    driver.close()

    for list_item in soup.find_all("li", attrs={"class": "search-results__item"}):
        for link in list_item.find_all("a", attrs={"class": "card__title-link"}):
            links.append(link.get("href"))
    
    page += 1

## Print the links

Just to get a bit of an overview, can we maybe spot a pattern? (Sadly, no)

In [5]:
#for link in links:
    #print(link)

## Get the data from one of the links

We now make a soup from just 1 of the links so we can start looking at the data and which data we need. Later we will replace this with a loop so we can start building soups from the different links. Currently using just a regular get request, hopefully we can get all the necessary data with this, not really looking forward to having to start and close drivers for every link.

In [10]:
r = requests.get('https://www.immoweb.be/en/classified/house/for-sale/soignies/7060/9722093?searchId=621cb380e1a80')
print(links[0], r.status_code)
soup = BeautifulSoup(r.content, "lxml")

https://www.immoweb.be/en/classified/house/for-sale/gilly-(charleroi)/6060/9777760?searchId=621cb10780c2f 200


Lets take a look at what's in our soup

In [11]:
script_text = soup.find_all("script")
script_text = soup.find_all("script")[1].text.split("= ", 1)[1]

json_data = json.loads(script_text[:script_text.find(";")])
print(json_data[0]['classified'])

{'id': '9722093', 'type': 'house', 'subtype': 'house', 'price': '700000', 'transactionType': 'for sale', 'zip': '7060', 'visualisationOption': 'xl', 'kitchen': {'type': 'hyper equipped'}, 'building': {'constructionYear': '', 'condition': 'as new'}, 'energy': {'heatingType': 'fueloil'}, 'certificates': {'primaryEnergyConsumptionLevel': ''}, 'bedroom': {'count': '0'}, 'land': {'surface': '14000'}, 'atticExists': '', 'basementExists': '', 'outdoor': {'garden': {'surface': '50'}, 'terrace': {'exists': 'true'}}, 'specificities': {'SME': {'office': {'exists': 'true'}}}, 'wellnessEquipment': {'hasSwimmingPool': 'true'}, 'parking': {'parkingSpaceCount': {'indoor': '', 'outdoor': '10'}}, 'condition': {'isNewlyBuilt': 'false'}}


In [7]:
# Locality
# <span class="classified__information--address-row"> 
# -->                                       Rue des Francais 1,41 
#                                <!----></span>


# Type of property (House/apartment)
# Located in link as indicated below, maybe start fitering before getting data, so we can split the poroperty types
# https://www.immoweb.be/en/classified/ [[apartment]] /for-sale/ans/4430/9777582?searchId=621c956edcf6b 

print(json_data[0]['classified']['type'])

# Subtype of property (Bungalow, Chalet, Mansion, ...)
# I don't directly see this in the link, but there is an option to filter this on the main page so there should be a way to get this info

print(json_data[0]['classified']['subtype'])

# Price
# <p class="classified__price"><span aria-hidden="true">
# -->                                   €204,900
#                                </span> <span class="sr-only">204900€</span></p>

print(json_data[0]['classified']['price'])

# Type of sale (Exclusion of life sales)
# Not sure what is asked here specifically

print(json_data[0]['classified']['transactionType']) #?

# Number of rooms
# Is this number of bedrooms? casue if so:
#<tr class="classified-table__row"><th scope="row" class="classified-table__header">
#                                    Bedrooms
#                                </th> <td class="classified-table__data">
# -->                                   3
#                                </td></tr>

print(json_data[0]['classified']['bedroom']['count'])

#Area
#<tr class="classified-table__row"><th scope="row" class="classified-table__header">
#                                    Living area
#                                </th> <td class="classified-table__data">
# -->                                   90
#
#                                    <span class="abbreviation"><span aria-hidden="true">
# -->                                           m²                                        </span> <span class="sr-only">
#                                            square meters
#                                        </span></span></td></tr>

# Fully equipped kitchen (Yes/No)
# Not sure but maybe
# <tr class="classified-table__row"><th scope="row" class="classified-table__header">Kitchen type</th> <td class="classified-table__data">
# -->                                   Installed
#                                </td></tr>

print(json_data[0]['classified']['kitchen']) #get rest of data from here

#Furnished (Yes/No)
#<tr class="classified-table__row"><th scope="row" class="classified-table__header">Furnished</th> <td class="classified-table__data">
# -->                                   No
#                                </td></tr>


#Open fire (Yes/No)
# Not sure but maybe
#<tr class="classified-table__row"><th scope="row" class="classified-table__header">Heating type</th> <td class="classified-table__data">
# -->                               Gas
#                            </td></tr>

#<tr class="classified-table__row"><th scope="row" class="classified-table__header">
#                                    How many fireplaces?
#                            </th> <td class="classified-table__data">
# -->                                   2
#                            </td></tr>

#Terrace (Yes/No)
    #If yes: Area

print(json_data[0]['classified']['terrace'])    
    
#Garden (Yes/No)
    #If yes: Area

print(json_data[0]['classified']['outdoor']['garden'])    
    
#Surface of the land
# see area?

#Surface area of the plot of land

print(json_data[0]['classified']['land']['surface'])

#Number of facades
# <tr class="classified-table__row"><th scope="row" class="classified-table__header">
#                                    Number of frontages
#                                </th> <td class="classified-table__data">
# -->                                   4
#                                </td></tr>

pass

#Swimming pool (Yes/No)

print(json_data[0]['classified']['wellnessEquipment']) #get rest of data from this

#State of the building (New, to be renovated, ...)

print(json_data[0]['classified']['condition']) #get rest of data from this