In [1]:
# Importing the necessary libraries
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time

In [470]:
link = 'https://www.immoweb.be/en/classified/apartment/for-sale/hoboken/2660/20315431'

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
}

In [471]:
# Get the website's content and making soup

req = requests.get(link, headers=headers)
print(req.status_code)

content = req.content
soup = BeautifulSoup(content, 'html')


200


In [472]:
# City / Locality 
locality = link.split('/')[-3]
locality

'hoboken'

In [None]:
# Property Type

houses = ('house', 'villa', 'mansion', 'town-house', 'mixed-use-building', 'exceptional-property')
appartments = ('apartment', 'ground-floor', 'penthouse', 'flat-studio','apartment-block', 'duplex')



In [477]:
sub_type = link.split('/')[-5]
sub_type

'apartment'

In [479]:
if sub_type in houses:
    p_type = 'house'
elif sub_type in appartments:
    p_type = 'apartment'
else:
    p_type = 'other'


p_type


'apartment'

In [480]:
# Price
price = soup.find('span', attrs={'class': 'sr-only'})
price = re.sub(r'[\D]', "", price.text)
price

'229000'

In [182]:
###################
# Sale Type TO BE DONE

sale_type = soup.find('span', attrs={'class': 'flag-list__text'})

sale_type



In [481]:
# Get table with all other property details
table = soup.find_all('table', class_='classified-table')

In [482]:
# Create list of raw data from the table
labels = [str(th) for th in soup.find_all('th', class_='classified-table__header')]
values = [str(td) for td in soup.find_all('td', class_='classified-table__data')]

# Get content of the extracted tags as labels to become keys
new_labels = [re.search(r'<th.*?>(.*?)</th>', label, flags=re.DOTALL) for label in labels]
clean_labels = [new_label.group(1).strip() if new_label else 'NA' for new_label in new_labels]

# Get content of the extracted tags as values to become values
new_values = [re.search(r'<td[^>]*>(.*?)<', value, flags=re.DOTALL) for value in values]
clean_values = [new_value.group(1).strip() if new_value else 'Not found' for new_value in new_values]


# print(clean_labels)
# print(clean_values)


# Convert to dict:
details_dict = {label: value for label, value in zip(clean_labels, clean_values)}

details_dict


{'Available as of': 'After signing the deed',
 'Construction year': '1934',
 'Floor': '1',
 'Number of floors': '1',
 'Building condition': 'As new',
 'Number of frontages': '2',
 'Asbestos certificate is available': 'Yes',
 'Living area': '100',
 'Living room surface': '35',
 'Dining room': 'Yes',
 'Kitchen surface': '6',
 'Bedrooms': '3',
 'Bedroom 1 surface': '22',
 'Bedroom 2 surface': '22',
 'Bedroom 3 surface': '19',
 'Bathrooms': '1',
 'Toilets': '1',
 'Terrace surface': '26',
 'TV cable': 'Yes',
 'Primary energy consumption': '152',
 'Energy class': 'B',
 'Reference number of the EPC report': '20230905-0002981351-RES-1',
 'CO₂ emission': 'Not specified',
 'Yearly theoretical total energy consumption': 'Not specified',
 'As built plan': 'No',
 'Conformity certification for fuel tanks': 'Not specified',
 'Heating type': 'Gas',
 'Double glazing': 'Yes',
 'Planning permission obtained': 'Yes',
 'Subdivision permit': 'No',
 'Possible priority purchase right': 'No',
 'Proceedings for

In [483]:
# Searching for all details needed in details_dict and
# Assign "NA" if not found

new_house = {}

# Bedrooms and living area
new_house['bedrooms'] = details_dict.get('Bedrooms')
new_house['living_area'] = details_dict.get('Living area')

# Kitchen
equiped = ('Hyper equipped', 'Installed', 'Semi equipped')
if details_dict.get('Kitchen type') in equiped:
    new_house['equip_kitchen'] = 1
else:
    new_house['equip_kitchen'] = 0

# Furnished
if details_dict.get('Furnished') == 'Yes':
    new_house['furnished'] = 1
else:
    new_house['furnished'] = 0


# open fire:
if details_dict.get('How many fireplaces?') == None:
    new_house['open_fire'] = 0
else:
    new_house['open_fire'] = 1

# Terrace
if details_dict.get('Terrace') != None \
    or details_dict.get('Terrace surface') != None:
    new_house['terrace'] = 1
else:
    new_house['terrace'] = 0

# Terrace surface
new_house['terrace_surface'] = details_dict.get('Terrace surface')

new_house

# 'bedrooms' : '',
# 'living_area' : '',
# 'equip_kit' : '',   # Yes, no
# 'furnished' : '' , # Yes, no
# 'open_fire' : '', # Yes, no
# 'terrace' : '', # Yes, no  # If yes, area:

{'bedrooms': '3',
 'living_area': '100',
 'equip_kitchen': 0,
 'furnished': 0,
 'open_fire': 0,
 'terrace': 1,
 'terrace_surface': '26'}

In [2]:
################### FOR FINAL PY DOC
def get_details(link: str) -> dict:
    """
    Goes over a property listing website and gets the relevant details.

    PARAMS:
    link - str: the url of the property listing to scrap

    RETURNS:
    house - dict: label-value pairs of the relevant property details

    """

    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
    }

    # make request
    req = requests.get(link, headers=headers)

    # Get content and make soup
    content = req.content
    soup = BeautifulSoup(content, features='lxml')

    if req.status_code == 200:
        # Iniciate an empty house dict
        house = {}

        # Sets to be used for the property sub_type
        houses = ('house', 'villa', 'mansion', 'town-house', 'mixed-use-building', 'exceptional-property')
        appartments = ('apartment', 'ground-floor', 'penthouse', 'flat-studio','apartment-block', 'duplex')

        # declaring the sub_type
        sub_type = link.split('/')[-5]

        # Get the info:

        # Fetch the html tags that will be used to extract the info
        price = soup.find('span', attrs={'class': 'sr-only'})
        #table = soup.find_all('table', class_='classified-table')

        # Create list of raw data from the table
        labels = [str(th) for th in soup.find_all('th', class_='classified-table__header')]
        values = [str(td) for td in soup.find_all('td', class_='classified-table__data')]

        # Get content of the extracted tags as labels to become keys
        new_labels = [re.search(r'<th.*?>(.*?)</th>', label, flags=re.DOTALL) for label in labels]
        clean_labels = [new_label.group(1).strip() if new_label else 'NA' for new_label in new_labels]

        # Get content of the extracted tags as values to become values
        new_values = [re.search(r'<td[^>]*>(.*?)<', value, flags=re.DOTALL) for value in values]
        clean_values = [new_value.group(1).strip() if new_value else 'Not found' for new_value in new_values]

        # Crate a dictionary with all the house details
        details_dict = {label: value for label, value in zip(clean_labels, clean_values)}

        # Start to populate our house dict:
        # Id
        house['id'] = link.split('/')[-1]

        # Locality (city/town)
        house['locality'] = link.split('/')[-3]

        # Property type and sub type
        house['p_type'] = 'house' if sub_type in houses else 'apartment' if sub_type in appartments else 'other'
        house['sub_type'] = sub_type

        # price
        house['price'] = price = re.sub(r'[\D]', "", price.text)

        # sales type
        #house['sales_type'] = 'NA'

        # Bedrooms and living area
        house['bedrooms'] = details_dict.get('Bedrooms')
        house['living_area'] = details_dict.get('Living area')

        # Kitchen
        equiped = ('Hyper equipped', 'Installed', 'Semi equipped')
        if details_dict.get('Kitchen type') in equiped:
            house['equip_kitchen'] = 1
        else:
            house['equip_kitchen'] = 0

        # Furnished
        if details_dict.get('Furnished') == 'Yes':
            house['furnished'] = 1
        else:
            house['furnished'] = 0

        # open fire:
        if details_dict.get('How many fireplaces?') == None:
            house['open_fire'] = 0
        else:
            house['open_fire'] = 1

        # terrace
        if details_dict.get('Terrace') != None \
            or details_dict.get('Terrace surface') != None:
            house['terrace'] = 1
        else:
            house['terrace'] = 0

        # Terrace surface
        house['terrace_area'] = details_dict.get('Terrace surface')

        # garden
        if details_dict.get('Garden') != None \
        or details_dict.get('Garden surface') != None:
            house['garden'] = 1
        else:
            house['garden'] = 0
        
        # garden area
        house['garden_area'] = details_dict.get('Garden surface')

        # land_surface
        #living = int(house['living_area']) if house['terrace_area'] != None else 0
        living = 0 if house['living_area'] == None else int(house['living_area'])  

        #terrace = int(house['terrace_area']) if house['terrace_area'] != None else 0
        terrace = 0 if house['terrace_area'] == None else int(house['terrace_area'])

        #garden = int(house['garden_area']) if house['garden_area'] != None else 0
        garden = 0 if house['garden_area'] == None else int(house['garden_area'])

        house['land_surface'] = living + terrace + garden

        # surface area of the plot of land
        if details_dict.get('Surface of the plot') != None:
            house['surface of the plot'] = details_dict.get('Surface of the plot')
        else:
            house['surface of the plot'] = 0

        # Surface area of the plot of land
        house['state'] = details_dict.get('Building condition')

        # facades
        if details_dict.get('Number of frontages') != None:
            house['facades'] = details_dict.get('Number of frontages')
        else:
            house['facades'] = 0

        # swim_pool
        if details_dict.get('Swimming pool') != None:
            house['swim_pool'] = 1
        else:
            house['swim_pool'] = 0
        
        # state
        house['state'] = details_dict.get('Building condition')

        return house
    #else: 
    #    print('Unable to process this link: ' + link)

In [586]:
get_details('https://www.immoweb.be/en/classified/mixed-use-building/for-sale/geraardsbergen/9500/20318212')

{'id': '20318212',
 'locality': 'geraardsbergen',
 'p_type': 'house',
 'sub_type': 'mixed-use-building',
 'price': '235000',
 'bedrooms': '3',
 'living_area': '150',
 'equip_kitchen': 1,
 'furnished': 0,
 'open_fire': 1,
 'terrace': 1,
 'terrace_area': '15',
 'garden': 0,
 'garden_area': None,
 'land_surface': 165,
 'surface of the plot': '285',
 'state': 'Good',
 'facades': '3',
 'swim_pool': 0}

In [None]:
# Not ready yet

def check_status(link):
    """ 
    Check status code of a given link before 
    procding with scraping.

    PARAMS:
    link -str: the url of the property listing

    RETURNS:
    If status OK, calls get_details function
    If status NOK, prints link and 'unable to process' message
    """

    # Headers for our req so it does not get blocked
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
        }

    # Making the request and printing status code
    req = requests.get(link, headers=headers)
    status_code = req.status_code
    
    if status_code == 200:
        get_details(link)
        print(link)
        print('All good')
    else:
        print('Unable to process this link: ' + link)

In [29]:
links = ['https://www.immoweb.be/en/classified/apartment/for-sale/antwerpen/2100/20315980', 'https://www.immoweb.be/en/classified/house/for-sale/jette/1090/20231308', 'https://www.immoweb.be/en/classified/country-cottage/for-sale/vaux-sur-sure/6640/20057933']

In [3]:
def convert(details):
    """
    Converts the list of dictionaries to
    a dataframe and saves it as csv file.

    PARAMS:
    details -list: of dictionaries

    RETURN:
    df: dataFrame saved in as csv file
    """

    df = pd.DataFrame(details)
    df.to_csv('be_properties.csv', index=False)
    return df

In [None]:
def make_list(file):
    """ 
    Gets the details of each property and creates a dictionary for it.
    
    PARAMS:
    links_list -list: of the links to extract the info from

    RETURN
    details -list: of dictionaries
    """

    links_to_check = []

    with open(file, 'r') as links_file:
        links_to_check = links_file.read().splitlines()
    
    print(len(links_to_check))

    details = []

    for link in links_to_check:
        property_info =  get_details(link)
        if property_info == None:
            print(link)
            continue
        else:
            details.append(property_info)
            
        time.sleep(0.5)

    print(type(details))
    
    #df = pd.DataFrame(details, index=False) 
    print(len(details))
    convert(details)


In [5]:
make_list('Final_Links_Final.txt')

6458
https://www.immoweb.be/en/classified/offices/for-sale/sint-amandsberg/9040/20229926
https://www.immoweb.be/en/classified/offices/for-sale/zwevezele/8750/20135645
https://www.immoweb.be/en/classified/office-block/for-sale/leuven-heverlee/3001/11455299
https://www.immoweb.be/en/classified/offices/for-sale/antwerpen/2000/11329710
https://www.immoweb.be/en/classified/office-block/for-sale/geel/2440/10026262
https://www.immoweb.be/en/classified/offices/for-sale/pepinster/4860/20291706
https://www.immoweb.be/en/classified/offices/for-sale/halle/1500/20277738
https://www.immoweb.be/en/classified/offices/for-sale/bruxelles-1/1000/20301457
https://www.immoweb.be/en/classified/offices/for-sale/arlon/6700/11057152
https://www.immoweb.be/en/classified/offices/for-sale/theux/4910/11251358
https://www.immoweb.be/en/classified/offices/for-sale/bruxelles/1000/10529633
https://www.immoweb.be/en/classified/offices/for-sale/boortmeerbeek/3190/10939151
https://www.immoweb.be/en/classified/offices/for