In [18]:
# Importing the necessary libraries
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

In [19]:
link = 'https://www.immoweb.be/en/classified/villa/for-sale/seraing/4100/20315332'

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
}

In [20]:
# Get the website's content and making soup

req = requests.get(link, headers=headers)
print(req.status_code)

content = req.content
soup = BeautifulSoup(content, 'html')


200


In [21]:
# City / Locality 
locality = link.split('/')[-3]
locality

'seraing'

In [22]:
# Property Type

houses = ('house', 'villa', 'mansion', 'town-house', 'mixed-use-building', 'exceptional-property')
appartments = ('appartment', 'ground-floor', 'penthouse', 'flat-studio','appartment-block', 'duplex')

sub_type = link.split('/')[-5]
sub_type

'villa'

In [23]:
if sub_type in houses:
    p_type = 'house'
elif sub_type in appartments:
    p_type = 'appartment'
else:
    p_type = 'other'


p_type


'house'

In [24]:
# Price
price = soup.find('span', attrs={'class': 'sr-only'})
price = re.sub(r'[\D]', "", price.text)
price

'369000'

In [25]:
###################
# Sale Type TO BE DONE

sale_type = soup.find('span', attrs={'class': 'flag-list__text'})

sale_type



In [26]:
# Get table with all other property details
table = soup.find_all('table', class_='classified-table')

In [27]:
# Create list of raw data from the table
labels = [str(th) for th in soup.find_all('th', class_='classified-table__header')]
values = [str(td) for td in soup.find_all('td', class_='classified-table__data')]

# Get content of the extracted tags as labels to become keys
new_labels = [re.search(r'<th.*?>(.*?)</th>', label, flags=re.DOTALL) for label in labels]
clean_labels = [new_label.group(1).strip() if new_label else 'NA' for new_label in new_labels]

# Get content of the extracted tags as values to become values
new_values = [re.search(r'<td[^>]*>(.*?)<', value, flags=re.DOTALL) for value in values]
clean_values = [new_value.group(1).strip() if new_value else 'Not found' for new_value in new_values]


# print(clean_labels)
# print(clean_values)


# Convert to dict:
details_dict = {label: value for label, value in zip(clean_labels, clean_values)}

details_dict


{'Available as of': 'After signing the deed',
 'Construction year': '1979',
 'Building condition': 'To be done up',
 'Street frontage width': '11 m',
 'Number of frontages': '4',
 'Covered parking spaces': '2',
 'Outdoor parking spaces': '2',
 'Asbestos certificate is available': 'No',
 'Living area': '242',
 'Living room surface': '60',
 'How many fireplaces?': '1',
 'Kitchen type': 'Semi equipped',
 'Bedrooms': '4',
 'Bathrooms': '1',
 'Toilets': '2',
 'Basement': 'Yes',
 'Attic': 'Yes',
 'Surface of the plot': '712',
 'Width of the lot on the street': '',
 'Connection to sewer network': 'Connected',
 'Gas, water &amp; electricity': 'Yes',
 'Garden surface': '400',
 'Garden orientation': 'South West',
 'Terrace surface': '50',
 'TV cable': 'Yes',
 'Primary energy consumption': '339',
 'Energy class': 'D',
 'Reference number of the EPC report': '20240412024792',
 'CO₂ emission': '62 kg CO₂/m²',
 'Yearly theoretical total energy consumption': '81974 kWh/year',
 'Conformity certificatio

In [28]:
# Searching for all details needed in details_dict and
# Assign "NA" if not found

new_house = {}

# Bedrooms and living area
new_house['bedrooms'] = details_dict.get('Bedrooms')
new_house['living_area'] = details_dict.get('Living area')

# Kitchen
equiped = ('Hyper equipped', 'Installed', 'Semi equipped')
if details_dict.get('Kitchen type') in equiped:
    new_house['equip_kitchen'] = 1
else:
    new_house['equip_kitchen'] = 0

# Furnished
if details_dict.get('Furnished') == 'Yes':
    new_house['furnished'] = 1
else:
    new_house['furnished'] = 0


# open fire:
if details_dict.get('How many fireplaces?') == None:
    new_house['open_fire'] = 0
else:
    new_house['open_fire'] = 1

# Terrace
if details_dict.get('Terrace') != None \
    or details_dict.get('Terrace surface') != None:
    new_house['terrace'] = 1
else:
    new_house['terrace'] = 0

# Terrace surface
new_house['terrace_surface'] = details_dict.get('Terrace surface')

# Garden
if details_dict.get('Garden') != None \
    or details_dict.get('Garden surface') != None:
    new_house['garden'] = 1
else:
    new_house['garden'] = 0

# Land surface
if details_dict.get('Surface of the plot') != None:
    new_house['surface of the plot'] = details_dict.get('Surface of the plot')
else:
    new_house['surface of the plot'] = 0

# Facades
if details_dict.get('Number of frontages') != None:
    new_house['facades'] = details_dict.get('Number of frontages')
else:
    new_house['facades'] = 0
    
# Swim pool
if details_dict.get('Swimming pool') != None:
    new_house['swim_pool'] = "yes"
else:
    new_house['swim_pool'] = "no"
    
# State
if details_dict.get('Building condition') != None:
    new_house['state'] = details_dict.get('Building condition')
else:
    new_house['state'] = 0

new_house

# 'bedrooms' : '',
# 'living_area' : '',
# 'equip_kit' : '',   # Yes, no
# 'furnished' : '' , # Yes, no
# 'open_fire' : '', # Yes, no
# 'terrace' : '', # Yes, no  # If yes, area:

{'bedrooms': '4',
 'living_area': '242',
 'equip_kitchen': 1,
 'furnished': 0,
 'open_fire': 1,
 'terrace': 1,
 'terrace_surface': '50',
 'garden': 1,
 'surface of the plot': '712',
 'facades': '4',
 'swim_pool': 'no',
 'state': 'To be done up'}

In [29]:
house = {'Locality' : '',
         'Type' : '',   # (House/apartment)
         'Subtype' : '', # (Bungalow, Chalet, Mansion, ...)
         'Price'  : '',
         'sale_type' : '',
         'bedrooms' : '',
         'living_area' : '',
         'equip_kit' : '',   # Yes, no
         'furnished' : '' , # Yes, no
         'open_fire' : '', # Yes, no
         'terrace' : '', # Yes, no  # If yes, area: 
         'garden' : '', # Yes, no  # If yes, area: 
         'land_surface' : '',
         # Surface area of the plot of land (?)
         'facades' : '',
         'swim_pool' : '',  # Yes, no
         'state' : ''     #  (New, to be renovated, ...)
          
                  }

In [30]:
def get_details(link) -> dict:
    """
    Takes a link for a listing and returns its details.

    PARAMS:
    link - str: the url of the property listing to scrap

    RETURNS:
    house - dict: label-value pairs of the relevant property details

    """

    # Headers for our req so it does not get blocked
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
        }

    # Making the request and printing status code
    req = requests.get(link, headers=headers)

    if req.status_code != 200:
        print("Unable to process")
    else:

        # Get content and make soup
        content = req.content
        soup = BeautifulSoup(content, features='lxml')

        # Iniciate an empty dict
        house = {}

        # Fetch the html tags that will be used to extract the info
        price = soup.find('span', attrs={'class': 'sr-only'}).text
        #table = soup.find_all('table', class_='classified-table')

        # Convert table to lists
        labels = [th for th in soup.find_all('th', class_='classified-table__header')]
        values = [td for td in soup.find_all('td', class_='classified-table__data')]

        # Convert lists to df
        # df = pd.DataFrame({
        # 'Headers' : labels,
        # 'Values' : values
        #     })

        # Populate house dict with the info
        house['id'] = link.split('/')[-1]
        house['locality'] = link.split('/')[-3]
        house['p_type'] = link.split('/')[-5]
        house['price'] = re.sub(r'[\D]', "", price)

        return house


In [31]:
links = ['https://www.immoweb.be/en/classified/apartment/for-sale/antwerpen/2100/20315980', 'https://www.immoweb.be/en/classified/house/for-sale/jette/1090/20231308', 'https://www.immoweb.be/en/classified/country-cottage/for-sale/vaux-sur-sure/6640/20057933']

In [32]:
def make_list(file):
    """ 
    Gets the details of each property and creates a dictionary for it.
    
    PARAMS:
    links_list -list: of the links to extract the info from

    RETURN
    details -list: of dictionaries
    """

    links = []
    links_file = open(file, 'r')
    for link in links_file.readlines():
        links.append(link)

    details = [get_details(link) for link in links]

    df = pd.DataFrame(details)
    df.to_csv('Test.csv', index=False)

    return details

In [33]:
def convert(details):
    """
    Converts the list of dictionaries to
    a dataframe and saves it as csv file.

    PARAMS:
    details -list: of dictionaries

    RETURN:
    df: dataFrame saved in as csv file
    """

    df = pd.DataFrame(details)
    df.to_csv('Test.csv', index=False)
    return df

In [34]:
make_list('immo_links.txt')

FileNotFoundError: [Errno 2] No such file or directory: 'immo_links.txt'