In [1]:
import json
import pandas as pd
import unicodedata

def load_data():
    data = []
    with open('data/communes.json') as f:
        for line in f:
            data.append(json.loads(line))
    return data

def format_for_url(feature):

    # Récupération du nom et du code de la commune
    name, code = feature['name'], feature['id']
    
    # Suppression des accents
    name = ''.join((c for c in unicodedata.normalize('NFD', name) if unicodedata.category(c) != 'Mn'))

    # Remplacement des espaces et autres caractères non alphanumériques par des tirets
    name = name.replace(" ", "-")
    name = name.replace("'", "-")
    name = ''.join(e for e in name if e.isalnum() or e == '-')

    return f"https://ville-data.com/logement/{name}-33-{code}"

communes = pd.DataFrame(load_data())
communes['url'] = communes.apply(format_for_url, axis=1)
communes

Unnamed: 0,name,id,postal,url
0,Ambarès-et-Lagrave,33003,33440,https://ville-data.com/logement/Ambares-et-Lag...
1,Ambès,33004,33810,https://ville-data.com/logement/Ambes-33-33004
2,Artigues-près-Bordeaux,33013,33370,https://ville-data.com/logement/Artigues-pres-...
3,Ayguemorte-les-Graves,33023,33640,https://ville-data.com/logement/Ayguemorte-les...
4,Bassens,33032,33530,https://ville-data.com/logement/Bassens-33-33032
...,...,...,...,...
77,Talence,33522,33400,https://ville-data.com/logement/Talence-33-33522
78,Le Tourne,33534,33550,https://ville-data.com/logement/Le-Tourne-33-3...
79,Tresses,33535,33370,https://ville-data.com/logement/Tresses-33-33535
80,Villenave-d'Ornon,33550,33140,https://ville-data.com/logement/Villenave-d-Or...


In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# Open the browser
driver = webdriver.Chrome()
driver.get("https://ville-data.com/")

time.sleep(2)

# Consent ville-data.com cookies
query = '//button[@class="fc-button fc-cta-consent fc-primary-button"]'
buttons = driver.find_element(By.XPATH, query)
buttons.click()

The chromedriver version (115.0.5790.102) detected in PATH at /opt/homebrew/bin/chromedriver might not be compatible with the detected chrome version (115.0.5790.170); currently, chromedriver 115.0.5790.170 is recommended for chrome 115.*, so it is advised to delete the driver in PATH and retry


In [3]:
from selenium.webdriver.common.by import By
import re 
import time

def extract_values():
    # Find number of housing
    try:
        query = '//div[contains(@id, "Nombre de logements à")]'
        text = driver.find_element(By.XPATH, query).find_element(By.TAG_NAME, 'p').text
        regex = r'(\d+(?:\s*\d+)*)\s*logements'
        match = re.search(regex, text)
        number_of_housing = int(match.group(1).replace(' ', ''))
    except:
        number_of_housing = None

    # Find number of house
    try:
        query = '//div[contains(@id, "Nombre de maisons à")]'
        text = driver.find_element(By.XPATH, query).find_element(By.TAG_NAME, 'p').text
        regex = r'(\d+(?:\s*\d+)*)\s*maisons'
        match = re.search(regex, text)
        number_of_house = int(match.group(1).replace(' ', ''))
    except:
        number_of_house = None

    # Find number of apartment
    try:
        query = '//div[contains(@id, "Nombre d\'appartements à")]'
        text = driver.find_element(By.XPATH, query).find_element(By.TAG_NAME, 'p').text
        regex = r'(\d+(?:\s*\d+)*)\s*appartements'
        match = re.search(regex, text)
        number_of_apartment = int(match.group(1).replace(' ', ''))
    except:
        number_of_apartment = None

    # Logement quality
    try:
        query = '//div[contains(@id, "Qualité des logements à")]'
        text = driver.find_element(By.XPATH, query).find_element(By.TAG_NAME, 'p').text

        match_total = re.search(r'(\d+(?:\s*\d+)*)\s*logements.*?résidence principale', text)
        match_t1 = re.search(r'(\d+(?:\s*\d+)*)\s*logements de 1 pièce', text)
        match_t2 = re.search(r'(\d+(?:\s*\d+)*)\s*logements de 2 pièces', text)
        match_t3 = re.search(r'(\d+(?:\s*\d+)*)\s*résidences principales de 3 pièces', text)
        match_t4 = re.search(r'(\d+(?:\s*\d+)*)\s*logements de 4 pièces', text)
        match_t5_plus = re.search(r'(\d+(?:\s*\d+)*)\s*logements de 5 pièces ou plus', text)

        total_logements = int(match_total.group(1).replace(" ", "")) if match_total else None
        t1_logements = int(match_t1.group(1).replace(" ", "")) if match_t1 else None
        t2_logements = int(match_t2.group(1).replace(" ", "")) if match_t2 else None
        t3_logements = int(match_t3.group(1).replace(" ", "")) if match_t3 else None
        t4_logements = int(match_t4.group(1).replace(" ", "")) if match_t4 else None
        t5_plus_logements = int(match_t5_plus.group(1).replace(" ", "")) if match_t5_plus else None
    except:
        total_logements = None
        t1_logements = None
        t2_logements = None
        t3_logements = None
        t4_logements = None
        t5_plus_logements = None

    return {
        'housing': number_of_housing,
        'house': number_of_house,
        'apartment': number_of_apartment,
        'principal': total_logements,
        't1': t1_logements,
        't2': t2_logements,
        't3': t3_logements,
        't4': t4_logements,
        't5+': t5_plus_logements
    }

def update_values(row):
    driver.get(row['url'])
    time.sleep(1)
    values = extract_values()
    for key, value in values.items():
        row[key] = value
    return row


communes = communes.apply(update_values, axis=1)
communes



In [5]:
communes.to_json('data/communes-housing.json', orient='records', lines=True)