# Load communes

In [1]:
import json
import pandas as pd
import unicodedata

def load_data():
    data = []
    with open('data/communes.json') as f:
        for line in f:
            data.append(json.loads(line))
    return data

def format_for_url(feature):

    # Récupération du nom et du code de la commune
    name, code = feature['name'], feature['id']
    
    # Suppression des accents
    name = ''.join((c for c in unicodedata.normalize('NFD', name) if unicodedata.category(c) != 'Mn'))

    # Remplacement des espaces et autres caractères non alphanumériques par des tirets
    name = name.replace(" ", "-")
    name = name.replace("'", "-")
    name = name.replace("œ", "oe")
    name = ''.join(e for e in name if e.isalnum() or e == '-')

    return f"https://ville-data.com/logement/{name}-33-{code}"

communes = pd.DataFrame(load_data())
communes['url'] = communes.apply(format_for_url, axis=1)
communes

Unnamed: 0,name,id,postal,url
0,Ambarès-et-Lagrave,33003,33440,https://ville-data.com/logement/Ambares-et-Lag...
1,Ambès,33004,33810,https://ville-data.com/logement/Ambes-33-33004
2,Artigues-près-Bordeaux,33013,33370,https://ville-data.com/logement/Artigues-pres-...
3,Ayguemorte-les-Graves,33023,33640,https://ville-data.com/logement/Ayguemorte-les...
4,Bassens,33032,33530,https://ville-data.com/logement/Bassens-33-33032
...,...,...,...,...
77,Talence,33522,33400,https://ville-data.com/logement/Talence-33-33522
78,Le Tourne,33534,33550,https://ville-data.com/logement/Le-Tourne-33-3...
79,Tresses,33535,33370,https://ville-data.com/logement/Tresses-33-33535
80,Villenave-d'Ornon,33550,33140,https://ville-data.com/logement/Villenave-d-Or...


# Scrap housing and details

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# Open the browser
driver = webdriver.Chrome()
driver.get("https://ville-data.com/")

time.sleep(2)

# Consent ville-data.com cookies
query = '//button[@class="fc-button fc-cta-consent fc-primary-button"]'
buttons = driver.find_element(By.XPATH, query)
buttons.click()

The chromedriver version (115.0.5790.102) detected in PATH at /opt/homebrew/bin/chromedriver might not be compatible with the detected chrome version (115.0.5790.170); currently, chromedriver 115.0.5790.170 is recommended for chrome 115.*, so it is advised to delete the driver in PATH and retry


In [3]:
from selenium.webdriver.common.by import By
import re 
import time

def extract_values():
    # Find number of housing
    try:
        query = '//div[contains(@id, "Nombre de logements à")]'
        text = driver.find_element(By.XPATH, query).find_element(By.TAG_NAME, 'p').text
        regex = r'(\d+(?:\s*\d+)*)\s*logements'
        match = re.search(regex, text)
        number_of_housing = int(match.group(1).replace(' ', ''))
    except:
        number_of_housing = None

    # Find number of house
    try:
        query = '//div[contains(@id, "Nombre de maisons à")]'
        text = driver.find_element(By.XPATH, query).find_element(By.TAG_NAME, 'p').text
        regex = r'(\d+(?:\s*\d+)*)\s*maisons'
        match = re.search(regex, text)
        number_of_house = int(match.group(1).replace(' ', ''))
    except:
        number_of_house = None

    # Find number of apartment
    try:
        query = '//div[contains(@id, "Nombre d\'appartements à")]'
        text = driver.find_element(By.XPATH, query).find_element(By.TAG_NAME, 'p').text
        regex = r'(\d+(?:\s*\d+)*)\s*appartements'
        match = re.search(regex, text)
        number_of_apartment = int(match.group(1).replace(' ', ''))
    except:
        number_of_apartment = None

    # Logement quality
    try:
        query = '//div[contains(@id, "Qualité des logements à")]'
        text = driver.find_element(By.XPATH, query).find_element(By.TAG_NAME, 'p').text

        match_total = re.search(r'(\d+(?:\s*\d+)*)\s*logements.*?résidence principale', text)
        match_t1 = re.search(r'(\d+(?:\s*\d+)*)\s*logements de 1 pièce', text)
        match_t2 = re.search(r'(\d+(?:\s*\d+)*)\s*logements de 2 pièces', text)
        match_t3 = re.search(r'(\d+(?:\s*\d+)*)\s*résidences principales de 3 pièces', text)
        match_t4 = re.search(r'(\d+(?:\s*\d+)*)\s*logements de 4 pièces', text)
        match_t5_plus = re.search(r'(\d+(?:\s*\d+)*)\s*logements de 5 pièces ou plus', text)

        total_logements = int(match_total.group(1).replace(" ", "")) if match_total else None
        t1_logements = int(match_t1.group(1).replace(" ", "")) if match_t1 else None
        t2_logements = int(match_t2.group(1).replace(" ", "")) if match_t2 else None
        t3_logements = int(match_t3.group(1).replace(" ", "")) if match_t3 else None
        t4_logements = int(match_t4.group(1).replace(" ", "")) if match_t4 else None
        t5_plus_logements = int(match_t5_plus.group(1).replace(" ", "")) if match_t5_plus else None
    except:
        total_logements = None
        t1_logements = None
        t2_logements = None
        t3_logements = None
        t4_logements = None
        t5_plus_logements = None

    return {
        'housing': number_of_housing,
        'house': number_of_house,
        'apartment': number_of_apartment,
        'principal': total_logements,
        't1': t1_logements,
        't2': t2_logements,
        't3': t3_logements,
        't4': t4_logements,
        't5+': t5_plus_logements
    }

def update_values(row):
    driver.get(row['url'])
    time.sleep(1)
    values = extract_values()
    for key, value in values.items():
        row[key] = value
    return row


# Update values with selenium
communes = communes.apply(update_values, axis=1)
communes.head()



Unnamed: 0,name,id,postal,url,housing,house,apartment,principal,t1,t2,t3,t4,t5+
0,Ambarès-et-Lagrave,33003,33440,https://ville-data.com/logement/Ambares-et-Lag...,7021.0,5473.0,1509.0,6689,80.0,654.0,1396,2188,2372
1,Ambès,33004,33810,https://ville-data.com/logement/Ambes-33-33004,1422.0,1105.0,308.0,1278,9.0,148.0,198,403,521
2,Artigues-près-Bordeaux,33013,33370,https://ville-data.com/logement/Artigues-pres-...,3874.0,2741.0,1128.0,3701,8.0,415.0,762,1106,1410
3,Ayguemorte-les-Graves,33023,33640,https://ville-data.com/logement/Ayguemorte-les...,518.0,500.0,18.0,501,,19.0,55,166,260
4,Bassens,33032,33530,https://ville-data.com/logement/Bassens-33-33032,3400.0,2170.0,1208.0,3211,114.0,363.0,717,1024,993


In [4]:
# Fill no apartment and house with 0
communes.apartment.fillna(0, inplace=True)
communes.house.fillna(0, inplace=True)

# Fill no t1, t2, t3, t4, t5+ with 0
columns = ['t1', 't2', 't3', 't4', 't5+']
communes[columns] = communes[columns].fillna(0)

# Fill no housing with sum of apartment and house
index = communes[communes.housing.isna()].index
communes.loc[index, 'housing'] = communes.loc[index, 'apartment'] + communes.loc[index, 'house']

# Convert columns to int
columns = ['id', 'housing', 'house', 'apartment', 'principal', 't1', 't2', 't3', 't4', 't5+']
communes[columns] = communes[columns].astype(int)

# Adjust number of house & apartment to consider even non principal residence (estimate)
columns = ['t1', 't2', 't3', 't4', 't5+']
for col in columns:
    communes[col] = (communes[col] * communes['housing'] / communes['principal']).round().astype(int)

communes.head()

Unnamed: 0,name,id,postal,url,housing,house,apartment,principal,t1,t2,t3,t4,t5+
0,Ambarès-et-Lagrave,33003,33440,https://ville-data.com/logement/Ambares-et-Lag...,7021,5473,1509,6689,84,686,1465,2297,2490
1,Ambès,33004,33810,https://ville-data.com/logement/Ambes-33-33004,1422,1105,308,1278,10,165,220,448,580
2,Artigues-près-Bordeaux,33013,33370,https://ville-data.com/logement/Artigues-pres-...,3874,2741,1128,3701,8,434,798,1158,1476
3,Ayguemorte-les-Graves,33023,33640,https://ville-data.com/logement/Ayguemorte-les...,518,500,18,501,0,20,57,172,269
4,Bassens,33032,33530,https://ville-data.com/logement/Bassens-33-33032,3400,2170,1208,3211,121,384,759,1084,1051


In [5]:
communes.to_json('data/communes-housing.json', orient='records', lines=True)

# Chargement des DVF

In [6]:
import pandas as pd

columns = [
    'Nature mutation', 
    'Valeur fonciere',
    'Code postal',
    'Commune', 
    'Code departement', 
    'Code commune',
    'Section', 
    'No plan', 
    'Type local',
    'Surface reelle bati', 
    'Nombre pieces principales'
]

# Load dvfs
dvf = pd.concat([
    pd.read_csv('data/dvf2022.txt', sep='|', low_memory=False),
    pd.read_csv('data/dvf2021.txt', sep='|', low_memory=False),
    pd.read_csv('data/dvf2020.txt', sep='|', low_memory=False),
    pd.read_csv('data/dvf2019.txt', sep='|', low_memory=False),
    pd.read_csv('data/dvf2018.txt', sep='|', low_memory=False),
])


# Open DF and clean it
dvf = dvf[dvf['Type local'].isin(['Appartement', 'Maison'])]
dvf = dvf[dvf['Surface reelle bati'].isna() == False]
dvf = dvf.reset_index(drop=True)
dvf = dvf[columns]

# Keep only selected communes in departement
CODE_DEP = 33
code_communes = communes['id'].apply(lambda x: x - CODE_DEP*1000)
dvf = dvf[(dvf['Code departement'] == str(CODE_DEP).zfill(2)) & (dvf['Code commune'].isin(code_communes))]
dvf = dvf.reset_index(drop=True)

# Convert to int
columns = ['Code commune', 'Nombre pieces principales', 'Surface reelle bati']
dvf[columns] = dvf[columns].astype(int)

# Set nombre pieces principales to 5 for all properties with more than 5 rooms
index = dvf[dvf['Nombre pieces principales'] >= 5].index
dvf.loc[index, 'Nombre pieces principales'] = 5

# Remove Nombre de pieces principales = 0
dvf = dvf[dvf['Nombre pieces principales'] > 0]

dvf

# Calcul des tailles moyenne des maisons et appartements en fonction de leur nombre de pièces

In [116]:
MINIMUM_DATA = 5

# Get the average surface for each type of property
def get_mean_surface_by_type(df):
    # Moyenne générale pour chaque combinaison de 'Nombre pieces principales' et 'Type local'
    general_mean = dvf.groupby(['Nombre pieces principales', 'Type local'])['Surface reelle bati'].mean()

    # Moyenne pour chaque combinaison de 'Code commune', 'Nombre pieces principales' et 'Type local'
    mean_by_commune = dvf.groupby(['Code commune', 'Nombre pieces principales', 'Type local'])['Surface reelle bati'].mean()

    # Nombre de données pour chaque combinaison de 'Code commune', 'Nombre pieces principales' et 'Type local'
    count_by_commune = dvf.groupby(['Code commune', 'Nombre pieces principales', 'Type local']).size()

    # Pour les combinaisons où le nombre de données est inférieur à MINIMUM_DATA, remplacez par la moyenne générale
    for index, count in count_by_commune.items():
        if count < MINIMUM_DATA:
            commune, n_pieces, local_type = index
            mean_by_commune[commune, n_pieces, local_type] = general_mean[n_pieces, local_type]

    return mean_by_commune

# Get the average surface for each type of property
dfs = get_mean_surface_by_type(dvf)
dfs = dfs.unstack(level=[1, 2])
dfs.columns = [f"{col[1][0]}T{col[0]}" for col in dfs.columns]
dfs = dfs.reset_index()

# Fill nan with mean values of the column
columns = ['AT1', 'MT1', 'AT2', 'MT2', 'AT3', 'MT3', 'AT4', 'MT4', 'AT5', 'MT5']
for column in columns:
    dfs[column].fillna(dfs[column].mean(), inplace=True)
dfs.head()

# Add the department code to the city code
dfs['Code commune'] = dfs['Code commune'] + CODE_DEP * 1000
dfs.head()

Unnamed: 0,Code commune,AT1,MT1,AT2,MT2,AT3,MT3,AT4,MT4,AT5,MT5
0,33003,31.731707,54.478261,45.537634,51.196078,61.84058,76.276995,69.727273,96.487805,81.0,128.912621
1,33004,30.0,63.166667,43.6,70.272727,65.852941,75.444444,84.450691,86.97,112.716142,135.828125
2,33013,26.691303,42.774648,46.745455,49.869565,63.026316,69.986842,84.450691,101.02765,110.800853,136.255556
3,33023,29.626326,42.774648,44.599976,66.555556,64.997666,79.454545,84.450691,107.363636,112.716142,144.698113
4,33032,26.691303,24.0,42.388889,46.411765,63.608696,76.188679,83.578947,89.135338,112.716142,107.093023


In [117]:
# Merge it to communes to get final df
df = pd.merge(left=communes, right=dfs, left_on='id', right_on='Code commune')
df = df.drop('Code commune', axis=1)
df.head()

Unnamed: 0,name,id,postal,url,housing,house,apartment,principal,t1,t2,...,AT1,MT1,AT2,MT2,AT3,MT3,AT4,MT4,AT5,MT5
0,Ambarès-et-Lagrave,33003,33440,https://ville-data.com/logement/Ambares-et-Lag...,7021,5473,1509,6689,80,654,...,31.731707,54.478261,45.537634,51.196078,61.84058,76.276995,69.727273,96.487805,81.0,128.912621
1,Ambès,33004,33810,https://ville-data.com/logement/Ambes-33-33004,1422,1105,308,1278,9,148,...,30.0,63.166667,43.6,70.272727,65.852941,75.444444,84.450691,86.97,112.716142,135.828125
2,Artigues-près-Bordeaux,33013,33370,https://ville-data.com/logement/Artigues-pres-...,3874,2741,1128,3701,8,415,...,26.691303,42.774648,46.745455,49.869565,63.026316,69.986842,84.450691,101.02765,110.800853,136.255556
3,Ayguemorte-les-Graves,33023,33640,https://ville-data.com/logement/Ayguemorte-les...,518,500,18,501,0,19,...,29.626326,42.774648,44.599976,66.555556,64.997666,79.454545,84.450691,107.363636,112.716142,144.698113
4,Bassens,33032,33530,https://ville-data.com/logement/Bassens-33-33032,3400,2170,1208,3211,114,363,...,26.691303,24.0,42.388889,46.411765,63.608696,76.188679,83.578947,89.135338,112.716142,107.093023


In [118]:
df.to_json('data/communes-ready.json', orient='records', lines=True)

# Calcul le prix d'une ville

In [None]:
def func(x):
    return 