# Load communes

In [9]:
import json
import pandas as pd
import unicodedata

def load_data():
    data = []
    with open('data/paris/communes.json') as f:
        for line in f:
            data.append(json.loads(line))
    return data

def format_url_ville_data(feature, kind="log"):

    # Récupération du nom et du code de la commune
    name, code = feature['name'], feature['id']
    
    # Suppression des accents
    name = ''.join((c for c in unicodedata.normalize('NFD', name) if unicodedata.category(c) != 'Mn'))

    # Remplacement des espaces et autres caractères non alphanumériques par des tirets
    name = name.replace(" ", "-")
    name = name.replace("'", "-")
    name = name.replace("œ", "oe")
    name = ''.join(e for e in name if e.isalnum() or e == '-')

    # Statistique type
    kinds = {
        "log": "logement",
        "pop": "nombre-d-habitants"
    }

    return f"https://ville-data.com/{kinds[kind]}/{name}-75-{code}"



communes = pd.DataFrame(load_data())
communes['url_log'] = communes.apply(lambda x: format_url_ville_data(x, "log"), axis=1)
communes['url_pop'] = communes.apply(lambda x: format_url_ville_data(x, "pop"), axis=1)
communes

Unnamed: 0,name,id,postal,price_apart,price_house,url_log,url_pop
0,Paris 1er Arrondissement,75101,75001,12820,13952,https://ville-data.com/logement/Paris-1er-Arro...,https://ville-data.com/nombre-d-habitants/Pari...
1,Paris 2e Arrondissement,75102,75002,11395,11029,https://ville-data.com/logement/Paris-2e-Arron...,https://ville-data.com/nombre-d-habitants/Pari...
2,Paris 3e Arrondissement,75103,75003,12461,12991,https://ville-data.com/logement/Paris-3e-Arron...,https://ville-data.com/nombre-d-habitants/Pari...
3,Paris 4e Arrondissement,75104,75004,13692,16863,https://ville-data.com/logement/Paris-4e-Arron...,https://ville-data.com/nombre-d-habitants/Pari...
4,Paris 5e Arrondissement,75105,75005,12804,14565,https://ville-data.com/logement/Paris-5e-Arron...,https://ville-data.com/nombre-d-habitants/Pari...
5,Paris 6e Arrondissement,75106,75006,16267,17346,https://ville-data.com/logement/Paris-6e-Arron...,https://ville-data.com/nombre-d-habitants/Pari...
6,Paris 7e Arrondissement,75107,75007,14687,15209,https://ville-data.com/logement/Paris-7e-Arron...,https://ville-data.com/nombre-d-habitants/Pari...
7,Paris 8e Arrondissement,75108,75008,12718,15020,https://ville-data.com/logement/Paris-8e-Arron...,https://ville-data.com/nombre-d-habitants/Pari...
8,Paris 9e Arrondissement,75109,75009,10984,13535,https://ville-data.com/logement/Paris-9e-Arron...,https://ville-data.com/nombre-d-habitants/Pari...
9,Paris 10e Arrondissement,75110,75010,9506,10573,https://ville-data.com/logement/Paris-10e-Arro...,https://ville-data.com/nombre-d-habitants/Pari...


# Scrap housing and population data from the web

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# Open the browser
driver = webdriver.Chrome()
driver.get("https://ville-data.com/")

time.sleep(2)

# Consent ville-data.com cookies
query = '//button[@class="fc-button fc-cta-consent fc-primary-button"]'
buttons = driver.find_element(By.XPATH, query)
buttons.click()

The chromedriver version (115.0.5790.102) detected in PATH at /opt/homebrew/bin/chromedriver might not be compatible with the detected chrome version (115.0.5790.170); currently, chromedriver 115.0.5790.170 is recommended for chrome 115.*, so it is advised to delete the driver in PATH and retry


### Scrap population

In [3]:
from selenium.webdriver.common.by import By
import re 
import time

def extract_pop():
    # Find number of housing
    try:
        query = '//div[contains(@id, "Population d")]'
        text = driver.find_element(By.XPATH, query).find_element(By.TAG_NAME, 'p').text
        regex = r'Il y a (\d+(?:\s*\d+)*) habitants'
        match = re.search(regex, text)
        population = int(match.group(1).replace(' ', ''))
    except:
        population = None

    return {
        'population': population
    }

def update_pop(row):
    driver.get(row['url_pop'])
    time.sleep(1)
    values = extract_pop()
    for key, value in values.items():
        row[key] = value
    return row


# Update values with selenium
communes = communes.apply(update_pop, axis=1)
communes.head()

Unnamed: 0,name,id,postal,url_log,url_pop,population
0,Ambarès-et-Lagrave,33003,33440,https://ville-data.com/logement/Ambares-et-Lag...,https://ville-data.com/nombre-d-habitants/Amba...,17246
1,Ambès,33004,33810,https://ville-data.com/logement/Ambes-33-33004,https://ville-data.com/nombre-d-habitants/Ambe...,3086
2,Artigues-près-Bordeaux,33013,33370,https://ville-data.com/logement/Artigues-pres-...,https://ville-data.com/nombre-d-habitants/Arti...,8952
3,Ayguemorte-les-Graves,33023,33640,https://ville-data.com/logement/Ayguemorte-les...,https://ville-data.com/nombre-d-habitants/Aygu...,1461
4,Bassens,33032,33530,https://ville-data.com/logement/Bassens-33-33032,https://ville-data.com/nombre-d-habitants/Bass...,7673


### Scrap housing

In [4]:
from selenium.webdriver.common.by import By
import re 
import time

def extract_log():
    # Find number of housing
    try:
        query = '//div[contains(@id, "Nombre de logements à")]'
        text = driver.find_element(By.XPATH, query).find_element(By.TAG_NAME, 'p').text
        regex = r'(\d+(?:\s*\d+)*)\s*logements'
        match = re.search(regex, text)
        number_of_housing = int(match.group(1).replace(' ', ''))
    except:
        number_of_housing = None

    # Find number of house
    try:
        query = '//div[contains(@id, "Nombre de maisons à")]'
        text = driver.find_element(By.XPATH, query).find_element(By.TAG_NAME, 'p').text
        regex = r'(\d+(?:\s*\d+)*)\s*maisons'
        match = re.search(regex, text)
        number_of_house = int(match.group(1).replace(' ', ''))
    except:
        number_of_house = None

    # Find number of apartment
    try:
        query = '//div[contains(@id, "Nombre d\'appartements à")]'
        text = driver.find_element(By.XPATH, query).find_element(By.TAG_NAME, 'p').text
        regex = r'(\d+(?:\s*\d+)*)\s*appartements'
        match = re.search(regex, text)
        number_of_apartment = int(match.group(1).replace(' ', ''))
    except:
        number_of_apartment = None

    # Logement quality
    try:
        query = '//div[contains(@id, "Qualité des logements à")]'
        text = driver.find_element(By.XPATH, query).find_element(By.TAG_NAME, 'p').text

        match_total = re.search(r'(\d+(?:\s*\d+)*)\s*logements.*?résidence principale', text)
        match_t1 = re.search(r'(\d+(?:\s*\d+)*)\s*logements de 1 pièce', text)
        match_t2 = re.search(r'(\d+(?:\s*\d+)*)\s*logements de 2 pièces', text)
        match_t3 = re.search(r'(\d+(?:\s*\d+)*)\s*résidences principales de 3 pièces', text)
        match_t4 = re.search(r'(\d+(?:\s*\d+)*)\s*logements de 4 pièces', text)
        match_t5_plus = re.search(r'(\d+(?:\s*\d+)*)\s*logements de 5 pièces ou plus', text)

        total_logements = int(match_total.group(1).replace(" ", "")) if match_total else None
        t1_logements = int(match_t1.group(1).replace(" ", "")) if match_t1 else None
        t2_logements = int(match_t2.group(1).replace(" ", "")) if match_t2 else None
        t3_logements = int(match_t3.group(1).replace(" ", "")) if match_t3 else None
        t4_logements = int(match_t4.group(1).replace(" ", "")) if match_t4 else None
        t5_plus_logements = int(match_t5_plus.group(1).replace(" ", "")) if match_t5_plus else None
    except:
        total_logements = None
        t1_logements = None
        t2_logements = None
        t3_logements = None
        t4_logements = None
        t5_plus_logements = None

    return {
        'housing': number_of_housing,
        'house': number_of_house,
        'apartment': number_of_apartment,
        'principal': total_logements,
        't1': t1_logements,
        't2': t2_logements,
        't3': t3_logements,
        't4': t4_logements,
        't5+': t5_plus_logements
    }

def update_log(row):
    driver.get(row['url_log'])
    time.sleep(1)
    values = extract_log()
    for key, value in values.items():
        row[key] = value
    return row


# Update values with selenium
communes = communes.apply(update_log, axis=1)
communes.head()

Unnamed: 0,name,id,postal,url_log,url_pop,population,housing,house,apartment,principal,t1,t2,t3,t4,t5+
0,Ambarès-et-Lagrave,33003,33440,https://ville-data.com/logement/Ambares-et-Lag...,https://ville-data.com/nombre-d-habitants/Amba...,17246,7021.0,5473.0,1509.0,6689,80.0,654.0,1396,2188,2372
1,Ambès,33004,33810,https://ville-data.com/logement/Ambes-33-33004,https://ville-data.com/nombre-d-habitants/Ambe...,3086,1422.0,1105.0,308.0,1278,9.0,148.0,198,403,521
2,Artigues-près-Bordeaux,33013,33370,https://ville-data.com/logement/Artigues-pres-...,https://ville-data.com/nombre-d-habitants/Arti...,8952,3874.0,2741.0,1128.0,3701,8.0,415.0,762,1106,1410
3,Ayguemorte-les-Graves,33023,33640,https://ville-data.com/logement/Ayguemorte-les...,https://ville-data.com/nombre-d-habitants/Aygu...,1461,518.0,500.0,18.0,501,,19.0,55,166,260
4,Bassens,33032,33530,https://ville-data.com/logement/Bassens-33-33032,https://ville-data.com/nombre-d-habitants/Bass...,7673,3400.0,2170.0,1208.0,3211,114.0,363.0,717,1024,993


### Clean data

In [5]:
# Close the driver, not needed anymore
driver.close()

# Fill no apartment and house with 0
communes.apartment.fillna(0, inplace=True)
communes.house.fillna(0, inplace=True)

# Fill no t1, t2, t3, t4, t5+ with 0
columns = ['t1', 't2', 't3', 't4', 't5+']
communes[columns] = communes[columns].fillna(0)

# Fill no housing with sum of apartment and house
index = communes[communes.housing.isna()].index
communes.loc[index, 'housing'] = communes.loc[index, 'apartment'] + communes.loc[index, 'house']

# If no house, and no apartment, fill with estimate mean
index = communes[(communes.house == 0) & (communes.apartment == 0)].index
percent_house = communes.house.sum() / communes.housing.sum()
communes.loc[index, 'house'] = communes.loc[index, 'housing'] * percent_house
communes.loc[index, 'apartment'] = communes.loc[index, 'housing'] * (1 - percent_house)

# Convert columns to int
columns = ['id', 'population', 'housing', 'house', 'apartment', 'principal', 't1', 't2', 't3', 't4', 't5+']
communes[columns] = communes[columns].astype(int)

# house & apartment in the website are count only for principal residence
# Adjust number of house & apartment to consider even non principal residence (estimate)
columns = ['t1', 't2', 't3', 't4', 't5+']
for col in columns:
    communes[col] = (communes[col] * communes['housing'] / communes['principal']).round().astype(int)

# Drop url column
communes.drop(columns=['url_log'], inplace=True)
communes.drop(columns=['url_pop'], inplace=True)

communes.head()

Unnamed: 0,name,id,postal,population,housing,house,apartment,principal,t1,t2,t3,t4,t5+
0,Ambarès-et-Lagrave,33003,33440,17246,7021,5473,1509,6689,84,686,1465,2297,2490
1,Ambès,33004,33810,3086,1422,1105,308,1278,10,165,220,448,580
2,Artigues-près-Bordeaux,33013,33370,8952,3874,2741,1128,3701,8,434,798,1158,1476
3,Ayguemorte-les-Graves,33023,33640,1461,518,500,18,501,0,20,57,172,269
4,Bassens,33032,33530,7673,3400,2170,1208,3211,121,384,759,1084,1051


In [6]:
communes.to_json('data/paris/communes-housing.json', orient='records', lines=True)

# Load DVF

In [7]:
import pandas as pd

columns = [
    'Nature mutation', 
    'Valeur fonciere',
    'Code postal',
    'Commune', 
    'Code departement', 
    'Code commune',
    'Section', 
    'No plan', 
    'Type local',
    'Surface reelle bati', 
    'Nombre pieces principales'
]

# Load dvfs
dvf = pd.concat([
    pd.read_csv('data/dvf2022.txt', sep='|', low_memory=False),
    pd.read_csv('data/dvf2021.txt', sep='|', low_memory=False),
    pd.read_csv('data/dvf2020.txt', sep='|', low_memory=False),
    pd.read_csv('data/dvf2019.txt', sep='|', low_memory=False),
    pd.read_csv('data/dvf2018.txt', sep='|', low_memory=False),
])


# Open DF and clean it
dvf = dvf[dvf['Type local'].isin(['Appartement', 'Maison'])]
dvf = dvf[dvf['Surface reelle bati'].isna() == False]
dvf = dvf.reset_index(drop=True)
dvf = dvf[columns]

# Keep only selected communes in departement
CODE_DEP = 33
code_communes = communes['id'].apply(lambda x: x - CODE_DEP*1000)
dvf = dvf[(dvf['Code departement'] == str(CODE_DEP).zfill(2)) & (dvf['Code commune'].isin(code_communes))]
dvf = dvf.reset_index(drop=True)

# Convert to int
columns = ['Code commune', 'Nombre pieces principales', 'Surface reelle bati']
dvf[columns] = dvf[columns].astype(int)

# Set nombre pieces principales to 5 for all properties with more than 5 rooms
index = dvf[dvf['Nombre pieces principales'] >= 5].index
dvf.loc[index, 'Nombre pieces principales'] = 5

# Remove Nombre de pieces principales = 0
dvf = dvf[dvf['Nombre pieces principales'] > 0]

dvf

Unnamed: 0,Nature mutation,Valeur fonciere,Code postal,Commune,Code departement,Code commune,Section,No plan,Type local,Surface reelle bati,Nombre pieces principales
0,Vente,39600000,33000.0,BORDEAUX,33,63,HB,203,Maison,71,4
1,Vente,33283000,33290.0,LUDON MEDOC,33,256,AN,214,Maison,111,4
2,Vente,34500000,33140.0,VILLENAVE D ORNON,33,550,CE,223,Maison,95,3
3,Vente,34500000,33140.0,VILLENAVE D ORNON,33,550,CE,223,Maison,95,3
4,Vente,15850000,33000.0,BORDEAUX,33,63,HE,381,Appartement,40,1
...,...,...,...,...,...,...,...,...,...,...,...
97547,Adjudication,7600000,33360.0,CAMBLANES ET MEYNAC,33,85,AL,47,Appartement,103,3
97548,Vente,55327600,33100.0,BORDEAUX,33,63,BP,2,Maison,140,5
97549,Adjudication,22500000,33450.0,MONTUSSAN,33,293,C,919,Maison,138,5
97550,Adjudication,19600000,33310.0,LORMONT,33,249,AZ,375,Maison,63,3


# Compute the average size of houses and apartments according to their number of rooms per city

In [8]:
MINIMUM_DATA = 5

# Get the average surface for each type of property
def get_mean_surface_by_type(df):
    # Mean for each combination of 'Nombre pieces principales' and 'Type local'
    general_mean = dvf.groupby(['Nombre pieces principales', 'Type local'])['Surface reelle bati'].mean()

    # Mean for each combination of 'Code commune', 'Nombre pieces principales' and 'Type local'
    mean_by_commune = dvf.groupby(['Code commune', 'Nombre pieces principales', 'Type local'])['Surface reelle bati'].mean()

    # Number of data for each combination of 'Code commune', 'Nombre pieces principales' and 'Type local'
    count_by_commune = dvf.groupby(['Code commune', 'Nombre pieces principales', 'Type local']).size()

    # For combinations where the number of data is less than MINIMUM_DATA, replace with the general mean
    for index, count in count_by_commune.items():
        if count < MINIMUM_DATA:
            commune, n_pieces, local_type = index
            mean_by_commune[commune, n_pieces, local_type] = general_mean[n_pieces, local_type]

    return mean_by_commune

# Get the average surface for each type of property
dfs = get_mean_surface_by_type(dvf)
dfs = dfs.unstack(level=[1, 2])
dfs.columns = [f"{'H' if col[1][0] == 'M' else col[1][0]}T{col[0]}" for col in dfs.columns]
dfs = dfs.reset_index()

# Fill nan with mean values of the column
columns = ['AT1', 'HT1', 'AT2', 'HT2', 'AT3', 'HT3', 'AT4', 'HT4', 'AT5', 'HT5']
for column in columns:
    dfs[column].fillna(dfs[column].mean(), inplace=True)
dfs.head()

# Add the department code to the city code
dfs['Code commune'] = dfs['Code commune'] + CODE_DEP * 1000
dfs.head()

Unnamed: 0,Code commune,AT1,HT1,AT2,HT2,AT3,HT3,AT4,HT4,AT5,HT5
0,33003,31.731707,54.478261,45.537634,51.196078,61.84058,76.276995,69.727273,96.487805,81.0,128.912621
1,33004,30.0,63.166667,43.6,70.272727,65.852941,75.444444,84.450691,86.97,112.716142,135.828125
2,33013,26.691303,42.774648,46.745455,49.869565,63.026316,69.986842,84.450691,101.02765,110.800853,136.255556
3,33023,29.626326,42.774648,44.599976,66.555556,64.997666,79.454545,84.450691,107.363636,112.716142,144.698113
4,33032,26.691303,24.0,42.388889,46.411765,63.608696,76.188679,83.578947,89.135338,112.716142,107.093023


In [9]:
# Merge it to communes to get final df
communes = pd.read_json('data/bordeaux/communes-housing.json', orient='records', lines=True)
df = pd.merge(left=communes, right=dfs, left_on='id', right_on='Code commune')
df = df.drop('Code commune', axis=1)
df.head()

Unnamed: 0,name,id,postal,population,housing,house,apartment,principal,t1,t2,...,AT1,HT1,AT2,HT2,AT3,HT3,AT4,HT4,AT5,HT5
0,Ambarès-et-Lagrave,33003,33440,17246,7021,5473,1509,6689,84,686,...,31.731707,54.478261,45.537634,51.196078,61.84058,76.276995,69.727273,96.487805,81.0,128.912621
1,Ambès,33004,33810,3086,1422,1105,308,1278,10,165,...,30.0,63.166667,43.6,70.272727,65.852941,75.444444,84.450691,86.97,112.716142,135.828125
2,Artigues-près-Bordeaux,33013,33370,8952,3874,2741,1128,3701,8,434,...,26.691303,42.774648,46.745455,49.869565,63.026316,69.986842,84.450691,101.02765,110.800853,136.255556
3,Ayguemorte-les-Graves,33023,33640,1461,518,500,18,501,0,20,...,29.626326,42.774648,44.599976,66.555556,64.997666,79.454545,84.450691,107.363636,112.716142,144.698113
4,Bassens,33032,33530,7673,3400,2170,1208,3211,121,384,...,26.691303,24.0,42.388889,46.411765,63.608696,76.188679,83.578947,89.135338,112.716142,107.093023


In [10]:
df.to_json('data/bordeaux/communes-surface.json', orient='records', lines=True)

# Add the price per square meter for each city

In [11]:
import pandas as pd

# Load data
dfc = pd.read_json('data/bordeaux/communes-surface.json', orient='records', lines=True)
dfp = pd.read_json('data/bordeaux/price_square_meter.json', lines=True, orient='records')

# Merge data
df = pd.concat([dfc, dfp.drop('name', axis=1)], axis=1)
df.to_json('data/bordeaux/communes-ready.json', orient='records', lines=True)
df.head()

Unnamed: 0,name,id,postal,population,housing,house,apartment,principal,t1,t2,...,AT2,HT2,AT3,HT3,AT4,HT4,AT5,HT5,price_apart,price_house
0,Ambarès-et-Lagrave,33003,33440,17246,7021,5473,1509,6689,84,686,...,45.537634,51.196078,61.84058,76.276995,69.727273,96.487805,81.0,128.912621,2918,2894
1,Ambès,33004,33810,3086,1422,1105,308,1278,10,165,...,43.6,70.272727,65.852941,75.444444,84.450691,86.97,112.716142,135.828125,2248,2232
2,Artigues-près-Bordeaux,33013,33370,8952,3874,2741,1128,3701,8,434,...,46.745455,49.869565,63.026316,69.986842,84.450691,101.02765,110.800853,136.255556,3490,3513
3,Ayguemorte-les-Graves,33023,33640,1461,518,500,18,501,0,20,...,44.599976,66.555556,64.997666,79.454545,84.450691,107.363636,112.716142,144.698113,3494,3115
4,Bassens,33032,33530,7673,3400,2170,1208,3211,121,384,...,42.388889,46.411765,63.608696,76.188679,83.578947,89.135338,112.716142,107.093023,2981,3131


# Compute the price of each city

In [12]:
import pandas as pd

# Load data
df = pd.read_json('data/bordeaux/communes-ready.json', orient='records', lines=True)

# Count price of one TX for one city
def count_price_tx(row, x):
    x = str(x) 
    tx = row['t' + x if int(x) < 5 else 't5+']
    a = row['apartment']
    h = row['house']
    p = row['principal']
    ATX = row['AT' + x]
    pa = row['price_apart']
    HTX = row['HT' + x]
    ph = row['price_house']
    return tx * ((a/p) * ATX * pa + (h/p) * HTX * ph)

# Count price of all TX for one city
def count_price(row):
    sum = 0
    for i in range(1, 6):
        sum += count_price_tx(row, i)
    return sum

# Count price for all cities
df['city_price'] = df.apply(count_price, axis=1)
df.to_json('data/bordeaux/communes-price.json', orient='records', lines=True)
df

Unnamed: 0,name,id,postal,population,housing,house,apartment,principal,t1,t2,...,HT2,AT3,HT3,AT4,HT4,AT5,HT5,price_apart,price_house,city_price
0,Ambarès-et-Lagrave,33003,33440,17246,7021,5473,1509,6689,84,686,...,51.196078,61.840580,76.276995,69.727273,96.487805,81.000000,128.912621,2918,2894,1.963639e+09
1,Ambès,33004,33810,3086,1422,1105,308,1278,10,165,...,70.272727,65.852941,75.444444,84.450691,86.970000,112.716142,135.828125,2248,2232,3.506792e+08
2,Artigues-près-Bordeaux,33013,33370,8952,3874,2741,1128,3701,8,434,...,49.869565,63.026316,69.986842,84.450691,101.027650,110.800853,136.255556,3490,3513,1.383424e+09
3,Ayguemorte-les-Graves,33023,33640,1461,518,500,18,501,0,20,...,66.555556,64.997666,79.454545,84.450691,107.363636,112.716142,144.698113,3494,3115,2.028383e+08
4,Bassens,33032,33530,7673,3400,2170,1208,3211,121,384,...,46.411765,63.608696,76.188679,83.578947,89.135338,112.716142,107.093023,2981,3131,9.192998e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,Talence,33522,33400,45975,26117,7478,18186,24055,7203,4951,...,53.780303,64.484568,70.607527,79.248899,93.465291,94.556150,130.980973,3650,5236,7.157304e+09
78,Le Tourne,33534,33550,825,396,352,45,362,7,28,...,55.866526,57.600000,75.142857,107.600000,94.413793,110.800853,135.400000,2291,2890,1.276792e+08
79,Tresses,33535,33370,5270,2001,1916,79,1937,9,86,...,54.857143,66.875000,89.709677,84.450691,100.203125,110.800853,149.666667,3885,3544,9.027491e+08
80,Villenave-d'Ornon,33550,33140,42747,16909,9504,7341,16039,609,2640,...,51.311111,61.900391,75.836735,75.366197,92.385863,88.897959,122.016743,3309,3871,5.233548e+09


# Add GeoJSON cadastre to each city 

In [2]:
import geopandas as gpd
import pandas as pd

# Load GeoJSON file with geopandas
gdf = gpd.read_file('data/bordeaux/cadastre-33-communes.json')
gdf.drop(['nom', 'created', 'updated'], axis=1, inplace=True)
gdf['id'] = gdf['id'].astype('int')

# Load JSON file with pandas
df = pd.read_json('data/bordeaux/communes-price.json', orient='records', lines=True)

# Merge GeoJSON and JSON files
gdf = gdf.merge(df, on='id', how='right')
gdf.head()

Unnamed: 0,id,geometry,name,postal,population,housing,house,apartment,principal,t1,...,HT2,AT3,HT3,AT4,HT4,AT5,HT5,price_apart,price_house,city_price
0,33003,"MULTIPOLYGON (((-0.48303 44.91540, -0.48297 44...",Ambarès-et-Lagrave,33440,17246,7021,5473,1509,6689,84,...,51.196078,61.84058,76.276995,69.727273,96.487805,81.0,128.912621,2918,2894,1963639000.0
1,33004,"MULTIPOLYGON (((-0.53850 44.98504, -0.53838 44...",Ambès,33810,3086,1422,1105,308,1278,10,...,70.272727,65.852941,75.444444,84.450691,86.97,112.716142,135.828125,2248,2232,350679200.0
2,33013,"MULTIPOLYGON (((-0.49620 44.84147, -0.49557 44...",Artigues-près-Bordeaux,33370,8952,3874,2741,1128,3701,8,...,49.869565,63.026316,69.986842,84.450691,101.02765,110.800853,136.255556,3490,3513,1383424000.0
3,33023,"MULTIPOLYGON (((-0.49705 44.68500, -0.49666 44...",Ayguemorte-les-Graves,33640,1461,518,500,18,501,0,...,66.555556,64.997666,79.454545,84.450691,107.363636,112.716142,144.698113,3494,3115,202838300.0
4,33032,"MULTIPOLYGON (((-0.51709 44.88795, -0.51708 44...",Bassens,33530,7673,3400,2170,1208,3211,121,...,46.411765,63.608696,76.188679,83.578947,89.135338,112.716142,107.093023,2981,3131,919299800.0


In [2]:
import folium
import numpy as np

MIN_PRICE = np.log(min(gdf['city_price']))
MAX_PRICE = np.log(max(gdf['city_price']))

# Define the style of each location
def style_function(feature):

    # Définir la couleur de remplissage en fonction du log du prix 
    min_opacity = 0.05
    max_opacity = 0.8
    opacity = min_opacity + (max_opacity - min_opacity) * (np.log(feature['properties']['city_price']) - MIN_PRICE) / (MAX_PRICE - MIN_PRICE)

    return {
        'fillColor': '#CC4652',   # couleur de remplissage
        'color': f'rgba(0, 0, 0, 0.2)',      # couleur de la ligne
        'weight': 2,             # épaisseur de la ligne
        'fillOpacity': opacity,       # opacité du remplissage
        'clickable': False,       # si True, la zone réagit au clic

    }

# Rewrite price to readable format
def format_price(price):
    d = {
        1000000000: 'B€',
        1000000: 'M€',
        1000: 'K€',
    }
    for k in d:
        if price > k:
            return f'{int(round(price / k, 0))}{d[k]}'
    return price


# Define a custom function to create the tooltip (hover popup)
tooltip = folium.GeoJsonTooltip(
    fields=['name'], 
    sticky=False
)

# Créer une carte centrée sur les coordonnées moyennes du GeoDataFrame
m = folium.Map(
    height=2000,
    width=3500,
    location=[gdf.geometry.unary_union.centroid.y - 0.01, gdf.geometry.unary_union.centroid.x],
    zoom_start=12,
    tiles='https://cartodb-basemaps-{s}.global.ssl.fastly.net/light_nolabels/{z}/{x}/{y}.png',
    attr='&copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> contributors &copy; <a href="https://carto.com/attributions">CARTO</a>'
)

# Add the data to the map with folium
folium.GeoJson(
    gdf, 
    style_function=style_function,
    tooltip=tooltip
).add_to(m)

# Add names centered on each location
for _, row in gdf.iterrows():
    location = [row['geometry'].centroid.y, row['geometry'].centroid.x]
    min_size_text = 12
    max_size_text = 36
    size_text = int(min_size_text + (max_size_text - min_size_text) * (np.log(row['city_price']) - MIN_PRICE) / (MAX_PRICE - MIN_PRICE))
    folium.Marker(location, icon=folium.DivIcon(
        html=f"""
            <div style="width: 300px; transform: translate(-50%, -50%); {"margin-left: 40px; margin-top: 20px;" if row['name'] == 'Bordeaux' else ''}">
                <h5 style="font-family: 'Arial', sans-serif; font-size: {size_text}px; text-align: center;">
                    {row['name']}
                </h5>
                <h5 style="font-family: 'Arial', sans-serif; font-size: {size_text}px; text-align: center;">
                    {format_price(row['city_price'])}
                </h5>
            </div>
        """
    )).add_to(m)

# Afficher la carte
m

In [39]:
import io
from PIL import Image

img_data = m._to_png(5)
img = Image.open(io.BytesIO(img_data))
img.save('image.png')