# ⚙️ **CLEANED DATA IMPORT**

In [1]:
import pandas as pd
import numpy as np
import os

DATA_PATH = '../data/cleaned'

POI_FILENAME = 'poi_df_cleaned.csv'
SITE_FILENAME = 'site_df_cleaned.csv'
SALARY_FILENAME = 'salary_df_cleaned.csv'
GEOREF_FILENAME = 'georef_df_cleaned.csv'
STOCK_FILENAME = 'stock_df_cleaned.csv'
SALES_FILENAME = 'sales_df_cleaned.csv'
POPULATION_FILENAME = 'population_df_cleaned.csv'
POVERTY_FILENAME = 'poverty_df_cleaned.csv'
REAL_ESTATE_FILENAME = 'real_estate_df_cleaned.csv'

poi_df = pd.read_csv(os.path.join(DATA_PATH, POI_FILENAME))
site_df = pd.read_csv(os.path.join(DATA_PATH, SITE_FILENAME))
salary_df = pd.read_csv(os.path.join(DATA_PATH, SALARY_FILENAME))
georef_df = pd.read_csv(os.path.join(DATA_PATH, GEOREF_FILENAME))
stock_df = pd.read_csv(os.path.join(DATA_PATH, STOCK_FILENAME))
sales_df = pd.read_csv(os.path.join(DATA_PATH, SALES_FILENAME))
population_df = pd.read_csv(os.path.join(DATA_PATH, POPULATION_FILENAME))
poverty_df = pd.read_csv(os.path.join(DATA_PATH, POVERTY_FILENAME))
real_estate_df = pd.read_csv(os.path.join(DATA_PATH, REAL_ESTATE_FILENAME))


###____TOURISM (2 KPIS)____###
# Nombre de sites touristiques par départements : num_sites_per_department
# Répartition des catégories touristiques par départements :tourism_category_per_department


###____REAL ESTATE & SECONDARY HOME (5 KPIS)____###
# Prix moyen du m2 par département : average_price_per_m2
# Stock de biens par départements : total_stock_per_department
# Superficie moyenne des logements vendus par départements :average_surface_per_department
# Taux de répartition des maisons secondaires par départements : secondary_home_rate_per_department
# Évolution du % des maisons secondaires par département (entre 2008 et 2018) : secondary_home_rate_evolution_department
# Nombre de maisons vacantes (en 2019) : vacants_housing_per_department
# Taxe d'habitation (valeur et nombre) en 2023 par département : tax_df


###___LIFE QUALITY (4 KPIS)____###
# Salaire moyen par département : avg_salary_per_department
# Nombre de professionnels de santé pour 100 000 habitants par départements (en 2023) : health_df
# Taux de criminalité pour 1000 habitants par départements (en 2020) : criminality_per_department
# Nombre de jours de soleil par an par départements : sunny_df_per_department
# Fusion de tous DF Life Quality  par départements (POUR NORMALISATIN AU SCORING) : life_quality_df

In [None]:
# DF CLEANED CHECK
poi_df.info()
site_df.info()
salary_df.info()
georef_df.info() 
print (stock_df.info())
sales_df.info()
population_df.info() 
poverty_df.info()
real_estate_df.info()

In [None]:
#test
stock_df_test = stock_df.merge(georef_df, on="municipality_code")
stock_df_test = stock_df_test.groupby(["year", "department_name"])["nb_second_home"].sum().reset_index()
filtered_df = stock_df_test[stock_df_test["department_name"] == "Guyane"]
filtered_df


### CLEANING

##### DF_SALES CLEANING

In [2]:


# SALES_DF: Suppression des doublons > nous passons de 4,3M de lignes à 3,821M
sales_df = sales_df.drop_duplicates()
sales_df.shape

(3448398, 13)

In [3]:
# SALES_DF: Check si les doublons on été enlevés : OK
sales_df.duplicated().sum()

np.int64(0)

In [4]:
# SALES_DF: Suppression des prix au m2 supérieur à 30K€ et inférieur à 1K€ > nous passons à 3,3399M de lignes
sales_df = sales_df[(sales_df['sales_price_m2'] <= 30000) & (sales_df['sales_price_m2'] >= 500)]
sales_df.shape

(3448398, 13)

In [5]:
# SALES_DF:
s2 = (sales_df['sales_amount']
             .value_counts()
             .loc[sales_df['sales_amount'].value_counts() > 10])

In [6]:
# SALES_DF:
sales_df = sales_df[sales_df['sales_amount'] > 1] # on enlève les 166 fois ou sales_amount = 1€
sales_df.shape

(3448398, 13)

In [7]:
# SALES_DF: changement du type sales_date en datetime
sales_df['sales_date'] = pd.to_datetime(sales_df['sales_date'])
sales_df["municipality_code"].nunique()

32743

##### DF_SALARY CLEANING

In [8]:
# DF_SALARY: ROUND avg_net_salary
salary_df['avg_net_salary'] = salary_df['avg_net_salary'].round()
salary_df.head()

Unnamed: 0,municipality_code,avg_net_salary,year
0,1004,26471.0,2019
1,1007,26341.0,2019
2,1014,25897.0,2019
3,1024,25695.0,2019
4,1025,26055.0,2019


##### DF_REAL_ESTATE CLEANING

In [9]:
# DF_REAL_ESTATE: suppression des nulls
real_estate_df = real_estate_df.dropna(axis=1)
real_estate_df.isnull().sum()

municipality_code         0
intensite_tension_immo    0
rental_max_apartment      0
rental_min_apartment      0
rental_med_all            0
rental_max_all            0
rental_min_all            0
dtype: int64

##### DF_SITE CLEANING

In [10]:
# SITE_DF: tri avec les données entre parenthèses de la colonne "name" inclues

import re

site_df['data_inside_parenthesis'] = site_df['name'].apply(lambda x: re.search(r'\((.*?)\)', x).group(1) if re.search(r'\((.*?)\)', x) else '')
site_df

#suppression de la colonne "name" dans un second temps

site_df.drop(columns=["name"])

#check pour savoir les informations présentes dans la colonne "poi", et si elles correspondent aux valeurs présentes dans la colonne "type"
print (site_df["poi"].value_counts())
print (site_df["data_inside_parenthesis"].value_counts().head(50))

#faire un mapping des colonnes poi, qui sont en fait plus pertinentes que celles de la colonne "type"

poi
castle            5611
2                 5207
water             4500
museum            3400
theatre           2439
forest            2233
beach             1010
protected_area     927
wetland            784
cinema             667
vineyard           641
golf_course        612
theme_park         358
cliff              337
marina             255
water_park         239
1                  236
zoo                234
valley             223
ridge              213
islet              163
casino             158
volcano            145
park               132
meadow              78
monument            76
sand                35
heritage            32
viewpoint           21
cave_entrance       15
wreck               12
rock                11
waterfall           10
attraction           8
dune                 5
national_park        4
allotments           2
geyser               1
Name: count, dtype: int64
data_inside_parenthesis
                       9899
Monument historique    5090
Eau             

In [11]:
# SITE_DF: création d'un dictionnaire intégrant toutes les différentes valeurs inclues dans la colonne "poi"
s = site_df["poi"].value_counts()[site_df["poi"]]
{k: "toto" for k in s.index}

{'1': 'toto',
 '2': 'toto',
 'zoo': 'toto',
 'dune': 'toto',
 'park': 'toto',
 'rock': 'toto',
 'sand': 'toto',
 'beach': 'toto',
 'cliff': 'toto',
 'islet': 'toto',
 'ridge': 'toto',
 'water': 'toto',
 'wreck': 'toto',
 'casino': 'toto',
 'castle': 'toto',
 'cinema': 'toto',
 'forest': 'toto',
 'geyser': 'toto',
 'marina': 'toto',
 'meadow': 'toto',
 'museum': 'toto',
 'valley': 'toto',
 'theatre': 'toto',
 'volcano': 'toto',
 'wetland': 'toto',
 'heritage': 'toto',
 'monument': 'toto',
 'vineyard': 'toto',
 'viewpoint': 'toto',
 'waterfall': 'toto',
 'allotments': 'toto',
 'attraction': 'toto',
 'theme_park': 'toto',
 'water_park': 'toto',
 'golf_course': 'toto',
 'cave_entrance': 'toto',
 'national_park': 'toto',
 'protected_area': 'toto'}

In [12]:
# SITE_DF: création d'un dictionnaire avec les catégories associées aux valeurs de la colonne POI

category_dict = {'1': 'Patrimoine',
 '2': 'Patrimoine',
 'zoo': 'Entertainment',
 'dune': 'Nature',
 'park': 'Nature',
 'rock': 'Nature',
 'sand': 'Nature',
 'beach': 'Nature',
 'cliff': 'Nature',
 'islet': 'Nature',
 'ridge': 'Nature',
 'water': 'Nature',
 'wreck': 'Patrimoine',
 'casino': 'Entertainment',
 'castle': 'Patrimoine',
 'cinema': 'Culture',
 'forest': 'Nature',
 'geyser': 'Nature',
 'marina': 'Nature',
 'meadow': 'Nature',
 'museum': 'Culture',
 'valley': 'Nature',
 'theatre': 'Culture',
 'volcano': 'Nature',
 'wetland': 'Nature',
 'heritage': 'Patrimoine',
 'monument': 'Patrimoine',
 'vineyard': 'Nature',
 'viewpoint': 'Nature',
 'waterfall': 'Nature',
 'allotments': 'Patrimoine',
 'attraction': 'Entertainment',
 'theme_park': 'Entertainment',
 'water_park': 'Entertainment',
 'golf_course': 'Entertainment',
 'cave_entrance': 'Culture',
 'national_park': 'Nature',
 'protected_area': 'Nature'}

In [13]:
# SITE_DF: création de la colonne "catégorie"
site_df["Category"] = site_df["poi"].map(category_dict)
site_df

Unnamed: 0,poi,name,latitude,longitude,municipality_code,importance,name_reprocessed,data_inside_parenthesis,Category
0,1,Fortifications de Vauban (Patrimoine mondial),46.396160,1.472600,36134,0.139527,Fortifications de Vauban,Patrimoine mondial,Patrimoine
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,46.313695,1.478772,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...,Patrimoine mondial,Patrimoine
2,1,Canal du Midi (Patrimoine mondial),43.332709,3.219890,34032,0.129531,Canal du Midi,Patrimoine mondial,Patrimoine
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,50.455895,2.965034,62427,0.127170,Bassin minier du Nord-Pas de Calais,Patrimoine mondial,Patrimoine
4,1,"Les Causses et les Cévennes, paysage culturel ...",44.346210,3.613406,48050,0.124981,"Les Causses et les Cévennes, paysage culturel ...",Patrimoine mondial,Patrimoine
...,...,...,...,...,...,...,...,...,...
31029,viewpoint,Tour Eiffel 3e étage (Point de vue),48.858262,2.294497,75056,0.053782,Tour Eiffel 3e étage,Point de vue,Nature
31030,theme_park,Jardin d'Acclimatation (Parc à thème),48.878145,2.264686,75056,0.087097,Jardin d'Acclimatation,Parc à thème,Entertainment
31031,theme_park,Foire du Trône (Parc à thème),48.832003,2.404337,75056,0.060000,Foire du Trône,Parc à thème,Entertainment
31032,golf_course,Golf du Bois de Boulogne (Terrain de golf),48.857776,2.232651,75056,0.060073,Golf du Bois de Boulogne,Terrain de golf,Entertainment


In [None]:
poi_df.info()
site_df.info()
salary_df.info()
georef_df.info() 
stock_df.info() 
sales_df.info()
population_df.info() 
poverty_df.info()
real_estate_df.info()

In [None]:
poi_df.head(1)

In [None]:
site_df.head(1) 

In [None]:
salary_df.head(1)

In [None]:
georef_df.head(1) 

In [None]:
stock_df.head(1) 

In [None]:
sales_df.head(1)

In [None]:
population_df.head(1) 

In [None]:
poverty_df.head(1)

In [None]:
real_estate_df.head(1)

# 🧪 **DATA TRANSFORMATION**

### KPIS AGGREGATION BY DEPARTMENT

##### 1. TOURISM

In [14]:
#création de tables permettant de scorer le potentiel touristique de chaque département
site_df = site_df.merge (georef_df, on=["municipality_code"])
site_df.head(5)

Unnamed: 0,poi,name,latitude_x,longitude_x,municipality_code,importance,name_reprocessed,data_inside_parenthesis,Category,city_name,city_name_normalized,municipality_type,latitude_y,longitude_y,department_code,epci_code,department_name
0,1,Fortifications de Vauban (Patrimoine mondial),46.39616,1.4726,36134,0.139527,Fortifications de Vauban,Patrimoine mondial,Patrimoine,Mouhet,MOUHET,municipality,46.389251,1.442651,36,200035137.0,Indre
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,46.313695,1.478772,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...,Patrimoine mondial,Patrimoine,Vareilles,VAREILLES,municipality,46.305016,1.456031,23,242300135.0,Creuse
2,1,Canal du Midi (Patrimoine mondial),43.332709,3.21989,34032,0.129531,Canal du Midi,Patrimoine mondial,Patrimoine,Béziers,BEZIERS,municipality,43.347588,3.230768,34,243400769.0,Hérault
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,50.455895,2.965034,62427,0.12717,Bassin minier du Nord-Pas de Calais,Patrimoine mondial,Patrimoine,Hénin-Beaumont,HENIN BEAUMONT,municipality,50.409234,2.958997,62,246200299.0,Pas-de-Calais
4,1,"Les Causses et les Cévennes, paysage culturel ...",44.34621,3.613406,48050,0.124981,"Les Causses et les Cévennes, paysage culturel ...",Patrimoine mondial,Patrimoine,Bédouès-Cocurès,BEDOUES COCURES,municipality,44.353946,3.61956,48,200069151.0,Lozère


In [20]:
#sélection des colonnes dont on aura besoin pour le calcul
site_df_department = site_df[["poi", "name", "municipality_code", "importance", "name_reprocessed", "department_name"]]
site_df_department

Unnamed: 0,poi,name,municipality_code,importance,name_reprocessed,department_name
0,1,Fortifications de Vauban (Patrimoine mondial),36134,0.139527,Fortifications de Vauban,Indre
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...,Creuse
2,1,Canal du Midi (Patrimoine mondial),34032,0.129531,Canal du Midi,Hérault
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,62427,0.127170,Bassin minier du Nord-Pas de Calais,Pas-de-Calais
4,1,"Les Causses et les Cévennes, paysage culturel ...",48050,0.124981,"Les Causses et les Cévennes, paysage culturel ...",Lozère
...,...,...,...,...,...,...
31019,viewpoint,Tour Eiffel 3e étage (Point de vue),75056,0.053782,Tour Eiffel 3e étage,Paris
31020,theme_park,Jardin d'Acclimatation (Parc à thème),75056,0.087097,Jardin d'Acclimatation,Paris
31021,theme_park,Foire du Trône (Parc à thème),75056,0.060000,Foire du Trône,Paris
31022,golf_course,Golf du Bois de Boulogne (Terrain de golf),75056,0.060073,Golf du Bois de Boulogne,Paris


In [21]:
#groupement par département, puis classement par le département ayant le + d'atouts touristiques
group_site = site_df_department.groupby("department_name")[["importance"]].sum()
group_site
group_site.sort_values("importance", ascending =False)

Unnamed: 0_level_0,importance
department_name,Unnamed: 1_level_1
Gironde,52.539958
Bouches-du-Rhône,47.068588
Finistère,46.685274
Isère,44.230787
Loire-Atlantique,43.525109
...,...
Haute-Marne,7.416165
Val-de-Marne,6.699159
Lozère,6.230406
Seine-Saint-Denis,5.718311


In [17]:
#même calcul que précédemment, mais pour la partie concernant les logements/lieux de villégiature
poi_df = poi_df.merge (georef_df, on=["municipality_code"])
poi_df.head(5)

Unnamed: 0,poi,latitude_x,longitude_x,municipality_code,importance,name_reprocessed,city_name,city_name_normalized,municipality_type,latitude_y,longitude_y,department_code,epci_code,department_name
0,hotel,45.678531,-1.163635,17225,0.078556,Club Med,Les Mathes,MATHES,municipality,45.705988,-1.170867,17,241700640.0,Charente-Maritime
1,hotel,44.026859,4.847491,84129,0.078419,Cabanes des Grands Cépages,Sorgues,SORGUES,municipality,44.014576,4.867405,84,248400293.0,Vaucluse
2,hotel,50.331541,1.565677,80333,0.077999,Pierre et Vacances,Fort-Mahon-Plage,FORT MAHON PLAGE,municipality,50.345059,1.577068,80,200070936.0,Somme
3,hotel,43.285936,6.569696,83068,0.077702,Pierre & Vacances,Grimaud,GRIMAUD,municipality,43.282028,6.533032,83,200036077.0,Var
4,hotel,43.54105,3.752852,34095,0.077542,Le Domaine du Golf,Fabrègues,FABREGUES,municipality,43.534477,3.77193,34,243400017.0,Hérault


In [18]:
#sélection des colonnes dont on aura besoin pour le calcul
poi_df_department = poi_df[["poi", "municipality_code", "importance", "department_name"]]
poi_df_department


Unnamed: 0,poi,municipality_code,importance,department_name
0,hotel,17225,0.078556,Charente-Maritime
1,hotel,84129,0.078419,Vaucluse
2,hotel,80333,0.077999,Somme
3,hotel,83068,0.077702,Var
4,hotel,34095,0.077542,Hérault
...,...,...,...,...
26202,camp_site,19164,0.040000,Corrèze
26203,camp_site,03238,0.040000,Allier
26204,camp_site,19241,0.040000,Corrèze
26205,camp_site,23131,0.040000,Creuse


In [19]:
#groupement par département, puis classement par le département ayant le + de logements/lieux de villégiature
group_poi = poi_df_department.groupby("department_name")[["importance"]].sum()
group_poi
group_poi.sort_values("importance", ascending =False)

Unnamed: 0_level_0,importance
department_name,Unnamed: 1_level_1
Paris,70.275305
Savoie,37.401407
Haute-Savoie,35.158395
Hérault,33.793973
Alpes-Maritimes,32.802552
...,...
Eure-et-Loir,3.871754
Haute-Marne,3.670584
Ardennes,3.541133
Mayenne,3.154595


In [22]:
#ajout des 2 calculs d'importance
department_merged_df = group_poi.merge (group_site, on=["department_name"])
#department_merged_df["somme_importance"]=department_merged_df["importance_x"]+department_merged_df["importance_y"]
#department_merged_df = department_merged_df.drop(columns=["importance_x", "importance_y"])
#department_merged_df
#department_merged_df.sort_values("somme_importance", ascending =False)
department_merged_df



Unnamed: 0_level_0,importance_x,importance_y
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Ain,11.423180,34.061847
Aisne,4.937153,15.687868
Allier,8.386644,21.401418
Alpes-Maritimes,32.802552,32.967630
Alpes-de-Haute-Provence,11.293578,14.722338
...,...,...
Vendée,27.816510,32.287377
Vienne,8.205227,21.203995
Vosges,12.007083,14.973759
Yonne,7.211247,16.595920


##### 2. REAL ESTATE

In [23]:
# 2.1 calcul du loyer au m2 médian par municipality_code
rental_med = real_estate_df [["municipality_code", "rental_med_all"]]
rental_med

Unnamed: 0,municipality_code,rental_med_all
0,57133,9.53
1,57446,11.09
2,77013,12.26
3,77026,9.53
4,77072,11.47
...,...,...
34436,81126,8.93
34437,33425,11.09
34438,85146,10.88
34439,53062,8.69


In [None]:
sales_df

In [24]:
#calcul du prix d'achat au m2 médian par municipality_code
sales_df
sales_df_grouped = sales_df.groupby(["municipality_code"])[["sales_amount", "surface", "premise_type"]].agg({"sales_amount": "sum", "surface": "sum", "premise_type": "count"})
sales_df_grouped = pd.DataFrame (sales_df_grouped)
sales_df_grouped

Unnamed: 0_level_0,sales_amount,surface,premise_type
municipality_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01001,1.362286e+07,6781.0,60
01002,4.067389e+06,1901.0,17
01004,1.852356e+08,87209.0,1062
01005,3.579244e+07,13876.0,138
01006,2.575955e+06,1425.0,13
...,...,...,...
97420,5.335366e+07,16160.0,188
97421,3.143598e+06,1872.0,24
97422,2.541138e+08,99088.0,1237
97423,1.326098e+07,3780.0,54


In [25]:
#jointure pour rajouter dans cette table le loyer médian par municipality_code
sales_df
real_estate_grouped = sales_df_grouped.merge(rental_med, on="municipality_code")
real_estate_grouped

Unnamed: 0,municipality_code,sales_amount,surface,premise_type,rental_med_all
0,01001,1.362286e+07,6781.0,60,10.66
1,01002,4.067389e+06,1901.0,17,10.16
2,01004,1.852356e+08,87209.0,1062,11.25
3,01005,3.579244e+07,13876.0,138,13.28
4,01006,2.575955e+06,1425.0,13,12.70
...,...,...,...,...,...
31892,95676,5.974304e+06,2486.0,23,12.29
31893,95678,1.222182e+07,3568.0,38,18.53
31894,95680,8.569815e+07,32626.0,471,16.22
31895,95682,1.463606e+06,533.0,5,16.57


In [26]:
#ajout du nom du département correspondant à chaque municipality code
real_estate_department = real_estate_grouped.merge(georef_df, on="municipality_code")
real_estate_department
real_estate_department = real_estate_department [["municipality_code", "sales_amount", "surface", "rental_med_all", "department_name", "premise_type"]]

In [None]:
real_estate_department

In [27]:
#calcul du prix au m2 par département
average_price_per_m2 = real_estate_department.groupby(["department_name"])[["sales_amount", "surface"]].agg({"sales_amount": "sum", "surface": "sum"})
average_price_per_m2
average_price_per_m2["average_price_per_m2"] = average_price_per_m2["sales_amount"]/average_price_per_m2["surface"]
average_price_per_m2.sort_values("average_price_per_m2")

Unnamed: 0_level_0,sales_amount,surface,average_price_per_m2
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Creuse,3.494798e+08,250929.0,1392.743706
Indre,9.313114e+08,643735.0,1446.731035
Nièvre,9.686520e+08,663808.0,1459.235257
Haute-Marne,6.485911e+08,444240.0,1460.001467
Meuse,8.157466e+08,550437.0,1481.998032
...,...,...,...
Corse-du-Sud,3.267670e+09,739291.0,4420.005565
Alpes-Maritimes,3.837944e+10,7959808.0,4821.653974
Val-de-Marne,1.529389e+10,2931831.0,5216.497456
Hauts-de-Seine,2.665847e+10,3573916.0,7459.175300


In [28]:
#calcul du loyer médian par départment
real_estate_department["intermediate_sum"]=real_estate_department["rental_med_all"]*real_estate_department["premise_type"]
real_estate_department
average_rental = real_estate_department.groupby(["department_name"])[["intermediate_sum", "premise_type"]].agg({"intermediate_sum": "sum", "premise_type": "sum"})
average_rental
average_rental["average_rental"]= average_rental["intermediate_sum"]/average_rental["premise_type"]
average_rental

Unnamed: 0_level_0,intermediate_sum,premise_type,average_rental
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ain,499987.29,37925,13.183580
Aisne,196297.97,19789,9.919550
Allier,153111.91,15287,10.015825
Alpes-Maritimes,2105196.54,120087,17.530595
Alpes-de-Haute-Provence,141928.94,12555,11.304575
...,...,...,...
Vendée,350608.65,33234,10.549698
Vienne,129985.21,13286,9.783623
Vosges,93517.06,9719,9.622087
Yonne,107147.28,10815,9.907284


In [29]:
#regroupement des colonnes avec le loyer moyen au m2 par département et le prix d'achat au m2 moyen par département
yield_calculation = average_price_per_m2.merge(average_rental, on="department_name")
yield_calculation
yield_calculation = yield_calculation.drop(columns=["sales_amount", "surface", "intermediate_sum", "premise_type"])
yield_calculation["yield_rate"]=yield_calculation["average_rental"]*12/yield_calculation["average_price_per_m2"]*100
yield_calculation.sort_values("yield_rate", ascending=True)


Unnamed: 0_level_0,average_price_per_m2,average_rental,yield_rate
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Paris,11432.173601,29.809690,3.129031
Corse-du-Sud,4420.005565,13.879070,3.768068
Hauts-de-Seine,7459.175300,24.168113,3.888062
Var,4058.417572,14.124893,4.176473
Savoie,3472.500028,12.601657,4.354784
...,...,...,...
Cher,1549.532038,9.742943,7.545201
Aube,1690.589163,10.689934,7.587840
Allier,1577.489243,10.015825,7.619063
Aisne,1561.099105,9.919550,7.625051


In [None]:
#Informations sur la rentabilité locative
yield_calculation

In [30]:
# 2.2 calcul de la variation entre 2018 et 2021

#ajout d'une colonne "year"
sales_df.info()
sales_df["year"]=sales_df["sales_date"].dt.year

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3448398 entries, 0 to 3448397
Data columns (total 13 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   sales_date                 datetime64[ns]
 1   sales_amount               float64       
 2   street_number              float64       
 3   street_code                object        
 4   street_name                object        
 5   nom_commune                object        
 6   municipality_code          object        
 7   premise_type               object        
 8   surface                    float64       
 9   number_of_principal_rooms  int64         
 10  sales_price_m2             float64       
 11  latitude                   float64       
 12  longitude                  float64       
dtypes: datetime64[ns](1), float64(6), int64(1), object(5)
memory usage: 342.0+ MB


In [31]:
#merge pour rajouter le département
sales_info_per_department = sales_df.merge (georef_df, on=["municipality_code"])
sales_info_per_department

Unnamed: 0,sales_date,sales_amount,street_number,street_code,street_name,nom_commune,municipality_code,premise_type,surface,number_of_principal_rooms,...,longitude_x,year,city_name,city_name_normalized,municipality_type,latitude_y,longitude_y,department_code,epci_code,department_name
0,2018-02-06,5000000.0,63.0,0040,AV PRINCE RAINIER III,Saint-Jean-Cap-Ferrat,06121,Maison,292.0,10,...,7.330651,2018,Saint-Jean-Cap-Ferrat,SAINT JEAN CAP FERRAT,municipality,43.687179,7.329936,06,200030195.0,Alpes-Maritimes
1,2018-11-26,93060.0,5308.0,B061,LE SERRET,Vallées-d'Antraigues-Asperjoc,07011,Maison,16.0,0,...,4.360185,2018,Vallées-d'Antraigues-Asperjoc,VALLEES D ANTRAIGUES ASPERJOC,municipality,44.739318,4.355176,07,200073245.0,Ardèche
2,2018-06-08,95000.0,161.0,0683,IMP COL EMILE VIGUIER,Millau,12145,Maison,21.0,0,...,3.054594,2018,Millau,MILLAU,municipality,44.097625,3.117054,12,241200567.0,Aveyron
3,2018-07-17,3912000.0,690.0,1868,CHE DE MAZARGUES,Aix-en-Provence,13001,Maison,610.0,13,...,5.428194,2018,Aix-en-Provence,AIX EN PROVENCE,municipality,43.536071,5.398574,13,200054807.0,Bouches-du-Rhône
4,2018-03-27,810000.0,5000.0,0162,CHE DES OLIVIERS,Eygalières,13034,Maison,296.0,9,...,4.957214,2018,Eygalières,EYGALIERES,municipality,43.761705,4.952120,13,241300375.0,Bouches-du-Rhône
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3444144,2017-11-10,160000.0,9001.0,A055,RES LA COUPIANE,La Valette-du-Var,83144,Appartement,103.0,5,...,5.988491,2017,La Valette-du-Var,VALETTE DU VAR,municipality,43.149915,5.992225,83,248300543.0,Var
3444145,2019-03-15,290000.0,9001.0,A278,RES LES TROIS MAGES,Aix-en-Provence,13001,Appartement,103.0,6,...,5.470374,2019,Aix-en-Provence,AIX EN PROVENCE,municipality,43.536071,5.398574,13,200054807.0,Bouches-du-Rhône
3444146,2018-04-26,540000.0,9001.0,A370,RES LE GAUGUIN CHEM BRUNET,Aix-en-Provence,13001,Appartement,115.0,5,...,5.438490,2018,Aix-en-Provence,AIX EN PROVENCE,municipality,43.536071,5.398574,13,200054807.0,Bouches-du-Rhône
3444147,2021-08-25,184000.0,9001.0,0002,BD ABEL LEFEVRE,Ézy-sur-Eure,27230,Appartement,150.0,3,...,1.422561,2021,Ézy-sur-Eure,EZY SUR EURE,municipality,48.870981,1.412630,27,200040277.0,Eure


In [32]:
#filtre uniquement sur les années 2020 et 2021 (car ce sont les seules années où nous avons toutes les informations)
sales_info_per_department = sales_info_per_department[sales_info_per_department['year'].isin([2020, 2021])]
sales_info_per_department

Unnamed: 0,sales_date,sales_amount,street_number,street_code,street_name,nom_commune,municipality_code,premise_type,surface,number_of_principal_rooms,...,longitude_x,year,city_name,city_name_normalized,municipality_type,latitude_y,longitude_y,department_code,epci_code,department_name
29,2020-05-22,253902.0,41.0,0114,RUE DE LA REPUBLIQUE,L'Argentière-la-Bessée,05006,Maison,160.0,9,...,6.557181,2020,L'Argentière-la-Bessée,ARGENTIERE LA BESSEE,municipality,44.782173,6.472145,05,240500462.0,Hautes-Alpes
30,2020-08-27,2400000.0,760.0,0560,CHE DE LA GRANDE BASTIDE,Mougins,06085,Maison,257.0,9,...,6.985652,2020,Mougins,MOUGINS,municipality,43.596141,7.001294,06,200039915.0,Alpes-Maritimes
31,2020-11-27,920000.0,139.0,0380,CHE DE L ETANG,Mougins,06085,Maison,285.0,9,...,7.019548,2020,Mougins,MOUGINS,municipality,43.596141,7.001294,06,200039915.0,Alpes-Maritimes
32,2020-12-17,1778080.0,168.0,0947,RTE DES ROMARINS,Mougins,06085,Maison,244.0,9,...,7.020948,2020,Mougins,MOUGINS,municipality,43.596141,7.001294,06,200039915.0,Alpes-Maritimes
33,2020-11-20,793750.0,282.0,0275,CHE DE MONTFORT,La Colle-sur-Loup,06044,Maison,227.0,9,...,7.110638,2020,La Colle-sur-Loup,COLLE SUR LOUP,municipality,43.687170,7.097736,06,240600585.0,Alpes-Maritimes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3444138,2021-01-15,169000.0,9001.0,A070,RES DE LA THEUILLERIE,Ris-Orangis,91521,Appartement,99.0,5,...,2.397566,2021,Ris-Orangis,RIS ORANGIS,municipality,48.645194,2.407936,91,200059228.0,Essonne
3444141,2020-09-08,389685.0,9001.0,0667,ALL DES CIGALES,Aix-en-Provence,13001,Appartement,103.0,5,...,5.459952,2020,Aix-en-Provence,AIX EN PROVENCE,municipality,43.536071,5.398574,13,200054807.0,Bouches-du-Rhône
3444142,2021-12-07,236000.0,9001.0,A100,RES LES PAMPRES,Les Ulis,91692,Appartement,103.0,5,...,2.172815,2021,Les Ulis,ULIS,municipality,48.680321,2.185190,91,200056232.0,Essonne
3444147,2021-08-25,184000.0,9001.0,0002,BD ABEL LEFEVRE,Ézy-sur-Eure,27230,Appartement,150.0,3,...,1.422561,2021,Ézy-sur-Eure,EZY SUR EURE,municipality,48.870981,1.412630,27,200040277.0,Eure


In [33]:
#groupement par année et par département
sales_df_per_year = sales_info_per_department.groupby(["department_name", "year"])[["sales_amount", "surface"]].agg({"sales_amount": "sum", "surface": "sum"})
sales_df_per_year

Unnamed: 0_level_0,Unnamed: 1_level_0,sales_amount,surface
department_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Ain,2020,2.210313e+09,837639.0
Ain,2021,2.802408e+09,990520.0
Aisne,2020,6.593465e+08,420712.0
Aisne,2021,8.480408e+08,523582.0
Allier,2020,4.703144e+08,295301.0
...,...,...,...
Vosges,2021,7.157524e+08,405495.0
Yonne,2020,5.685219e+08,359444.0
Yonne,2021,7.649412e+08,464540.0
Yvelines,2020,6.951062e+09,1674525.0


In [34]:
#calcul du prix moyen au m2
sales_df_per_year["average_price_m2"]=sales_df_per_year["sales_amount"]/sales_df_per_year["surface"]
sales_df_per_year
sales_df_per_year.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,sales_amount,surface,average_price_m2
department_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ain,2020,2210313000.0,837639.0,2638.742163
Ain,2021,2802408000.0,990520.0,2829.229384
Aisne,2020,659346500.0,420712.0,1567.215852
Aisne,2021,848040800.0,523582.0,1619.690449
Allier,2020,470314400.0,295301.0,1592.661195
Allier,2021,610503600.0,372329.0,1639.688435
Alpes-Maritimes,2020,8099021000.0,1684862.0,4806.934575
Alpes-Maritimes,2021,10506120000.0,2027161.0,5182.678071
Alpes-de-Haute-Provence,2020,487872100.0,211239.0,2309.573765
Alpes-de-Haute-Provence,2021,540421400.0,227226.0,2378.343326


In [35]:
#calcul de l'évolution entre 2018 et 2021
sales_df_per_year['price_m2_growth'] = sales_df_per_year.groupby('department_name')['average_price_m2'].pct_change()
sales_df_per_year


Unnamed: 0_level_0,Unnamed: 1_level_0,sales_amount,surface,average_price_m2,price_m2_growth
department_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ain,2020,2.210313e+09,837639.0,2638.742163,
Ain,2021,2.802408e+09,990520.0,2829.229384,0.072189
Aisne,2020,6.593465e+08,420712.0,1567.215852,
Aisne,2021,8.480408e+08,523582.0,1619.690449,0.033483
Allier,2020,4.703144e+08,295301.0,1592.661195,
...,...,...,...,...,...
Vosges,2021,7.157524e+08,405495.0,1765.132472,0.039378
Yonne,2020,5.685219e+08,359444.0,1581.670165,
Yonne,2021,7.649412e+08,464540.0,1646.663833,0.041092
Yvelines,2020,6.951062e+09,1674525.0,4151.064983,


In [36]:
#calcul final de l'évolution
sales_df_per_year = sales_df_per_year.dropna()
sales_df_per_year.drop (columns=["sales_amount", "surface"])
sales_df_per_year.sort_values ("price_m2_growth", ascending=False)


Unnamed: 0_level_0,Unnamed: 1_level_0,sales_amount,surface,average_price_m2,price_m2_growth
department_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Pyrénées-Atlantiques,2021,3.821283e+09,1068164.0,3577.430704,0.140719
Landes,2021,2.409002e+09,818313.0,2943.864249,0.112565
Corse-du-Sud,2021,9.380425e+08,193855.0,4838.887261,0.106444
Finistère,2021,3.056316e+09,1421428.0,2150.172909,0.099954
Maine-et-Loire,2021,2.644990e+09,1162430.0,2275.396966,0.096707
...,...,...,...,...,...
Haute-Marne,2021,1.928814e+08,129888.0,1484.982081,0.009958
Haute-Saône,2021,3.172671e+08,202655.0,1565.552673,0.008060
Cantal,2021,2.238980e+08,138821.0,1612.854279,-0.011984
La Réunion,2021,1.260015e+09,384936.0,3273.309935,-0.024756


In [37]:
#calcul du nb de maisons vacantes en 2019
stock_df_2018 = stock_df[stock_df['year'].isin([2018])]
stock_df_2018
stock_df_2018 = stock_df_2018.merge (georef_df, on=["municipality_code"])
vacants_housing_per_department = stock_df_2018.groupby("department_name")["nb_vacants_housing"].sum()
vacants_housing_per_department = pd.DataFrame(vacants_housing_per_department)
vacants_housing_per_department

Unnamed: 0_level_0,nb_vacants_housing
department_name,Unnamed: 1_level_1
Ain,25849
Aisne,26001
Allier,30479
Alpes-Maritimes,64643
Alpes-de-Haute-Provence,10768
...,...
Vendée,21818
Vienne,23264
Vosges,24154
Yonne,23303


In [None]:
stock_df_2018


In [38]:
# 2.3 taxe d'habitation sur les maisons secondaires par département

TAX_FILENAME = 'taxe_habitation.xlsx'

tax_df = pd.read_excel(os.path.join(DATA_PATH, TAX_FILENAME))
tax_df.head()
tax_df = tax_df.rename(columns={'RÉGIONS': 'department_name'})
tax_df

Unnamed: 0,department_name,Taxe d'habitation moyenne en 2023,Nombre d'avis d'impôt
0,Ain,847,23 000
1,Aisne,732,14 000
2,Allier,801,18 000
3,Alpes-de-Haute-Provence,551,35 000
4,Hautes-Alpes,567,52 000
...,...,...,...
96,Guadeloupe,1080,26 000
97,Martinique,986,18 000
98,Guyane,766,9 000
99,La Réunion,1182,15 000


##### 3. SECONDARY HOME

In [39]:
# 3.1 Superficie moyenne des logements vendus par départements

# Joindre les informations de géolocalisation pour obtenir les départements
real_estate_sales_dep = sales_df.merge(
    georef_df[['municipality_code', 'department_code', 'department_name']],
    on='municipality_code'
)

# Calculer la surface moyenne des logements vendus par département
average_surface_municipality = real_estate_sales_dep.groupby('department_name')['surface'].mean().reset_index()

average_surface_municipality

Unnamed: 0,department_name,surface
0,Ain,95.491503
1,Aisne,91.956053
2,Allier,85.308972
3,Alpes-Maritimes,66.283678
4,Alpes-de-Haute-Provence,73.283742
...,...,...
92,Vendée,84.926212
93,Vienne,87.584873
94,Vosges,88.986998
95,Yonne,88.105673


In [40]:
# 3.2 Évolution du % des maisons secondaires par département

# Filtrer les données pour les années 2008 et 2018
housing_2008 = stock_df[stock_df['year'] == 2008]
housing_2018 = stock_df[stock_df['year'] == 2018]

# Renommer les colonnes pour les années spécifiques
housing_2008 = housing_2008[['municipality_code', 'nb_second_home']].rename(columns={'nb_second_home': 'nb_second_home_2008'})
housing_2018 = housing_2018[['municipality_code', 'nb_second_home']].rename(columns={'nb_second_home': 'nb_second_home_2018'})

# Joindre les données pour les années 2008 et 2018 sur le code de municipalité
secondary_home_rate_comparison = housing_2008.merge(housing_2018, on='municipality_code')

# Joindre les informations de géolocalisation pour obtenir les départements
secondary_home_rate_comparison = secondary_home_rate_comparison.merge(
    georef_df[['municipality_code', 'department_code', 'department_name']],
    on='municipality_code'
)

# Calculer l'évolution moyenne du pourcentage de maisons secondaires par département
secondary_home_rate_evolution_department = secondary_home_rate_comparison.groupby(['department_name'])[['nb_second_home_2008', "nb_second_home_2018"]].agg({'nb_second_home_2008': "sum", "nb_second_home_2018": "sum"})
secondary_home_rate_evolution_department["evolution_secondary_homes"]=((secondary_home_rate_evolution_department["nb_second_home_2018"]-secondary_home_rate_evolution_department["nb_second_home_2008"])/secondary_home_rate_evolution_department["nb_second_home_2008"])*100
secondary_home_rate_evolution_department.head(50)

Unnamed: 0_level_0,nb_second_home_2008,nb_second_home_2018,evolution_secondary_homes
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ain,16960,17574,3.620283
Aisne,10185,9393,-7.776141
Allier,14797,15032,1.58816
Alpes-Maritimes,166798,195108,16.972626
Alpes-de-Haute-Provence,38403,40199,4.676718
Ardennes,5070,5003,-1.321499
Ardèche,35985,36779,2.206475
Ariège,25341,27107,6.968944
Aube,6930,7825,12.914863
Aude,61952,67047,8.224109


In [41]:
# 3.3 taxe d'habitation sur les maisons secondaires par département

TAX_FILENAME = 'taxe_habitation.xlsx'

tax_df = pd.read_excel(os.path.join(DATA_PATH, TAX_FILENAME))
tax_df.head()
tax_df = tax_df.rename(columns={'RÉGIONS': 'department_name'})
tax_df

Unnamed: 0,department_name,Taxe d'habitation moyenne en 2023,Nombre d'avis d'impôt
0,Ain,847,23 000
1,Aisne,732,14 000
2,Allier,801,18 000
3,Alpes-de-Haute-Provence,551,35 000
4,Hautes-Alpes,567,52 000
...,...,...,...
96,Guadeloupe,1080,26 000
97,Martinique,986,18 000
98,Guyane,766,9 000
99,La Réunion,1182,15 000


##### 4. LIFE QUALITY

In [42]:
# 4.1 Professionnels de santé pour 100 000 habitants par départements en 2023
DATA_PATH = '../data/cleaned'
HEALTH_FILENAME = 'health_df_cleaned.csv'

health_df = pd.read_csv(os.path.join(DATA_PATH, HEALTH_FILENAME))
health_df.head(50)

Unnamed: 0,department_code,department_name,ensemble des médecins,ensemble des médecins.1,dont généralistes,dont spécialistes,chirurg. dentistes,pharm.
0,01,Ain,1 162,174,99,75,53,78
1,02,Aisne,1 107,211,99,111,45,91
2,03,Allier,917,275,134,141,51,111
3,04,Alpes-de-Haute-Provence,483,291,165,125,54,103
4,05,Hautes-Alpes,705,503,291,213,72,135
5,06,Alpes-Maritimes,5 095,461,173,288,123,123
6,07,Ardèche,777,235,121,113,48,97
7,08,Ardennes,642,241,126,115,42,104
8,09,Ariège,408,264,163,102,60,97
9,10,Aube,714,229,112,117,48,87


In [43]:
# 4.2 Taux de criminalité pour 1000 habitants par départements en 2020

CRIMINALITY_FILENAME = 'criminality_df_cleaned.csv'

criminality_df = pd.read_csv(os.path.join(DATA_PATH, CRIMINALITY_FILENAME))

# Convertir criminality_per_1000 en type numérique (si nécessaire)
criminality_df['criminality_per_1000'] = pd.to_numeric(criminality_df['criminality_per_1000'].str.replace(',', '.'))

# Agréger georef_df par département_name pour obtenir une seule ligne par département
georef_aggregated = georef_df.groupby('department_name').first().reset_index()

# Effectuer une fusion (merge) pour ajouter department_code à criminality_aggregated en utilisant department_name comme clé
criminality_aggregated = criminality_df.groupby('department_name')['criminality_per_1000'].mean().reset_index()
criminality_per_department = criminality_aggregated.merge(georef_aggregated[['department_name', 'department_code']], on='department_name')

# Afficher les premières lignes du dataframe mis à jour
criminality_per_department.tail(50)

Unnamed: 0,department_name,criminality_per_1000,department_code
51,Isère,51.65,38
52,Jura,34.68,39
53,La Réunion,24.72,974
54,Landes,34.7,40
55,Loir-et-Cher,30.62,41
56,Loire,41.18,42
57,Loire-Atlantique,52.79,44
58,Loiret,39.49,45
59,Lot,30.8,46
60,Lot-et-Garonne,34.94,47


In [44]:
# 4.3 Nombre de jours de soleil par an par départements

SUNNY_FILENAME = 'heures_ensoleillement.xlsx'

sunny_df = pd.read_excel(os.path.join(DATA_PATH, SUNNY_FILENAME))
sunny_df.head()
sunny_df = sunny_df.rename(columns={'Départements Français et Dom Tom': 'department_name'})
sunny_df
sunny_df_per_department = sunny_df.drop (columns=["Num dép", "Classement"])
sunny_df_per_department.tail(50)

Unnamed: 0,department_name,Ensoleillement (heures)
46,Saône-et-Loire,1849
47,Indre,1835
48,Indre-et-Loire,1799
49,Doubs,1797
50,Hauts-de-Seine,1796
51,Côte-d’Or,1789
52,Cher,1787
53,Val-de-Marne,1774
54,Aube,1771
55,Nièvre,1764


# 🚀 ENRICHED EXPORT

In [None]:
# Chemin du dossier où les fichiers seront enregistrés
output_folder = "../data/enriched"

# Assurez-vous que le dossier existe
os.makedirs(output_folder, exist_ok=True)

In [None]:
# Liste des DataFrames et leurs noms
dataframes = {
    "num_sites_per_department": num_sites_per_department,
    "tourism_category_per_department": tourism_category_per_department,    
    "average_price_per_m2_per_department": avg_price_per_m2_per_department,
    "total_stock_per_department": total_stock_per_department,    
    "average_surface_per_department": average_surface_per_department,
    "secondary_home_rate_per_department": secondary_home_rate_per_department,
    "secondary_home_rate_evolution_department": secondary_home_rate_evolution_department,
    "vacants_housing_per_department": vacants_housing_per_department,
    "avg_salary_per_department": avg_salary_per_department,
    "health_df_per_derpartment": health_df,
    "criminality_per_department": criminality_per_department,
    "sunny_df_per_department": sunny_df_per_department,
    "life_quality_df": life_quality_df
    "taxe_habitation_per_department": tax_df,
}

# Exportation des DataFrames en CSV
for name, df in dataframes.items():
    output_path = os.path.join(output_folder, f"{name}_enriched.csv")
    df.to_csv(output_path, index=False)
    print(f"DataFrame {name} exporté vers {output_path}")

# SCORING

##### 1. TOURISM

In [45]:
department_merged_df["ranking_hosting"]= department_merged_df["importance_x"]
department_merged_df["ranking_touristic_sites"]= department_merged_df["importance_y"]
calculation_tourism_scoring = department_merged_df.drop (columns=["importance_x", "importance_y"])
calculation_tourism_scoring

Unnamed: 0_level_0,ranking_hosting,ranking_touristic_sites
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Ain,11.423180,34.061847
Aisne,4.937153,15.687868
Allier,8.386644,21.401418
Alpes-Maritimes,32.802552,32.967630
Alpes-de-Haute-Provence,11.293578,14.722338
...,...,...
Vendée,27.816510,32.287377
Vienne,8.205227,21.203995
Vosges,12.007083,14.973759
Yonne,7.211247,16.595920


##### 2. REAL ESTATE

In [46]:
#fusion des différents dataframes pour réaliser le scoring
real_estate_scoring_merge_1 = yield_calculation.merge(sales_df_per_year, on="department_name")
real_estate_scoring_merge_2 = real_estate_scoring_merge_1.merge(vacants_housing_per_department, on="department_name")
real_estate_scoring_merge_2

Unnamed: 0_level_0,average_price_per_m2,average_rental,yield_rate,sales_amount,surface,average_price_m2,price_m2_growth,nb_vacants_housing
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Ain,2635.784095,13.183580,6.002122,2.802408e+09,990520.0,2829.229384,0.072189,25849
Aisne,1561.099105,9.919550,7.625051,8.480408e+08,523582.0,1619.690449,0.033483,26001
Allier,1577.489243,10.015825,7.619063,6.105036e+08,372329.0,1639.688435,0.029527,30479
Alpes-Maritimes,4821.653974,17.530595,4.362966,1.050612e+10,2027161.0,5182.678071,0.078167,64643
Alpes-de-Haute-Provence,2281.499046,11.304575,5.945867,5.404214e+08,227226.0,2378.343326,0.029776,10768
...,...,...,...,...,...,...,...,...
Vendée,2401.896369,10.549698,5.270684,3.281171e+09,1290262.0,2543.026785,0.091690,21818
Vienne,1752.067837,9.783623,6.700852,1.002048e+09,549295.0,1824.243054,0.066504,23264
Vosges,1713.030737,9.622087,6.740395,7.157524e+08,405495.0,1765.132472,0.039378,24154
Yonne,1607.734995,9.907284,7.394714,7.649412e+08,464540.0,1646.663833,0.041092,23303


In [47]:
#fusion du dernier dataframe
real_estate_scoring_merge_3 = real_estate_scoring_merge_2.merge(tax_df, on="department_name")
real_estate_scoring_merge_3
real_estate_scoring_merge_3 = real_estate_scoring_merge_3.drop(columns=["average_price_per_m2","sales_amount","surface","average_price_m2","average_rental","Nombre d'avis d'impôt"
])
real_estate_scoring_merge_3

Unnamed: 0,department_name,yield_rate,price_m2_growth,nb_vacants_housing,Taxe d'habitation moyenne en 2023
0,Ain,6.002122,0.072189,25849,847
1,Aisne,7.625051,0.033483,26001,732
2,Allier,7.619063,0.029527,30479,801
3,Alpes-Maritimes,4.362966,0.078167,64643,1686
4,Alpes-de-Haute-Provence,5.945867,0.029776,10768,551
...,...,...,...,...,...
88,Vendée,5.270684,0.091690,21818,759
89,Vienne,6.700852,0.066504,23264,746
90,Vosges,6.740395,0.039378,24154,611
91,Yonne,7.394714,0.041092,23303,682


In [48]:
#éléments pour le calcul du scoring immo
calculation_real_estate_scoring = real_estate_scoring_merge_3
calculation_real_estate_scoring.head(50)
calculation_real_estate_scoring["Taxe d'habitation moyenne en 2023"].astype(float)


0      847.0
1      732.0
2      801.0
3     1686.0
4      551.0
       ...  
88     759.0
89     746.0
90     611.0
91     682.0
92    1443.0
Name: Taxe d'habitation moyenne en 2023, Length: 93, dtype: float64

In [49]:
calculation_real_estate_scoring.tail(50)
calculation_real_estate_scoring = calculation_real_estate_scoring.drop(columns="Taxe d'habitation moyenne en 2023")
calculation_real_estate_scoring

Unnamed: 0,department_name,yield_rate,price_m2_growth,nb_vacants_housing
0,Ain,6.002122,0.072189,25849
1,Aisne,7.625051,0.033483,26001
2,Allier,7.619063,0.029527,30479
3,Alpes-Maritimes,4.362966,0.078167,64643
4,Alpes-de-Haute-Provence,5.945867,0.029776,10768
...,...,...,...,...
88,Vendée,5.270684,0.091690,21818
89,Vienne,6.700852,0.066504,23264
90,Vosges,6.740395,0.039378,24154
91,Yonne,7.394714,0.041092,23303


In [None]:
calculation_real_estate_scoring

##### 3. SECONDARY HOME

In [50]:
#fusion des différents dataframes pour réaliser le scoring
calculation_secondary_home_scoring_merge_1 = average_surface_municipality.merge(secondary_home_rate_evolution_department, on="department_name")
calculation_secondary_home_scoring_merge_2 = calculation_secondary_home_scoring_merge_1.merge(tax_df, on="department_name")
calculation_secondary_home_scoring_merge_2
calculation_secondary_home_scoring=calculation_secondary_home_scoring_merge_2.drop (columns=["nb_second_home_2008", "nb_second_home_2018", "Nombre d'avis d'impôt"])
calculation_secondary_home_scoring

Unnamed: 0,department_name,surface,evolution_secondary_homes,Taxe d'habitation moyenne en 2023
0,Ain,95.491503,3.620283,847
1,Aisne,91.956053,-7.776141,732
2,Allier,85.308972,1.588160,801
3,Alpes-Maritimes,66.283678,16.972626,1686
4,Alpes-de-Haute-Provence,73.283742,4.676718,551
...,...,...,...,...
92,Vendée,84.926212,-1.718634,759
93,Vienne,87.584873,15.877164,746
94,Vosges,88.986998,12.625825,611
95,Yonne,88.105673,-9.106951,682


##### 4. LIFE QUALITY

In [51]:
# MERGE DES 3 DF
life_quality_df = sunny_df_per_department.merge(criminality_per_department, on='department_name', how='inner')
life_quality_df = life_quality_df.merge(health_df, on='department_name', how='outer')

# Remplacer les NaN par des valeurs nulles
life_quality_df = life_quality_df.fillna(0)  # Vous pouvez remplacer 0 par d'autres valeurs par défaut si nécessaire

# Supprimer les colonnes redondantes department_code_x et department_code_y
life_quality_df = life_quality_df.drop(columns=['department_code_x', 'department_code_y', "ensemble des médecins", "dont généralistes", "dont spécialistes", "chirurg. dentistes", "pharm."])

calculation_life_quality_scoring = pd.DataFrame (life_quality_df)
calculation_life_quality_scoring.tail(50)

Unnamed: 0,department_name,Ensoleillement (heures),criminality_per_1000,ensemble des médecins.1
51,Isère,2020.0,51.65,337
52,Jura,1889.0,34.68,246
53,La Réunion,0.0,0.0,364
54,Landes,1852.0,34.7,295
55,Loir-et-Cher,1737.0,30.62,231
56,Loire,2007.0,41.18,345
57,Loire-Atlantique,1690.0,52.79,355
58,Loiret,1710.0,39.49,231
59,Lot,2054.0,30.8,260
60,Lot-et-Garonne,1957.0,34.94,250


In [52]:
#modification des 3 lignes nulles (je n'ai pas compris pourquoi elles l'étaient)
calculation_life_quality_scoring.at[21,"criminality_per_1000"]=36.72
calculation_life_quality_scoring.at[22,"criminality_per_1000"]=30.92
calculation_life_quality_scoring.at[21,"Ensoleillement (heures)"]=1789
calculation_life_quality_scoring.at[22,"Ensoleillement (heures)"]=1512
calculation_life_quality_scoring.at[92,"Ensoleillement (heures)"]=1719
calculation_life_quality_scoring.at[92,"criminality_per_1000"]=43.79

calculation_life_quality_scoring.head(50)

Unnamed: 0,department_name,Ensoleillement (heures),criminality_per_1000,ensemble des médecins.1
0,Ain,1928.0,35.0,174
1,Aisne,1609.0,41.71,211
2,Allier,1857.0,35.12,275
3,Alpes-Maritimes,2668.0,55.66,461
4,Alpes-de-Haute-Provence,2596.0,44.57,291
5,Ardennes,1440.0,38.67,241
6,Ardèche,2390.0,34.69,235
7,Ariège,1900.0,39.98,264
8,Aube,1771.0,45.96,229
9,Aude,2106.0,38.95,288


# **SCALING**

##### 1. TOURISM

In [53]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
scaler = MinMaxScaler()

#scaling pour calculer le scoring
calculation_tourism_scoring_numeric = calculation_tourism_scoring.select_dtypes(include="number")
df_scaled_tourism = scaler.fit_transform(calculation_tourism_scoring_numeric)
df_scaled_tourism = pd.DataFrame(df_scaled_tourism, columns=calculation_tourism_scoring_numeric.columns, index=calculation_tourism_scoring.index)
df_scaled_tourism.head(50)

Unnamed: 0_level_0,ranking_hosting,ranking_touristic_sites
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Ain,0.142733,0.627787
Aisne,0.048254,0.257672
Allier,0.098501,0.372763
Alpes-Maritimes,0.454155,0.605746
Alpes-de-Haute-Provence,0.140845,0.238223
Ardennes,0.027919,0.092048
Ardèche,0.437353,0.368949
Ariège,0.106684,0.14551
Aube,0.056686,0.093658
Aude,0.190056,0.336015


In [54]:
#cleaning des différents KPI
df_scaled_tourism['ranking_hosting'] = round(df_scaled_tourism['ranking_hosting'], 2)
df_scaled_tourism['ranking_touristic_sites'] = round(df_scaled_tourism['ranking_touristic_sites'], 2)
df_scaled_tourism

Unnamed: 0_level_0,ranking_hosting,ranking_touristic_sites
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Ain,0.14,0.63
Aisne,0.05,0.26
Allier,0.10,0.37
Alpes-Maritimes,0.45,0.61
Alpes-de-Haute-Provence,0.14,0.24
...,...,...
Vendée,0.38,0.59
Vienne,0.10,0.37
Vosges,0.15,0.24
Yonne,0.08,0.28


##### 2. REAL ESTATE

In [55]:
#scaling pour calculer le scoring
calculation_real_estate_scoring_numeric = calculation_real_estate_scoring.select_dtypes(include="number")
scaler = MinMaxScaler()
df_scaled_real_estate = scaler.fit_transform(calculation_real_estate_scoring_numeric)
df_scaled_real_estate = pd.DataFrame(df_scaled_real_estate, index=real_estate_scoring_merge_2.index, columns=calculation_real_estate_scoring_numeric.columns)
df_scaled_real_estate

Unnamed: 0_level_0,yield_rate,price_m2_growth,nb_vacants_housing
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ain,0.633355,0.551218,0.197095
Aisne,0.991120,0.297747,0.198424
Allier,0.989800,0.271845,0.237583
Alpes-Maritimes,0.272014,0.590368,0.536334
Alpes-de-Haute-Provence,0.620955,0.273472,0.065217
...,...,...,...
Vendée,0.472114,0.678925,0.161845
Vienne,0.787386,0.513990,0.174490
Vosges,0.796103,0.336356,0.182273
Yonne,0.940344,0.347576,0.174831


In [56]:
#cleaning des différents KPI
df_scaled_real_estate['yield_rate'] = round(df_scaled_real_estate['yield_rate'], 2)
df_scaled_real_estate['price_m2_growth'] = round(df_scaled_real_estate['price_m2_growth'], 2)
df_scaled_real_estate['nb_vacants_housing'] = round(df_scaled_real_estate['nb_vacants_housing'], 2)
df_scaled_real_estate

Unnamed: 0_level_0,yield_rate,price_m2_growth,nb_vacants_housing
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ain,0.63,0.55,0.20
Aisne,0.99,0.30,0.20
Allier,0.99,0.27,0.24
Alpes-Maritimes,0.27,0.59,0.54
Alpes-de-Haute-Provence,0.62,0.27,0.07
...,...,...,...
Vendée,0.47,0.68,0.16
Vienne,0.79,0.51,0.17
Vosges,0.80,0.34,0.18
Yonne,0.94,0.35,0.17


In [None]:
calculation_real_estate_scoring.tail(50)

##### 3. SECONDARY HOME

In [57]:
#scaling pour calculer le scoring
calculation_secondary_home_scoring_numeric = calculation_secondary_home_scoring.select_dtypes(include="number")
df_scaled_secondary_home = scaler.fit_transform(calculation_secondary_home_scoring_numeric)
df_scaled_secondary_home = pd.DataFrame(df_scaled_secondary_home, columns=calculation_secondary_home_scoring_numeric.columns, index=calculation_secondary_home_scoring["department_name"])
df_scaled_secondary_home.head(50)

Unnamed: 0_level_0,surface,evolution_secondary_homes,Taxe d'habitation moyenne en 2023
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ain,0.849569,0.13895,0.213806
Aisne,0.777615,0.014529,0.158677
Allier,0.642331,0.116764,0.191755
Alpes-Maritimes,0.255123,0.284725,0.616012
Alpes-de-Haute-Provence,0.39759,0.150484,0.071908
Ardennes,0.776964,0.084998,0.084372
Ardèche,0.679231,0.123515,0.05465
Ariège,0.61961,0.175509,0.053212
Aube,0.626639,0.240424,0.139981
Aude,0.489839,0.189213,0.088207


In [58]:
#cleaning des différents KPI
df_scaled_secondary_home['surface'] = round(df_scaled_secondary_home['surface'], 2)
df_scaled_secondary_home['evolution_secondary_homes'] = round(df_scaled_secondary_home['evolution_secondary_homes'], 2)
df_scaled_secondary_home["Taxe d'habitation moyenne en 2023"] = round(1 - df_scaled_secondary_home["Taxe d'habitation moyenne en 2023"], 2)
df_scaled_secondary_home

Unnamed: 0_level_0,surface,evolution_secondary_homes,Taxe d'habitation moyenne en 2023
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ain,0.85,0.14,0.79
Aisne,0.78,0.01,0.84
Allier,0.64,0.12,0.81
Alpes-Maritimes,0.26,0.28,0.38
Alpes-de-Haute-Provence,0.40,0.15,0.93
...,...,...,...
Vendée,0.63,0.08,0.83
Vienne,0.69,0.27,0.83
Vosges,0.72,0.24,0.90
Yonne,0.70,0.00,0.87


##### 4. LIFE QUALITY

In [65]:
#scaling pour calculer le scoring
calculation_life_quality_scoring_numeric = calculation_life_quality_scoring.select_dtypes(include="number")
df_scaled_life_quality = scaler.fit_transform(calculation_life_quality_scoring_numeric)
df_scaled_life_quality = pd.DataFrame(df_scaled_life_quality, columns=calculation_life_quality_scoring_numeric.columns, index=calculation_life_quality_scoring["department_name"])
df_scaled_life_quality.tail(50)

Unnamed: 0_level_0,Ensoleillement (heures),criminality_per_1000,ensemble des médecins.1
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Isère,0.721171,0.524738,0.310388
Jura,0.674402,0.352332,0.196496
La Réunion,0.0,0.0,0.34418
Landes,0.661192,0.352535,0.257822
Loir-et-Cher,0.620136,0.311084,0.177722
Loire,0.71653,0.418368,0.320401
Loire-Atlantique,0.603356,0.53632,0.332916
Loiret,0.610496,0.401199,0.177722
Lot,0.73331,0.312913,0.214018
Lot-et-Garonne,0.698679,0.354973,0.201502


In [66]:
#cleaning des différents KPI
df_scaled_life_quality['Ensoleillement'] = round(df_scaled_life_quality['Ensoleillement (heures)'], 2)
df_scaled_life_quality['Criminality'] = round(1 - df_scaled_life_quality['criminality_per_1000'], 2)
df_scaled_life_quality['Health'] = round(df_scaled_life_quality['ensemble des médecins.1'], 2)

df_scaled_life_quality = df_scaled_life_quality.drop(columns=["Ensoleillement (heures)", "criminality_per_1000", "ensemble des médecins.1"])
df_scaled_life_quality

Unnamed: 0_level_0,Ensoleillement,Criminality,Health
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ain,0.69,0.64,0.11
Aisne,0.57,0.58,0.15
Allier,0.66,0.64,0.23
Alpes-Maritimes,0.95,0.43,0.47
Alpes-de-Haute-Provence,0.93,0.55,0.25
...,...,...,...
Vendée,0.63,0.69,0.17
Vienne,0.67,0.62,0.33
Vosges,0.62,0.65,0.20
Yonne,0.63,0.54,0.15


# **NORMALIZATION**

##### 1. TOURISM

In [59]:
tourism_scoring = pd.DataFrame()
tourism_scoring['Hosting_score'] = df_scaled_tourism['ranking_hosting']*10
tourism_scoring['Touristic_sites_score'] = df_scaled_tourism['ranking_touristic_sites']*10
tourism_scoring['Global_tourism_score'] = round((tourism_scoring['Hosting_score'] + tourism_scoring['Touristic_sites_score'])/2,1)
tourism_scoring.sort_values("Global_tourism_score", ascending=False).head()

Unnamed: 0_level_0,Hosting_score,Touristic_sites_score,Global_tourism_score
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Paris,10.0,7.4,8.7
Gironde,3.4,10.0,6.7
Finistère,4.0,8.8,6.4
Savoie,5.2,7.3,6.2
Bouches-du-Rhône,3.3,8.9,6.1


##### 2. REAL ESTATE

In [60]:
real_estate_scoring= pd.DataFrame()
real_estate_scoring['Rentability_score'] = df_scaled_real_estate['yield_rate']*10
real_estate_scoring['Growth_score'] = df_scaled_real_estate['price_m2_growth']*10
real_estate_scoring['Vacancy_score'] = df_scaled_real_estate['nb_vacants_housing']*10
real_estate_scoring['Global_real_estate_score'] = round((real_estate_scoring['Rentability_score'] + real_estate_scoring['Growth_score'] + (real_estate_scoring['Vacancy_score']/2))/2.5,1)
real_estate_scoring.sort_values("Global_real_estate_score", ascending=False).head()

Unnamed: 0_level_0,Rentability_score,Growth_score,Vacancy_score,Global_real_estate_score
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Finistère,7.1,7.3,3.3,6.4
Nord,6.9,4.3,7.9,6.1
Eure,8.6,5.5,1.9,6.0
Aube,9.8,4.4,1.1,5.9
Meurthe-et-Moselle,8.5,4.8,2.8,5.9


##### 3. SECONDARY HOME

In [63]:
secondary_home_scoring= pd.DataFrame()
secondary_home_scoring['Surface_score'] = df_scaled_secondary_home['surface']*10
secondary_home_scoring['Secondary_home_growth_score'] = df_scaled_secondary_home['evolution_secondary_homes']*10
secondary_home_scoring['Tax_score'] = df_scaled_secondary_home["Taxe d'habitation moyenne en 2023"]*10
secondary_home_scoring['Global_secondary_home_score'] = round(((secondary_home_scoring['Surface_score']/2) + secondary_home_scoring['Secondary_home_growth_score'] + secondary_home_scoring['Tax_score'])/2.5,1)
secondary_home_scoring.sort_values("Global_secondary_home_score", ascending=False).head()

Unnamed: 0_level_0,Surface_score,Secondary_home_growth_score,Tax_score,Global_secondary_home_score
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Guadeloupe,4.7,9.8,6.7,7.5
La Réunion,4.3,10.0,6.3,7.4
Martinique,4.2,8.4,7.2,7.1
Guyane,5.0,6.6,8.3,7.0
Nord,6.7,6.6,6.7,6.7


##### 4. LIFE QUALITY

In [67]:
life_quality_scoring= pd.DataFrame()
life_quality_scoring['Sun_score'] = df_scaled_life_quality['Ensoleillement']*10
life_quality_scoring['Safety_score'] = df_scaled_life_quality['Criminality']*10
life_quality_scoring['Health_score'] = df_scaled_life_quality["Health"]*10
life_quality_scoring['Global_life_quality_score'] = round(((life_quality_scoring['Sun_score'])/2 + life_quality_scoring['Safety_score'] + life_quality_scoring['Health_score'])/2.5,1)
life_quality_scoring.sort_values("Global_life_quality_score", ascending=False).head()

Unnamed: 0_level_0,Sun_score,Safety_score,Health_score,Global_life_quality_score
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Hautes-Alpes,8.7,6.3,5.2,6.3
Pyrénées-Atlantiques,6.7,6.7,4.3,5.7
Corse-du-Sud,9.7,6.0,3.3,5.7
Haute-Corse,9.0,6.8,2.6,5.6
Haute-Vienne,6.6,6.9,3.8,5.6


##### 5. GLOBAL SCORE

In [68]:
global_scoring_merge_1 = tourism_scoring.merge(real_estate_scoring, on="department_name")
global_scoring_merge_2 = global_scoring_merge_1.merge(secondary_home_scoring, on="department_name")
global_scoring_merge_3 = global_scoring_merge_2.merge(life_quality_scoring, on="department_name")
global_scoring_merge_3["Global_scoring"]= round((global_scoring_merge_3["Global_tourism_score"]+global_scoring_merge_3["Global_real_estate_score"]+global_scoring_merge_3["Global_secondary_home_score"]+global_scoring_merge_3["Global_life_quality_score"])/4,1)
global_scoring_table=global_scoring_merge_3
global_scoring_table.sort_values("Global_scoring", ascending=False).head(25)

Unnamed: 0_level_0,Hosting_score,Touristic_sites_score,Global_tourism_score,Rentability_score,Growth_score,Vacancy_score,Global_real_estate_score,Surface_score,Secondary_home_growth_score,Tax_score,Global_secondary_home_score,Sun_score,Safety_score,Health_score,Global_life_quality_score,Global_scoring
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Finistère,4.0,8.8,6.4,7.1,7.3,3.3,6.4,6.8,2.2,7.2,5.1,5.3,6.6,3.5,5.1,5.8
Nord,2.4,6.9,4.6,6.9,4.3,7.9,6.1,6.7,6.6,6.7,6.7,5.8,5.7,3.4,4.8,5.6
Bouches-du-Rhône,3.3,8.9,6.1,4.2,4.6,6.5,4.8,4.1,6.1,4.9,5.2,10.0,3.6,4.6,5.3,5.4
Gironde,3.4,10.0,6.7,3.4,5.7,4.5,4.5,6.5,3.2,6.4,5.1,7.1,4.3,4.4,4.9,5.3
Maine-et-Loire,1.4,6.7,4.0,6.4,7.1,2.1,5.8,7.4,3.1,8.0,5.9,6.0,6.6,3.3,5.2,5.2
Pyrénées-Atlantiques,4.2,5.2,4.7,3.4,10.0,2.6,5.9,6.0,2.9,5.7,4.6,6.7,6.7,4.3,5.7,5.2
Isère,3.4,8.3,5.8,6.4,5.8,4.1,5.7,5.5,1.9,7.6,4.9,7.2,4.8,3.1,4.6,5.2
Morbihan,3.5,7.1,5.3,4.9,7.0,2.7,5.3,7.0,1.6,7.9,5.2,6.2,6.7,3.1,5.2,5.2
Côte-d'Or,1.6,8.0,4.8,6.8,4.4,1.9,4.9,5.5,2.0,8.4,5.3,6.4,6.3,4.1,5.4,5.1
Hérault,4.7,6.5,5.6,4.9,5.2,4.4,4.9,3.6,1.5,7.1,4.2,9.3,4.3,4.4,5.3,5.0


In [85]:
#rajout dans le tableau "global score" de toutes les informations sur les départements qui ont permis de calculer le Global Score
global_scoring_info_merge_1 = global_scoring_table.merge(calculation_real_estate_scoring, on="department_name", how="outer")
global_scoring_info_merge_2 = global_scoring_info_merge_1.merge(calculation_secondary_home_scoring, on="department_name", how="outer")
global_scoring_info_merge_3 = global_scoring_info_merge_2.merge(calculation_life_quality_scoring, on="department_name", how="outer")

# Fonction pour remplacer les points par des virgules dans les valeurs flottantes
def replace_dot_with_comma(value):
    if isinstance(value, float):
        return str(value).replace('.', ',')
    return value

# Appliquer la fonction à toutes les cellules du DataFrame
global_scoring_info_merge_3 = global_scoring_info_merge_3.apply(lambda x: x.apply(replace_dot_with_comma))


global_scoring_info_merge_3.head(80)

Unnamed: 0,department_name,Hosting_score,Touristic_sites_score,Global_tourism_score,Rentability_score,Growth_score,Vacancy_score,Global_real_estate_score,Surface_score,Secondary_home_growth_score,...,Global_scoring,yield_rate,price_m2_growth,nb_vacants_housing,surface,evolution_secondary_homes,Taxe d'habitation moyenne en 2023,Ensoleillement (heures),criminality_per_1000,ensemble des médecins.1
0,Ain,14000000000000001,63,38,63,55,20,51,85,14000000000000001,...,47,600212157576975,007218864494118082,258490,9549150254923524,36202830188679243,8470,19280,350,174
1,Aisne,05,26,16,99,30,20,56,7800000000000001,01,...,41,76250506190679275,0033482686264434625,260010,9195605323269442,-7776141384388807,7320,16090,4171,211
2,Allier,10,37,24,99,27,24,55,64,12,...,44,7619062710594496,0029527460032250152,304790,853089722675367,15881597621139418,8010,18570,3512,275
3,Alpes-Maritimes,45,61,53,27,58999999999999995,54,45,26,28000000000000003,...,46,4362966296588599,007816696705265946,646430,6628367766702475,16972625571050013,16860,26680,5566,461
4,Alpes-de-Haute-Provence,14000000000000001,24,19,62,27,07000000000000001,37,40,15,...,40,5945867086723071,002977586676012889,107680,7328374221030212,4676717964742338,5510,25960,4457,291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,Orne,06,23000000000000003,15,80,34000000000000004,13,48,76,12,...,41,67506570019665135,004035608404340407,186010,9100721732157177,1601736681530157,6550,16150,3521,235
76,Paris,100,74,87,00,17000000000000002,100,27,00,66000000000000005,...,48,3129031252651794,0013567914107807555,1176660,53748363426187694,5176636055265988,24870,16300,9843,888
77,Pas-de-Calais,25,67,46,7199999999999999,44,45,55,7199999999999999,13,...,48,63961011132888625,0054476639555179895,546810,8896594456278552,2624123043712898,10570,17340,4002,253
78,Puy-de-Dôme,25,75,50,71,28000000000000003,32,46,5300000000000001,20,...,50,6345400473102771,003006633041378559,403670,7984791697553743,894181206758731,7170,18980,371,373


In [86]:
# Chemin du dossier où les fichiers seront enregistrés
output_folder = "../data/enriched"

# Assurez-vous que le dossier existe
os.makedirs(output_folder, exist_ok=True)

dataframes = {
    "global_scoring_per_department": global_scoring_info_merge_3,
}

# Exportation des DataFrames en CSV
for name, df in dataframes.items():
    output_path = os.path.join(output_folder, f"{name}_enriched.csv")
    df.to_csv(output_path, index=False)
    print(f"DataFrame {name} exporté vers {output_path}")

DataFrame global_scoring_per_department exporté vers ../data/enriched/global_scoring_per_department_enriched.csv
