# ⚙️ **CLEANED DATA IMPORT**

In [152]:
import pandas as pd
import numpy as np
import os

DATA_PATH = '../data/cleaned'

POI_FILENAME = 'poi_df_cleaned.csv'
SITE_FILENAME = 'site_df_cleaned.csv'
SALARY_FILENAME = 'salary_df_cleaned.csv'
GEOREF_FILENAME = 'georef_df_cleaned.csv'
STOCK_FILENAME = 'stock_df_cleaned.csv'
SALES_FILENAME = 'sales_df_cleaned.csv'
POPULATION_FILENAME = 'population_df_cleaned.csv'
POVERTY_FILENAME = 'poverty_df_cleaned.csv'
REAL_ESTATE_FILENAME = 'real_estate_df_cleaned.csv'

poi_df = pd.read_csv(os.path.join(DATA_PATH, POI_FILENAME))
site_df = pd.read_csv(os.path.join(DATA_PATH, SITE_FILENAME))
salary_df = pd.read_csv(os.path.join(DATA_PATH, SALARY_FILENAME))
georef_df = pd.read_csv(os.path.join(DATA_PATH, GEOREF_FILENAME))
stock_df = pd.read_csv(os.path.join(DATA_PATH, STOCK_FILENAME))
sales_df = pd.read_csv(os.path.join(DATA_PATH, SALES_FILENAME))
population_df = pd.read_csv(os.path.join(DATA_PATH, POPULATION_FILENAME))
poverty_df = pd.read_csv(os.path.join(DATA_PATH, POVERTY_FILENAME))
real_estate_df = pd.read_csv(os.path.join(DATA_PATH, REAL_ESTATE_FILENAME))

In [130]:
# DF CLEANED CHECK
poi_df.info()
site_df.info()
salary_df.info()
georef_df.info() 
print (stock_df.info())
sales_df.info()
population_df.info() 
poverty_df.info()
real_estate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26207 entries, 0 to 26206
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   poi                   26207 non-null  object 
 1   latitude_x            26207 non-null  float64
 2   longitude_x           26207 non-null  float64
 3   municipality_code     26207 non-null  object 
 4   importance            26207 non-null  float64
 5   name_reprocessed      26207 non-null  object 
 6   city_name             26207 non-null  object 
 7   city_name_normalized  26207 non-null  object 
 8   municipality_type     26207 non-null  object 
 9   latitude_y            26207 non-null  float64
 10  longitude_y           26207 non-null  float64
 11  department_code       26207 non-null  object 
 12  epci_code             24598 non-null  float64
 13  department_name       26207 non-null  object 
dtypes: float64(6), object(8)
memory usage: 2.8+ MB
<class 'pandas.core.fra

In [158]:
#test
stock_df_test = stock_df.merge(georef_df, on="municipality_code")
stock_df_test = stock_df_test.groupby(["year", "department_name"])["nb_second_home"].sum().reset_index()
filtered_df = stock_df_test[stock_df_test["department_name"] == "Guyane"]
filtered_df


Unnamed: 0,year,department_name,nb_second_home
35,1968,Guyane,345
135,1975,Guyane,469
235,1982,Guyane,1297
335,1990,Guyane,1820
435,1999,Guyane,2519
535,2008,Guyane,1566
635,2013,Guyane,1771
735,2018,Guyane,2367


### CLEANING

##### DF_SALES CLEANING

In [3]:
# SALES_DF: Suppression des doublons > nous passons de 4,3M de lignes à 3,821M
sales_df = sales_df.drop_duplicates()
sales_df.shape

(3448398, 13)

In [4]:
# SALES_DF: Check si les doublons on été enlevés : OK
sales_df.duplicated().sum()

np.int64(0)

In [5]:
# SALES_DF: Suppression des prix au m2 supérieur à 30K€ et inférieur à 1K€ > nous passons à 3,3399M de lignes
sales_df = sales_df[(sales_df['sales_price_m2'] <= 30000) & (sales_df['sales_price_m2'] >= 500)]
sales_df.shape

(3448398, 13)

In [6]:
# SALES_DF:
s2 = (sales_df['sales_amount']
             .value_counts()
             .loc[sales_df['sales_amount'].value_counts() > 10])

In [7]:
# SALES_DF:
sales_df = sales_df[sales_df['sales_amount'] > 1] # on enlève les 166 fois ou sales_amount = 1€
sales_df.shape

(3448398, 13)

In [8]:
# SALES_DF: changement du type sales_date en datetime
sales_df['sales_date'] = pd.to_datetime(sales_df['sales_date'])
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3448398 entries, 0 to 3448397
Data columns (total 13 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   sales_date                 datetime64[ns]
 1   sales_amount               float64       
 2   street_number              float64       
 3   street_code                object        
 4   street_name                object        
 5   nom_commune                object        
 6   municipality_code          object        
 7   premise_type               object        
 8   surface                    float64       
 9   number_of_principal_rooms  int64         
 10  sales_price_m2             float64       
 11  latitude                   float64       
 12  longitude                  float64       
dtypes: datetime64[ns](1), float64(6), int64(1), object(5)
memory usage: 342.0+ MB


##### DF_SALARY CLEANING

In [9]:
# DF_SALARY: ROUND avg_net_salary
salary_df['avg_net_salary'] = salary_df['avg_net_salary'].round()
salary_df.head()

Unnamed: 0,municipality_code,avg_net_salary,year
0,1004,26471.0,2019
1,1007,26341.0,2019
2,1014,25897.0,2019
3,1024,25695.0,2019
4,1025,26055.0,2019


##### DF_REAL_ESTATE CLEANING

In [10]:
# DF_REAL_ESTATE: suppression des nulls
real_estate_df = real_estate_df.dropna(axis=1)
real_estate_df.isnull().sum()

municipality_code         0
intensite_tension_immo    0
rental_max_apartment      0
rental_min_apartment      0
rental_med_all            0
rental_max_all            0
rental_min_all            0
dtype: int64

##### DF_SITE CLEANING

In [11]:
# SITE_DF: tri avec les données entre parenthèses de la colonne "name" inclues

import re

site_df['data_inside_parenthesis'] = site_df['name'].apply(lambda x: re.search(r'\((.*?)\)', x).group(1) if re.search(r'\((.*?)\)', x) else '')
site_df

#suppression de la colonne "name" dans un second temps

site_df.drop(columns=["name"])

#check pour savoir les informations présentes dans la colonne "poi", et si elles correspondent aux valeurs présentes dans la colonne "type"
print (site_df["poi"].value_counts())
print (site_df["data_inside_parenthesis"].value_counts().head(50))

#faire un mapping des colonnes poi, qui sont en fait plus pertinentes que celles de la colonne "type"

poi
castle            5611
2                 5207
water             4500
museum            3400
theatre           2439
forest            2233
beach             1010
protected_area     927
wetland            784
cinema             667
vineyard           641
golf_course        612
theme_park         358
cliff              337
marina             255
water_park         239
1                  236
zoo                234
valley             223
ridge              213
islet              163
casino             158
volcano            145
park               132
meadow              78
monument            76
sand                35
heritage            32
viewpoint           21
cave_entrance       15
wreck               12
rock                11
waterfall           10
attraction           8
dune                 5
national_park        4
allotments           2
geyser               1
Name: count, dtype: int64
data_inside_parenthesis
                       9899
Monument historique    5090
Eau             

In [12]:
# SITE_DF: création d'un dictionnaire intégrant toutes les différentes valeurs inclues dans la colonne "poi"
s = site_df["poi"].value_counts()[site_df["poi"]]
{k: "toto" for k in s.index}

{'1': 'toto',
 '2': 'toto',
 'zoo': 'toto',
 'dune': 'toto',
 'park': 'toto',
 'rock': 'toto',
 'sand': 'toto',
 'beach': 'toto',
 'cliff': 'toto',
 'islet': 'toto',
 'ridge': 'toto',
 'water': 'toto',
 'wreck': 'toto',
 'casino': 'toto',
 'castle': 'toto',
 'cinema': 'toto',
 'forest': 'toto',
 'geyser': 'toto',
 'marina': 'toto',
 'meadow': 'toto',
 'museum': 'toto',
 'valley': 'toto',
 'theatre': 'toto',
 'volcano': 'toto',
 'wetland': 'toto',
 'heritage': 'toto',
 'monument': 'toto',
 'vineyard': 'toto',
 'viewpoint': 'toto',
 'waterfall': 'toto',
 'allotments': 'toto',
 'attraction': 'toto',
 'theme_park': 'toto',
 'water_park': 'toto',
 'golf_course': 'toto',
 'cave_entrance': 'toto',
 'national_park': 'toto',
 'protected_area': 'toto'}

In [13]:
# SITE_DF: création d'un dictionnaire avec les catégories associées aux valeurs de la colonne POI

category_dict = {'1': 'Patrimoine',
 '2': 'Patrimoine',
 'zoo': 'Entertainment',
 'dune': 'Nature',
 'park': 'Nature',
 'rock': 'Nature',
 'sand': 'Nature',
 'beach': 'Nature',
 'cliff': 'Nature',
 'islet': 'Nature',
 'ridge': 'Nature',
 'water': 'Nature',
 'wreck': 'Patrimoine',
 'casino': 'Entertainment',
 'castle': 'Patrimoine',
 'cinema': 'Culture',
 'forest': 'Nature',
 'geyser': 'Nature',
 'marina': 'Nature',
 'meadow': 'Nature',
 'museum': 'Culture',
 'valley': 'Nature',
 'theatre': 'Culture',
 'volcano': 'Nature',
 'wetland': 'Nature',
 'heritage': 'Patrimoine',
 'monument': 'Patrimoine',
 'vineyard': 'Nature',
 'viewpoint': 'Nature',
 'waterfall': 'Nature',
 'allotments': 'Patrimoine',
 'attraction': 'Entertainment',
 'theme_park': 'Entertainment',
 'water_park': 'Entertainment',
 'golf_course': 'Entertainment',
 'cave_entrance': 'Culture',
 'national_park': 'Nature',
 'protected_area': 'Nature'}

In [14]:
# SITE_DF: création de la colonne "catégorie"
site_df["Category"] = site_df["poi"].map(category_dict)
site_df

Unnamed: 0,poi,name,latitude,longitude,municipality_code,importance,name_reprocessed,data_inside_parenthesis,Category
0,1,Fortifications de Vauban (Patrimoine mondial),46.396160,1.472600,36134,0.139527,Fortifications de Vauban,Patrimoine mondial,Patrimoine
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,46.313695,1.478772,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...,Patrimoine mondial,Patrimoine
2,1,Canal du Midi (Patrimoine mondial),43.332709,3.219890,34032,0.129531,Canal du Midi,Patrimoine mondial,Patrimoine
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,50.455895,2.965034,62427,0.127170,Bassin minier du Nord-Pas de Calais,Patrimoine mondial,Patrimoine
4,1,"Les Causses et les Cévennes, paysage culturel ...",44.346210,3.613406,48050,0.124981,"Les Causses et les Cévennes, paysage culturel ...",Patrimoine mondial,Patrimoine
...,...,...,...,...,...,...,...,...,...
31029,viewpoint,Tour Eiffel 3e étage (Point de vue),48.858262,2.294497,75056,0.053782,Tour Eiffel 3e étage,Point de vue,Nature
31030,theme_park,Jardin d'Acclimatation (Parc à thème),48.878145,2.264686,75056,0.087097,Jardin d'Acclimatation,Parc à thème,Entertainment
31031,theme_park,Foire du Trône (Parc à thème),48.832003,2.404337,75056,0.060000,Foire du Trône,Parc à thème,Entertainment
31032,golf_course,Golf du Bois de Boulogne (Terrain de golf),48.857776,2.232651,75056,0.060073,Golf du Bois de Boulogne,Terrain de golf,Entertainment


In [15]:
poi_df.info()
site_df.info()
salary_df.info()
georef_df.info() 
stock_df.info() 
sales_df.info()
population_df.info() 
poverty_df.info()
real_estate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26216 entries, 0 to 26215
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   poi                26216 non-null  object 
 1   latitude           26216 non-null  float64
 2   longitude          26216 non-null  float64
 3   municipality_code  26216 non-null  object 
 4   importance         26216 non-null  float64
 5   name_reprocessed   26216 non-null  object 
dtypes: float64(3), object(3)
memory usage: 1.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31034 entries, 0 to 31033
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   poi                      31034 non-null  object 
 1   name                     31034 non-null  object 
 2   latitude                 31034 non-null  float64
 3   longitude                31034 non-null  float64
 4   municipality_code  

In [16]:
poi_df.head(1)

Unnamed: 0,poi,latitude,longitude,municipality_code,importance,name_reprocessed
0,hotel,45.678531,-1.163635,17225,0.078556,Club Med


In [17]:
site_df.head(1) 

Unnamed: 0,poi,name,latitude,longitude,municipality_code,importance,name_reprocessed,data_inside_parenthesis,Category
0,1,Fortifications de Vauban (Patrimoine mondial),46.39616,1.4726,36134,0.139527,Fortifications de Vauban,Patrimoine mondial,Patrimoine


In [18]:
salary_df.head(1)

Unnamed: 0,municipality_code,avg_net_salary,year
0,1004,26471.0,2019


In [19]:
georef_df.head(1) 

Unnamed: 0,municipality_code,city_name,city_name_normalized,municipality_type,latitude,longitude,department_code,epci_code,department_name
0,1005,Ambérieux-en-Dombes,AMBERIEUX EN DOMBES,municipality,45.99618,4.912273,1,200042497.0,Ain


In [20]:
stock_df.head(1) 

Unnamed: 0,municipality_code,year,nb_principal_home,nb_second_home,nb_vacants_housing,nb_tot_housing,secondary_home_rate,principal_home_rate,vacants_housing_rate
0,1339,1968,109,155,0,264,0.587121,0.412879,0.0


In [21]:
sales_df.head(1)

Unnamed: 0,sales_date,sales_amount,street_number,street_code,street_name,nom_commune,municipality_code,premise_type,surface,number_of_principal_rooms,sales_price_m2,latitude,longitude
0,2018-02-06,5000000.0,63.0,40,AV PRINCE RAINIER III,Saint-Jean-Cap-Ferrat,6121,Maison,292.0,10,17123.0,43.678892,7.330651


In [22]:
population_df.head(1) 

Unnamed: 0,municipality_code,year,population
0,4213,1968,0.0


In [23]:
poverty_df.head(1)

Unnamed: 0,municipality_code,year,population
0,4213,1968,0.0


In [24]:
real_estate_df.head(1)

Unnamed: 0,municipality_code,intensite_tension_immo,rental_max_apartment,rental_min_apartment,rental_med_all,rental_max_all,rental_min_all
0,57133,8,12.27,9.07,9.53,13.77,7.25


# 🧪 **DATA TRANSFORMATION**

### KPIS AGGREGATION BY DEPARTMENT

##### 1. TOURISM

In [197]:
#création de tables permettant de scorer le potentiel touristique de chaque département
site_df = site_df.merge (georef_df, on=["municipality_code"])
site_df.head(5)

Unnamed: 0,poi,name,latitude_x,longitude_x,municipality_code,importance,name_reprocessed,data_inside_parenthesis,Category,city_name_x,...,epci_code_x,department_name_x,city_name_y,city_name_normalized_y,municipality_type_y,latitude,longitude,department_code_y,epci_code_y,department_name_y
0,1,Fortifications de Vauban (Patrimoine mondial),46.39616,1.4726,36134,0.139527,Fortifications de Vauban,Patrimoine mondial,Patrimoine,Mouhet,...,200035137.0,Indre,Mouhet,MOUHET,municipality,46.389251,1.442651,36,200035137.0,Indre
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,46.313695,1.478772,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...,Patrimoine mondial,Patrimoine,Vareilles,...,242300135.0,Creuse,Vareilles,VAREILLES,municipality,46.305016,1.456031,23,242300135.0,Creuse
2,1,Canal du Midi (Patrimoine mondial),43.332709,3.21989,34032,0.129531,Canal du Midi,Patrimoine mondial,Patrimoine,Béziers,...,243400769.0,Hérault,Béziers,BEZIERS,municipality,43.347588,3.230768,34,243400769.0,Hérault
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,50.455895,2.965034,62427,0.12717,Bassin minier du Nord-Pas de Calais,Patrimoine mondial,Patrimoine,Hénin-Beaumont,...,246200299.0,Pas-de-Calais,Hénin-Beaumont,HENIN BEAUMONT,municipality,50.409234,2.958997,62,246200299.0,Pas-de-Calais
4,1,"Les Causses et les Cévennes, paysage culturel ...",44.34621,3.613406,48050,0.124981,"Les Causses et les Cévennes, paysage culturel ...",Patrimoine mondial,Patrimoine,Bédouès-Cocurès,...,200069151.0,Lozère,Bédouès-Cocurès,BEDOUES COCURES,municipality,44.353946,3.61956,48,200069151.0,Lozère


In [198]:
#sélection des colonnes dont on aura besoin pour le calcul
site_df_department = site_df[["poi", "name", "municipality_code", "importance", "name_reprocessed", "department_name"]]
site_df_department

KeyError: "['department_name'] not in index"

In [199]:
#groupement par département, puis classement par le département ayant le + d'atouts touristiques
group_site = site_df_department.groupby("department_name")[["importance"]].sum()
group_site
group_site.sort_values("importance", ascending =False)

Unnamed: 0_level_0,importance
department_name,Unnamed: 1_level_1
Gironde,52.539958
Bouches-du-Rhône,47.068588
Finistère,46.685274
Isère,44.230787
Loire-Atlantique,43.525109
...,...
Haute-Marne,7.416165
Val-de-Marne,6.699159
Lozère,6.230406
Seine-Saint-Denis,5.718311


In [200]:
#même calcul que précédemment, mais pour la partie concernant les logements/lieux de villégiature
poi_df = poi_df.merge (georef_df, on=["municipality_code"])
poi_df.head(5)

Unnamed: 0,poi,latitude_x,longitude_x,municipality_code,importance,name_reprocessed,city_name_x,city_name_normalized_x,municipality_type_x,latitude_y,...,epci_code_x,department_name_x,city_name_y,city_name_normalized_y,municipality_type_y,latitude,longitude,department_code_y,epci_code_y,department_name_y
0,hotel,45.678531,-1.163635,17225,0.078556,Club Med,Les Mathes,MATHES,municipality,45.705988,...,241700640.0,Charente-Maritime,Les Mathes,MATHES,municipality,45.705988,-1.170867,17,241700640.0,Charente-Maritime
1,hotel,44.026859,4.847491,84129,0.078419,Cabanes des Grands Cépages,Sorgues,SORGUES,municipality,44.014576,...,248400293.0,Vaucluse,Sorgues,SORGUES,municipality,44.014576,4.867405,84,248400293.0,Vaucluse
2,hotel,50.331541,1.565677,80333,0.077999,Pierre et Vacances,Fort-Mahon-Plage,FORT MAHON PLAGE,municipality,50.345059,...,200070936.0,Somme,Fort-Mahon-Plage,FORT MAHON PLAGE,municipality,50.345059,1.577068,80,200070936.0,Somme
3,hotel,43.285936,6.569696,83068,0.077702,Pierre & Vacances,Grimaud,GRIMAUD,municipality,43.282028,...,200036077.0,Var,Grimaud,GRIMAUD,municipality,43.282028,6.533032,83,200036077.0,Var
4,hotel,43.54105,3.752852,34095,0.077542,Le Domaine du Golf,Fabrègues,FABREGUES,municipality,43.534477,...,243400017.0,Hérault,Fabrègues,FABREGUES,municipality,43.534477,3.77193,34,243400017.0,Hérault


In [201]:
#sélection des colonnes dont on aura besoin pour le calcul
poi_df_department = poi_df[["poi", "municipality_code", "importance", "department_name"]]
poi_df_department


KeyError: "['department_name'] not in index"

In [202]:
#groupement par département, puis classement par le département ayant le + de logements/lieux de villégiature
group_poi = poi_df_department.groupby("department_name")[["importance"]].sum()
group_poi
group_poi.sort_values("importance", ascending =False)

Unnamed: 0_level_0,importance
department_name,Unnamed: 1_level_1
Paris,70.275305
Savoie,37.401407
Haute-Savoie,35.158395
Hérault,33.793973
Alpes-Maritimes,32.802552
...,...
Eure-et-Loir,3.871754
Haute-Marne,3.670584
Ardennes,3.541133
Mayenne,3.154595


In [215]:
#ajout des 2 calculs d'importance
department_merged_df = group_poi.merge (group_site, on=["department_name"])
#department_merged_df["somme_importance"]=department_merged_df["importance_x"]+department_merged_df["importance_y"]
#department_merged_df = department_merged_df.drop(columns=["importance_x", "importance_y"])
#department_merged_df
#department_merged_df.sort_values("somme_importance", ascending =False)
department_merged_df



Unnamed: 0_level_0,importance_x,importance_y
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Ain,11.423180,34.061847
Aisne,4.937153,15.687868
Allier,8.386644,21.401418
Alpes-Maritimes,32.802552,32.967630
Alpes-de-Haute-Provence,11.293578,14.722338
...,...,...
Vendée,27.816510,32.287377
Vienne,8.205227,21.203995
Vosges,12.007083,14.973759
Yonne,7.211247,16.595920


##### 2. REAL ESTATE

In [32]:
# 2.1 calcul du loyer au m2 médian par municipality_code
rental_med = real_estate_df [["municipality_code", "rental_med_all"]]
rental_med

Unnamed: 0,municipality_code,rental_med_all
0,57133,9.53
1,57446,11.09
2,77013,12.26
3,77026,9.53
4,77072,11.47
...,...,...
34436,81126,8.93
34437,33425,11.09
34438,85146,10.88
34439,53062,8.69


In [33]:
sales_df

Unnamed: 0,sales_date,sales_amount,street_number,street_code,street_name,nom_commune,municipality_code,premise_type,surface,number_of_principal_rooms,sales_price_m2,latitude,longitude
0,2018-02-06,5000000.0,63.0,0040,AV PRINCE RAINIER III,Saint-Jean-Cap-Ferrat,06121,Maison,292.0,10,17123.0,43.678892,7.330651
1,2018-11-26,93060.0,5308.0,B061,LE SERRET,Vallées-d'Antraigues-Asperjoc,07011,Maison,16.0,0,5816.0,44.714072,4.360185
2,2018-06-08,95000.0,161.0,0683,IMP COL EMILE VIGUIER,Millau,12145,Maison,21.0,0,4524.0,44.093714,3.054594
3,2018-07-17,3912000.0,690.0,1868,CHE DE MAZARGUES,Aix-en-Provence,13001,Maison,610.0,13,6413.0,43.502833,5.428194
4,2018-03-27,810000.0,5000.0,0162,CHE DES OLIVIERS,Eygalières,13034,Maison,296.0,9,2736.0,43.756881,4.957214
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3448393,2017-11-10,160000.0,9001.0,A055,RES LA COUPIANE,La Valette-du-Var,83144,Appartement,103.0,5,1553.0,43.127825,5.988491
3448394,2019-03-15,290000.0,9001.0,A278,RES LES TROIS MAGES,Aix-en-Provence,13001,Appartement,103.0,6,2816.0,43.516165,5.470374
3448395,2018-04-26,540000.0,9001.0,A370,RES LE GAUGUIN CHEM BRUNET,Aix-en-Provence,13001,Appartement,115.0,5,4696.0,43.538090,5.438490
3448396,2021-08-25,184000.0,9001.0,0002,BD ABEL LEFEVRE,Ézy-sur-Eure,27230,Appartement,150.0,3,1227.0,48.862603,1.422561


In [34]:
#calcul du prix d'achat au m2 médian par municipality_code
sales_df
sales_df_grouped = sales_df.groupby(["municipality_code"])[["sales_amount", "surface", "premise_type"]].agg({"sales_amount": "sum", "surface": "sum", "premise_type": "count"})
sales_df_grouped = pd.DataFrame (sales_df_grouped)
sales_df_grouped

Unnamed: 0_level_0,sales_amount,surface,premise_type
municipality_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01001,1.362286e+07,6781.0,60
01002,4.067389e+06,1901.0,17
01004,1.852356e+08,87209.0,1062
01005,3.579244e+07,13876.0,138
01006,2.575955e+06,1425.0,13
...,...,...,...
97420,5.335366e+07,16160.0,188
97421,3.143598e+06,1872.0,24
97422,2.541138e+08,99088.0,1237
97423,1.326098e+07,3780.0,54


In [35]:
#jointure pour rajouter dans cette table le loyer médian par municipality_code
sales_df
real_estate_grouped = sales_df_grouped.merge(rental_med, on="municipality_code")
real_estate_grouped

Unnamed: 0,municipality_code,sales_amount,surface,premise_type,rental_med_all
0,01001,1.362286e+07,6781.0,60,10.66
1,01002,4.067389e+06,1901.0,17,10.16
2,01004,1.852356e+08,87209.0,1062,11.25
3,01005,3.579244e+07,13876.0,138,13.28
4,01006,2.575955e+06,1425.0,13,12.70
...,...,...,...,...,...
31892,95676,5.974304e+06,2486.0,23,12.29
31893,95678,1.222182e+07,3568.0,38,18.53
31894,95680,8.569815e+07,32626.0,471,16.22
31895,95682,1.463606e+06,533.0,5,16.57


In [36]:
#ajout du nom du département correspondant à chaque municipality code
real_estate_department = real_estate_grouped.merge(georef_df, on="municipality_code")
real_estate_department
real_estate_department = real_estate_department [["municipality_code", "sales_amount", "surface", "rental_med_all", "department_name", "premise_type"]]

In [37]:
real_estate_department

Unnamed: 0,municipality_code,sales_amount,surface,rental_med_all,department_name,premise_type
0,01001,1.362286e+07,6781.0,10.66,Ain,60
1,01002,4.067389e+06,1901.0,10.16,Ain,17
2,01004,1.852356e+08,87209.0,11.25,Ain,1062
3,01005,3.579244e+07,13876.0,13.28,Ain,138
4,01006,2.575955e+06,1425.0,12.70,Ain,13
...,...,...,...,...,...,...
31892,95676,5.974304e+06,2486.0,12.29,Val-d'Oise,23
31893,95678,1.222182e+07,3568.0,18.53,Val-d'Oise,38
31894,95680,8.569815e+07,32626.0,16.22,Val-d'Oise,471
31895,95682,1.463606e+06,533.0,16.57,Val-d'Oise,5


In [38]:
#calcul du prix au m2 par département
average_price_per_m2 = real_estate_department.groupby(["department_name"])[["sales_amount", "surface"]].agg({"sales_amount": "sum", "surface": "sum"})
average_price_per_m2
average_price_per_m2["average_price_per_m2"] = average_price_per_m2["sales_amount"]/average_price_per_m2["surface"]
average_price_per_m2

Unnamed: 0_level_0,sales_amount,surface,average_price_per_m2
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ain,9.607929e+09,3645188.0,2635.784095
Aisne,2.840598e+09,1819614.0,1561.099105
Allier,2.057614e+09,1304360.0,1577.489243
Alpes-Maritimes,3.837944e+10,7959808.0,4821.653974
Alpes-de-Haute-Provence,2.097409e+09,919312.0,2281.499046
...,...,...,...
Vendée,6.771054e+09,2819045.0,2401.896369
Vienne,2.024136e+09,1155284.0,1752.067837
Vosges,1.482116e+09,865201.0,1713.030737
Yonne,1.533758e+09,953987.0,1607.734995


In [39]:
#calcul du loyer médian par départment
real_estate_department["intermediate_sum"]=real_estate_department["rental_med_all"]*real_estate_department["premise_type"]
real_estate_department
average_rental = real_estate_department.groupby(["department_name"])[["intermediate_sum", "premise_type"]].agg({"intermediate_sum": "sum", "premise_type": "sum"})
average_rental
average_rental["average_rental"]= average_rental["intermediate_sum"]/average_rental["premise_type"]
average_rental

Unnamed: 0_level_0,intermediate_sum,premise_type,average_rental
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ain,499987.29,37925,13.183580
Aisne,196297.97,19789,9.919550
Allier,153111.91,15287,10.015825
Alpes-Maritimes,2105196.54,120087,17.530595
Alpes-de-Haute-Provence,141928.94,12555,11.304575
...,...,...,...
Vendée,350608.65,33234,10.549698
Vienne,129985.21,13286,9.783623
Vosges,93517.06,9719,9.622087
Yonne,107147.28,10815,9.907284


In [40]:
#regroupement des colonnes avec le loyer moyen au m2 par département et le prix d'achat au m2 moyen par département
yield_calculation = average_price_per_m2.merge(average_rental, on="department_name")
yield_calculation
yield_calculation = yield_calculation.drop(columns=["sales_amount", "surface", "intermediate_sum", "premise_type"])
yield_calculation["yield_rate"]=yield_calculation["average_rental"]*12/yield_calculation["average_price_per_m2"]*100
yield_calculation.sort_values("yield_rate", ascending=True)


Unnamed: 0_level_0,average_price_per_m2,average_rental,yield_rate
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Paris,11432.173601,29.809690,3.129031
Corse-du-Sud,4420.005565,13.879070,3.768068
Hauts-de-Seine,7459.175300,24.168113,3.888062
Var,4058.417572,14.124893,4.176473
Savoie,3472.500028,12.601657,4.354784
...,...,...,...
Cher,1549.532038,9.742943,7.545201
Aube,1690.589163,10.689934,7.587840
Allier,1577.489243,10.015825,7.619063
Aisne,1561.099105,9.919550,7.625051


In [41]:
#Informations sur la rentabilité locative
yield_calculation

Unnamed: 0_level_0,average_price_per_m2,average_rental,yield_rate
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ain,2635.784095,13.183580,6.002122
Aisne,1561.099105,9.919550,7.625051
Allier,1577.489243,10.015825,7.619063
Alpes-Maritimes,4821.653974,17.530595,4.362966
Alpes-de-Haute-Provence,2281.499046,11.304575,5.945867
...,...,...,...
Vendée,2401.896369,10.549698,5.270684
Vienne,1752.067837,9.783623,6.700852
Vosges,1713.030737,9.622087,6.740395
Yonne,1607.734995,9.907284,7.394714


In [42]:
# 2.2 calcul de la variation entre 2018 et 2021

#ajout d'une colonne "year"
sales_df.info()
sales_df["year"]=sales_df["sales_date"].dt.year

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3448398 entries, 0 to 3448397
Data columns (total 13 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   sales_date                 datetime64[ns]
 1   sales_amount               float64       
 2   street_number              float64       
 3   street_code                object        
 4   street_name                object        
 5   nom_commune                object        
 6   municipality_code          object        
 7   premise_type               object        
 8   surface                    float64       
 9   number_of_principal_rooms  int64         
 10  sales_price_m2             float64       
 11  latitude                   float64       
 12  longitude                  float64       
dtypes: datetime64[ns](1), float64(6), int64(1), object(5)
memory usage: 342.0+ MB


In [43]:
#merge pour rajouter le département
sales_info_per_department = sales_df.merge (georef_df, on=["municipality_code"])
sales_info_per_department

Unnamed: 0,sales_date,sales_amount,street_number,street_code,street_name,nom_commune,municipality_code,premise_type,surface,number_of_principal_rooms,...,longitude_x,year,city_name,city_name_normalized,municipality_type,latitude_y,longitude_y,department_code,epci_code,department_name
0,2018-02-06,5000000.0,63.0,0040,AV PRINCE RAINIER III,Saint-Jean-Cap-Ferrat,06121,Maison,292.0,10,...,7.330651,2018,Saint-Jean-Cap-Ferrat,SAINT JEAN CAP FERRAT,municipality,43.687179,7.329936,06,200030195.0,Alpes-Maritimes
1,2018-11-26,93060.0,5308.0,B061,LE SERRET,Vallées-d'Antraigues-Asperjoc,07011,Maison,16.0,0,...,4.360185,2018,Vallées-d'Antraigues-Asperjoc,VALLEES D ANTRAIGUES ASPERJOC,municipality,44.739318,4.355176,07,200073245.0,Ardèche
2,2018-06-08,95000.0,161.0,0683,IMP COL EMILE VIGUIER,Millau,12145,Maison,21.0,0,...,3.054594,2018,Millau,MILLAU,municipality,44.097625,3.117054,12,241200567.0,Aveyron
3,2018-07-17,3912000.0,690.0,1868,CHE DE MAZARGUES,Aix-en-Provence,13001,Maison,610.0,13,...,5.428194,2018,Aix-en-Provence,AIX EN PROVENCE,municipality,43.536071,5.398574,13,200054807.0,Bouches-du-Rhône
4,2018-03-27,810000.0,5000.0,0162,CHE DES OLIVIERS,Eygalières,13034,Maison,296.0,9,...,4.957214,2018,Eygalières,EYGALIERES,municipality,43.761705,4.952120,13,241300375.0,Bouches-du-Rhône
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3444144,2017-11-10,160000.0,9001.0,A055,RES LA COUPIANE,La Valette-du-Var,83144,Appartement,103.0,5,...,5.988491,2017,La Valette-du-Var,VALETTE DU VAR,municipality,43.149915,5.992225,83,248300543.0,Var
3444145,2019-03-15,290000.0,9001.0,A278,RES LES TROIS MAGES,Aix-en-Provence,13001,Appartement,103.0,6,...,5.470374,2019,Aix-en-Provence,AIX EN PROVENCE,municipality,43.536071,5.398574,13,200054807.0,Bouches-du-Rhône
3444146,2018-04-26,540000.0,9001.0,A370,RES LE GAUGUIN CHEM BRUNET,Aix-en-Provence,13001,Appartement,115.0,5,...,5.438490,2018,Aix-en-Provence,AIX EN PROVENCE,municipality,43.536071,5.398574,13,200054807.0,Bouches-du-Rhône
3444147,2021-08-25,184000.0,9001.0,0002,BD ABEL LEFEVRE,Ézy-sur-Eure,27230,Appartement,150.0,3,...,1.422561,2021,Ézy-sur-Eure,EZY SUR EURE,municipality,48.870981,1.412630,27,200040277.0,Eure


In [44]:
#filtre uniquement sur les années 2020 et 2021 (car ce sont les seules années où nous avons toutes les informations)
sales_info_per_department = sales_info_per_department[sales_info_per_department['year'].isin([2020, 2021])]
sales_info_per_department

Unnamed: 0,sales_date,sales_amount,street_number,street_code,street_name,nom_commune,municipality_code,premise_type,surface,number_of_principal_rooms,...,longitude_x,year,city_name,city_name_normalized,municipality_type,latitude_y,longitude_y,department_code,epci_code,department_name
29,2020-05-22,253902.0,41.0,0114,RUE DE LA REPUBLIQUE,L'Argentière-la-Bessée,05006,Maison,160.0,9,...,6.557181,2020,L'Argentière-la-Bessée,ARGENTIERE LA BESSEE,municipality,44.782173,6.472145,05,240500462.0,Hautes-Alpes
30,2020-08-27,2400000.0,760.0,0560,CHE DE LA GRANDE BASTIDE,Mougins,06085,Maison,257.0,9,...,6.985652,2020,Mougins,MOUGINS,municipality,43.596141,7.001294,06,200039915.0,Alpes-Maritimes
31,2020-11-27,920000.0,139.0,0380,CHE DE L ETANG,Mougins,06085,Maison,285.0,9,...,7.019548,2020,Mougins,MOUGINS,municipality,43.596141,7.001294,06,200039915.0,Alpes-Maritimes
32,2020-12-17,1778080.0,168.0,0947,RTE DES ROMARINS,Mougins,06085,Maison,244.0,9,...,7.020948,2020,Mougins,MOUGINS,municipality,43.596141,7.001294,06,200039915.0,Alpes-Maritimes
33,2020-11-20,793750.0,282.0,0275,CHE DE MONTFORT,La Colle-sur-Loup,06044,Maison,227.0,9,...,7.110638,2020,La Colle-sur-Loup,COLLE SUR LOUP,municipality,43.687170,7.097736,06,240600585.0,Alpes-Maritimes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3444138,2021-01-15,169000.0,9001.0,A070,RES DE LA THEUILLERIE,Ris-Orangis,91521,Appartement,99.0,5,...,2.397566,2021,Ris-Orangis,RIS ORANGIS,municipality,48.645194,2.407936,91,200059228.0,Essonne
3444141,2020-09-08,389685.0,9001.0,0667,ALL DES CIGALES,Aix-en-Provence,13001,Appartement,103.0,5,...,5.459952,2020,Aix-en-Provence,AIX EN PROVENCE,municipality,43.536071,5.398574,13,200054807.0,Bouches-du-Rhône
3444142,2021-12-07,236000.0,9001.0,A100,RES LES PAMPRES,Les Ulis,91692,Appartement,103.0,5,...,2.172815,2021,Les Ulis,ULIS,municipality,48.680321,2.185190,91,200056232.0,Essonne
3444147,2021-08-25,184000.0,9001.0,0002,BD ABEL LEFEVRE,Ézy-sur-Eure,27230,Appartement,150.0,3,...,1.422561,2021,Ézy-sur-Eure,EZY SUR EURE,municipality,48.870981,1.412630,27,200040277.0,Eure


In [45]:
#groupement par année et par département
sales_df_per_year = sales_info_per_department.groupby(["department_name", "year"])[["sales_amount", "surface"]].agg({"sales_amount": "sum", "surface": "sum"})
sales_df_per_year

Unnamed: 0_level_0,Unnamed: 1_level_0,sales_amount,surface
department_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Ain,2020,2.210313e+09,837639.0
Ain,2021,2.802408e+09,990520.0
Aisne,2020,6.593465e+08,420712.0
Aisne,2021,8.480408e+08,523582.0
Allier,2020,4.703144e+08,295301.0
...,...,...,...
Vosges,2021,7.157524e+08,405495.0
Yonne,2020,5.685219e+08,359444.0
Yonne,2021,7.649412e+08,464540.0
Yvelines,2020,6.951062e+09,1674525.0


In [46]:
#calcul du prix moyen au m2
sales_df_per_year["average_price_m2"]=sales_df_per_year["sales_amount"]/sales_df_per_year["surface"]
sales_df_per_year
sales_df_per_year.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,sales_amount,surface,average_price_m2
department_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ain,2020,2210313000.0,837639.0,2638.742163
Ain,2021,2802408000.0,990520.0,2829.229384
Aisne,2020,659346500.0,420712.0,1567.215852
Aisne,2021,848040800.0,523582.0,1619.690449
Allier,2020,470314400.0,295301.0,1592.661195
Allier,2021,610503600.0,372329.0,1639.688435
Alpes-Maritimes,2020,8099021000.0,1684862.0,4806.934575
Alpes-Maritimes,2021,10506120000.0,2027161.0,5182.678071
Alpes-de-Haute-Provence,2020,487872100.0,211239.0,2309.573765
Alpes-de-Haute-Provence,2021,540421400.0,227226.0,2378.343326


In [47]:
#calcul de l'évolution entre 2018 et 2021
sales_df_per_year['price_m2_growth'] = sales_df_per_year.groupby('department_name')['average_price_m2'].pct_change()
sales_df_per_year


Unnamed: 0_level_0,Unnamed: 1_level_0,sales_amount,surface,average_price_m2,price_m2_growth
department_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ain,2020,2.210313e+09,837639.0,2638.742163,
Ain,2021,2.802408e+09,990520.0,2829.229384,0.072189
Aisne,2020,6.593465e+08,420712.0,1567.215852,
Aisne,2021,8.480408e+08,523582.0,1619.690449,0.033483
Allier,2020,4.703144e+08,295301.0,1592.661195,
...,...,...,...,...,...
Vosges,2021,7.157524e+08,405495.0,1765.132472,0.039378
Yonne,2020,5.685219e+08,359444.0,1581.670165,
Yonne,2021,7.649412e+08,464540.0,1646.663833,0.041092
Yvelines,2020,6.951062e+09,1674525.0,4151.064983,


In [48]:
#calcul final de l'évolution
sales_df_per_year = sales_df_per_year.dropna()
sales_df_per_year.drop (columns=["sales_amount", "surface"])
sales_df_per_year.sort_values ("price_m2_growth", ascending=False)


Unnamed: 0_level_0,Unnamed: 1_level_0,sales_amount,surface,average_price_m2,price_m2_growth
department_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Pyrénées-Atlantiques,2021,3.821283e+09,1068164.0,3577.430704,0.140719
Landes,2021,2.409002e+09,818313.0,2943.864249,0.112565
Corse-du-Sud,2021,9.380425e+08,193855.0,4838.887261,0.106444
Finistère,2021,3.056316e+09,1421428.0,2150.172909,0.099954
Maine-et-Loire,2021,2.644990e+09,1162430.0,2275.396966,0.096707
...,...,...,...,...,...
Haute-Marne,2021,1.928814e+08,129888.0,1484.982081,0.009958
Haute-Saône,2021,3.172671e+08,202655.0,1565.552673,0.008060
Cantal,2021,2.238980e+08,138821.0,1612.854279,-0.011984
La Réunion,2021,1.260015e+09,384936.0,3273.309935,-0.024756


In [49]:
#calcul du nb de maisons vacantes en 2019
stock_df_2018 = stock_df[stock_df['year'].isin([2018])]
stock_df_2018
stock_df_2018 = stock_df_2018.merge (georef_df, on=["municipality_code"])
vacants_housing_per_department = stock_df_2018.groupby("department_name")["nb_vacants_housing"].sum()
vacants_housing_per_department = pd.DataFrame(vacants_housing_per_department)
vacants_housing_per_department

Unnamed: 0_level_0,nb_vacants_housing
department_name,Unnamed: 1_level_1
Ain,25849
Aisne,26001
Allier,30479
Alpes-Maritimes,64643
Alpes-de-Haute-Provence,10768
...,...
Vendée,21818
Vienne,23264
Vosges,24154
Yonne,23303


In [50]:
stock_df_2018


Unnamed: 0,municipality_code,year,nb_principal_home,nb_second_home,nb_vacants_housing,nb_tot_housing,secondary_home_rate,principal_home_rate,vacants_housing_rate,city_name,city_name_normalized,municipality_type,latitude,longitude,department_code,epci_code,department_name
0,01066,2018,38,53,0,91,0.582418,0.417582,0.000000,La Burbanche,BURBANCHE,municipality,45.859871,5.546097,01,200040350.0,Ain
1,04090,2018,110,130,0,240,0.541667,0.458333,0.000000,Le Fugeret,FUGERET,municipality,44.014500,6.665607,04,200068625.0,Alpes-de-Haute-Provence
2,04159,2018,41,49,0,90,0.544444,0.455556,0.000000,Redortiers,REDORTIERS,municipality,44.098121,5.614146,04,200071025.0,Alpes-de-Haute-Provence
3,05026,2018,145,629,0,774,0.812661,0.187339,0.000000,Ceillac,CEILLAC,municipality,44.652652,6.803662,05,200067452.0,Hautes-Alpes
4,06063,2018,42,76,0,118,0.644068,0.355932,0.000000,Gars,GARS,municipality,43.860523,6.816763,06,200039857.0,Alpes-Maritimes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34930,13105,2018,2951,48,202,3201,0.014995,0.921899,0.063105,Sénas,SENAS,municipality,43.744607,5.086879,13,200054807.0,Bouches-du-Rhône
34931,78380,2018,2325,48,205,2578,0.018619,0.901862,0.079519,Maule,MAULE,municipality,48.907855,1.839595,78,200034130.0,Yvelines
34932,95352,2018,1918,48,212,2178,0.022039,0.880624,0.097337,Luzarches,LUZARCHES,municipality,49.115121,2.440826,95,200073013.0,Val-d'Oise
34933,59279,2018,8137,48,471,8656,0.005545,0.940042,0.054413,Halluin,HALLUIN,municipality,50.774748,3.126925,59,200093201.0,Nord


In [89]:
# 2.3 taxe d'habitation sur les maisons secondaires par département

TAX_FILENAME = 'taxe_habitation.xlsx'

tax_df = pd.read_excel(os.path.join(DATA_PATH, TAX_FILENAME))
tax_df.head()
tax_df = tax_df.rename(columns={'RÉGIONS': 'department_name'})
tax_df

Unnamed: 0,department_name,Taxe d'habitation moyenne en 2023,Nombre d'avis d'impôt
0,Ain,847,23 000
1,Aisne,732,14 000
2,Allier,801,18 000
3,Alpes-de-Haute-Provence,551,35 000
4,Hautes-Alpes,567,52 000
...,...,...,...
96,Guadeloupe,1080,26 000
97,Martinique,986,18 000
98,Guyane,766,9 000
99,La Réunion,1182,15 000


##### 3. SECONDARY HOME

In [63]:
# 3.1 Superficie moyenne des logements vendus par départements

# Joindre les informations de géolocalisation pour obtenir les départements
real_estate_sales_dep = sales_df.merge(
    georef_df[['municipality_code', 'department_code', 'department_name']],
    on='municipality_code'
)

# Calculer la surface moyenne des logements vendus par département
average_surface_municipality = real_estate_sales_dep.groupby('department_name')['surface'].mean().reset_index()

average_surface_municipality

Unnamed: 0,department_name,surface
0,Ain,95.491503
1,Aisne,91.956053
2,Allier,85.308972
3,Alpes-Maritimes,66.283678
4,Alpes-de-Haute-Provence,73.283742
...,...,...
92,Vendée,84.926212
93,Vienne,87.584873
94,Vosges,88.986998
95,Yonne,88.105673


In [62]:
# Filtre sur un departement en particulier (exemple paris 75)
print(average_surface_department[average_surface_department['department_code'] == '75'])

NameError: name 'average_surface_department' is not defined

In [178]:
# 3.2 Évolution du % des maisons secondaires par département

# Filtrer les données pour les années 2008 et 2018
housing_2008 = stock_df[stock_df['year'] == 2008]
housing_2018 = stock_df[stock_df['year'] == 2018]

# Renommer les colonnes pour les années spécifiques
housing_2008 = housing_2008[['municipality_code', 'nb_second_home']].rename(columns={'nb_second_home': 'nb_second_home_2008'})
housing_2018 = housing_2018[['municipality_code', 'nb_second_home']].rename(columns={'nb_second_home': 'nb_second_home_2018'})

# Joindre les données pour les années 2008 et 2018 sur le code de municipalité
secondary_home_rate_comparison = housing_2008.merge(housing_2018, on='municipality_code')

# Joindre les informations de géolocalisation pour obtenir les départements
secondary_home_rate_comparison = secondary_home_rate_comparison.merge(
    georef_df[['municipality_code', 'department_code', 'department_name']],
    on='municipality_code'
)

# Calculer l'évolution moyenne du pourcentage de maisons secondaires par département
secondary_home_rate_evolution_department = secondary_home_rate_comparison.groupby(['department_name'])[['nb_second_home_2008', "nb_second_home_2018"]].agg({'nb_second_home_2008': "sum", "nb_second_home_2018": "sum"})
secondary_home_rate_evolution_department["evolution_secondary_homes"]=((secondary_home_rate_evolution_department["nb_second_home_2018"]-secondary_home_rate_evolution_department["nb_second_home_2008"])/secondary_home_rate_evolution_department["nb_second_home_2008"])*100
secondary_home_rate_evolution_department.head(50)

Unnamed: 0_level_0,nb_second_home_2008,nb_second_home_2018,evolution_secondary_homes
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ain,16960,17574,3.620283
Aisne,10185,9393,-7.776141
Allier,14797,15032,1.58816
Alpes-Maritimes,166798,195108,16.972626
Alpes-de-Haute-Provence,38403,40199,4.676718
Ardennes,5070,5003,-1.321499
Ardèche,35985,36779,2.206475
Ariège,25341,27107,6.968944
Aube,6930,7825,12.914863
Aude,61952,67047,8.224109


In [169]:
secondary_home_rate_comparison

Unnamed: 0,municipality_code,nb_second_home_2008,nb_second_home_2018,department_code,department_name,secondary_home_rate_evolution
0,02550,110,330,02,Aisne,220
1,06134,275,236,06,Alpes-Maritimes,-39
2,07029,152,142,07,Ardèche,-10
3,07213,94,86,07,Ardèche,-8
4,07215,94,102,07,Ardèche,8
...,...,...,...,...,...,...
34930,33162,48,56,33,Gironde,8
34931,53084,48,42,53,Mayenne,-6
34932,38034,48,39,38,Isère,-9
34933,76462,48,33,76,Seine-Maritime,-15


##### 4. LIFE QUALITY

In [65]:
# 4.1 Professionnels de santé pour 100 000 habitants par départements en 2023
DATA_PATH = '../data/cleaned'
HEALTH_FILENAME = 'health_df_cleaned.csv'

health_df = pd.read_csv(os.path.join(DATA_PATH, HEALTH_FILENAME))
health_df.head()

Unnamed: 0,department_code,department_name,ensemble des médecins,ensemble des médecins.1,dont généralistes,dont spécialistes,chirurg. dentistes,pharm.
0,1,Ain,1 162,174,99,75,53,78
1,2,Aisne,1 107,211,99,111,45,91
2,3,Allier,917,275,134,141,51,111
3,4,Alpes-de-Haute-Provence,483,291,165,125,54,103
4,5,Hautes-Alpes,705,503,291,213,72,135


In [None]:
# 4.2 Taux de criminalité pour 1000 habitants par départements en 2020

CRIMINALITY_FILENAME = 'criminality_df_cleaned.csv'

criminality_df = pd.read_csv(os.path.join(DATA_PATH, CRIMINALITY_FILENAME))
criminality_df.head()

In [66]:
# 4.3 Nombre de jours de soleil par an par départements

SUNNY_FILENAME = 'sunny_df_cleaned.csv'

sunny_df = pd.read_csv(os.path.join(DATA_PATH, SUNNY_FILENAME))
sunny_df.head()
sunny_df.sort_values(by="sunny_days_per_year")

Unnamed: 0,department_name,sunny_days_per_year
89,Hauts-de-Seine,107
29,Gers,111
56,Nord,125
90,Seine-Saint-Denis,126
91,Val-de-Marne,126
...,...,...
27,Gard,238
3,Alpes de Haute Provence,241
81,Vaucluse,241
80,Var,245


# SCORING

##### 1. TOURISM

In [216]:
department_merged_df["ranking_hosting"]= department_merged_df["importance_x"]
department_merged_df["ranking_touristic_sites"]= department_merged_df["importance_y"]
calculation_tourism_scoring = department_merged_df.drop (columns=["importance_x", "importance_y"])
calculation_tourism_scoring

Unnamed: 0_level_0,ranking_hosting,ranking_touristic_sites
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Ain,11.423180,34.061847
Aisne,4.937153,15.687868
Allier,8.386644,21.401418
Alpes-Maritimes,32.802552,32.967630
Alpes-de-Haute-Provence,11.293578,14.722338
...,...,...
Vendée,27.816510,32.287377
Vienne,8.205227,21.203995
Vosges,12.007083,14.973759
Yonne,7.211247,16.595920


##### 2. REAL ESTATE

In [95]:
#fusion des différents dataframes pour réaliser le scoring
real_estate_scoring_merge_1 = yield_calculation.merge(sales_df_per_year, on="department_name")
real_estate_scoring_merge_2 = real_estate_scoring_merge_1.merge(vacants_housing_per_department, on="department_name")
real_estate_scoring_merge_2

Unnamed: 0_level_0,average_price_per_m2,average_rental,yield_rate,sales_amount,surface,average_price_m2,price_m2_growth,nb_vacants_housing
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Ain,2635.784095,13.183580,6.002122,2.802408e+09,990520.0,2829.229384,0.072189,25849
Aisne,1561.099105,9.919550,7.625051,8.480408e+08,523582.0,1619.690449,0.033483,26001
Allier,1577.489243,10.015825,7.619063,6.105036e+08,372329.0,1639.688435,0.029527,30479
Alpes-Maritimes,4821.653974,17.530595,4.362966,1.050612e+10,2027161.0,5182.678071,0.078167,64643
Alpes-de-Haute-Provence,2281.499046,11.304575,5.945867,5.404214e+08,227226.0,2378.343326,0.029776,10768
...,...,...,...,...,...,...,...,...
Vendée,2401.896369,10.549698,5.270684,3.281171e+09,1290262.0,2543.026785,0.091690,21818
Vienne,1752.067837,9.783623,6.700852,1.002048e+09,549295.0,1824.243054,0.066504,23264
Vosges,1713.030737,9.622087,6.740395,7.157524e+08,405495.0,1765.132472,0.039378,24154
Yonne,1607.734995,9.907284,7.394714,7.649412e+08,464540.0,1646.663833,0.041092,23303


In [96]:
#fusion du dernier dataframe
real_estate_scoring_merge_3 = real_estate_scoring_merge_2.merge(tax_df, on="department_name")
real_estate_scoring_merge_3
real_estate_scoring_merge_3 = real_estate_scoring_merge_3.drop(columns=["average_price_per_m2","sales_amount","surface","average_price_m2","average_rental","Nombre d'avis d'impôt"
])
real_estate_scoring_merge_3

Unnamed: 0,department_name,yield_rate,price_m2_growth,nb_vacants_housing,Taxe d'habitation moyenne en 2023
0,Ain,6.002122,0.072189,25849,847
1,Aisne,7.625051,0.033483,26001,732
2,Allier,7.619063,0.029527,30479,801
3,Alpes-Maritimes,4.362966,0.078167,64643,1686
4,Alpes-de-Haute-Provence,5.945867,0.029776,10768,551
...,...,...,...,...,...
88,Vendée,5.270684,0.091690,21818,759
89,Vienne,6.700852,0.066504,23264,746
90,Vosges,6.740395,0.039378,24154,611
91,Yonne,7.394714,0.041092,23303,682


In [97]:
#éléments pour le calcul du scoring immo
calculation_real_estate_scoring = real_estate_scoring_merge_3
calculation_real_estate_scoring.head(50)
calculation_real_estate_scoring["Taxe d'habitation moyenne en 2023"].astype(float)


0      847.0
1      732.0
2      801.0
3     1686.0
4      551.0
       ...  
88     759.0
89     746.0
90     611.0
91     682.0
92    1443.0
Name: Taxe d'habitation moyenne en 2023, Length: 93, dtype: float64

In [98]:
calculation_real_estate_scoring.tail(50)
calculation_real_estate_scoring

Unnamed: 0,department_name,yield_rate,price_m2_growth,nb_vacants_housing,Taxe d'habitation moyenne en 2023
0,Ain,6.002122,0.072189,25849,847
1,Aisne,7.625051,0.033483,26001,732
2,Allier,7.619063,0.029527,30479,801
3,Alpes-Maritimes,4.362966,0.078167,64643,1686
4,Alpes-de-Haute-Provence,5.945867,0.029776,10768,551
...,...,...,...,...,...
88,Vendée,5.270684,0.091690,21818,759
89,Vienne,6.700852,0.066504,23264,746
90,Vosges,6.740395,0.039378,24154,611
91,Yonne,7.394714,0.041092,23303,682


##### 3. SECONDARY HOME

In [184]:
#fusion des différents dataframes pour réaliser le scoring
calculation_secondary_home_scoring = average_surface_municipality.merge(secondary_home_rate_evolution_department, on="department_name")
calculation_secondary_home_scoring.tail(50)
calculation_secondary_home_scoring = calculation_secondary_home_scoring.drop(columns=["nb_second_home_2008", "nb_second_home_2018"])

##### 4. LIFE QUALITY

# **NORMALIZATION**

##### 1. TOURISM

In [218]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
scaler = MinMaxScaler()

#scaling pour calculer le scoring
calculation_tourism_scoring_numeric = calculation_tourism_scoring.select_dtypes(include="number")
df_scaled_tourism = scaler.fit_transform(calculation_tourism_scoring_numeric)
df_scaled_tourism = pd.DataFrame(df_scaled_tourism, columns=calculation_tourism_scoring_numeric.columns, index=calculation_tourism_scoring.index)
df_scaled_tourism.head(50)

Unnamed: 0_level_0,ranking_hosting,ranking_touristic_sites
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Ain,0.142733,0.627787
Aisne,0.048254,0.257672
Allier,0.098501,0.372763
Alpes-Maritimes,0.454155,0.605746
Alpes-de-Haute-Provence,0.140845,0.238223
Ardennes,0.027919,0.092048
Ardèche,0.437353,0.368949
Ariège,0.106684,0.14551
Aube,0.056686,0.093658
Aude,0.190056,0.336015


##### 2. REAL ESTATE

In [109]:
#scaling pour calculer le scoring
calculation_real_estate_scoring_numeric = calculation_real_estate_scoring.select_dtypes(include="number")
scaler = MinMaxScaler()
df_scaled_real_estate = scaler.fit_transform(calculation_real_estate_scoring_numeric)
df_scaled_real_estate = pd.DataFrame(df_scaled_real_estate, index=real_estate_scoring_merge_2.index, columns=calculation_real_estate_scoring_numeric.columns)
df_scaled_real_estate

Unnamed: 0_level_0,yield_rate,price_m2_growth,nb_vacants_housing,Taxe d'habitation moyenne en 2023
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ain,0.633355,0.551218,0.197095,0.213806
Aisne,0.991120,0.297747,0.198424,0.158677
Allier,0.989800,0.271845,0.237583,0.191755
Alpes-Maritimes,0.272014,0.590368,0.536334,0.616012
Alpes-de-Haute-Provence,0.620955,0.273472,0.065217,0.071908
...,...,...,...,...
Vendée,0.472114,0.678925,0.161845,0.171620
Vienne,0.787386,0.513990,0.174490,0.165388
Vosges,0.796103,0.336356,0.182273,0.100671
Yonne,0.940344,0.347576,0.174831,0.134708


In [112]:
#cleaning des différents KPI
df_scaled_real_estate['yield_rate'] = round(df_scaled_real_estate['yield_rate'], 2)
df_scaled_real_estate['price_m2_growth'] = round(df_scaled_real_estate['price_m2_growth'], 2)
df_scaled_real_estate['nb_vacants_housing'] = round(df_scaled_real_estate['nb_vacants_housing'], 2)
df_scaled_real_estate["Taxe d'habitation moyenne en 2023"] = round(1 - df_scaled_real_estate["Taxe d'habitation moyenne en 2023"], 2)
df_scaled_real_estate

Unnamed: 0_level_0,yield_rate,price_m2_growth,nb_vacants_housing,Taxe d'habitation moyenne en 2023
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ain,0.63,0.55,0.20,0.21
Aisne,0.99,0.30,0.20,0.16
Allier,0.99,0.27,0.24,0.19
Alpes-Maritimes,0.27,0.59,0.54,0.62
Alpes-de-Haute-Provence,0.62,0.27,0.07,0.07
...,...,...,...,...
Vendée,0.47,0.68,0.16,0.17
Vienne,0.79,0.51,0.17,0.17
Vosges,0.80,0.34,0.18,0.10
Yonne,0.94,0.35,0.17,0.13


In [78]:
calculation_real_estate_scoring.tail(50)

Unnamed: 0_level_0,yield_rate,price_m2_growth,nb_vacants_housing,Taxe d'habitation moyenne en 2023
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Hérault,5.334787,0.067913,53371,998
Ille-et-Vilaine,5.500305,0.088263,37096,909
Indre,7.204237,0.047201,18514,552
Indre-et-Loire,5.952506,0.054554,28098,818
Isère,6.016491,0.076904,50307,904
Jura,7.049789,0.048344,14794,634
Landes,5.059902,0.112565,17224,823
Loir-et-Cher,6.923747,0.05764,20023,753
Loire,6.506399,0.038213,41323,793
Loire-Atlantique,4.964178,0.088958,41992,1 130


##### 3. SECONDARY HOME

In [185]:
#scaling pour calculer le scoring
calculation_secondary_home_scoring_numeric = calculation_secondary_home_scoring.select_dtypes(include="number")
df_scaled_secondary_home = scaler.fit_transform(calculation_secondary_home_scoring_numeric)
df_scaled_secondary_home = pd.DataFrame(df_scaled_secondary_home, columns=calculation_secondary_home_scoring_numeric.columns, index=calculation_secondary_home_scoring["department_name"])
df_scaled_secondary_home.head(50)

Unnamed: 0_level_0,surface,evolution_secondary_homes
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Ain,0.849569,0.13895
Aisne,0.777615,0.014529
Allier,0.642331,0.116764
Alpes-Maritimes,0.255123,0.284725
Alpes-de-Haute-Provence,0.39759,0.150484
Ardennes,0.776964,0.084998
Ardèche,0.679231,0.123515
Ariège,0.61961,0.175509
Aube,0.626639,0.240424
Aude,0.489839,0.189213


In [189]:
#cleaning des différents KPI
df_scaled_secondary_home['surface'] = round(df_scaled_secondary_home['surface'], 2)
df_scaled_secondary_home['evolution_secondary_homes'] = round(df_scaled_secondary_home['evolution_secondary_homes'], 2)
df_scaled_secondary_home

Unnamed: 0_level_0,surface,evolution_secondary_homes
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Ain,0.85,0.14
Aisne,0.78,0.01
Allier,0.64,0.12
Alpes-Maritimes,0.26,0.28
Alpes-de-Haute-Provence,0.40,0.15
...,...,...
Vendée,0.63,0.08
Vienne,0.69,0.27
Vosges,0.72,0.24
Yonne,0.70,0.00


##### 4. LIFE QUALITY

In [123]:
calculation_secondary_home_scoring_numeric

Unnamed: 0,surface,secondary_home_rate_evolution
0,95.491503,-8.064238
1,91.956053,-8.667438
2,85.308972,-5.866098
3,66.283678,-1.166718
4,73.283742,3.350143
...,...,...
92,84.926212,-0.937029
93,87.584873,2.714484
94,88.986998,1.137999
95,88.105673,-13.456559
