# ⚙️ **CLEANED DATA IMPORT**

In [1]:
import pandas as pd
import numpy as np
import os

DATA_PATH = '../data/cleaned'

POI_FILENAME = 'poi_df_cleaned.csv'
SITE_FILENAME = 'site_df_cleaned.csv'
SALARY_FILENAME = 'salary_df_cleaned.csv'
GEOREF_FILENAME = 'georef_df_cleaned.csv'
STOCK_FILENAME = 'stock_df_cleaned.csv'
SALES_FILENAME = 'sales_df_cleaned.csv'
POPULATION_FILENAME = 'population_df_cleaned.csv'
POVERTY_FILENAME = 'poverty_df_cleaned.csv'
REAL_ESTATE_FILENAME = 'real_estate_df_cleaned.csv'

poi_df = pd.read_csv(os.path.join(DATA_PATH, POI_FILENAME))
site_df = pd.read_csv(os.path.join(DATA_PATH, SITE_FILENAME))
salary_df = pd.read_csv(os.path.join(DATA_PATH, SALARY_FILENAME))
georef_df = pd.read_csv(os.path.join(DATA_PATH, GEOREF_FILENAME))
stock_df = pd.read_csv(os.path.join(DATA_PATH, STOCK_FILENAME))
sales_df = pd.read_csv(os.path.join(DATA_PATH, SALES_FILENAME))
population_df = pd.read_csv(os.path.join(DATA_PATH, POPULATION_FILENAME))
poverty_df = pd.read_csv(os.path.join(DATA_PATH, POVERTY_FILENAME))
real_estate_df = pd.read_csv(os.path.join(DATA_PATH, REAL_ESTATE_FILENAME))

In [6]:
# DF CLEANED CHECK
poi_df.info()
site_df.info()
salary_df.info()
georef_df.info() 
stock_df.info() 
sales_df.info()
population_df.info() 
poverty_df.info()
real_estate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26216 entries, 0 to 26215
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   poi                26216 non-null  object 
 1   latitude           26216 non-null  float64
 2   longitude          26216 non-null  float64
 3   municipality_code  26216 non-null  object 
 4   importance         26216 non-null  float64
 5   name_reprocessed   26216 non-null  object 
dtypes: float64(3), object(3)
memory usage: 1.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31034 entries, 0 to 31033
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   poi                      31034 non-null  object 
 1   name                     31034 non-null  object 
 2   latitude                 31034 non-null  float64
 3   longitude                31034 non-null  float64
 4   municipality_code  

### CLEANING

##### DF_SALES CLEANING

In [8]:
# SALES_DF: Suppression des doublons > nous passons de 4,3M de lignes à 3,821M
sales_df = sales_df.drop_duplicates()
sales_df.shape

(3448398, 13)

In [9]:
# SALES_DF: Check si les doublons on été enlevés : OK
sales_df.duplicated().sum()

np.int64(0)

In [10]:
# SALES_DF: Suppression des prix au m2 supérieur à 30K€ et inférieur à 1K€ > nous passons à 3,3399M de lignes
sales_df = sales_df[(sales_df['sales_price_m2'] <= 30000) & (sales_df['sales_price_m2'] >= 1000)]
sales_df.shape

(3448398, 13)

In [11]:
# SALES_DF:
s2 = (sales_df['sales_amount']
             .value_counts()
             .loc[sales_df['sales_amount'].value_counts() > 10])

In [12]:
# SALES_DF:
sales_df = sales_df[sales_df['sales_amount'] > 1] # on enlève les 166 fois ou sales_amount = 1€
sales_df.shape

(3448398, 13)

In [13]:
# SALES_DF: changement du type sales_date en datetime
sales_df['sales_date'] = pd.to_datetime(sales_df['sales_date'])
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3448398 entries, 0 to 3448397
Data columns (total 13 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   sales_date                 datetime64[ns]
 1   sales_amount               float64       
 2   street_number              float64       
 3   street_code                object        
 4   street_name                object        
 5   nom_commune                object        
 6   municipality_code          object        
 7   premise_type               object        
 8   surface                    float64       
 9   number_of_principal_rooms  int64         
 10  sales_price_m2             float64       
 11  latitude                   float64       
 12  longitude                  float64       
dtypes: datetime64[ns](1), float64(6), int64(1), object(5)
memory usage: 342.0+ MB


##### DF_SALARY CLEANING

In [14]:
# DF_SALARY: ROUND avg_net_salary
salary_df['avg_net_salary'] = salary_df['avg_net_salary'].round()
salary_df.head()

Unnamed: 0,municipality_code,avg_net_salary,year
0,1004,26471.0,2019
1,1007,26341.0,2019
2,1014,25897.0,2019
3,1024,25695.0,2019
4,1025,26055.0,2019


##### DF_REAL_ESTATE CLEANING

In [15]:
# DF_REAL_ESTATE: suppression des nulls
real_estate_df = real_estate_df.dropna(axis=1)
real_estate_df.isnull().sum()

municipality_code         0
intensite_tension_immo    0
rental_max_apartment      0
rental_min_apartment      0
rental_med_all            0
rental_max_all            0
rental_min_all            0
dtype: int64

##### DF_SITE CLEANING

In [16]:
# SITE_DF: tri avec les données entre parenthèses de la colonne "name" inclues

import re

site_df['data_inside_parenthesis'] = site_df['name'].apply(lambda x: re.search(r'\((.*?)\)', x).group(1) if re.search(r'\((.*?)\)', x) else '')
site_df

#suppression de la colonne "name" dans un second temps

site_df.drop(columns=["name"])

#check pour savoir les informations présentes dans la colonne "poi", et si elles correspondent aux valeurs présentes dans la colonne "type"
print (site_df["poi"].value_counts())
print (site_df["data_inside_parenthesis"].value_counts().head(50))

#faire un mapping des colonnes poi, qui sont en fait plus pertinentes que celles de la colonne "type"

poi
castle            5611
2                 5207
water             4500
museum            3400
theatre           2439
forest            2233
beach             1010
protected_area     927
wetland            784
cinema             667
vineyard           641
golf_course        612
theme_park         358
cliff              337
marina             255
water_park         239
1                  236
zoo                234
valley             223
ridge              213
islet              163
casino             158
volcano            145
park               132
meadow              78
monument            76
sand                35
heritage            32
viewpoint           21
cave_entrance       15
wreck               12
rock                11
waterfall           10
attraction           8
dune                 5
national_park        4
allotments           2
geyser               1
Name: count, dtype: int64
data_inside_parenthesis
                       9899
Monument historique    5090
Eau             

In [17]:
# SITE_DF: création d'un dictionnaire intégrant toutes les différentes valeurs inclues dans la colonne "poi"
s = site_df["poi"].value_counts()[site_df["poi"]]
{k: "toto" for k in s.index}

{'1': 'toto',
 '2': 'toto',
 'zoo': 'toto',
 'dune': 'toto',
 'park': 'toto',
 'rock': 'toto',
 'sand': 'toto',
 'beach': 'toto',
 'cliff': 'toto',
 'islet': 'toto',
 'ridge': 'toto',
 'water': 'toto',
 'wreck': 'toto',
 'casino': 'toto',
 'castle': 'toto',
 'cinema': 'toto',
 'forest': 'toto',
 'geyser': 'toto',
 'marina': 'toto',
 'meadow': 'toto',
 'museum': 'toto',
 'valley': 'toto',
 'theatre': 'toto',
 'volcano': 'toto',
 'wetland': 'toto',
 'heritage': 'toto',
 'monument': 'toto',
 'vineyard': 'toto',
 'viewpoint': 'toto',
 'waterfall': 'toto',
 'allotments': 'toto',
 'attraction': 'toto',
 'theme_park': 'toto',
 'water_park': 'toto',
 'golf_course': 'toto',
 'cave_entrance': 'toto',
 'national_park': 'toto',
 'protected_area': 'toto'}

In [18]:
# SITE_DF: création d'un dictionnaire avec les catégories associées aux valeurs de la colonne POI

category_dict = {'1': 'Patrimoine',
 '2': 'Patrimoine',
 'zoo': 'Entertainment',
 'dune': 'Nature',
 'park': 'Nature',
 'rock': 'Nature',
 'sand': 'Nature',
 'beach': 'Nature',
 'cliff': 'Nature',
 'islet': 'Nature',
 'ridge': 'Nature',
 'water': 'Nature',
 'wreck': 'Patrimoine',
 'casino': 'Entertainment',
 'castle': 'Patrimoine',
 'cinema': 'Culture',
 'forest': 'Nature',
 'geyser': 'Nature',
 'marina': 'Nature',
 'meadow': 'Nature',
 'museum': 'Culture',
 'valley': 'Nature',
 'theatre': 'Culture',
 'volcano': 'Nature',
 'wetland': 'Nature',
 'heritage': 'Patrimoine',
 'monument': 'Patrimoine',
 'vineyard': 'Nature',
 'viewpoint': 'Nature',
 'waterfall': 'Nature',
 'allotments': 'Patrimoine',
 'attraction': 'Entertainment',
 'theme_park': 'Entertainment',
 'water_park': 'Entertainment',
 'golf_course': 'Entertainment',
 'cave_entrance': 'Culture',
 'national_park': 'Nature',
 'protected_area': 'Nature'}

In [19]:
# SITE_DF: création de la colonne "catégorie"
site_df["Category"] = site_df["poi"].map(category_dict)
site_df

Unnamed: 0,poi,name,latitude,longitude,municipality_code,importance,name_reprocessed,data_inside_parenthesis,Category
0,1,Fortifications de Vauban (Patrimoine mondial),46.396160,1.472600,36134,0.139527,Fortifications de Vauban,Patrimoine mondial,Patrimoine
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,46.313695,1.478772,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...,Patrimoine mondial,Patrimoine
2,1,Canal du Midi (Patrimoine mondial),43.332709,3.219890,34032,0.129531,Canal du Midi,Patrimoine mondial,Patrimoine
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,50.455895,2.965034,62427,0.127170,Bassin minier du Nord-Pas de Calais,Patrimoine mondial,Patrimoine
4,1,"Les Causses et les Cévennes, paysage culturel ...",44.346210,3.613406,48050,0.124981,"Les Causses et les Cévennes, paysage culturel ...",Patrimoine mondial,Patrimoine
...,...,...,...,...,...,...,...,...,...
31029,viewpoint,Tour Eiffel 3e étage (Point de vue),48.858262,2.294497,75056,0.053782,Tour Eiffel 3e étage,Point de vue,Nature
31030,theme_park,Jardin d'Acclimatation (Parc à thème),48.878145,2.264686,75056,0.087097,Jardin d'Acclimatation,Parc à thème,Entertainment
31031,theme_park,Foire du Trône (Parc à thème),48.832003,2.404337,75056,0.060000,Foire du Trône,Parc à thème,Entertainment
31032,golf_course,Golf du Bois de Boulogne (Terrain de golf),48.857776,2.232651,75056,0.060073,Golf du Bois de Boulogne,Terrain de golf,Entertainment


In [20]:
poi_df.info()
site_df.info()
salary_df.info()
georef_df.info() 
stock_df.info() 
sales_df.info()
population_df.info() 
poverty_df.info()
real_estate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26216 entries, 0 to 26215
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   poi                26216 non-null  object 
 1   latitude           26216 non-null  float64
 2   longitude          26216 non-null  float64
 3   municipality_code  26216 non-null  object 
 4   importance         26216 non-null  float64
 5   name_reprocessed   26216 non-null  object 
dtypes: float64(3), object(3)
memory usage: 1.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31034 entries, 0 to 31033
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   poi                      31034 non-null  object 
 1   name                     31034 non-null  object 
 2   latitude                 31034 non-null  float64
 3   longitude                31034 non-null  float64
 4   municipality_code  

In [None]:
poi_df.head(1)

In [None]:
site_df.head(1) 

In [None]:
salary_df.head(1)

In [None]:
georef_df.head(1) 

In [None]:
stock_df.head(1) 

In [None]:
sales_df.head(1)

In [None]:
population_df.head(1) 

In [None]:
poverty_df.head(1)

In [None]:
real_estate_df.head(1)

# 🧪 **DATA TRANSFORMATION**

### KPIS AGGREGATION BY DEPARTMENT

##### 1. POPULATION

In [None]:
# 1.1 Salaire moyen par département
# Joindre les informations de géolocalisation pour obtenir les départements
salary_dep_df = salary_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer le salaire moyen par département
avg_salary_per_department = salary_dep_df.groupby(['department_code', 'department_name'])['avg_net_salary'].mean().reset_index().round()
avg_salary_per_department.rename(columns={'avg_net_salary': 'avg_salary'}, inplace=True)
avg_salary_per_department.head()

In [None]:
# 1.2 Évolution de la population par département
# Joindre les informations de géolocalisation pour obtenir les départements
population_dep_df = population_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer l'évolution de la population par département (différence entre les années)
pop_evolution = population_dep_df.groupby(['department_code', 'department_name', 'year'])['population'].sum().unstack().reset_index()
pop_evolution['evolution'] = (pop_evolution[pop_evolution.columns[-1]] - pop_evolution[pop_evolution.columns[-2]]) / pop_evolution[pop_evolution.columns[-2]] * 100
pop_evolution = pop_evolution[['department_code', 'department_name', 'evolution']]

pop_evolution.head()

In [None]:
# 1.3 Taux de pauvreté par département
# Joindre les données de population
poverty_df = poverty_df.merge(population_df[['municipality_code', 'population']], on='municipality_code', suffixes=('_poverty', '_population'))

# Merge avec georef pour avoir le department_name
poverty_df = poverty_df.merge(georef_df[['municipality_code', 'department_name']], on='municipality_code', how='left')

# Groupe par department_name pour calculer le taux de pauvreté par département
poverty_by_department = poverty_df.groupby('department_name').agg({
    'population_poverty': 'sum',
    'population_population': 'sum'
}).reset_index()

# Calculer le poverty_rate pour chaque département
poverty_by_department['poverty_rate'] = (poverty_by_department['population_poverty'] / poverty_by_department['population']) * 100

# Afficher le résultat
poverty_by_department.head()

# calcul = poverty_rate = poverty_population / total_population * 100
# il faut ensuite le DF qui a poverty_rate à georef pour récupérer le department_name

##### 2. TOURISM

In [22]:
#création de tables permettant de scorer le potentiel touristique de chaque département
site_df = site_df.merge (georef_df, on=["municipality_code"])
site_df.head(5)

Unnamed: 0,poi,name,latitude_x,longitude_x,municipality_code,importance,name_reprocessed,data_inside_parenthesis,Category,city_name,city_name_normalized,municipality_type,latitude_y,longitude_y,department_code,epci_code,department_name
0,1,Fortifications de Vauban (Patrimoine mondial),46.39616,1.4726,36134,0.139527,Fortifications de Vauban,Patrimoine mondial,Patrimoine,Mouhet,MOUHET,municipality,46.389251,1.442651,36,200035137.0,Indre
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,46.313695,1.478772,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...,Patrimoine mondial,Patrimoine,Vareilles,VAREILLES,municipality,46.305016,1.456031,23,242300135.0,Creuse
2,1,Canal du Midi (Patrimoine mondial),43.332709,3.21989,34032,0.129531,Canal du Midi,Patrimoine mondial,Patrimoine,Béziers,BEZIERS,municipality,43.347588,3.230768,34,243400769.0,Hérault
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,50.455895,2.965034,62427,0.12717,Bassin minier du Nord-Pas de Calais,Patrimoine mondial,Patrimoine,Hénin-Beaumont,HENIN BEAUMONT,municipality,50.409234,2.958997,62,246200299.0,Pas-de-Calais
4,1,"Les Causses et les Cévennes, paysage culturel ...",44.34621,3.613406,48050,0.124981,"Les Causses et les Cévennes, paysage culturel ...",Patrimoine mondial,Patrimoine,Bédouès-Cocurès,BEDOUES COCURES,municipality,44.353946,3.61956,48,200069151.0,Lozère


In [23]:
#sélection des colonnes dont on aura besoin pour le calcul
site_df_department = site_df[["poi", "name", "municipality_code", "importance", "name_reprocessed", "department_name"]]
site_df_department

Unnamed: 0,poi,name,municipality_code,importance,name_reprocessed,department_name
0,1,Fortifications de Vauban (Patrimoine mondial),36134,0.139527,Fortifications de Vauban,Indre
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...,Creuse
2,1,Canal du Midi (Patrimoine mondial),34032,0.129531,Canal du Midi,Hérault
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,62427,0.127170,Bassin minier du Nord-Pas de Calais,Pas-de-Calais
4,1,"Les Causses et les Cévennes, paysage culturel ...",48050,0.124981,"Les Causses et les Cévennes, paysage culturel ...",Lozère
...,...,...,...,...,...,...
31019,viewpoint,Tour Eiffel 3e étage (Point de vue),75056,0.053782,Tour Eiffel 3e étage,Paris
31020,theme_park,Jardin d'Acclimatation (Parc à thème),75056,0.087097,Jardin d'Acclimatation,Paris
31021,theme_park,Foire du Trône (Parc à thème),75056,0.060000,Foire du Trône,Paris
31022,golf_course,Golf du Bois de Boulogne (Terrain de golf),75056,0.060073,Golf du Bois de Boulogne,Paris


In [24]:
#groupement par département, puis classement par le département ayant le + d'atouts touristiques
group_site = site_df_department.groupby("department_name")[["importance"]].sum()
group_site
group_site.sort_values("importance", ascending =False)

Unnamed: 0_level_0,importance
department_name,Unnamed: 1_level_1
Gironde,52.539958
Bouches-du-Rhône,47.068588
Finistère,46.685274
Isère,44.230787
Loire-Atlantique,43.525109
...,...
Haute-Marne,7.416165
Val-de-Marne,6.699159
Lozère,6.230406
Seine-Saint-Denis,5.718311


In [25]:
#même calcul que précédemment, mais pour la partie concernant les logements/lieux de villégiature
poi_df = poi_df.merge (georef_df, on=["municipality_code"])
poi_df.head(5)

Unnamed: 0,poi,latitude_x,longitude_x,municipality_code,importance,name_reprocessed,city_name,city_name_normalized,municipality_type,latitude_y,longitude_y,department_code,epci_code,department_name
0,hotel,45.678531,-1.163635,17225,0.078556,Club Med,Les Mathes,MATHES,municipality,45.705988,-1.170867,17,241700640.0,Charente-Maritime
1,hotel,44.026859,4.847491,84129,0.078419,Cabanes des Grands Cépages,Sorgues,SORGUES,municipality,44.014576,4.867405,84,248400293.0,Vaucluse
2,hotel,50.331541,1.565677,80333,0.077999,Pierre et Vacances,Fort-Mahon-Plage,FORT MAHON PLAGE,municipality,50.345059,1.577068,80,200070936.0,Somme
3,hotel,43.285936,6.569696,83068,0.077702,Pierre & Vacances,Grimaud,GRIMAUD,municipality,43.282028,6.533032,83,200036077.0,Var
4,hotel,43.54105,3.752852,34095,0.077542,Le Domaine du Golf,Fabrègues,FABREGUES,municipality,43.534477,3.77193,34,243400017.0,Hérault


In [26]:
#sélection des colonnes dont on aura besoin pour le calcul
poi_df_department = poi_df[["poi", "municipality_code", "importance", "department_name"]]
poi_df_department


Unnamed: 0,poi,municipality_code,importance,department_name
0,hotel,17225,0.078556,Charente-Maritime
1,hotel,84129,0.078419,Vaucluse
2,hotel,80333,0.077999,Somme
3,hotel,83068,0.077702,Var
4,hotel,34095,0.077542,Hérault
...,...,...,...,...
26202,camp_site,19164,0.040000,Corrèze
26203,camp_site,03238,0.040000,Allier
26204,camp_site,19241,0.040000,Corrèze
26205,camp_site,23131,0.040000,Creuse


In [27]:
#groupement par département, puis classement par le département ayant le + de logements/lieux de villégiature
group_poi = poi_df_department.groupby("department_name")[["importance"]].sum()
group_poi
group_poi.sort_values("importance", ascending =False)

Unnamed: 0_level_0,importance
department_name,Unnamed: 1_level_1
Paris,70.275305
Savoie,37.401407
Haute-Savoie,35.158395
Hérault,33.793973
Alpes-Maritimes,32.802552
...,...
Eure-et-Loir,3.871754
Haute-Marne,3.670584
Ardennes,3.541133
Mayenne,3.154595


In [28]:
#ajout des 2 calculs d'importance
department_merged_df = group_poi.merge (group_site, on=["department_name"])
department_merged_df["somme_importance"]=department_merged_df["importance_x"]+department_merged_df["importance_y"]
department_merged_df = department_merged_df.drop(columns=["importance_x", "importance_y"])
department_merged_df
department_merged_df.sort_values("somme_importance", ascending =False)


Unnamed: 0_level_0,somme_importance
department_name,Unnamed: 1_level_1
Paris,110.051777
Gironde,77.287832
Savoie,76.393945
Finistère,75.977072
Bouches-du-Rhône,71.561051
...,...
Aube,13.061510
Seine-Saint-Denis,11.649267
Haute-Marne,11.086749
Ardennes,11.006796


##### 3. REAL ESTATE

In [42]:
#calcul du loyer au m2 médian par municipality_code
rental_med = real_estate_df [["municipality_code", "rental_med_all"]]
rental_med

Unnamed: 0,municipality_code,rental_med_all
0,57133,9.53
1,57446,11.09
2,77013,12.26
3,77026,9.53
4,77072,11.47
...,...,...
34436,81126,8.93
34437,33425,11.09
34438,85146,10.88
34439,53062,8.69


In [41]:
#calcul du prix d'achat au m2 médian par municipality_code
sales_df
sales_df_grouped = sales_df.groupby(["municipality_code"])[["sales_amount", "sales_price_m2"]].sum()
sales_df_grouped = pd.DataFrame (sales_df_grouped)
sales_df_grouped

Unnamed: 0_level_0,sales_amount,sales_price_m2
municipality_code,Unnamed: 1_level_1,Unnamed: 2_level_1
01001,1.362286e+07,122067.0
01002,4.067389e+06,39457.0
01004,1.852356e+08,2331076.0
01005,3.579244e+07,360368.0
01006,2.575955e+06,25962.0
...,...,...
97420,5.335366e+07,674894.0
97421,3.143598e+06,41730.0
97422,2.541138e+08,3559005.0
97423,1.326098e+07,197498.0


In [44]:
#jointure pour rajouter dans cette table le loyer médian par municipality_code
sales_df
real_estate_grouped = sales_df_grouped.merge(rental_med, on="municipality_code")
real_estate_grouped

Unnamed: 0,municipality_code,sales_amount,sales_price_m2,rental_med_all
0,01001,1.362286e+07,122067.0,10.66
1,01002,4.067389e+06,39457.0,10.16
2,01004,1.852356e+08,2331076.0,11.25
3,01005,3.579244e+07,360368.0,13.28
4,01006,2.575955e+06,25962.0,12.70
...,...,...,...,...
31892,95676,5.974304e+06,59985.0,12.29
31893,95678,1.222182e+07,146294.0,18.53
31894,95680,8.569815e+07,1278553.0,16.22
31895,95682,1.463606e+06,13640.0,16.57


In [47]:
#ajout du nom du département correspondant à chaque municipality code
real_estate_department = real_estate_grouped.merge(georef_df, on="municipality_code")
real_estate_department
real_estate_department = real_estate_department [["municipality_code", "sales_amount", "sales_price_m2", "rental_med_all", "department_name"]]

In [48]:
real_estate_department

Unnamed: 0,municipality_code,sales_amount,sales_price_m2,rental_med_all,department_name
0,01001,1.362286e+07,122067.0,10.66,Ain
1,01002,4.067389e+06,39457.0,10.16,Ain
2,01004,1.852356e+08,2331076.0,11.25,Ain
3,01005,3.579244e+07,360368.0,13.28,Ain
4,01006,2.575955e+06,25962.0,12.70,Ain
...,...,...,...,...,...
31892,95676,5.974304e+06,59985.0,12.29,Val-d'Oise
31893,95678,1.222182e+07,146294.0,18.53,Val-d'Oise
31894,95680,8.569815e+07,1278553.0,16.22,Val-d'Oise
31895,95682,1.463606e+06,13640.0,16.57,Val-d'Oise


In [None]:
#groupement des informations par département
real_estate_global_per_department = sales_df.groupby(["department_name"])[["sales_amount", "sales_price_m2"]].sum()[



Population
1.1 Salaire moyen par département
1.2 Évolution de la population par département
1.3 Taux de pauvreté par département

Tourisme
2.1 Nombre de sites touristiques par département
2.2 Importance moyenne des sites par département
2.3 Stock de logement par département

Immobilier
3.1 Rentabilité locative au m² par département
3.2 Tension immobilière par département
3.3 Part de maisons secondaires par département
3.4 Évolution du prix au m² par département
3.5 Prix moyen au m² des ventes immobilières par département
creer moi un système de scoring (avec pondération) pour avoir :

un score Population
un score Tourisme
un score Immobilier
puis un Score Global grâce aux 3 précédents scores

In [None]:
# Supposons que vous avez déjà calculé les données nécessaires pour les scores Population, Tourisme et Immobilier

# 1. Définition des poids pour chaque sous-score dans chaque catégorie
weights_population = {
    'average_salary': 0.4,
    'population_growth': 0.3,
    'poverty_rate': 0.3
}

weights_tourism = {
    'num_tourism_sites': 0.4,
    'average_importance': 0.3,
    'stock_housing': 0.3
}

weights_real_estate = {
    'rental_yield_per_m2': 0.2,
    'real_estate_tension': 0.2,
    'secondary_home_rate': 0.1,
    'price_growth': 0.2,
    'average_price_per_m2': 0.3
}

# 2. Calcul des scores pour chaque catégorie

# POPULATION
# Supposons que vous avez déjà les dataframes suivants : average_salary_by_department, population_evolution_by_department, average_poverty_rate_by_department

population_scores = (
    avg_salary_per_department['avg_salary'] * weights_population['average_salary'] +
    pop_evolution['evolution'] * weights_population['population_growth'] +
    (100 - average_poverty_rate_by_department['poverty_rate']) * weights_population['poverty_rate']
)

# TOURISM
# Supposons que vous avez déjà les dataframes suivants : tourism_sites_by_department, average_importance_by_department, stock_housing_by_department

tourism_scores = (
    tourism_sites_by_department['num_tourism_sites'] * weights_tourism['num_tourism_sites'] +
    average_importance_by_department['importance'] * weights_tourism['average_importance'] +
    stock_housing_by_department['stock_housing'] * weights_tourism['stock_housing']
)

# REAL ESTATE
# Supposons que vous avez déjà les dataframes suivants : rental_yield_per_m2_by_department, real_estate_tension_by_department, second_home_rate_by_department, price_growth_by_department, average_price_per_m2_by_department

real_estate_scores = (
    rental_yield_per_m2_by_department['rental_yield_per_m2'] * weights_real_estate['rental_yield_per_m2'] +
    (100 - real_estate_tension_by_department['intensite_tension_immo']) * weights_real_estate['real_estate_tension'] +
    (100 - second_home_rate_by_department['secondary_home_rate']) * weights_real_estate['secondary_home_rate'] +
    price_growth_by_department['price_growth'] * weights_real_estate['price_growth'] +
    average_price_per_m2_by_department['average_price_per_m2'] * weights_real_estate['average_price_per_m2']
)

# 3. Calcul du score global
# Supposons que les scores sont déjà calculés pour chaque catégorie

global_score = (
    population_scores +
    tourism_scores +
    real_estate_scores
)

# Afficher ou utiliser les résultats
print("Scores Population :\n", population_scores.head())
print("\nScores Tourisme :\n", tourism_scores.head())
print("\nScores Immobilier :\n", real_estate_scores.head())
print("\nScore Global :\n", global_score.head())


In [None]:
# POPULATION SCORE
from sklearn.preprocessing import MinMaxScaler

# Supposons que vous avez déjà calculé les sous-scores pour la catégorie Population : average_salary_by_department, population_evolution_by_department, average_poverty_rate_by_department

# Initialisation du MinMaxScaler
scaler_population = MinMaxScaler()

# Normalisation des données
population_scores_scaled = scaler_population.fit_transform(
    population_scores[['avg_net_salary', 'population_growth', 'poverty_rate']]
)

# Calcul des scores normalisés
population_scores_normalized = (
    population_scores_scaled[:, 0] * weights_population['average_salary'] +
    population_scores_scaled[:, 1] * weights_population['population_growth'] +
    population_scores_scaled[:, 2] * weights_population['poverty_rate']
)

# Afficher ou utiliser les résultats des scores normalisés
print("Scores Population normalisés :\n", population_scores_normalized.head())


In [None]:
# TOURISM SCORE
# Supposons que vous avez déjà calculé les sous-scores pour la catégorie Tourisme : tourism_sites_by_department, average_importance_by_department, stock_housing_by_department

# Initialisation du MinMaxScaler
scaler_tourism = MinMaxScaler()

# Normalisation des données
tourism_scores_scaled = scaler_tourism.fit_transform(
    tourism_scores[['num_tourism_sites', 'average_importance', 'stock_housing']]
)

# Calcul des scores normalisés
tourism_scores_normalized = (
    tourism_scores_scaled[:, 0] * weights_tourism['num_tourism_sites'] +
    tourism_scores_scaled[:, 1] * weights_tourism['average_importance'] +
    tourism_scores_scaled[:, 2] * weights_tourism['stock_housing']
)

# Afficher ou utiliser les résultats des scores normalisés
print("Scores Tourisme normalisés :\n", tourism_scores_normalized.head())


In [None]:
# 1 REAL ESTATE SCORE
# Supposons que vous avez déjà calculé les sous-scores pour la catégorie Immobilier : rental_yield_per_m2_by_department, real_estate_tension_by_department, second_home_rate_by_department, price_growth_by_department, average_price_per_m2_by_department

# Initialisation du MinMaxScaler
scaler_real_estate = MinMaxScaler()

# Normalisation des données
real_estate_scores_scaled = scaler_real_estate.fit_transform(
    real_estate_scores[['rental_yield_per_m2', 'real_estate_tension', 'secondary_home_rate', 'price_growth', 'average_price_per_m2']]
)

# Calcul des scores normalisés
real_estate_scores_normalized = (
    real_estate_scores_scaled[:, 0] * weights_real_estate['rental_yield_per_m2'] +
    real_estate_scores_scaled[:, 1] * weights_real_estate['real_estate_tension'] +
    real_estate_scores_scaled[:, 2] * weights_real_estate['secondary_home_rate'] +
    real_estate_scores_scaled[:, 3] * weights_real_estate['price_growth'] +
    real_estate_scores_scaled[:, 4] * weights_real_estate['average_price_per_m2']
)

# Afficher ou utiliser les résultats des scores normalisés
print("Scores Immobilier normalisés :\n", real_estate_scores_normalized.head())


In [None]:
1 GLOBAL SCORE
# Supposons que vous avez déjà les scores normalisés pour chaque catégorie
# population_scores_normalized, tourism_scores_normalized, real_estate_scores_normalized

# Définition des poids pour chaque catégorie
weights = {
    'population': 0.4,
    'tourism': 0.3,
    'real_estate': 0.3
}

# Calcul du score global pondéré
global_score = (
    population_scores_normalized * weights['population'] +
    tourism_scores_normalized * weights['tourism'] +
    real_estate_scores_normalized * weights['real_estate']
)

# Afficher ou utiliser le score global
print("Score Global :\n", global_score.head())


# 🚀 ENRICHED EXPORT