# ‚öôÔ∏è **CLEANED DATA IMPORT**

In [1]:
import pandas as pd
import numpy as np
import os

DATA_PATH = '../data/cleaned'

POI_FILENAME = 'poi_df_cleaned.csv'
SITE_FILENAME = 'site_df_cleaned.csv'
SALARY_FILENAME = 'salary_df_cleaned.csv'
GEOREF_FILENAME = 'georef_df_cleaned.csv'
STOCK_FILENAME = 'stock_df_cleaned.csv'
SALES_FILENAME = 'sales_df_cleaned.csv'
POPULATION_FILENAME = 'population_df_cleaned.csv'
POVERTY_FILENAME = 'poverty_df_cleaned.csv'
REAL_ESTATE_FILENAME = 'real_estate_df_cleaned.csv'

poi_df = pd.read_csv(os.path.join(DATA_PATH, POI_FILENAME))
site_df = pd.read_csv(os.path.join(DATA_PATH, SITE_FILENAME))
salary_df = pd.read_csv(os.path.join(DATA_PATH, SALARY_FILENAME))
georef_df = pd.read_csv(os.path.join(DATA_PATH, GEOREF_FILENAME))
stock_df = pd.read_csv(os.path.join(DATA_PATH, STOCK_FILENAME))
sales_df = pd.read_csv(os.path.join(DATA_PATH, SALES_FILENAME))
population_df = pd.read_csv(os.path.join(DATA_PATH, POPULATION_FILENAME))
poverty_df = pd.read_csv(os.path.join(DATA_PATH, POVERTY_FILENAME))
real_estate_df = pd.read_csv(os.path.join(DATA_PATH, REAL_ESTATE_FILENAME))

### CLEANING

##### DF_SALES CLEANING

In [2]:
# SALES_DF: Suppression des doublons > nous passons de 4,3M de lignes √† 3,821M
sales_df = sales_df.drop_duplicates()
sales_df.shape

(3448398, 13)

In [3]:
# SALES_DF: Check si les doublons on √©t√© enlev√©s : OK
sales_df.duplicated().sum()

np.int64(0)

In [4]:
# SALES_DF: Suppression des prix au m2 sup√©rieur √† 30K‚Ç¨ et inf√©rieur √† 1K‚Ç¨ > nous passons √† 3,3399M de lignes
sales_df = sales_df[(sales_df['sales_price_m2'] <= 30000) & (sales_df['sales_price_m2'] >= 1000)]
sales_df.shape

(3448398, 13)

In [5]:
# SALES_DF:
s2 = (sales_df['sales_amount']
             .value_counts()
             .loc[sales_df['sales_amount'].value_counts() > 10])

In [6]:
# SALES_DF:
sales_df = sales_df[sales_df['sales_amount'] > 1] # on enl√®ve les 166 fois ou sales_amount = 1‚Ç¨
sales_df.shape

(3448398, 13)

In [7]:
# SALES_DF: changement du type sales_date en datetime
sales_df['sales_date'] = pd.to_datetime(sales_df['sales_date'])
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3448398 entries, 0 to 3448397
Data columns (total 13 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   sales_date                 datetime64[ns]
 1   sales_amount               float64       
 2   street_number              float64       
 3   street_code                object        
 4   street_name                object        
 5   nom_commune                object        
 6   municipality_code          object        
 7   premise_type               object        
 8   surface                    float64       
 9   number_of_principal_rooms  int64         
 10  sales_price_m2             float64       
 11  latitude                   float64       
 12  longitude                  float64       
dtypes: datetime64[ns](1), float64(6), int64(1), object(5)
memory usage: 342.0+ MB


##### DF_SALARY CLEANING

In [8]:
# DF_SALARY: ROUND avg_net_salary
salary_df['avg_net_salary'] = salary_df['avg_net_salary'].round()
salary_df.head()

Unnamed: 0,municipality_code,avg_net_salary,year
0,1004,26471.0,2019
1,1007,26341.0,2019
2,1014,25897.0,2019
3,1024,25695.0,2019
4,1025,26055.0,2019


##### DF_REAL_ESTATE CLEANING

In [9]:
# DF_REAL_ESTATE: suppression des nulls
real_estate_df = real_estate_df.dropna(axis=1)
real_estate_df.isnull().sum()

municipality_code         0
intensite_tension_immo    0
rental_max_apartment      0
rental_min_apartment      0
rental_med_all            0
rental_max_all            0
rental_min_all            0
dtype: int64

##### DF_SITE CLEANING

In [10]:
# SITE_DF: tri avec les donn√©es entre parenth√®ses de la colonne "name" inclues

import re

site_df['data_inside_parenthesis'] = site_df['name'].apply(lambda x: re.search(r'\((.*?)\)', x).group(1) if re.search(r'\((.*?)\)', x) else '')
site_df

#suppression de la colonne "name" dans un second temps

site_df.drop(columns=["name"])

#check pour savoir les informations pr√©sentes dans la colonne "poi", et si elles correspondent aux valeurs pr√©sentes dans la colonne "type"
print (site_df["poi"].value_counts())
print (site_df["data_inside_parenthesis"].value_counts().head(50))

#faire un mapping des colonnes poi, qui sont en fait plus pertinentes que celles de la colonne "type"

poi
castle            5611
2                 5207
water             4500
museum            3400
theatre           2439
forest            2233
beach             1010
protected_area     927
wetland            784
cinema             667
vineyard           641
golf_course        612
theme_park         358
cliff              337
marina             255
water_park         239
1                  236
zoo                234
valley             223
ridge              213
islet              163
casino             158
volcano            145
park               132
meadow              78
monument            76
sand                35
heritage            32
viewpoint           21
cave_entrance       15
wreck               12
rock                11
waterfall           10
attraction           8
dune                 5
national_park        4
allotments           2
geyser               1
Name: count, dtype: int64
data_inside_parenthesis
                       9899
Monument historique    5090
Eau             

In [11]:
# SITE_DF: cr√©ation d'un dictionnaire int√©grant toutes les diff√©rentes valeurs inclues dans la colonne "poi"
s = site_df["poi"].value_counts()[site_df["poi"]]
{k: "toto" for k in s.index}

{'1': 'toto',
 '2': 'toto',
 'zoo': 'toto',
 'dune': 'toto',
 'park': 'toto',
 'rock': 'toto',
 'sand': 'toto',
 'beach': 'toto',
 'cliff': 'toto',
 'islet': 'toto',
 'ridge': 'toto',
 'water': 'toto',
 'wreck': 'toto',
 'casino': 'toto',
 'castle': 'toto',
 'cinema': 'toto',
 'forest': 'toto',
 'geyser': 'toto',
 'marina': 'toto',
 'meadow': 'toto',
 'museum': 'toto',
 'valley': 'toto',
 'theatre': 'toto',
 'volcano': 'toto',
 'wetland': 'toto',
 'heritage': 'toto',
 'monument': 'toto',
 'vineyard': 'toto',
 'viewpoint': 'toto',
 'waterfall': 'toto',
 'allotments': 'toto',
 'attraction': 'toto',
 'theme_park': 'toto',
 'water_park': 'toto',
 'golf_course': 'toto',
 'cave_entrance': 'toto',
 'national_park': 'toto',
 'protected_area': 'toto'}

In [12]:
# SITE_DF: cr√©ation d'un dictionnaire avec les cat√©gories associ√©es aux valeurs de la colonne POI

category_dict = {'1': 'Patrimoine',
 '2': 'Patrimoine',
 'zoo': 'Entertainment',
 'dune': 'Nature',
 'park': 'Nature',
 'rock': 'Nature',
 'sand': 'Nature',
 'beach': 'Nature',
 'cliff': 'Nature',
 'islet': 'Nature',
 'ridge': 'Nature',
 'water': 'Nature',
 'wreck': 'Patrimoine',
 'casino': 'Entertainment',
 'castle': 'Patrimoine',
 'cinema': 'Culture',
 'forest': 'Nature',
 'geyser': 'Nature',
 'marina': 'Nature',
 'meadow': 'Nature',
 'museum': 'Culture',
 'valley': 'Nature',
 'theatre': 'Culture',
 'volcano': 'Nature',
 'wetland': 'Nature',
 'heritage': 'Patrimoine',
 'monument': 'Patrimoine',
 'vineyard': 'Nature',
 'viewpoint': 'Nature',
 'waterfall': 'Nature',
 'allotments': 'Patrimoine',
 'attraction': 'Entertainment',
 'theme_park': 'Entertainment',
 'water_park': 'Entertainment',
 'golf_course': 'Entertainment',
 'cave_entrance': 'Culture',
 'national_park': 'Nature',
 'protected_area': 'Nature'}

In [13]:
# SITE_DF: cr√©ation de la colonne "cat√©gorie"
site_df["Category"] = site_df["poi"].map(category_dict)
site_df

Unnamed: 0,poi,name,latitude,longitude,municipality_code,importance,name_reprocessed,data_inside_parenthesis,Category
0,1,Fortifications de Vauban (Patrimoine mondial),46.396160,1.472600,36134,0.139527,Fortifications de Vauban,Patrimoine mondial,Patrimoine
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,46.313695,1.478772,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...,Patrimoine mondial,Patrimoine
2,1,Canal du Midi (Patrimoine mondial),43.332709,3.219890,34032,0.129531,Canal du Midi,Patrimoine mondial,Patrimoine
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,50.455895,2.965034,62427,0.127170,Bassin minier du Nord-Pas de Calais,Patrimoine mondial,Patrimoine
4,1,"Les Causses et les C√©vennes, paysage culturel ...",44.346210,3.613406,48050,0.124981,"Les Causses et les C√©vennes, paysage culturel ...",Patrimoine mondial,Patrimoine
...,...,...,...,...,...,...,...,...,...
31029,viewpoint,Tour Eiffel 3e √©tage (Point de vue),48.858262,2.294497,75056,0.053782,Tour Eiffel 3e √©tage,Point de vue,Nature
31030,theme_park,Jardin d'Acclimatation (Parc √† th√®me),48.878145,2.264686,75056,0.087097,Jardin d'Acclimatation,Parc √† th√®me,Entertainment
31031,theme_park,Foire du Tr√¥ne (Parc √† th√®me),48.832003,2.404337,75056,0.060000,Foire du Tr√¥ne,Parc √† th√®me,Entertainment
31032,golf_course,Golf du Bois de Boulogne (Terrain de golf),48.857776,2.232651,75056,0.060073,Golf du Bois de Boulogne,Terrain de golf,Entertainment


In [19]:
# Fusionner les donn√©es des sites avec les informations de g√©olocalisation pour obtenir les d√©partements
site_with_dep = site_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer la r√©partition des cat√©gories par d√©partement
tourism_category_per_department = site_with_dep.groupby(['department_code', 'department_name', 'Category']).size().unstack(fill_value=0).reset_index()

# Afficher les premi√®res lignes de la r√©partition des cat√©gories par d√©partement
tourism_category_per_department.head()

Category,department_code,department_name,Culture,Entertainment,Nature,Patrimoine
0,1,Ain,38,27,274,169
1,2,Aisne,40,7,63,107
2,3,Allier,42,10,102,169
3,4,Alpes-de-Haute-Provence,48,8,108,60
4,5,Hautes-Alpes,40,12,228,33


In [None]:
poi_df.info()
site_df.info()
salary_df.info()
georef_df.info() 
stock_df.info() 
sales_df.info()
population_df.info() 
poverty_df.info()
real_estate_df.info()

In [None]:
poi_df.head(1)

In [None]:
site_df.head(1) 

In [None]:
salary_df.head(1)

In [None]:
georef_df.head(1) 

In [None]:
stock_df.head(1) 

In [None]:
sales_df.head(1)

In [None]:
population_df.head(1) 

In [None]:
poverty_df.head(1)

In [None]:
real_estate_df.head(1)

# üß™ **DATA TRANSFORMATION**

### KPIS AGGREGATION BY DEPARTMENT

##### 1. TOURISM MIKE (not used for the scoring)

In [20]:
# MIKE 1.1 Nombre de sites touristiques par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
site_dep_df = site_df.merge(georef_df[['municipality_code', 'department_code']], on='municipality_code')

# Calculer le nombre de sites touristiques par d√©partement
num_sites_per_department = site_dep_df.groupby('department_code')['poi'].count().reset_index()
num_sites_per_department.rename(columns={'poi': 'num_sites'}, inplace=True)
num_sites_per_department

Unnamed: 0,department_code,num_sites
0,01,508
1,02,217
2,03,323
3,04,224
4,05,313
...,...,...
91,91,325
92,92,152
93,93,90
94,94,101


In [21]:
# MIKE 1.2 Importance moyenne des sites par d√©partement
# Calculer l'importance moyenne des sites touristiques par d√©partement
avg_site_importance_per_department = site_dep_df.groupby('department_code')['importance'].mean().reset_index() 
avg_site_importance_per_department.rename(columns={'importance': 'avg_site_importance'}, inplace=True)
avg_site_importance_per_department

Unnamed: 0,department_code,avg_site_importance
0,01,0.067051
1,02,0.072294
2,03,0.066258
3,04,0.065725
4,05,0.069074
...,...,...
91,91,0.066098
92,92,0.067027
93,93,0.063537
94,94,0.066328


In [22]:
# MIKE 1.3 Stock de logement par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
stock_dep_df = stock_df.merge(georef_df[['municipality_code', 'department_code']], on='municipality_code')

# Calculer le stock de logement par d√©partement (nombre total de logements)
total_stock_per_department = stock_dep_df.groupby('department_code')['nb_tot_housing'].sum().reset_index()
total_stock_per_department.rename(columns={'nb_tot_housing': 'total_stock'}, inplace=True)
total_stock_per_department

Unnamed: 0,department_code,total_stock
0,01,1781912
1,02,1829844
2,03,1462151
3,04,755894
4,05,763007
...,...,...
95,95,3134354
96,971,1203232
97,972,1161753
98,973,384652


In [23]:
# MIKE 1.4 √©partition des cat√©gories par d√©partement
# Fusionner les donn√©es des sites avec les informations de g√©olocalisation pour obtenir les d√©partements
site_with_dep = site_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer la r√©partition des cat√©gories par d√©partement
tourism_category_per_department = site_with_dep.groupby(['department_code', 'department_name', 'Category']).size().unstack(fill_value=0).reset_index()

# Afficher les premi√®res lignes de la r√©partition des cat√©gories par d√©partement
tourism_category_per_department.head()

Category,department_code,department_name,Culture,Entertainment,Nature,Patrimoine
0,1,Ain,38,27,274,169
1,2,Aisne,40,7,63,107
2,3,Allier,42,10,102,169
3,4,Alpes-de-Haute-Provence,48,8,108,60
4,5,Hautes-Alpes,40,12,228,33


##### 2. REAL ESTATE MIKE (not used for the scoring)

In [27]:
# 2.1 Rentabilit√© locative au m¬≤ par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
real_estate_dep_df = real_estate_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer la rentabilit√© locative moyenne au m¬≤ par d√©partement
real_estate_dep_df['avg_rental_yield'] = (real_estate_dep_df['rental_max_all'] + real_estate_dep_df['rental_min_all']) / 2
rental_yield_per_department = real_estate_dep_df.groupby(['department_code', 'department_name'])['avg_rental_yield'].mean().reset_index()
rental_yield_per_department.rename(columns={'avg_rental_yield': 'avg_rental_yield'}, inplace=True)
rental_yield_per_department

Unnamed: 0,department_code,department_name,avg_rental_yield
0,01,Ain,12.253130
1,02,Aisne,9.627683
2,03,Allier,9.244937
3,04,Alpes-de-Haute-Provence,11.311378
4,05,Hautes-Alpes,11.714241
...,...,...,...
91,91,Essonne,16.469145
92,92,Hauts-de-Seine,25.354306
93,93,Seine-Saint-Denis,21.065125
94,94,Val-de-Marne,22.158404


In [28]:
# 2.2 Tension immobili√®re par d√©partement
# Calculer la tension immobili√®re par d√©partement
housing_tension_per_department = real_estate_dep_df.groupby('department_code')['intensite_tension_immo'].mean().reset_index()
housing_tension_per_department.rename(columns={'intensite_tension_immo': 'avg_housing_tension'}, inplace=True)

In [29]:
# 2.3 Part de maisons secondaires par d√©partement
# Calculer la part de maisons secondaires par d√©partement
secondary_home_rate_per_department = stock_dep_df.groupby('department_code')['secondary_home_rate'].mean().reset_index()
secondary_home_rate_per_department.rename(columns={'secondary_home_rate': 'avg_secondary_home_rate'}, inplace=True)
secondary_home_rate_per_department

Unnamed: 0,department_code,avg_secondary_home_rate
0,01,0.138884
1,02,0.102122
2,03,0.132708
3,04,0.397630
4,05,0.400697
...,...,...
95,95,0.071613
96,971,0.076645
97,972,0.068388
98,973,0.062976


In [None]:
# 2.4 √âvolution du prix au m¬≤ par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
sales_dep_df = sales_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer l'√©volution du prix au m¬≤ par d√©partement
price_evolution = sales_dep_df.groupby(['department_code', 'department_name', 'sales_date'])['sales_price_m2'].mean().unstack().reset_index()
price_evolution['price_evolution'] = (price_evolution[price_evolution.columns[-1]] - price_evolution[price_evolution.columns[-2]]) / price_evolution[price_evolution.columns[-2]] * 100
price_evolution = price_evolution[['department_code', 'department_name', 'price_evolution']]
price_evolution.head()


In [None]:
# Calculer le prix moyen au m¬≤ des ventes immobili√®res par d√©partement
avg_price_per_m2_per_department = sales_dep_df.groupby(['department_code', 'department_name'])['sales_price_m2'].mean().reset_index()
avg_price_per_m2_per_department.rename(columns={'sales_price_m2': 'avg_sales_price_m2'}, inplace=True)
avg_price_per_m2_per_department.head()

##### POPULATION (not used for the scoring)

In [86]:
# 1.1 Salaire moyen par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
salary_dep_df = salary_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer le salaire moyen par d√©partement
avg_salary_per_department = salary_dep_df.groupby(['department_code', 'department_name'])['avg_net_salary'].mean().reset_index().round()
avg_salary_per_department.rename(columns={'avg_net_salary': 'avg_salary'}, inplace=True)
avg_salary_per_department.head()

Unnamed: 0,department_code,department_name,avg_salary
0,1,Ain,25614.0
1,2,Aisne,22865.0
2,3,Allier,23272.0
3,4,Alpes-de-Haute-Provence,24147.0
4,5,Hautes-Alpes,22269.0


In [22]:
# 1.2 √âvolution de la population par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
population_dep_df = population_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer l'√©volution de la population par d√©partement (diff√©rence entre les ann√©es)
pop_evolution = population_dep_df.groupby(['department_code', 'department_name', 'year'])['population'].sum().unstack().reset_index()
pop_evolution['evolution'] = (pop_evolution[pop_evolution.columns[-1]] - pop_evolution[pop_evolution.columns[-2]]) / pop_evolution[pop_evolution.columns[-2]] * 100
pop_evolution = pop_evolution[['department_code', 'department_name', 'evolution']]

pop_evolution.head()

year,department_code,department_name,evolution
0,1,Ain,0.740851
1,2,Aisne,-0.369817
2,3,Allier,-0.354716
3,4,Alpes-de-Haute-Provence,0.146281
4,5,Hautes-Alpes,0.371007


In [26]:
# 1.3 Taux de pauvret√© par d√©partement
# Joindre les donn√©es de population
# poverty_df = poverty_df.merge(population_df[['municipality_code', 'population']], on='municipality_code', suffixes=('_poverty', '_population'))

# Merge avec georef pour avoir le department_name
# poverty_df = poverty_df.merge(georef_df[['municipality_code', 'department_name']], on='municipality_code', how='left')

# Groupe par department_name pour calculer le taux de pauvret√© par d√©partement
# poverty_by_department = poverty_df.groupby('department_name').agg({
    #'population_poverty': 'sum',
    #'population_population': 'sum'
#}).reset_index()

# Calculer le poverty_rate pour chaque d√©partement
#poverty_by_department['poverty_rate'] = (poverty_by_department['population_poverty'] / poverty_by_department['population']) * 100

# Afficher le r√©sultat
#poverty_by_department.head()

# calcul = poverty_rate = poverty_population / total_population * 100
# il faut ensuite le DF qui a poverty_rate √† georef pour r√©cup√©rer le department_name

In [24]:
# 2.1 Nombre de sites touristiques par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
site_dep_df = site_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer le nombre de sites touristiques par d√©partement
num_sites_per_department = site_dep_df.groupby(['department_code', 'department_name'])['poi'].count().reset_index()
num_sites_per_department.rename(columns={'poi': 'nb_sites'}, inplace=True)
num_sites_per_department.head()

Unnamed: 0,department_code,department_name,nb_sites
0,1,Ain,508
1,2,Aisne,217
2,3,Allier,323
3,4,Alpes-de-Haute-Provence,224
4,5,Hautes-Alpes,313


In [25]:
# 2.2 Importance moyenne des sites par d√©partement
# Calculer l'importance moyenne des sites touristiques par d√©partement
avg_site_importance_per_department = site_dep_df.groupby(['department_code', 'department_name'])['importance'].mean().reset_index()
avg_site_importance_per_department.rename(columns={'importance': 'avg_site_importance'}, inplace=True)
avg_site_importance_per_department.head()

Unnamed: 0,department_code,department_name,avg_site_importance
0,1,Ain,0.067051
1,2,Aisne,0.072294
2,3,Allier,0.066258
3,4,Alpes-de-Haute-Provence,0.065725
4,5,Hautes-Alpes,0.069074


In [26]:
# 2.3 Stock de logement par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
stock_dep_df = stock_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer le stock de logement par d√©partement (nombre total de logements)
total_stock_per_department = stock_dep_df.groupby(['department_code', 'department_name'])['nb_tot_housing'].sum().reset_index()
total_stock_per_department.rename(columns={'nb_tot_housing': 'total_stock'}, inplace=True)
total_stock_per_department.head()

Unnamed: 0,department_code,department_name,total_stock
0,1,Ain,1781912
1,2,Aisne,1829844
2,3,Allier,1462151
3,4,Alpes-de-Haute-Provence,755894
4,5,Hautes-Alpes,763007


##### 1. TOURISM ELIOTT

In [14]:
# 1.1 cr√©ation de tables permettant de scorer le potentiel touristique de chaque d√©partement
site_df = site_df.merge (georef_df, on=["municipality_code"])
site_df.head(5)

Unnamed: 0,poi,name,latitude_x,longitude_x,municipality_code,importance,name_reprocessed,data_inside_parenthesis,Category,city_name,city_name_normalized,municipality_type,latitude_y,longitude_y,department_code,epci_code,department_name
0,1,Fortifications de Vauban (Patrimoine mondial),46.39616,1.4726,36134,0.139527,Fortifications de Vauban,Patrimoine mondial,Patrimoine,Mouhet,MOUHET,municipality,46.389251,1.442651,36,200035137.0,Indre
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,46.313695,1.478772,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...,Patrimoine mondial,Patrimoine,Vareilles,VAREILLES,municipality,46.305016,1.456031,23,242300135.0,Creuse
2,1,Canal du Midi (Patrimoine mondial),43.332709,3.21989,34032,0.129531,Canal du Midi,Patrimoine mondial,Patrimoine,B√©ziers,BEZIERS,municipality,43.347588,3.230768,34,243400769.0,H√©rault
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,50.455895,2.965034,62427,0.12717,Bassin minier du Nord-Pas de Calais,Patrimoine mondial,Patrimoine,H√©nin-Beaumont,HENIN BEAUMONT,municipality,50.409234,2.958997,62,246200299.0,Pas-de-Calais
4,1,"Les Causses et les C√©vennes, paysage culturel ...",44.34621,3.613406,48050,0.124981,"Les Causses et les C√©vennes, paysage culturel ...",Patrimoine mondial,Patrimoine,B√©dou√®s-Cocur√®s,BEDOUES COCURES,municipality,44.353946,3.61956,48,200069151.0,Loz√®re


In [15]:
# 1.2 s√©lection des colonnes dont on aura besoin pour le calcul
site_df_department = site_df[["poi", "name", "municipality_code", "importance", "name_reprocessed", "department_name"]]
site_df_department

Unnamed: 0,poi,name,municipality_code,importance,name_reprocessed,department_name
0,1,Fortifications de Vauban (Patrimoine mondial),36134,0.139527,Fortifications de Vauban,Indre
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...,Creuse
2,1,Canal du Midi (Patrimoine mondial),34032,0.129531,Canal du Midi,H√©rault
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,62427,0.127170,Bassin minier du Nord-Pas de Calais,Pas-de-Calais
4,1,"Les Causses et les C√©vennes, paysage culturel ...",48050,0.124981,"Les Causses et les C√©vennes, paysage culturel ...",Loz√®re
...,...,...,...,...,...,...
31019,viewpoint,Tour Eiffel 3e √©tage (Point de vue),75056,0.053782,Tour Eiffel 3e √©tage,Paris
31020,theme_park,Jardin d'Acclimatation (Parc √† th√®me),75056,0.087097,Jardin d'Acclimatation,Paris
31021,theme_park,Foire du Tr√¥ne (Parc √† th√®me),75056,0.060000,Foire du Tr√¥ne,Paris
31022,golf_course,Golf du Bois de Boulogne (Terrain de golf),75056,0.060073,Golf du Bois de Boulogne,Paris


In [16]:
# 1.3 groupement par d√©partement, puis classement par le d√©partement ayant le + d'atouts touristiques
group_site = site_df_department.groupby("department_name")[["importance"]].sum()
group_site
group_site.sort_values("importance", ascending =False)

Unnamed: 0_level_0,importance
department_name,Unnamed: 1_level_1
Gironde,52.539958
Bouches-du-Rh√¥ne,47.068588
Finist√®re,46.685274
Is√®re,44.230787
Loire-Atlantique,43.525109
...,...
Haute-Marne,7.416165
Val-de-Marne,6.699159
Loz√®re,6.230406
Seine-Saint-Denis,5.718311


In [17]:
# 1.4 m√™me calcul que pr√©c√©demment, mais pour la partie concernant les logements/lieux de vill√©giature
poi_df = poi_df.merge (georef_df, on=["municipality_code"])
poi_df.head(5)

Unnamed: 0,poi,latitude_x,longitude_x,municipality_code,importance,name_reprocessed,city_name,city_name_normalized,municipality_type,latitude_y,longitude_y,department_code,epci_code,department_name
0,hotel,45.678531,-1.163635,17225,0.078556,Club Med,Les Mathes,MATHES,municipality,45.705988,-1.170867,17,241700640.0,Charente-Maritime
1,hotel,44.026859,4.847491,84129,0.078419,Cabanes des Grands C√©pages,Sorgues,SORGUES,municipality,44.014576,4.867405,84,248400293.0,Vaucluse
2,hotel,50.331541,1.565677,80333,0.077999,Pierre et Vacances,Fort-Mahon-Plage,FORT MAHON PLAGE,municipality,50.345059,1.577068,80,200070936.0,Somme
3,hotel,43.285936,6.569696,83068,0.077702,Pierre & Vacances,Grimaud,GRIMAUD,municipality,43.282028,6.533032,83,200036077.0,Var
4,hotel,43.54105,3.752852,34095,0.077542,Le Domaine du Golf,Fabr√®gues,FABREGUES,municipality,43.534477,3.77193,34,243400017.0,H√©rault


In [18]:
# 1.5 s√©lection des colonnes dont on aura besoin pour le calcul
poi_df_department = poi_df[["poi", "municipality_code", "importance", "department_name"]]
poi_df_department

Unnamed: 0,poi,municipality_code,importance,department_name
0,hotel,17225,0.078556,Charente-Maritime
1,hotel,84129,0.078419,Vaucluse
2,hotel,80333,0.077999,Somme
3,hotel,83068,0.077702,Var
4,hotel,34095,0.077542,H√©rault
...,...,...,...,...
26202,camp_site,19164,0.040000,Corr√®ze
26203,camp_site,03238,0.040000,Allier
26204,camp_site,19241,0.040000,Corr√®ze
26205,camp_site,23131,0.040000,Creuse


In [19]:
# 1.6 groupement par d√©partement, puis classement par le d√©partement ayant le + de logements/lieux de vill√©giature
group_poi = poi_df_department.groupby("department_name")[["importance"]].sum()
group_poi
group_poi.sort_values("importance", ascending =False)

Unnamed: 0_level_0,importance
department_name,Unnamed: 1_level_1
Paris,70.275305
Savoie,37.401407
Haute-Savoie,35.158395
H√©rault,33.793973
Alpes-Maritimes,32.802552
...,...
Eure-et-Loir,3.871754
Haute-Marne,3.670584
Ardennes,3.541133
Mayenne,3.154595


In [20]:
# 1.7 ajout des 2 calculs d'importance
department_merged_df = group_poi.merge (group_site, on=["department_name"])
department_merged_df["somme_importance"]=department_merged_df["importance_x"]+department_merged_df["importance_y"]
department_merged_df = department_merged_df.drop(columns=["importance_x", "importance_y"])
department_merged_df
department_merged_df.sort_values("somme_importance", ascending =False)

Unnamed: 0_level_0,somme_importance
department_name,Unnamed: 1_level_1
Paris,110.051777
Gironde,77.287832
Savoie,76.393945
Finist√®re,75.977072
Bouches-du-Rh√¥ne,71.561051
...,...
Aube,13.061510
Seine-Saint-Denis,11.649267
Haute-Marne,11.086749
Ardennes,11.006796


##### 2. REAL ESTATE ELIOTT

In [103]:
# 2.1 calcul du loyer au m2 m√©dian par municipality_code
rental_med = real_estate_df [["municipality_code", "rental_med_all"]]
rental_med

Unnamed: 0,municipality_code,rental_med_all
0,57133,9.53
1,57446,11.09
2,77013,12.26
3,77026,9.53
4,77072,11.47
...,...,...
34436,81126,8.93
34437,33425,11.09
34438,85146,10.88
34439,53062,8.69


In [104]:
# 2.2 calcul du prix d'achat au m2 m√©dian par municipality_code
sales_df
sales_df_grouped = sales_df.groupby(["municipality_code"])[["sales_amount", "surface", "premise_type"]].agg({"sales_amount": "sum", "surface": "sum", "premise_type": "count"})
sales_df_grouped = pd.DataFrame (sales_df_grouped)
sales_df_grouped

Unnamed: 0_level_0,sales_amount,surface,premise_type
municipality_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01001,1.362286e+07,6781.0,60
01002,4.067389e+06,1901.0,17
01004,1.852356e+08,87209.0,1062
01005,3.579244e+07,13876.0,138
01006,2.575955e+06,1425.0,13
...,...,...,...
97420,5.335366e+07,16160.0,188
97421,3.143598e+06,1872.0,24
97422,2.541138e+08,99088.0,1237
97423,1.326098e+07,3780.0,54


In [105]:
# 2.3 jointure pour rajouter dans cette table le loyer m√©dian par municipality_code
sales_df
real_estate_grouped = sales_df_grouped.merge(rental_med, on="municipality_code")
real_estate_grouped

Unnamed: 0,municipality_code,sales_amount,surface,premise_type,rental_med_all
0,01001,1.362286e+07,6781.0,60,10.66
1,01002,4.067389e+06,1901.0,17,10.16
2,01004,1.852356e+08,87209.0,1062,11.25
3,01005,3.579244e+07,13876.0,138,13.28
4,01006,2.575955e+06,1425.0,13,12.70
...,...,...,...,...,...
31892,95676,5.974304e+06,2486.0,23,12.29
31893,95678,1.222182e+07,3568.0,38,18.53
31894,95680,8.569815e+07,32626.0,471,16.22
31895,95682,1.463606e+06,533.0,5,16.57


In [106]:
# 2.4 ajout du nom du d√©partement correspondant √† chaque municipality code
real_estate_department = real_estate_grouped.merge(georef_df, on="municipality_code")
real_estate_department
real_estate_department = real_estate_department [["municipality_code", "sales_amount", "surface", "rental_med_all", "department_name", "premise_type"]]

In [107]:
# 2.5 calcul du prix au m2 par d√©partement
average_price_per_m2 = real_estate_department.groupby(["department_name"])[["sales_amount", "surface"]].agg({"sales_amount": "sum", "surface": "sum"})
average_price_per_m2
average_price_per_m2["average_price_per_m2"] = average_price_per_m2["sales_amount"]/average_price_per_m2["surface"]
average_price_per_m2

Unnamed: 0_level_0,sales_amount,surface,average_price_per_m2
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ain,9.607929e+09,3645188.0,2635.784095
Aisne,2.840598e+09,1819614.0,1561.099105
Allier,2.057614e+09,1304360.0,1577.489243
Alpes-Maritimes,3.837944e+10,7959808.0,4821.653974
Alpes-de-Haute-Provence,2.097409e+09,919312.0,2281.499046
...,...,...,...
Vend√©e,6.771054e+09,2819045.0,2401.896369
Vienne,2.024136e+09,1155284.0,1752.067837
Vosges,1.482116e+09,865201.0,1713.030737
Yonne,1.533758e+09,953987.0,1607.734995


In [None]:
# 2.6 calcul du loyer m√©dian par d√©partment
real_estate_department["intermediate_sum"]=real_estate_department["rental_med_all"]*real_estate_department["premise_type"]
real_estate_department
average_rental = real_estate_department.groupby(["department_name"])[["intermediate_sum", "premise_type"]].agg({"intermediate_sum": "sum", "premise_type": "sum"})
average_rental
average_rental["average_rental"]= average_rental["intermediate_sum"]/average_rental["premise_type"]
average_rental

In [None]:
# 2.7 regroupement des colonnes avec le loyer moyen au m2 par d√©partement et le prix d'achat au m2 moyen par d√©partement
yield_calculation = average_price_per_m2.merge(average_rental, on="department_name")
yield_calculation
yield_calculation = yield_calculation.drop(columns=["sales_amount", "surface", "intermediate_sum", "premise_type"])
yield_calculation["yield_rate"]=yield_calculation["average_rental"]*12/yield_calculation["average_price_per_m2"]*100
yield_calculation.sort_values("yield_rate", ascending=True)

In [None]:
# 2.8 Informations sur la rentabilit√© locative
yield_calculation

In [None]:
# 2.9 calcul de la variation entre 2018 et 2021

#ajout d'une colonne "year"
sales_df.info()
sales_df["year"]=sales_df["sales_date"].dt.year

In [None]:
# 2.10 merge pour rajouter le d√©partement
sales_info_per_department = sales_df.merge (georef_df, on=["municipality_code"])
sales_info_per_department

In [None]:
# 2.11 filtre uniquement sur les ann√©es 2020 et 2021 (car ce sont les seules ann√©es o√π nous avons toutes les informations)
sales_info_per_department = sales_info_per_department[sales_info_per_department['year'].isin([2020, 2021])]
sales_info_per_department

In [None]:
# 2.12 groupement par ann√©e et par d√©partement
sales_df_per_year = sales_info_per_department.groupby(["department_name", "year"])[["sales_amount", "surface"]].agg({"sales_amount": "sum", "surface": "sum"})
sales_df_per_year

In [None]:
# 2.13 calcul du prix moyen au m2
sales_df_per_year["average_price_m2"]=sales_df_per_year["sales_amount"]/sales_df_per_year["surface"]
sales_df_per_year
sales_df_per_year.head(50)

In [None]:
# 2.14 calcul de l'√©volution entre 2018 et 2021
sales_df_per_year['price_m2_growth'] = sales_df_per_year.groupby('department_name')['average_price_m2'].pct_change()
sales_df_per_year

In [None]:
# 2.15 calcul final de l'√©volution
sales_df_per_year = sales_df_per_year.dropna()
sales_df_per_year.drop (columns=["sales_amount", "surface"])
sales_df_per_year.sort_values ("price_m2_growth", ascending=False)

In [109]:
# 2.16 calcul du nb de maisons vacantes en 2019
stock_df_2018 = stock_df[stock_df['year'].isin([2018])]
stock_df_2018
stock_df_2018 = stock_df_2018.merge (georef_df, on=["municipality_code"])
vacants_housing_per_department = stock_df_2018.groupby("department_name")["nb_vacants_housing"].sum()
vacants_housing_per_department = pd.DataFrame(vacants_housing_per_department)
vacants_housing_per_department

Unnamed: 0_level_0,nb_vacants_housing
department_name,Unnamed: 1_level_1
Ain,25849
Aisne,26001
Allier,30479
Alpes-Maritimes,64643
Alpes-de-Haute-Provence,10768
...,...
Vend√©e,21818
Vienne,23264
Vosges,24154
Yonne,23303


In [None]:
stock_df_2018

In [108]:
# 2.17 taxe d'habitation sur les maisons secondaires par d√©partement

TAX_FILENAME = 'taxe_habitation.xlsx'

tax_df = pd.read_excel(os.path.join(DATA_PATH, TAX_FILENAME))
tax_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/cleaned/taxe_habitation.xlsx'

##### 3. SECONDARY HOME MIKE

In [78]:
# 3.1 Superficie moyenne des logements vendus par d√©partements
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
real_estate_sales_dep = sales_df.merge(
    georef_df[['municipality_code', 'department_code', 'department_name']],
    on='municipality_code'
)

# Calculer la surface moyenne des logements vendus par d√©partement
average_surface_per_department = real_estate_sales_dep.groupby(
    ['department_code', 'department_name']
)['surface'].mean().reset_index()

# Renommer la colonne r√©sultante
average_surface_per_department.rename(columns={'surface': 'avg_surface'}, inplace=True)
average_surface_per_department

Unnamed: 0,department_code,department_name,avg_surface
0,01,Ain,95.491503
1,02,Aisne,91.956053
2,03,Allier,85.308972
3,04,Alpes-de-Haute-Provence,73.283742
4,05,Hautes-Alpes,63.986778
...,...,...,...
92,95,Val-d'Oise,79.078816
93,971,Guadeloupe,76.678240
94,972,Martinique,74.256095
95,973,Guyane,78.392100


In [79]:
# Filtre sur un departement en particulier (exemple paris 75)
print(average_surface_per_department[average_surface_per_department['department_code'] == '75'])

   department_code department_name  avg_surface
72              75           Paris    53.748363


In [33]:
# 3.2 √âvolution du % des maisons secondaires par d√©partement

# Filtrer les donn√©es pour les ann√©es 2008 et 2018
housing_2008 = stock_df[stock_df['year'] == 2008]
housing_2018 = stock_df[stock_df['year'] == 2018]

# Renommer les colonnes pour les ann√©es sp√©cifiques
housing_2008 = housing_2008[['municipality_code', 'secondary_home_rate']].rename(columns={'secondary_home_rate': 'secondary_home_rate_2008'})
housing_2018 = housing_2018[['municipality_code', 'secondary_home_rate']].rename(columns={'secondary_home_rate': 'secondary_home_rate_2018'})

# Joindre les donn√©es pour les ann√©es 2008 et 2018 sur le code de municipalit√©
secondary_home_rate_comparison = housing_2008.merge(housing_2018, on='municipality_code')

# Remplacer les valeurs 0 de 2008 pour √©viter la division par z√©ro
secondary_home_rate_comparison = secondary_home_rate_comparison.replace({'secondary_home_rate_2008': {0: np.nan}})

# Calculer l'√©volution du pourcentage de maisons secondaires par municipalit√©
secondary_home_rate_comparison['secondary_home_rate_evolution'] = (
    (secondary_home_rate_comparison['secondary_home_rate_2018'] - secondary_home_rate_comparison['secondary_home_rate_2008']) / 
    secondary_home_rate_comparison['secondary_home_rate_2008']
) * 100

# Remplacer les valeurs infinies et NaN par 0
secondary_home_rate_comparison = secondary_home_rate_comparison.replace({'secondary_home_rate_evolution': {np.inf: np.nan, -np.inf: np.nan}})
secondary_home_rate_comparison['secondary_home_rate_evolution'] = secondary_home_rate_comparison['secondary_home_rate_evolution'].fillna(0)

# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
secondary_home_rate_comparison = secondary_home_rate_comparison.merge(
    georef_df[['municipality_code', 'department_code', 'department_name']],
    on='municipality_code'
)

# Calculer l'√©volution moyenne du pourcentage de maisons secondaires par d√©partement
secondary_home_rate_evolution_department = secondary_home_rate_comparison.groupby(['department_code', 'department_name'])['secondary_home_rate_evolution'].mean().reset_index()
secondary_home_rate_evolution_department.rename(columns={'secondary_home_rate_evolution': 'secondary_home_rate_evolution'}, inplace=True)

secondary_home_rate_evolution_department.head(10)

Unnamed: 0,department_code,department_name,secondary_home_rate_evolution
0,1,Ain,-8.064238
1,2,Aisne,-8.667438
2,3,Allier,-5.866098
3,4,Alpes-de-Haute-Provence,3.350143
4,5,Hautes-Alpes,-4.386948
5,6,Alpes-Maritimes,-1.166718
6,7,Ard√®che,-7.774843
7,8,Ardennes,4.684543
8,9,Ari√®ge,-0.522571
9,10,Aube,2.34934


In [34]:
# Calculer le taux moyen de maisons secondaires par d√©partement
secondary_home_rate_per_department = stock_dep_df.groupby('department_code')['secondary_home_rate'].mean().reset_index()
secondary_home_rate_per_department.rename(columns={'secondary_home_rate': 'avg_secondary_home_rate'}, inplace=True)

# Fusionner avec georef_df pour ajouter le nom des d√©partements
secondary_home_rate_per_department = secondary_home_rate_per_department.merge(
    georef_df[['department_code', 'department_name']].drop_duplicates(),
    on='department_code',
    how='left'
)

# Afficher les r√©sultats
secondary_home_rate_per_department.head()

Unnamed: 0,department_code,avg_secondary_home_rate,department_name
0,1,0.138884,Ain
1,2,0.102122,Aisne
2,3,0.132708,Allier
3,4,0.39763,Alpes-de-Haute-Provence
4,5,0.400697,Hautes-Alpes


##### 4. LIFE QUALITY MIKE

In [64]:
# 4.1 Professionnels de sant√© pour 100 000 habitants par d√©partements en 2023
DATA_PATH = '../data/cleaned'
HEALTH_FILENAME = 'health_df_cleaned.csv'

health_df = pd.read_csv(os.path.join(DATA_PATH, HEALTH_FILENAME))
health_df

Unnamed: 0,department_code,department_name,ensemble des m√©decins,ensemble des m√©decins.1,dont g√©n√©ralistes,dont sp√©cialistes,chirurg. dentistes,pharm.
0,01,Ain,1‚ÄØ162,174,99,75,53,78
1,02,Aisne,1‚ÄØ107,211,99,111,45,91
2,03,Allier,917,275,134,141,51,111
3,04,Alpes-de-Haute-Provence,483,291,165,125,54,103
4,05,Hautes-Alpes,705,503,291,213,72,135
...,...,...,...,...,...,...,...,...
96,971,Guadeloupe,1‚ÄØ209,319,150,169,62,120
97,972,Martinique,1‚ÄØ171,332,152,180,68,99
98,973,Guyane,717,242,123,120,30,46
99,974,La R√©union,3‚ÄØ170,364,176,189,67,94


In [52]:
# 4.2 Taux de criminalit√© pour 1000 habitants par d√©partements en 2020

CRIMINALITY_FILENAME = 'criminality_df_cleaned.csv'

criminality_df = pd.read_csv(os.path.join(DATA_PATH, CRIMINALITY_FILENAME))

# Convertir criminality_per_1000 en type num√©rique (si n√©cessaire)
criminality_df['criminality_per_1000'] = pd.to_numeric(criminality_df['criminality_per_1000'].str.replace(',', '.'))

# Agr√©ger georef_df par d√©partement_name pour obtenir une seule ligne par d√©partement
georef_aggregated = georef_df.groupby('department_name').first().reset_index()

# Effectuer une fusion (merge) pour ajouter department_code √† criminality_aggregated en utilisant department_name comme cl√©
criminality_aggregated = criminality_df.groupby('department_name')['criminality_per_1000'].mean().reset_index()
criminality_per_department = criminality_aggregated.merge(georef_aggregated[['department_name', 'department_code']], on='department_name')

# Afficher les premi√®res lignes du dataframe mis √† jour
criminality_per_department

Unnamed: 0,department_name,criminality_per_1000,department_code
0,Ain,35.00,01
1,Aisne,41.71,02
2,Allier,35.12,03
3,Alpes-Maritimes,55.66,06
4,Alpes-de-Haute-Provence,44.57,04
...,...,...,...
96,Vend√©e,30.51,85
97,Vienne,37.29,86
98,Vosges,34.07,88
99,Yonne,44.85,89


In [65]:
# 4.3 Nombre de jours de soleil par an par d√©partements - ATTENTION: IL MANQUE 10 DEPARTEMENT

SUNNY_FILENAME = 'sunny_df_cleaned.csv'

# Agr√©ger georef_df par department_name pour obtenir une seule ligne par d√©partement
georef_agg = georef_df.groupby('department_name').first().reset_index()

# Remplacer les espaces par des tirets dans georef_agg
georef_agg['department_name'] = georef_agg['department_name'].str.replace(' ', '-')

# Fusionner sunny_df avec georef_agg pour ajouter department_code
sunny_df_per_department = sunny_df.merge(georef_agg[['department_name', 'department_code']], on='department_name')

# Afficher le DataFrame mis √† jour
sunny_df_per_department

Unnamed: 0,department_name,sunny_days_per_year,department_code
0,Ain,182,01
1,Aisne,142,02
2,Allier,161,03
3,Alpes-de-Haute-Provence,241,04
4,Hautes-Alpes,232,05
...,...,...,...
86,Essonne,148,91
87,Hauts-de-Seine,107,92
88,Seine-Saint-Denis,126,93
89,Val-de-Marne,126,94


In [95]:
# MERGE DES 3 DF CI-DESSUS
life_quality_df = sunny_df_per_department.merge(criminality_per_department, on='department_name', how='inner')
life_quality_df = life_quality_df.merge(health_df, on='department_name', how='outer')

# Remplacer les NaN par des valeurs nulles
life_quality_df = life_quality_df.fillna(0)  # Vous pouvez remplacer 0 par d'autres valeurs par d√©faut si n√©cessaire

# Supprimer les colonnes redondantes department_code_x et department_code_y
life_quality_df.drop(columns=['department_code_x', 'department_code_y'], inplace=True)

# R√©organiser les colonnes pour mettre 'department_code' en deuxi√®me position
columns_ordered = ['department_name', 'department_code', 'sunny_days_per_year', 'criminality_per_1000',
                   'ensemble des m√©decins', 'ensemble des m√©decins.1', 'dont g√©n√©ralistes', 
                   'dont sp√©cialistes', 'chirurg. dentistes', 'pharm.']
life_quality_df = life_quality_df.reindex(columns=columns_ordered)

# Afficher le DataFrame final
life_quality_df.head()

Unnamed: 0,department_name,department_code,sunny_days_per_year,criminality_per_1000,ensemble des m√©decins,ensemble des m√©decins.1,dont g√©n√©ralistes,dont sp√©cialistes,chirurg. dentistes,pharm.
0,Ain,1,182.0,35.0,1‚ÄØ162,174,99,75,53,78
1,Aisne,2,142.0,41.71,1‚ÄØ107,211,99,111,45,91
2,Allier,3,161.0,35.12,917,275,134,141,51,111
3,Alpes-Maritimes,6,253.0,55.66,5‚ÄØ095,461,173,288,123,123
4,Alpes-de-Haute-Provence,4,241.0,44.57,483,291,165,125,54,103


In [99]:
# Tri du DataFrame life_quality_df par sunny_days_per_year croissant
life_quality_df_sorted = life_quality_df.sort_values(by='sunny_days_per_year')
life_quality_df_sorted.head(50)

Unnamed: 0,department_name,department_code,sunny_days_per_year,criminality_per_1000,ensemble des m√©decins,ensemble des m√©decins.1,dont g√©n√©ralistes,dont sp√©cialistes,chirurg. dentistes,pharm.
29,Eure-et-Loir,28,0.0,0.0,834,194,86,108,41,103
21,C√¥te-d'Or,21,0.0,0.0,2‚ÄØ239,418,164,254,56,136
20,Creuse,23,0.0,0.0,250,219,128,91,33,119
19,Corse-du-Sud,2A,0.0,0.0,570,350,161,189,83,116
53,La R√©union,974,0.0,0.0,3‚ÄØ170,364,176,189,67,94
37,Haute-Corse,2B,0.0,0.0,556,299,157,143,61,112
34,Guadeloupe,971,0.0,0.0,1‚ÄØ209,319,150,169,62,120
35,Guyane,973,0.0,0.0,717,242,123,120,30,46
67,Mayotte,976,0.0,0.0,265,89,49,39,9,31
91,Territoire de Belfort,90,0.0,0.0,457,331,143,189,57,110


# üöÄ ENRICHED EXPORT

In [80]:
# Chemin du dossier o√π les fichiers seront enregistr√©s
output_folder = "../data/enriched"

# Assurez-vous que le dossier existe
os.makedirs(output_folder, exist_ok=True)

In [None]:
### TOURISM (2 KPIS)
# Nombre de sites touristiques par d√©partements : num_sites_per_department
# R√©partition des cat√©gories touristiques par d√©partements :tourism_category_per_department


### REAL ESTATE & SECONDARY HOME (5 KPIS)
# Prix moyen du m2 par d√©partement : average_price_per_m2
# Stock de biens par d√©partements : total_stock_per_department
# Superficie moyenne des logements vendus par d√©partements :average_surface_per_department
# Taux de r√©partition des maisons secondaires par d√©partements : secondary_home_rate_per_department
# √âvolution du % des maisons secondaires par d√©partement (entre 2008 et 2018) : secondary_home_rate_evolution_department
# Nombre de maisons vacantes (en 2019) : vacants_housing_per_department


### LIFE QUALITY (4 KPIS)
# Salaire moyen par d√©partement : avg_salary_per_department
# Nombre de professionnels de sant√© pour 100 000 habitants par d√©partements (en 2023) : health_df
# Taux de criminalit√© pour 1000 habitants par d√©partements (en 2020) : criminality_per_department
# Nombre de jours de soleil par an par d√©partements : sunny_df_per_department
# Fusion de tous DF Life Quality  par d√©partements (POUR NORMALISATIN AU SCORING) : life_quality_df

In [110]:
# Liste des DataFrames et leurs noms
dataframes = {
    "num_sites_per_department": num_sites_per_department,
    "tourism_category_per_department": tourism_category_per_department,    
    "average_price_per_m2_per_department": average_price_per_m2,
    "total_stock_per_department": total_stock_per_department,    
    "average_surface_per_department": average_surface_per_department,
    "secondary_home_rate_per_department": secondary_home_rate_per_department,
    "secondary_home_rate_evolution_department": secondary_home_rate_evolution_department,
    "vacants_housing_per_department": vacants_housing_per_department,
    "avg_salary_per_department": avg_salary_per_department,
    "health_df_per_derpartment": health_df,
    "criminality_per_department": criminality_per_department,
    "sunny_df_per_department": sunny_df_per_department,
    "life_quality_df": life_quality_df
}

    # Exportation des DataFrames en CSV
for name, df in dataframes.items():
    output_path = os.path.join(output_folder, f"{name}_enriched.csv")
    df.to_csv(output_path, index=False)
    print(f"DataFrame {name} export√© vers {output_path}")

DataFrame num_sites_per_department export√© vers ../data/enriched/num_sites_per_department_enriched.csv
DataFrame tourism_category_per_department export√© vers ../data/enriched/tourism_category_per_department_enriched.csv
DataFrame average_price_per_m2_per_department export√© vers ../data/enriched/average_price_per_m2_per_department_enriched.csv
DataFrame total_stock_per_department export√© vers ../data/enriched/total_stock_per_department_enriched.csv
DataFrame average_surface_per_department export√© vers ../data/enriched/average_surface_per_department_enriched.csv
DataFrame secondary_home_rate_per_department export√© vers ../data/enriched/secondary_home_rate_per_department_enriched.csv
DataFrame secondary_home_rate_evolution_department export√© vers ../data/enriched/secondary_home_rate_evolution_department_enriched.csv
DataFrame vacants_housing_per_department export√© vers ../data/enriched/vacants_housing_per_department_enriched.csv
DataFrame avg_salary_per_department export√© vers ..

# SCORING