# ‚öôÔ∏è **CLEANED DATA IMPORT**

In [1]:
import pandas as pd
import numpy as np
import os

DATA_PATH = '../data/cleaned'

POI_FILENAME = 'poi_df_cleaned.csv'
SITE_FILENAME = 'site_df_cleaned.csv'
SALARY_FILENAME = 'salary_df_cleaned.csv'
GEOREF_FILENAME = 'georef_df_cleaned.csv'
STOCK_FILENAME = 'stock_df_cleaned.csv'
SALES_FILENAME = 'sales_df_cleaned.csv'
POPULATION_FILENAME = 'population_df_cleaned.csv'
POVERTY_FILENAME = 'poverty_df_cleaned.csv'
REAL_ESTATE_FILENAME = 'real_estate_df_cleaned.csv'

poi_df = pd.read_csv(os.path.join(DATA_PATH, POI_FILENAME))
site_df = pd.read_csv(os.path.join(DATA_PATH, SITE_FILENAME))
salary_df = pd.read_csv(os.path.join(DATA_PATH, SALARY_FILENAME))
georef_df = pd.read_csv(os.path.join(DATA_PATH, GEOREF_FILENAME))
stock_df = pd.read_csv(os.path.join(DATA_PATH, STOCK_FILENAME))
sales_df = pd.read_csv(os.path.join(DATA_PATH, SALES_FILENAME))
population_df = pd.read_csv(os.path.join(DATA_PATH, POPULATION_FILENAME))
poverty_df = pd.read_csv(os.path.join(DATA_PATH, POVERTY_FILENAME))
real_estate_df = pd.read_csv(os.path.join(DATA_PATH, REAL_ESTATE_FILENAME))

### CLEANING

##### DF_SALES CLEANING

In [2]:
# SALES_DF: Suppression des doublons > nous passons de 4,3M de lignes √† 3,821M
sales_df = sales_df.drop_duplicates()
sales_df.shape

(3448398, 13)

In [3]:
# SALES_DF: Check si les doublons on √©t√© enlev√©s : OK
sales_df.duplicated().sum()

np.int64(0)

In [4]:
# SALES_DF: Suppression des prix au m2 sup√©rieur √† 30K‚Ç¨ et inf√©rieur √† 1K‚Ç¨ > nous passons √† 3,3399M de lignes
sales_df = sales_df[(sales_df['sales_price_m2'] <= 30000) & (sales_df['sales_price_m2'] >= 1000)]
sales_df.shape

(3448398, 13)

In [5]:
# SALES_DF:
s2 = (sales_df['sales_amount']
             .value_counts()
             .loc[sales_df['sales_amount'].value_counts() > 10])

In [6]:
# SALES_DF:
sales_df = sales_df[sales_df['sales_amount'] > 1] # on enl√®ve les 166 fois ou sales_amount = 1‚Ç¨
sales_df.shape

(3448398, 13)

In [7]:
# SALES_DF: changement du type sales_date en datetime
sales_df['sales_date'] = pd.to_datetime(sales_df['sales_date'])
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3448398 entries, 0 to 3448397
Data columns (total 13 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   sales_date                 datetime64[ns]
 1   sales_amount               float64       
 2   street_number              float64       
 3   street_code                object        
 4   street_name                object        
 5   nom_commune                object        
 6   municipality_code          object        
 7   premise_type               object        
 8   surface                    float64       
 9   number_of_principal_rooms  int64         
 10  sales_price_m2             float64       
 11  latitude                   float64       
 12  longitude                  float64       
dtypes: datetime64[ns](1), float64(6), int64(1), object(5)
memory usage: 342.0+ MB


##### DF_SALARY CLEANING

In [8]:
# DF_SALARY: ROUND avg_net_salary
salary_df['avg_net_salary'] = salary_df['avg_net_salary'].round()
salary_df.head()

Unnamed: 0,municipality_code,avg_net_salary,year
0,1004,26471.0,2019
1,1007,26341.0,2019
2,1014,25897.0,2019
3,1024,25695.0,2019
4,1025,26055.0,2019


##### DF_REAL_ESTATE CLEANING

In [9]:
# DF_REAL_ESTATE: suppression des nulls
real_estate_df = real_estate_df.dropna(axis=1)
real_estate_df.isnull().sum()

municipality_code         0
intensite_tension_immo    0
rental_max_apartment      0
rental_min_apartment      0
rental_med_all            0
rental_max_all            0
rental_min_all            0
dtype: int64

##### DF_SITE CLEANING

In [10]:
# SITE_DF: tri avec les donn√©es entre parenth√®ses de la colonne "name" inclues

import re

site_df['data_inside_parenthesis'] = site_df['name'].apply(lambda x: re.search(r'\((.*?)\)', x).group(1) if re.search(r'\((.*?)\)', x) else '')
site_df

#suppression de la colonne "name" dans un second temps

site_df.drop(columns=["name"])

#check pour savoir les informations pr√©sentes dans la colonne "poi", et si elles correspondent aux valeurs pr√©sentes dans la colonne "type"
print (site_df["poi"].value_counts())
print (site_df["data_inside_parenthesis"].value_counts().head(50))

#faire un mapping des colonnes poi, qui sont en fait plus pertinentes que celles de la colonne "type"

poi
castle            5611
2                 5207
water             4500
museum            3400
theatre           2439
forest            2233
beach             1010
protected_area     927
wetland            784
cinema             667
vineyard           641
golf_course        612
theme_park         358
cliff              337
marina             255
water_park         239
1                  236
zoo                234
valley             223
ridge              213
islet              163
casino             158
volcano            145
park               132
meadow              78
monument            76
sand                35
heritage            32
viewpoint           21
cave_entrance       15
wreck               12
rock                11
waterfall           10
attraction           8
dune                 5
national_park        4
allotments           2
geyser               1
Name: count, dtype: int64
data_inside_parenthesis
                       9899
Monument historique    5090
Eau             

In [None]:
# SITE_DF: cr√©ation d'un dictionnaire int√©grant toutes les diff√©rentes valeurs inclues dans la colonne "poi"
s = site_df["poi"].value_counts()[site_df["poi"]]
{k: "toto" for k in s.index}

In [11]:
# SITE_DF: cr√©ation d'un dictionnaire avec les cat√©gories associ√©es aux valeurs de la colonne POI

category_dict = {'1': 'Patrimoine',
 '2': 'Patrimoine',
 'zoo': 'Entertainment',
 'dune': 'Nature',
 'park': 'Nature',
 'rock': 'Nature',
 'sand': 'Nature',
 'beach': 'Nature',
 'cliff': 'Nature',
 'islet': 'Nature',
 'ridge': 'Nature',
 'water': 'Nature',
 'wreck': 'Patrimoine',
 'casino': 'Entertainment',
 'castle': 'Patrimoine',
 'cinema': 'Culture',
 'forest': 'Nature',
 'geyser': 'Nature',
 'marina': 'Nature',
 'meadow': 'Nature',
 'museum': 'Culture',
 'valley': 'Nature',
 'theatre': 'Culture',
 'volcano': 'Nature',
 'wetland': 'Nature',
 'heritage': 'Patrimoine',
 'monument': 'Patrimoine',
 'vineyard': 'Nature',
 'viewpoint': 'Nature',
 'waterfall': 'Nature',
 'allotments': 'Patrimoine',
 'attraction': 'Entertainment',
 'theme_park': 'Entertainment',
 'water_park': 'Entertainment',
 'golf_course': 'Entertainment',
 'cave_entrance': 'Culture',
 'national_park': 'Nature',
 'protected_area': 'Nature'}

In [12]:
# SITE_DF: cr√©ation de la colonne "cat√©gorie"
site_df["Category"] = site_df["poi"].map(category_dict)
site_df

Unnamed: 0,poi,name,latitude,longitude,municipality_code,importance,name_reprocessed,data_inside_parenthesis,Category
0,1,Fortifications de Vauban (Patrimoine mondial),46.396160,1.472600,36134,0.139527,Fortifications de Vauban,Patrimoine mondial,Patrimoine
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,46.313695,1.478772,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...,Patrimoine mondial,Patrimoine
2,1,Canal du Midi (Patrimoine mondial),43.332709,3.219890,34032,0.129531,Canal du Midi,Patrimoine mondial,Patrimoine
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,50.455895,2.965034,62427,0.127170,Bassin minier du Nord-Pas de Calais,Patrimoine mondial,Patrimoine
4,1,"Les Causses et les C√©vennes, paysage culturel ...",44.346210,3.613406,48050,0.124981,"Les Causses et les C√©vennes, paysage culturel ...",Patrimoine mondial,Patrimoine
...,...,...,...,...,...,...,...,...,...
31029,viewpoint,Tour Eiffel 3e √©tage (Point de vue),48.858262,2.294497,75056,0.053782,Tour Eiffel 3e √©tage,Point de vue,Nature
31030,theme_park,Jardin d'Acclimatation (Parc √† th√®me),48.878145,2.264686,75056,0.087097,Jardin d'Acclimatation,Parc √† th√®me,Entertainment
31031,theme_park,Foire du Tr√¥ne (Parc √† th√®me),48.832003,2.404337,75056,0.060000,Foire du Tr√¥ne,Parc √† th√®me,Entertainment
31032,golf_course,Golf du Bois de Boulogne (Terrain de golf),48.857776,2.232651,75056,0.060073,Golf du Bois de Boulogne,Terrain de golf,Entertainment


In [13]:
poi_df.info()
site_df.info()
salary_df.info()
georef_df.info() 
stock_df.info() 
sales_df.info()
population_df.info() 
poverty_df.info()
real_estate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26216 entries, 0 to 26215
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   poi                26216 non-null  object 
 1   latitude           26216 non-null  float64
 2   longitude          26216 non-null  float64
 3   municipality_code  26216 non-null  object 
 4   importance         26216 non-null  float64
 5   name_reprocessed   26216 non-null  object 
dtypes: float64(3), object(3)
memory usage: 1.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31034 entries, 0 to 31033
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   poi                      31034 non-null  object 
 1   name                     31034 non-null  object 
 2   latitude                 31034 non-null  float64
 3   longitude                31034 non-null  float64
 4   municipality_code  

In [None]:
poi_df.head(1)

In [None]:
site_df.head(1) 

In [None]:
salary_df.head(1)

In [None]:
georef_df.head(1) 

In [None]:
stock_df.head(1) 

In [None]:
sales_df.head(1)

In [None]:
population_df.head(1) 

In [None]:
poverty_df.head(1)

In [None]:
real_estate_df.head(1)

# üß™ **DATA TRANSFORMATION**

### KPIS AGGREGATION BY DEPARTMENT

##### 1. TOURISM

##### 2. REAL ESTATE

In [None]:
# 3.2 Tension immobili√®re par d√©partement
# Calculer la tension immobili√®re par d√©partement
housing_tension_per_department = real_estate_dep_df.groupby(['department_code', 'department_name'])['intensite_tension_immo'].mean().reset_index()
housing_tension_per_department.rename(columns={'intensite_tension_immo': 'avg_housing_tension'}, inplace=True)
housing_tension_per_department

In [None]:
# 3.3 Part de maisons secondaires par d√©partement
# Calculer la part de maisons secondaires par d√©partement
secondary_home_rate_per_department = stock_dep_df.groupby(['department_code', 'department_name'])['secondary_home_rate'].mean().reset_index()
secondary_home_rate_per_department.rename(columns={'secondary_home_rate': 'avg_secondary_home_rate'}, inplace=True)
secondary_home_rate_per_department

In [None]:
# 3.4 √âvolution du prix au m¬≤ par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
sales_dep_df = sales_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer l'√©volution du prix au m¬≤ par d√©partement
price_evolution = sales_dep_df.groupby(['department_code', 'department_name', 'sales_date'])['sales_price_m2'].mean().unstack().reset_index()
price_evolution['price_evolution'] = (price_evolution[price_evolution.columns[-1]] - price_evolution[price_evolution.columns[-2]]) / price_evolution[price_evolution.columns[-2]] * 100
price_evolution = price_evolution[['department_code', 'department_name', 'price_evolution']]
price_evolution.head()


In [None]:
# Calculer le prix moyen au m¬≤ des ventes immobili√®res par d√©partement
avg_price_per_m2_per_department = sales_dep_df.groupby(['department_code', 'department_name'])['sales_price_m2'].mean().reset_index()
avg_price_per_m2_per_department.rename(columns={'sales_price_m2': 'avg_sales_price_m2'}, inplace=True)
avg_price_per_m2_per_department.head()

Population
1.1 Salaire moyen par d√©partement
1.2 √âvolution de la population par d√©partement
1.3 Taux de pauvret√© par d√©partement

Tourisme
2.1 Nombre de sites touristiques par d√©partement
2.2 Importance moyenne des sites par d√©partement
2.3 Stock de logement par d√©partement

Immobilier
3.1 Rentabilit√© locative au m¬≤ par d√©partement
3.2 Tension immobili√®re par d√©partement
3.3 Part de maisons secondaires par d√©partement
3.4 √âvolution du prix au m¬≤ par d√©partement
3.5 Prix moyen au m¬≤ des ventes immobili√®res par d√©partement
creer moi un syst√®me de scoring (avec pond√©ration) pour avoir :

un score Population
un score Tourisme
un score Immobilier
puis un Score Global gr√¢ce aux 3 pr√©c√©dents scores

In [None]:
# Supposons que vous avez d√©j√† calcul√© les donn√©es n√©cessaires pour les scores Population, Tourisme et Immobilier

# 1. D√©finition des poids pour chaque sous-score dans chaque cat√©gorie
weights_population = {
    'average_salary': 0.4,
    'population_growth': 0.3,
    'poverty_rate': 0.3
}

weights_tourism = {
    'num_tourism_sites': 0.4,
    'average_importance': 0.3,
    'stock_housing': 0.3
}

weights_real_estate = {
    'rental_yield_per_m2': 0.2,
    'real_estate_tension': 0.2,
    'secondary_home_rate': 0.1,
    'price_growth': 0.2,
    'average_price_per_m2': 0.3
}

# 2. Calcul des scores pour chaque cat√©gorie

# POPULATION
# Supposons que vous avez d√©j√† les dataframes suivants : average_salary_by_department, population_evolution_by_department, average_poverty_rate_by_department

population_scores = (
    avg_salary_per_department['avg_salary'] * weights_population['average_salary'] +
    pop_evolution['evolution'] * weights_population['population_growth'] +
    (100 - average_poverty_rate_by_department['poverty_rate']) * weights_population['poverty_rate']
)

# TOURISM
# Supposons que vous avez d√©j√† les dataframes suivants : tourism_sites_by_department, average_importance_by_department, stock_housing_by_department

tourism_scores = (
    tourism_sites_by_department['num_tourism_sites'] * weights_tourism['num_tourism_sites'] +
    average_importance_by_department['importance'] * weights_tourism['average_importance'] +
    stock_housing_by_department['stock_housing'] * weights_tourism['stock_housing']
)

# REAL ESTATE
# Supposons que vous avez d√©j√† les dataframes suivants : rental_yield_per_m2_by_department, real_estate_tension_by_department, second_home_rate_by_department, price_growth_by_department, average_price_per_m2_by_department

real_estate_scores = (
    rental_yield_per_m2_by_department['rental_yield_per_m2'] * weights_real_estate['rental_yield_per_m2'] +
    (100 - real_estate_tension_by_department['intensite_tension_immo']) * weights_real_estate['real_estate_tension'] +
    (100 - second_home_rate_by_department['secondary_home_rate']) * weights_real_estate['secondary_home_rate'] +
    price_growth_by_department['price_growth'] * weights_real_estate['price_growth'] +
    average_price_per_m2_by_department['average_price_per_m2'] * weights_real_estate['average_price_per_m2']
)

# 3. Calcul du score global
# Supposons que les scores sont d√©j√† calcul√©s pour chaque cat√©gorie

global_score = (
    population_scores +
    tourism_scores +
    real_estate_scores
)

# Afficher ou utiliser les r√©sultats
print("Scores Population :\n", population_scores.head())
print("\nScores Tourisme :\n", tourism_scores.head())
print("\nScores Immobilier :\n", real_estate_scores.head())
print("\nScore Global :\n", global_score.head())


In [None]:
# POPULATION SCORE
from sklearn.preprocessing import MinMaxScaler

# Supposons que vous avez d√©j√† calcul√© les sous-scores pour la cat√©gorie Population : average_salary_by_department, population_evolution_by_department, average_poverty_rate_by_department

# Initialisation du MinMaxScaler
scaler_population = MinMaxScaler()

# Normalisation des donn√©es
population_scores_scaled = scaler_population.fit_transform(
    population_scores[['avg_net_salary', 'population_growth', 'poverty_rate']]
)

# Calcul des scores normalis√©s
population_scores_normalized = (
    population_scores_scaled[:, 0] * weights_population['average_salary'] +
    population_scores_scaled[:, 1] * weights_population['population_growth'] +
    population_scores_scaled[:, 2] * weights_population['poverty_rate']
)

# Afficher ou utiliser les r√©sultats des scores normalis√©s
print("Scores Population normalis√©s :\n", population_scores_normalized.head())


In [None]:
# TOURISM SCORE
# Supposons que vous avez d√©j√† calcul√© les sous-scores pour la cat√©gorie Tourisme : tourism_sites_by_department, average_importance_by_department, stock_housing_by_department

# Initialisation du MinMaxScaler
scaler_tourism = MinMaxScaler()

# Normalisation des donn√©es
tourism_scores_scaled = scaler_tourism.fit_transform(
    tourism_scores[['num_tourism_sites', 'average_importance', 'stock_housing']]
)

# Calcul des scores normalis√©s
tourism_scores_normalized = (
    tourism_scores_scaled[:, 0] * weights_tourism['num_tourism_sites'] +
    tourism_scores_scaled[:, 1] * weights_tourism['average_importance'] +
    tourism_scores_scaled[:, 2] * weights_tourism['stock_housing']
)

# Afficher ou utiliser les r√©sultats des scores normalis√©s
print("Scores Tourisme normalis√©s :\n", tourism_scores_normalized.head())


In [None]:
# 1 REAL ESTATE SCORE
# Supposons que vous avez d√©j√† calcul√© les sous-scores pour la cat√©gorie Immobilier : rental_yield_per_m2_by_department, real_estate_tension_by_department, second_home_rate_by_department, price_growth_by_department, average_price_per_m2_by_department

# Initialisation du MinMaxScaler
scaler_real_estate = MinMaxScaler()

# Normalisation des donn√©es
real_estate_scores_scaled = scaler_real_estate.fit_transform(
    real_estate_scores[['rental_yield_per_m2', 'real_estate_tension', 'secondary_home_rate', 'price_growth', 'average_price_per_m2']]
)

# Calcul des scores normalis√©s
real_estate_scores_normalized = (
    real_estate_scores_scaled[:, 0] * weights_real_estate['rental_yield_per_m2'] +
    real_estate_scores_scaled[:, 1] * weights_real_estate['real_estate_tension'] +
    real_estate_scores_scaled[:, 2] * weights_real_estate['secondary_home_rate'] +
    real_estate_scores_scaled[:, 3] * weights_real_estate['price_growth'] +
    real_estate_scores_scaled[:, 4] * weights_real_estate['average_price_per_m2']
)

# Afficher ou utiliser les r√©sultats des scores normalis√©s
print("Scores Immobilier normalis√©s :\n", real_estate_scores_normalized.head())


In [None]:
1 GLOBAL SCORE
# Supposons que vous avez d√©j√† les scores normalis√©s pour chaque cat√©gorie
# population_scores_normalized, tourism_scores_normalized, real_estate_scores_normalized

# D√©finition des poids pour chaque cat√©gorie
weights = {
    'population': 0.4,
    'tourism': 0.3,
    'real_estate': 0.3
}

# Calcul du score global pond√©r√©
global_score = (
    population_scores_normalized * weights['population'] +
    tourism_scores_normalized * weights['tourism'] +
    real_estate_scores_normalized * weights['real_estate']
)

# Afficher ou utiliser le score global
print("Score Global :\n", global_score.head())


In [None]:
# 3.1 Rentabilit√© locative au m¬≤ par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
real_estate_dep_df = real_estate_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer la rentabilit√© locative moyenne au m¬≤ par d√©partement
real_estate_dep_df['avg_rental_yield'] = (real_estate_dep_df['rental_max_all'] + real_estate_dep_df['rental_min_all']) / 2
rental_yield_per_department = real_estate_dep_df.groupby(['department_code', 'department_name'])['avg_rental_yield'].mean().reset_index()
rental_yield_per_department.rename(columns={'avg_rental_yield': 'avg_rental_yield'}, inplace=True)
rental_yield_per_department

##### 3. SECONDARY HOME

In [33]:
# 3.1 Taille moyenne des maisons vendues en superficie

# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
real_estate_sales_dep = sales_df.merge(
    georef_df[['municipality_code', 'department_code', 'department_name']],
    on='municipality_code'
)

# Calculer la surface moyenne des logements vendus par municipalit√©
average_surface_municipality = real_estate_sales_dep.groupby('municipality_code')['surface'].mean().reset_index()
average_surface_municipality.rename(columns={'surface': 'average_surface'}, inplace=True)

# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
average_surface_municipality = average_surface_municipality.merge(
    georef_df[['municipality_code', 'department_code', 'department_name']],
    on='municipality_code'
)

# Calculer la surface moyenne des logements vendus par d√©partement
average_surface_department = average_surface_municipality.groupby(['department_code', 'department_name'])['average_surface'].mean().reset_index()
average_surface_department.rename(columns={'average_surface': 'avg_house_surface'}, inplace=True)

average_surface_department.head()

Unnamed: 0,department_code,department_name,avg_house_surface
0,1,Ain,103.252328
1,2,Aisne,101.221864
2,3,Allier,95.459944
3,4,Alpes-de-Haute-Provence,84.560073
4,5,Hautes-Alpes,80.168163


In [35]:
# Filtre sur un departement en particulier (exemple paris 75)
print(average_surface_department[average_surface_department['department_code'] == '75'])

   department_code department_name  avg_house_surface
72              75           Paris           55.11202


In [50]:
# 3.2 √âvolution du % des maisons secondaires

# Filtrer les donn√©es pour les ann√©es 2008 et 2018
housing_2008 = stock_df[stock_df['year'] == 2008]
housing_2018 = stock_df[stock_df['year'] == 2018]

# Renommer les colonnes pour les ann√©es sp√©cifiques
housing_2008 = housing_2008[['municipality_code', 'secondary_home_rate']].rename(columns={'secondary_home_rate': 'secondary_home_rate_2008'})
housing_2018 = housing_2018[['municipality_code', 'secondary_home_rate']].rename(columns={'secondary_home_rate': 'secondary_home_rate_2018'})

# Joindre les donn√©es pour les ann√©es 2008 et 2018 sur le code de municipalit√©
secondary_home_rate_comparison = housing_2008.merge(housing_2018, on='municipality_code')

# Remplacer les valeurs 0 de 2008 pour √©viter la division par z√©ro
secondary_home_rate_comparison['secondary_home_rate_2008'].replace(0, np.nan, inplace=True)

# Calculer l'√©volution du pourcentage de maisons secondaires par municipalit√©
secondary_home_rate_comparison['secondary_home_rate_evolution'] = (
    (secondary_home_rate_comparison['secondary_home_rate_2018'] - secondary_home_rate_comparison['secondary_home_rate_2008']) / 
    secondary_home_rate_comparison['secondary_home_rate_2008']
) * 100

# Remplacer les valeurs infinies et NaN par 0
secondary_home_rate_comparison['secondary_home_rate_evolution'].replace([np.inf, -np.inf], np.nan, inplace=True)
secondary_home_rate_comparison['secondary_home_rate_evolution'].fillna(0, inplace=True)

# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
secondary_home_rate_comparison = secondary_home_rate_comparison.merge(
    georef_df[['municipality_code', 'department_code', 'department_name']],
    on='municipality_code'
)

# Calculer l'√©volution moyenne du pourcentage de maisons secondaires par d√©partement
secondary_home_rate_evolution_department = secondary_home_rate_comparison.groupby(['department_code', 'department_name'])['secondary_home_rate_evolution'].mean().reset_index()
secondary_home_rate_evolution_department.rename(columns={'secondary_home_rate_evolution': 'secondary_home_rate_evolution'}, inplace=True)

print(secondary_home_rate_evolution_department.head())

   department_code          department_name  secondary_home_rate_evolution
0               01                      Ain                            inf
1               02                    Aisne                            inf
2               03                   Allier                            inf
3               04  Alpes-de-Haute-Provence                       3.350143
4               05             Hautes-Alpes                      -4.386948
5               06          Alpes-Maritimes                      -1.166718
6               07                  Ard√®che                      -7.774843
7               08                 Ardennes                            inf
8               09                   Ari√®ge                            inf
9               10                     Aube                            inf
10              11                     Aude                            inf
11              12                  Aveyron                      -0.210740
12              13     

##### 4. LIFE QUALITY

##### POPULATION

In [None]:
# 1.1 Salaire moyen par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
salary_dep_df = salary_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer le salaire moyen par d√©partement
avg_salary_per_department = salary_dep_df.groupby(['department_code', 'department_name'])['avg_net_salary'].mean().reset_index().round()
avg_salary_per_department.rename(columns={'avg_net_salary': 'avg_salary'}, inplace=True)
avg_salary_per_department.head()

In [None]:
# 1.2 √âvolution de la population par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
population_dep_df = population_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer l'√©volution de la population par d√©partement (diff√©rence entre les ann√©es)
pop_evolution = population_dep_df.groupby(['department_code', 'department_name', 'year'])['population'].sum().unstack().reset_index()
pop_evolution['evolution'] = (pop_evolution[pop_evolution.columns[-1]] - pop_evolution[pop_evolution.columns[-2]]) / pop_evolution[pop_evolution.columns[-2]] * 100
pop_evolution = pop_evolution[['department_code', 'department_name', 'evolution']]

pop_evolution.head()

In [None]:
# 1.3 Taux de pauvret√© par d√©partement
# Joindre les donn√©es de population
poverty_df = poverty_df.merge(population_df[['municipality_code', 'population']], on='municipality_code', suffixes=('_poverty', '_population'))

# Merge avec georef pour avoir le department_name
poverty_df = poverty_df.merge(georef_df[['municipality_code', 'department_name']], on='municipality_code', how='left')

# Groupe par department_name pour calculer le taux de pauvret√© par d√©partement
poverty_by_department = poverty_df.groupby('department_name').agg({
    'population_poverty': 'sum',
    'population_population': 'sum'
}).reset_index()

# Calculer le poverty_rate pour chaque d√©partement
poverty_by_department['poverty_rate'] = (poverty_by_department['population_poverty'] / poverty_by_department['population']) * 100

# Afficher le r√©sultat
poverty_by_department.head()

# calcul = poverty_rate = poverty_population / total_population * 100
# il faut ensuite le DF qui a poverty_rate √† georef pour r√©cup√©rer le department_name

In [None]:
# 2.1 Nombre de sites touristiques par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
site_dep_df = site_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer le nombre de sites touristiques par d√©partement
num_sites_per_department = site_dep_df.groupby(['department_code', 'department_name'])['poi'].count().reset_index()
num_sites_per_department.rename(columns={'poi': 'nb_sites'}, inplace=True)
num_sites_per_department.head()

In [None]:
# 2.2 Importance moyenne des sites par d√©partement
# Calculer l'importance moyenne des sites touristiques par d√©partement
avg_site_importance_per_department = site_dep_df.groupby(['department_code', 'department_name'])['importance'].mean().reset_index()
avg_site_importance_per_department.rename(columns={'importance': 'avg_site_importance'}, inplace=True)
avg_site_importance_per_department.head()

In [None]:
# 2.3 Stock de logement par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
stock_dep_df = stock_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer le stock de logement par d√©partement (nombre total de logements)
total_stock_per_department = stock_dep_df.groupby(['department_code', 'department_name'])['nb_tot_housing'].sum().reset_index()
total_stock_per_department.rename(columns={'nb_tot_housing': 'total_stock'}, inplace=True)
total_stock_per_department.head()

# üöÄ ENRICHED EXPORT