# ‚öôÔ∏è **CLEANED DATA IMPORT**

In [None]:
import pandas as pd
import numpy as np
import os

DATA_PATH = '../data/cleaned'

POI_FILENAME = 'poi_df_cleaned.csv'
SITE_FILENAME = 'site_df_cleaned.csv'
SALARY_FILENAME = 'salary_df_cleaned.csv'
GEOREF_FILENAME = 'georef_df_cleaned.csv'
STOCK_FILENAME = 'stock_df_cleaned.csv'
SALES_FILENAME = 'sales_df_cleaned.csv'
POPULATION_FILENAME = 'population_df_cleaned.csv'
POVERTY_FILENAME = 'poverty_df_cleaned.csv'
REAL_ESTATE_FILENAME = 'real_estate_df_cleaned.csv'

poi_df = pd.read_csv(os.path.join(DATA_PATH, POI_FILENAME))
site_df = pd.read_csv(os.path.join(DATA_PATH, SITE_FILENAME))
salary_df = pd.read_csv(os.path.join(DATA_PATH, SALARY_FILENAME))
georef_df = pd.read_csv(os.path.join(DATA_PATH, GEOREF_FILENAME))
stock_df = pd.read_csv(os.path.join(DATA_PATH, STOCK_FILENAME))
sales_df = pd.read_csv(os.path.join(DATA_PATH, SALES_FILENAME))
population_df = pd.read_csv(os.path.join(DATA_PATH, POPULATION_FILENAME))
poverty_df = pd.read_csv(os.path.join(DATA_PATH, POVERTY_FILENAME))
real_estate_df = pd.read_csv(os.path.join(DATA_PATH, REAL_ESTATE_FILENAME))

### CLEANING

##### DF_SALES CLEANING

In [None]:
# SALES_DF: Suppression des doublons > nous passons de 4,3M de lignes √† 3,821M
sales_df = sales_df.drop_duplicates()
sales_df.shape

In [None]:
# SALES_DF: Check si les doublons on √©t√© enlev√©s : OK
sales_df.duplicated().sum()

In [None]:
# SALES_DF: Suppression des prix au m2 sup√©rieur √† 30K‚Ç¨ et inf√©rieur √† 1K‚Ç¨ > nous passons √† 3,3399M de lignes
sales_df = sales_df[(sales_df['sales_price_m2'] <= 30000) & (sales_df['sales_price_m2'] >= 1000)]
sales_df.shape

In [None]:
# SALES_DF:
s2 = (sales_df['sales_amount']
             .value_counts()
             .loc[sales_df['sales_amount'].value_counts() > 10])

In [None]:
# SALES_DF:
sales_df = sales_df[sales_df['sales_amount'] > 1] # on enl√®ve les 166 fois ou sales_amount = 1‚Ç¨
sales_df.shape

In [None]:
# SALES_DF: changement du type sales_date en datetime
sales_df['sales_date'] = pd.to_datetime(sales_df['sales_date'])
sales_df.info()

##### DF_SALARY CLEANING

In [None]:
# DF_SALARY: ROUND avg_net_salary
salary_df['avg_net_salary'] = salary_df['avg_net_salary'].round()
salary_df.head()

##### DF_REAL_ESTATE CLEANING

In [None]:
# DF_REAL_ESTATE: suppression des nulls
real_estate_df = real_estate_df.dropna(axis=1)
real_estate_df.isnull().sum()

##### DF_SITE CLEANING

In [None]:
# SITE_DF: tri avec les donn√©es entre parenth√®ses de la colonne "name" inclues

import re

site_df['data_inside_parenthesis'] = site_df['name'].apply(lambda x: re.search(r'\((.*?)\)', x).group(1) if re.search(r'\((.*?)\)', x) else '')
site_df

#suppression de la colonne "name" dans un second temps

site_df.drop(columns=["name"])

#check pour savoir les informations pr√©sentes dans la colonne "poi", et si elles correspondent aux valeurs pr√©sentes dans la colonne "type"
print (site_df["poi"].value_counts())
print (site_df["data_inside_parenthesis"].value_counts().head(50))

#faire un mapping des colonnes poi, qui sont en fait plus pertinentes que celles de la colonne "type"

In [None]:
# SITE_DF: cr√©ation d'un dictionnaire int√©grant toutes les diff√©rentes valeurs inclues dans la colonne "poi"
s = site_df["poi"].value_counts()[site_df["poi"]]
{k: "toto" for k in s.index}

In [None]:
# SITE_DF: cr√©ation d'un dictionnaire avec les cat√©gories associ√©es aux valeurs de la colonne POI

category_dict = {'1': 'Patrimoine',
 '2': 'Patrimoine',
 'zoo': 'Entertainment',
 'dune': 'Nature',
 'park': 'Nature',
 'rock': 'Nature',
 'sand': 'Nature',
 'beach': 'Nature',
 'cliff': 'Nature',
 'islet': 'Nature',
 'ridge': 'Nature',
 'water': 'Nature',
 'wreck': 'Patrimoine',
 'casino': 'Entertainment',
 'castle': 'Patrimoine',
 'cinema': 'Culture',
 'forest': 'Nature',
 'geyser': 'Nature',
 'marina': 'Nature',
 'meadow': 'Nature',
 'museum': 'Culture',
 'valley': 'Nature',
 'theatre': 'Culture',
 'volcano': 'Nature',
 'wetland': 'Nature',
 'heritage': 'Patrimoine',
 'monument': 'Patrimoine',
 'vineyard': 'Nature',
 'viewpoint': 'Nature',
 'waterfall': 'Nature',
 'allotments': 'Patrimoine',
 'attraction': 'Entertainment',
 'theme_park': 'Entertainment',
 'water_park': 'Entertainment',
 'golf_course': 'Entertainment',
 'cave_entrance': 'Culture',
 'national_park': 'Nature',
 'protected_area': 'Nature'}

In [None]:
# SITE_DF: cr√©ation de la colonne "cat√©gorie"
site_df["Category"] = site_df["poi"].map(category_dict)
site_df

In [None]:
# Fusionner les donn√©es des sites avec les informations de g√©olocalisation pour obtenir les d√©partements
site_with_dep = site_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer la r√©partition des cat√©gories par d√©partement
tourism_category_per_department = site_with_dep.groupby(['department_code', 'department_name', 'Category']).size().unstack(fill_value=0).reset_index()

# Afficher les premi√®res lignes de la r√©partition des cat√©gories par d√©partement
tourism_category_per_department.head()

In [None]:
poi_df.info()
site_df.info()
salary_df.info()
georef_df.info() 
stock_df.info() 
sales_df.info()
population_df.info() 
poverty_df.info()
real_estate_df.info()

In [None]:
poi_df.head(1)

In [None]:
site_df.head(1) 

In [None]:
salary_df.head(1)

In [None]:
georef_df.head(1) 

In [None]:
stock_df.head(1) 

In [None]:
sales_df.head(1)

In [None]:
population_df.head(1) 

In [None]:
poverty_df.head(1)

In [None]:
real_estate_df.head(1)

# üß™ **DATA TRANSFORMATION**

### KPIS AGGREGATION BY DEPARTMENT

##### 1. TOURISM MIKE (not used for the scoring)

In [None]:
# MIKE 1.1 Nombre de sites touristiques par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
site_dep_df = site_df.merge(georef_df[['municipality_code', 'department_code']], on='municipality_code')

# Calculer le nombre de sites touristiques par d√©partement
num_sites_per_department = site_dep_df.groupby('department_code')['poi'].count().reset_index()
num_sites_per_department.rename(columns={'poi': 'num_sites'}, inplace=True)
num_sites_per_department

In [None]:
# MIKE 1.2 Importance moyenne des sites par d√©partement
# Calculer l'importance moyenne des sites touristiques par d√©partement
avg_site_importance_per_department = site_dep_df.groupby('department_code')['importance'].mean().reset_index() 
avg_site_importance_per_department.rename(columns={'importance': 'avg_site_importance'}, inplace=True)
avg_site_importance_per_department

In [None]:
# MIKE 1.3 Stock de logement par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
stock_dep_df = stock_df.merge(georef_df[['municipality_code', 'department_code']], on='municipality_code')

# Calculer le stock de logement par d√©partement (nombre total de logements)
total_stock_per_department = stock_dep_df.groupby('department_code')['nb_tot_housing'].sum().reset_index()
total_stock_per_department.rename(columns={'nb_tot_housing': 'total_stock'}, inplace=True)
total_stock_per_department

In [None]:
# MIKE 1.4 √©partition des cat√©gories par d√©partement
# Fusionner les donn√©es des sites avec les informations de g√©olocalisation pour obtenir les d√©partements
site_with_dep = site_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer la r√©partition des cat√©gories par d√©partement
tourism_category_per_department = site_with_dep.groupby(['department_code', 'department_name', 'Category']).size().unstack(fill_value=0).reset_index()

# Afficher les premi√®res lignes de la r√©partition des cat√©gories par d√©partement
tourism_category_per_department.head()

##### 2. REAL ESTATE MIKE (not used for the scoring)

In [123]:
# 2.1 Rentabilit√© locative au m¬≤ par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
real_estate_dep_df = real_estate_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer la rentabilit√© locative moyenne au m¬≤ par d√©partement
real_estate_dep_df['avg_rental_yield'] = (real_estate_dep_df['rental_max_all'] + real_estate_dep_df['rental_min_all']) / 2
rental_yield_per_department = real_estate_dep_df.groupby(['department_code', 'department_name'])['avg_rental_yield'].mean().reset_index()
rental_yield_per_department.rename(columns={'avg_rental_yield': 'avg_rental_yield'}, inplace=True)
rental_yield_per_department

Unnamed: 0,department_code,department_name,avg_rental_yield
0,01,Ain,12.253130
1,02,Aisne,9.627683
2,03,Allier,9.244937
3,04,Alpes-de-Haute-Provence,11.311378
4,05,Hautes-Alpes,11.714241
...,...,...,...
91,91,Essonne,16.469145
92,92,Hauts-de-Seine,25.354306
93,93,Seine-Saint-Denis,21.065125
94,94,Val-de-Marne,22.158404


In [124]:
# 2.2 Tension immobili√®re par d√©partement
# Calculer la tension immobili√®re par d√©partement
housing_tension_per_department = real_estate_dep_df.groupby('department_code')['intensite_tension_immo'].mean().reset_index()
housing_tension_per_department.rename(columns={'intensite_tension_immo': 'avg_housing_tension'}, inplace=True)

In [125]:
# 2.3 Part de maisons secondaires par d√©partement
# Calculer la part de maisons secondaires par d√©partement
secondary_home_rate_per_department = stock_dep_df.groupby('department_code')['secondary_home_rate'].mean().reset_index()
secondary_home_rate_per_department.rename(columns={'secondary_home_rate': 'avg_secondary_home_rate'}, inplace=True)
secondary_home_rate_per_department

Unnamed: 0,department_code,avg_secondary_home_rate
0,01,0.138884
1,02,0.102122
2,03,0.132708
3,04,0.397630
4,05,0.400697
...,...,...
95,95,0.071613
96,971,0.076645
97,972,0.068388
98,973,0.062976


In [126]:
# 2.4 √âvolution du prix au m¬≤ par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
sales_dep_df = sales_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer l'√©volution du prix au m¬≤ par d√©partement
price_evolution = sales_dep_df.groupby(['department_code', 'department_name', 'sales_date'])['sales_price_m2'].mean().unstack().reset_index()
price_evolution['price_evolution'] = (price_evolution[price_evolution.columns[-1]] - price_evolution[price_evolution.columns[-2]]) / price_evolution[price_evolution.columns[-2]] * 100
price_evolution = price_evolution[['department_code', 'department_name', 'price_evolution']]
price_evolution.head()


sales_date,department_code,department_name,price_evolution
0,1,Ain,-26.355387
1,2,Aisne,-7.993162
2,3,Allier,-12.084001
3,4,Alpes-de-Haute-Provence,
4,5,Hautes-Alpes,19.86818


In [129]:
# Calculer le prix moyen au m¬≤ des ventes immobili√®res par d√©partement
avg_price_per_m2_per_department = sales_dep_df.groupby(['department_code', 'department_name'])['sales_price_m2'].mean().reset_index()
avg_price_per_m2_per_department.rename(columns={'sales_price_m2': 'avg_sales_price_m2'}, inplace=True)
avg_price_per_m2_per_department['avg_sales_price_m2'] = avg_price_per_m2_per_department['avg_sales_price_m2'].round()

avg_price_per_m2_per_department

Unnamed: 0,department_code,department_name,avg_sales_price_m2
0,01,Ain,2667.0
1,02,Aisne,1622.0
2,03,Allier,1624.0
3,04,Alpes-de-Haute-Provence,2314.0
4,05,Hautes-Alpes,2626.0
...,...,...,...
92,95,Val-d'Oise,3561.0
93,971,Guadeloupe,3597.0
94,972,Martinique,3618.0
95,973,Guyane,3868.0


##### POPULATION (not used for the scoring)

In [None]:
# 1.1 Salaire moyen par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
salary_dep_df = salary_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer le salaire moyen par d√©partement
avg_salary_per_department = salary_dep_df.groupby(['department_code', 'department_name'])['avg_net_salary'].mean().reset_index().round()
avg_salary_per_department.rename(columns={'avg_net_salary': 'avg_salary'}, inplace=True)
avg_salary_per_department.head()

In [None]:
# 1.2 √âvolution de la population par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
population_dep_df = population_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer l'√©volution de la population par d√©partement (diff√©rence entre les ann√©es)
pop_evolution = population_dep_df.groupby(['department_code', 'department_name', 'year'])['population'].sum().unstack().reset_index()
pop_evolution['evolution'] = (pop_evolution[pop_evolution.columns[-1]] - pop_evolution[pop_evolution.columns[-2]]) / pop_evolution[pop_evolution.columns[-2]] * 100
pop_evolution = pop_evolution[['department_code', 'department_name', 'evolution']]

pop_evolution.head()

In [None]:
# 1.3 Taux de pauvret√© par d√©partement
# Joindre les donn√©es de population
# poverty_df = poverty_df.merge(population_df[['municipality_code', 'population']], on='municipality_code', suffixes=('_poverty', '_population'))

# Merge avec georef pour avoir le department_name
# poverty_df = poverty_df.merge(georef_df[['municipality_code', 'department_name']], on='municipality_code', how='left')

# Groupe par department_name pour calculer le taux de pauvret√© par d√©partement
# poverty_by_department = poverty_df.groupby('department_name').agg({
    #'population_poverty': 'sum',
    #'population_population': 'sum'
#}).reset_index()

# Calculer le poverty_rate pour chaque d√©partement
#poverty_by_department['poverty_rate'] = (poverty_by_department['population_poverty'] / poverty_by_department['population']) * 100

# Afficher le r√©sultat
#poverty_by_department.head()

# calcul = poverty_rate = poverty_population / total_population * 100
# il faut ensuite le DF qui a poverty_rate √† georef pour r√©cup√©rer le department_name

In [None]:
# 2.1 Nombre de sites touristiques par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
site_dep_df = site_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer le nombre de sites touristiques par d√©partement
num_sites_per_department = site_dep_df.groupby(['department_code', 'department_name'])['poi'].count().reset_index()
num_sites_per_department.rename(columns={'poi': 'nb_sites'}, inplace=True)
num_sites_per_department.head()

In [None]:
# 2.2 Importance moyenne des sites par d√©partement
# Calculer l'importance moyenne des sites touristiques par d√©partement
avg_site_importance_per_department = site_dep_df.groupby(['department_code', 'department_name'])['importance'].mean().reset_index()
avg_site_importance_per_department.rename(columns={'importance': 'avg_site_importance'}, inplace=True)
avg_site_importance_per_department.head()

In [None]:
# 2.3 Stock de logement par d√©partement
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
stock_dep_df = stock_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer le stock de logement par d√©partement (nombre total de logements)
total_stock_per_department = stock_dep_df.groupby(['department_code', 'department_name'])['nb_tot_housing'].sum().reset_index()
total_stock_per_department.rename(columns={'nb_tot_housing': 'total_stock'}, inplace=True)
total_stock_per_department.head()

##### 1. TOURISM ELIOTT

In [None]:
# 1.1 cr√©ation de tables permettant de scorer le potentiel touristique de chaque d√©partement
site_df = site_df.merge (georef_df, on=["municipality_code"])
site_df.head(5)

In [None]:
# 1.2 s√©lection des colonnes dont on aura besoin pour le calcul
site_df_department = site_df[["poi", "name", "municipality_code", "importance", "name_reprocessed", "department_name"]]
site_df_department

In [None]:
# 1.3 groupement par d√©partement, puis classement par le d√©partement ayant le + d'atouts touristiques
group_site = site_df_department.groupby("department_name")[["importance"]].sum()
group_site
group_site.sort_values("importance", ascending =False)

In [None]:
# 1.4 m√™me calcul que pr√©c√©demment, mais pour la partie concernant les logements/lieux de vill√©giature
poi_df = poi_df.merge (georef_df, on=["municipality_code"])
poi_df.head(5)

In [None]:
# 1.5 s√©lection des colonnes dont on aura besoin pour le calcul
poi_df_department = poi_df[["poi", "municipality_code", "importance", "department_name"]]
poi_df_department

In [None]:
# 1.6 groupement par d√©partement, puis classement par le d√©partement ayant le + de logements/lieux de vill√©giature
group_poi = poi_df_department.groupby("department_name")[["importance"]].sum()
group_poi
group_poi.sort_values("importance", ascending =False)

In [None]:
# 1.7 ajout des 2 calculs d'importance
department_merged_df = group_poi.merge (group_site, on=["department_name"])
department_merged_df["somme_importance"]=department_merged_df["importance_x"]+department_merged_df["importance_y"]
department_merged_df = department_merged_df.drop(columns=["importance_x", "importance_y"])
department_merged_df
department_merged_df.sort_values("somme_importance", ascending =False)

##### 2. REAL ESTATE ELIOTT

In [None]:
# 2.1 calcul du loyer au m2 m√©dian par municipality_code
rental_med = real_estate_df [["municipality_code", "rental_med_all"]]
rental_med

In [119]:
# 2.2 calcul du prix d'achat au m2 m√©dian par municipality_code
sales_df
sales_df_grouped = sales_df.groupby(["municipality_code"])[["sales_amount", "surface", "premise_type"]].agg({"sales_amount": "sum", "surface": "sum", "premise_type": "count"})
sales_df_grouped = pd.DataFrame (sales_df_grouped)
sales_df_grouped

Unnamed: 0_level_0,sales_amount,surface,premise_type
municipality_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01001,1.362286e+07,6781.0,60
01002,4.067389e+06,1901.0,17
01004,1.852356e+08,87209.0,1062
01005,3.579244e+07,13876.0,138
01006,2.575955e+06,1425.0,13
...,...,...,...
97420,5.335366e+07,16160.0,188
97421,3.143598e+06,1872.0,24
97422,2.541138e+08,99088.0,1237
97423,1.326098e+07,3780.0,54


In [120]:
# 2.3 jointure pour rajouter dans cette table le loyer m√©dian par municipality_code
sales_df
real_estate_grouped = sales_df_grouped.merge(rental_med, on="municipality_code")
real_estate_grouped

Unnamed: 0,municipality_code,sales_amount,surface,premise_type,rental_med_all
0,01001,1.362286e+07,6781.0,60,10.66
1,01002,4.067389e+06,1901.0,17,10.16
2,01004,1.852356e+08,87209.0,1062,11.25
3,01005,3.579244e+07,13876.0,138,13.28
4,01006,2.575955e+06,1425.0,13,12.70
...,...,...,...,...,...
31892,95676,5.974304e+06,2486.0,23,12.29
31893,95678,1.222182e+07,3568.0,38,18.53
31894,95680,8.569815e+07,32626.0,471,16.22
31895,95682,1.463606e+06,533.0,5,16.57


In [116]:
# 2.4 ajout du nom du d√©partement correspondant √† chaque municipality code
real_estate_department = real_estate_grouped.merge(georef_df, on="municipality_code")
real_estate_department
real_estate_department = real_estate_department [["municipality_code", "sales_amount", "surface", "rental_med_all", "department_name", "premise_type"]]
real_estate_department

Unnamed: 0,municipality_code,sales_amount,surface,rental_med_all,department_name,premise_type
0,01001,1.362286e+07,6781.0,10.66,Ain,60
1,01002,4.067389e+06,1901.0,10.16,Ain,17
2,01004,1.852356e+08,87209.0,11.25,Ain,1062
3,01005,3.579244e+07,13876.0,13.28,Ain,138
4,01006,2.575955e+06,1425.0,12.70,Ain,13
...,...,...,...,...,...,...
31892,95676,5.974304e+06,2486.0,12.29,Val-d'Oise,23
31893,95678,1.222182e+07,3568.0,18.53,Val-d'Oise,38
31894,95680,8.569815e+07,32626.0,16.22,Val-d'Oise,471
31895,95682,1.463606e+06,533.0,16.57,Val-d'Oise,5


In [121]:
# 2.5 calcul du prix au m2 par d√©partement
average_price_per_m2 = real_estate_department.groupby(["department_name"])[["sales_amount", "surface"]].agg({"sales_amount": "sum", "surface": "sum"})
average_price_per_m2["average_price_per_m2"] = average_price_per_m2["sales_amount"]/average_price_per_m2["surface"]

# Arrondir la colonne average_price_per_m2
average_price_per_m2["average_price_per_m2"] = average_price_per_m2["average_price_per_m2"].round()

# R√©initialiser l'index pour que department_name devienne une colonne
average_price_per_m2.reset_index(inplace=True)
average_price_per_m2

Unnamed: 0,department_name,sales_amount,surface,average_price_per_m2
0,Ain,9.607929e+09,3645188.0,2636.0
1,Aisne,2.840598e+09,1819614.0,1561.0
2,Allier,2.057614e+09,1304360.0,1577.0
3,Alpes-Maritimes,3.837944e+10,7959808.0,4822.0
4,Alpes-de-Haute-Provence,2.097409e+09,919312.0,2281.0
...,...,...,...,...
88,Vend√©e,6.771054e+09,2819045.0,2402.0
89,Vienne,2.024136e+09,1155284.0,1752.0
90,Vosges,1.482116e+09,865201.0,1713.0
91,Yonne,1.533758e+09,953987.0,1608.0


In [None]:
paris_price_per_m2 = average_price_per_m2[average_price_per_m2["department_name"] == "Paris"]
paris_price_per_m2

In [None]:
# 2.6 calcul du loyer m√©dian par d√©partment
real_estate_department["intermediate_sum"]=real_estate_department["rental_med_all"]*real_estate_department["premise_type"]
real_estate_department
average_rental = real_estate_department.groupby(["department_name"])[["intermediate_sum", "premise_type"]].agg({"intermediate_sum": "sum", "premise_type": "sum"})
average_rental
average_rental["average_rental"]= average_rental["intermediate_sum"]/average_rental["premise_type"]
average_rental

In [None]:
# 2.7 regroupement des colonnes avec le loyer moyen au m2 par d√©partement et le prix d'achat au m2 moyen par d√©partement
yield_calculation = average_price_per_m2.merge(average_rental, on="department_name")
yield_calculation
yield_calculation = yield_calculation.drop(columns=["sales_amount", "surface", "intermediate_sum", "premise_type"])
yield_calculation["yield_rate"]=yield_calculation["average_rental"]*12/yield_calculation["average_price_per_m2"]*100
yield_calculation.sort_values("yield_rate", ascending=True)

In [None]:
# 2.8 Informations sur la rentabilit√© locative
yield_calculation

In [None]:
# 2.9 calcul de la variation entre 2018 et 2021

#ajout d'une colonne "year"
sales_df.info()
sales_df["year"]=sales_df["sales_date"].dt.year

In [None]:
# 2.10 merge pour rajouter le d√©partement
sales_info_per_department = sales_df.merge (georef_df, on=["municipality_code"])
sales_info_per_department

In [None]:
# 2.11 filtre uniquement sur les ann√©es 2020 et 2021 (car ce sont les seules ann√©es o√π nous avons toutes les informations)
sales_info_per_department = sales_info_per_department[sales_info_per_department['year'].isin([2020, 2021])]
sales_info_per_department

In [None]:
# 2.12 groupement par ann√©e et par d√©partement
sales_df_per_year = sales_info_per_department.groupby(["department_name", "year"])[["sales_amount", "surface"]].agg({"sales_amount": "sum", "surface": "sum"})
sales_df_per_year

In [None]:
# 2.13 calcul du prix moyen au m2
sales_df_per_year["average_price_m2"]=sales_df_per_year["sales_amount"]/sales_df_per_year["surface"]
sales_df_per_year
sales_df_per_year.head(50)

In [None]:
# 2.14 calcul de l'√©volution entre 2018 et 2021
sales_df_per_year['price_m2_growth'] = sales_df_per_year.groupby('department_name')['average_price_m2'].pct_change()
sales_df_per_year

In [None]:
# 2.15 calcul final de l'√©volution
sales_df_per_year = sales_df_per_year.dropna()
sales_df_per_year.drop (columns=["sales_amount", "surface"])
sales_df_per_year.sort_values ("price_m2_growth", ascending=False)

In [None]:
# 2.16 calcul du nb de maisons vacantes en 2019
stock_df_2018 = stock_df[stock_df['year'].isin([2018])]
stock_df_2018
stock_df_2018 = stock_df_2018.merge (georef_df, on=["municipality_code"])
vacants_housing_per_department = stock_df_2018.groupby("department_name")["nb_vacants_housing"].sum()
vacants_housing_per_department = pd.DataFrame(vacants_housing_per_department)
# R√©initialiser l'index pour que department_name devienne une colonne
vacants_housing_per_department.reset_index(inplace=True)
vacants_housing_per_department

In [None]:
stock_df_2018

In [None]:
# 2.17 taxe d'habitation sur les maisons secondaires par d√©partement

TAX_FILENAME = 'taxe_habitation.xlsx'
tax_df = pd.read_excel(os.path.join(DATA_PATH, TAX_FILENAME))

# Renommer les colonnes
tax_df.rename(columns={
    'R√âGIONS': 'department_name',
    'Taxe d\'habitation moyenne en 2023': 'taxe_habitation_2023',
    'Nombre d\'avis d\'imp√¥t': 'nb_avis_impot'
}, inplace=True)

tax_df.head()

##### 3. SECONDARY HOME MIKE

In [118]:
# 3.1 Superficie moyenne des logements vendus par d√©partements
# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
real_estate_sales_dep = sales_df.merge(
    georef_df[['municipality_code', 'department_code', 'department_name']],
    on='municipality_code'
)

# Calculer la surface moyenne des logements vendus par d√©partement
average_surface_per_department = real_estate_sales_dep.groupby(
    ['department_code', 'department_name']
)['surface'].mean().reset_index()

# Renommer la colonne r√©sultante
average_surface_per_department.rename(columns={'surface': 'avg_surface'}, inplace=True)
average_surface_per_department

Unnamed: 0,department_code,department_name,avg_surface
0,01,Ain,95.491503
1,02,Aisne,91.956053
2,03,Allier,85.308972
3,04,Alpes-de-Haute-Provence,73.283742
4,05,Hautes-Alpes,63.986778
...,...,...,...
92,95,Val-d'Oise,79.078816
93,971,Guadeloupe,76.678240
94,972,Martinique,74.256095
95,973,Guyane,78.392100


In [None]:
# Filtre sur un departement en particulier (exemple paris 75)
print(average_surface_per_department[average_surface_per_department['department_code'] == '75'])

In [None]:
# 3.2 √âvolution du % des maisons secondaires par d√©partement

# Filtrer les donn√©es pour les ann√©es 2008 et 2018
housing_2008 = stock_df[stock_df['year'] == 2008]
housing_2018 = stock_df[stock_df['year'] == 2018]

# Renommer les colonnes pour les ann√©es sp√©cifiques
housing_2008 = housing_2008[['municipality_code', 'secondary_home_rate']].rename(columns={'secondary_home_rate': 'secondary_home_rate_2008'})
housing_2018 = housing_2018[['municipality_code', 'secondary_home_rate']].rename(columns={'secondary_home_rate': 'secondary_home_rate_2018'})

# Joindre les donn√©es pour les ann√©es 2008 et 2018 sur le code de municipalit√©
secondary_home_rate_comparison = housing_2008.merge(housing_2018, on='municipality_code')

# Remplacer les valeurs 0 de 2008 pour √©viter la division par z√©ro
secondary_home_rate_comparison = secondary_home_rate_comparison.replace({'secondary_home_rate_2008': {0: np.nan}})

# Calculer l'√©volution du pourcentage de maisons secondaires par municipalit√©
secondary_home_rate_comparison['secondary_home_rate_evolution'] = (
    (secondary_home_rate_comparison['secondary_home_rate_2018'] - secondary_home_rate_comparison['secondary_home_rate_2008']) / 
    secondary_home_rate_comparison['secondary_home_rate_2008']
) * 100

# Remplacer les valeurs infinies et NaN par 0
secondary_home_rate_comparison = secondary_home_rate_comparison.replace({'secondary_home_rate_evolution': {np.inf: np.nan, -np.inf: np.nan}})
secondary_home_rate_comparison['secondary_home_rate_evolution'] = secondary_home_rate_comparison['secondary_home_rate_evolution'].fillna(0)

# Joindre les informations de g√©olocalisation pour obtenir les d√©partements
secondary_home_rate_comparison = secondary_home_rate_comparison.merge(
    georef_df[['municipality_code', 'department_code', 'department_name']],
    on='municipality_code'
)

# Calculer l'√©volution moyenne du pourcentage de maisons secondaires par d√©partement
secondary_home_rate_evolution_department = secondary_home_rate_comparison.groupby(['department_code', 'department_name'])['secondary_home_rate_evolution'].mean().reset_index()
secondary_home_rate_evolution_department.rename(columns={'secondary_home_rate_evolution': 'secondary_home_rate_evolution'}, inplace=True)

secondary_home_rate_evolution_department.head(10)

In [None]:
# Calculer le taux moyen de maisons secondaires par d√©partement
secondary_home_rate_per_department = stock_dep_df.groupby('department_code')['secondary_home_rate'].mean().reset_index()
secondary_home_rate_per_department.rename(columns={'secondary_home_rate': 'avg_secondary_home_rate'}, inplace=True)

# Fusionner avec georef_df pour ajouter le nom des d√©partements
secondary_home_rate_per_department = secondary_home_rate_per_department.merge(
    georef_df[['department_code', 'department_name']].drop_duplicates(),
    on='department_code',
    how='left'
)

# Afficher les r√©sultats
secondary_home_rate_per_department.head()

##### 4. LIFE QUALITY MIKE

In [None]:
# 4.1 Professionnels de sant√© pour 100 000 habitants par d√©partements en 2023
DATA_PATH = '../data/cleaned'
HEALTH_FILENAME = 'health_df_cleaned.csv'

health_df = pd.read_csv(os.path.join(DATA_PATH, HEALTH_FILENAME))
health_df

In [None]:
# 4.2 Taux de criminalit√© pour 1000 habitants par d√©partements en 2020

CRIMINALITY_FILENAME = 'criminality_df_cleaned.csv'

criminality_df = pd.read_csv(os.path.join(DATA_PATH, CRIMINALITY_FILENAME))

# Convertir criminality_per_1000 en type num√©rique (si n√©cessaire)
criminality_df['criminality_per_1000'] = pd.to_numeric(criminality_df['criminality_per_1000'].str.replace(',', '.'))

# Agr√©ger georef_df par d√©partement_name pour obtenir une seule ligne par d√©partement
georef_aggregated = georef_df.groupby('department_name').first().reset_index()

# Effectuer une fusion (merge) pour ajouter department_code √† criminality_aggregated en utilisant department_name comme cl√©
criminality_aggregated = criminality_df.groupby('department_name')['criminality_per_1000'].mean().reset_index()
criminality_per_department = criminality_aggregated.merge(georef_aggregated[['department_name', 'department_code']], on='department_name')

# Afficher les premi√®res lignes du dataframe mis √† jour
criminality_per_department

In [None]:
# 4.3 Nombre de jours de soleil par an par d√©partements - ATTENTION: IL MANQUE 10 DEPARTEMENT

SUNNY_FILENAME = 'sunny_df_cleaned.csv'
sunny_df = pd.read_csv(os.path.join(DATA_PATH, SUNNY_FILENAME))

# Agr√©ger georef_df par department_name pour obtenir une seule ligne par d√©partement
georef_agg = georef_df.groupby('department_name').first().reset_index()

# Remplacer les espaces par des tirets dans georef_agg
georef_agg['department_name'] = georef_agg['department_name'].str.replace(' ', '-')

# Fusionner sunny_df avec georef_agg pour ajouter department_code
sunny_df_per_department = sunny_df.merge(georef_agg[['department_name', 'department_code']], on='department_name')

# Afficher le DataFrame mis √† jour
sunny_df_per_department

In [None]:
# MERGE DES 3 DF CI-DESSUS
life_quality_df = sunny_df_per_department.merge(criminality_per_department, on='department_name', how='inner')
life_quality_df = life_quality_df.merge(health_df, on='department_name', how='outer')


# Remplacer les NaN par des valeurs nulles
life_quality_df = life_quality_df.fillna(0)  # Vous pouvez remplacer 0 par d'autres valeurs par d√©faut si n√©cessaire

# Supprimer les colonnes redondantes department_code_x et department_code_y
life_quality_df.drop(columns=['department_code_x', 'department_code_y'], inplace=True)

# R√©organiser les colonnes pour mettre 'department_code' en deuxi√®me position
columns_ordered = ['department_name', 'department_code', 'sunny_days_per_year', 'criminality_per_1000',
                   'ensemble des m√©decins', 'ensemble des m√©decins.1', 'dont g√©n√©ralistes', 
                   'dont sp√©cialistes', 'chirurg. dentistes', 'pharm.']
life_quality_df = life_quality_df.reindex(columns=columns_ordered)

# Afficher le DataFrame final
life_quality_df.head()

In [None]:
# Tri du DataFrame life_quality_df par sunny_days_per_year croissant
life_quality_df_sorted = life_quality_df.sort_values(by='sunny_days_per_year')
life_quality_df_sorted.head(50)

# üöÄ ENRICHED EXPORT

In [None]:
# Chemin du dossier o√π les fichiers seront enregistr√©s
output_folder = "../data/enriched"

# Assurez-vous que le dossier existe
os.makedirs(output_folder, exist_ok=True)

In [None]:
###____TOURISM (2 KPIS)____###
# Nombre de sites touristiques par d√©partements : num_sites_per_department
# R√©partition des cat√©gories touristiques par d√©partements :tourism_category_per_department


###____REAL ESTATE & SECONDARY HOME (5 KPIS)____###
# Prix moyen du m2 par d√©partement : average_price_per_m2
# Stock de biens par d√©partements : total_stock_per_department
# Superficie moyenne des logements vendus par d√©partements :average_surface_per_department
# Taux de r√©partition des maisons secondaires par d√©partements : secondary_home_rate_per_department
# √âvolution du % des maisons secondaires par d√©partement (entre 2008 et 2018) : secondary_home_rate_evolution_department
# Nombre de maisons vacantes (en 2019) : vacants_housing_per_department
# Taxe d'habitation (valeur et nombre) en 2023 par d√©partement : tax_df


###___LIFE QUALITY (4 KPIS)____###
# Salaire moyen par d√©partement : avg_salary_per_department
# Nombre de professionnels de sant√© pour 100 000 habitants par d√©partements (en 2023) : health_df
# Taux de criminalit√© pour 1000 habitants par d√©partements (en 2020) : criminality_per_department
# Nombre de jours de soleil par an par d√©partements : sunny_df_per_department
# Fusion de tous DF Life Quality  par d√©partements (POUR NORMALISATIN AU SCORING) : life_quality_df

In [130]:
# Liste des DataFrames et leurs noms
dataframes = {
    "num_sites_per_department": num_sites_per_department,
    "tourism_category_per_department": tourism_category_per_department,    
    "average_price_per_m2_per_department": avg_price_per_m2_per_department,
    "total_stock_per_department": total_stock_per_department,    
    "average_surface_per_department": average_surface_per_department,
    "secondary_home_rate_per_department": secondary_home_rate_per_department,
    "secondary_home_rate_evolution_department": secondary_home_rate_evolution_department,
    "vacants_housing_per_department": vacants_housing_per_department,
    "avg_salary_per_department": avg_salary_per_department,
    "health_df_per_derpartment": health_df,
    "criminality_per_department": criminality_per_department,
    "sunny_df_per_department": sunny_df_per_department,
    "life_quality_df": life_quality_df
    "taxe_habitation_per_department": tax_df,
}

# Exportation des DataFrames en CSV
for name, df in dataframes.items():
    output_path = os.path.join(output_folder, f"{name}_enriched.csv")
    df.to_csv(output_path, index=False)
    print(f"DataFrame {name} export√© vers {output_path}")

DataFrame average_price_per_m2_per_department export√© vers ../data/enriched/average_price_per_m2_per_department_enriched.csv


# SCORING

##### TOURISM