# ⚙️ **DATA IMPORT**

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
import geopandas as gpd
import geoviews as gv
import numpy as np


DATA_PATH = '../data'

POI_FILENAME = 'POI_tourist_establishments.csv'
SITE_FILENAME = 'POI_touristic_sites_by_municipality.csv'
SALARY_FILENAME = 'average_salary_by_municipality.csv'
GEOREF_FILENAME = 'geographical_referential.csv'
STOCK_FILENAME = 'housing_stock.csv'
SALES_FILENAME = 'notary_real_estate_sales.csv'
POPULATION_FILENAME = 'population_by_municipality.csv'
POVERTY_FILENAME = 'poverty_population_by_municipality.csv'
REAL_ESTATE_FILENAME = 'real_estate_info_by_municipality.csv'
FRANCE_DEPARTMENT = 'departements-version-simplifiee.geojson'

poi_df = pd.read_csv(os.path.join(DATA_PATH, POI_FILENAME),usecols=lambda column: column != 'name')
site_df = pd.read_csv(os.path.join(DATA_PATH, SITE_FILENAME))
salary_df = pd.read_csv(os.path.join(DATA_PATH, SALARY_FILENAME),usecols=lambda column: column != 'country_code')
georef_df = pd.read_csv(os.path.join(DATA_PATH, GEOREF_FILENAME),usecols=lambda column: column != 'country_code')
stock_df = pd.read_csv(os.path.join(DATA_PATH, STOCK_FILENAME),usecols=lambda column: column not in ['int64_field_0', 'country_code'])
sales_df = pd.read_csv(os.path.join(DATA_PATH, SALES_FILENAME))
population_df = pd.read_csv(os.path.join(DATA_PATH, POPULATION_FILENAME),usecols=lambda column: column != 'country_code')
poverty_df = pd.read_csv(os.path.join(DATA_PATH, POVERTY_FILENAME),usecols=lambda column: column != 'country_code')
real_estate_df = pd.read_csv(os.path.join(DATA_PATH, REAL_ESTATE_FILENAME))
file_path_json = os.path.join(DATA_PATH, FRANCE_DEPARTMENT)

In [3]:
#checker si on trouve bien le fichier json
file_path_json = os.path.join(DATA_PATH, FRANCE_DEPARTMENT)

if not os.path.exists(file_path_json):
    raise FileNotFoundError(f"File not found: {file_path_json}")
else:
    print(f"File found: {file_path_json}")

File found: ../data\departements-version-simplifiee.geojson


# 🔭 **DATA EXPLORATION**




#### DF POI

In [3]:
poi_df.head() #DATA CLEAN
# importance : poids interne pour évaluer l'importance

Unnamed: 0,poi,latitude,longitude,municipality_code,importance,name_reprocessed
0,hotel,45.678531,-1.163635,17225,0.078556,Club Med
1,hotel,44.026859,4.847491,84129,0.078419,Cabanes des Grands Cépages
2,hotel,50.331541,1.565677,80333,0.077999,Pierre et Vacances
3,hotel,43.285936,6.569696,83068,0.077702,Pierre & Vacances
4,hotel,43.54105,3.752852,34095,0.077542,Le Domaine du Golf


In [4]:
poi_df

Unnamed: 0,poi,latitude,longitude,municipality_code,importance,name_reprocessed
0,hotel,45.678531,-1.163635,17225,0.078556,Club Med
1,hotel,44.026859,4.847491,84129,0.078419,Cabanes des Grands Cépages
2,hotel,50.331541,1.565677,80333,0.077999,Pierre et Vacances
3,hotel,43.285936,6.569696,83068,0.077702,Pierre & Vacances
4,hotel,43.541050,3.752852,34095,0.077542,Le Domaine du Golf
...,...,...,...,...,...,...
26211,camp_site,45.701000,2.041247,19164,0.040000,Camping municipal
26212,camp_site,46.491559,3.020672,03238,0.040000,Camping municipal
26213,camp_site,45.694389,2.120544,19241,0.040000,Camping municipal
26214,camp_site,45.915488,2.487122,23131,0.040000,Camping municipal


In [5]:
# 0 DOUBLONS !
poi_df.duplicated().sum()#.drop_duplicates()

np.int64(0)

In [6]:
poi_df.info()
poi_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26216 entries, 0 to 26215
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   poi                26216 non-null  object 
 1   latitude           26216 non-null  float64
 2   longitude          26216 non-null  float64
 3   municipality_code  26216 non-null  object 
 4   importance         26216 non-null  float64
 5   name_reprocessed   26216 non-null  object 
dtypes: float64(3), object(3)
memory usage: 1.2+ MB


poi                  0
latitude             0
longitude            0
municipality_code    0
importance           0
name_reprocessed     0
dtype: int64

#### DF SITE

In [7]:
site_df.head() # DATA CLEAN
# importance : poids interne pour évaluer l'importance

Unnamed: 0,poi,name,latitude,longitude,municipality_code,importance,name_reprocessed
0,1,Fortifications de Vauban (Patrimoine mondial),46.39616,1.4726,36134,0.139527,Fortifications de Vauban
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,46.313695,1.478772,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...
2,1,Canal du Midi (Patrimoine mondial),43.332709,3.21989,34032,0.129531,Canal du Midi
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,50.455895,2.965034,62427,0.12717,Bassin minier du Nord-Pas de Calais
4,1,"Les Causses et les Cévennes, paysage culturel ...",44.34621,3.613406,48050,0.124981,"Les Causses et les Cévennes, paysage culturel ..."


In [8]:
# 0 DOUBLONS !
site_df.duplicated().sum()#.drop_duplicates()

np.int64(0)

In [9]:
import re

site_df['type'] = site_df['name'].apply(lambda x: re.search(r'\((.*?)\)', x).group(1) if re.search(r'\((.*?)\)', x) else '')
site_df

Unnamed: 0,poi,name,latitude,longitude,municipality_code,importance,name_reprocessed,type
0,1,Fortifications de Vauban (Patrimoine mondial),46.396160,1.472600,36134,0.139527,Fortifications de Vauban,Patrimoine mondial
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,46.313695,1.478772,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...,Patrimoine mondial
2,1,Canal du Midi (Patrimoine mondial),43.332709,3.219890,34032,0.129531,Canal du Midi,Patrimoine mondial
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,50.455895,2.965034,62427,0.127170,Bassin minier du Nord-Pas de Calais,Patrimoine mondial
4,1,"Les Causses et les Cévennes, paysage culturel ...",44.346210,3.613406,48050,0.124981,"Les Causses et les Cévennes, paysage culturel ...",Patrimoine mondial
...,...,...,...,...,...,...,...,...
31029,viewpoint,Tour Eiffel 3e étage (Point de vue),48.858262,2.294497,75056,0.053782,Tour Eiffel 3e étage,Point de vue
31030,theme_park,Jardin d'Acclimatation (Parc à thème),48.878145,2.264686,75056,0.087097,Jardin d'Acclimatation,Parc à thème
31031,theme_park,Foire du Trône (Parc à thème),48.832003,2.404337,75056,0.060000,Foire du Trône,Parc à thème
31032,golf_course,Golf du Bois de Boulogne (Terrain de golf),48.857776,2.232651,75056,0.060073,Golf du Bois de Boulogne,Terrain de golf


In [10]:
site_df.info()
site_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31034 entries, 0 to 31033
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   poi                31034 non-null  object 
 1   name               31034 non-null  object 
 2   latitude           31034 non-null  float64
 3   longitude          31034 non-null  float64
 4   municipality_code  31034 non-null  object 
 5   importance         31034 non-null  float64
 6   name_reprocessed   31034 non-null  object 
 7   type               31034 non-null  object 
dtypes: float64(3), object(5)
memory usage: 1.9+ MB


poi                  0
name                 0
latitude             0
longitude            0
municipality_code    0
importance           0
name_reprocessed     0
type                 0
dtype: int64

#### DF SALARY

In [11]:
salary_df.head(
) # DATA CLEAN

Unnamed: 0,municipality_code,avg_net_salary,year
0,1004,26471.026727,2019
1,1007,26341.353419,2019
2,1014,25897.497842,2019
3,1024,25695.240341,2019
4,1025,26054.712323,2019


In [12]:
# 0 DOUBLONS !
salary_df.duplicated().sum()#.drop_duplicates()

np.int64(0)

In [13]:
salary_df.info()
salary_df.isnull().sum()
print (salary_df["country_code"].nunique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26675 entries, 0 to 26674
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   municipality_code  26675 non-null  object 
 1   avg_net_salary     26675 non-null  float64
 2   year               26675 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 625.3+ KB


KeyError: 'country_code'

#### DF GEO REF

In [None]:
georef_df.head()

In [None]:
# 0 DOUBLONS !
georef_df.duplicated().sum()#.drop_duplicates()

In [None]:
georef_df.info()
georef_df.isnull().sum()

#### DF STOCK

In [None]:
stock_df.head() #il va falloir drop la colonne int64_field_0

In [None]:
# 0 DOUBLONS !
stock_df.duplicated().sum()#.drop_duplicates()

In [None]:
stock_df.info() # supprimer la colonne int64_field-0
stock_df.isnull().sum()

In [None]:
stock_df

#### DF SALES

In [None]:
sales_df.head()

In [None]:
fig = px.histogram(sales_df, x="sales_price_m2", nbins=20, title="Distribution de sales_price_m2")
fig.show()

In [None]:
# 510 211 DOUBLONS !
sales_df.duplicated().sum()#.drop_duplicates()

In [None]:
sales_df.info(), #il manque des latitude et longitude
sales_df.isnull().sum()

#### DF POPULATION

In [None]:
population_df.head()

In [None]:
# ??? DOUBLONS !
population_df.duplicated().sum

In [None]:
population_df.info() #colonne YEAR en format INT64
population_df.isnull().sum()

#### DF POVERTY

In [None]:
poverty_df.head()

In [None]:
# 0 DOUBLONS !
poverty_df.duplicated().sum()

In [None]:
poverty_df.info() # YEAR est en type INT64 et non DATE
poverty_df.isnull().sum()

#### DF REAL ESTATE

In [None]:
real_estate_df.head()

In [None]:
# 0 DOUBLONS !
real_estate_df.duplicated().sum()

In [None]:
real_estate_df.info()
real_estate_df.isnull().sum()

# 🧪 **DATA TRANSFORMATION**

#### CLEANING

In [None]:
#tri avec les données entre parenthèses de la colonne "name" inclues

import re

site_df['data_inside_parenthesis'] = site_df['name'].apply(lambda x: re.search(r'\((.*?)\)', x).group(1) if re.search(r'\((.*?)\)', x) else '')
site_df

#suppression de la colonne "name" dans un second temps

site_df.drop(columns=["name"])

#check pour savoir les informations présentes dans la colonne "poi", et si elles correspondent aux valeurs présentes dans la colonne "type"
print (site_df["poi"].value_counts())
print (site_df["data_inside_parenthesis"].value_counts().head(50))

#faire un mapping des colonnes poi, qui sont en fait plus pertinentes que celles de la colonne "type"










In [None]:
#création d'un dictionnaire intégrant toutes les différentes valeurs inclues dans la colonne "poi"
s = site_df["poi"].value_counts()[site_df["poi"]]
{k: "toto" for k in s.index}

In [None]:
#création d'un dictionnaire avec les catégories associées aux valeurs de la colonne POI

category_dict = {'zoo': 'Entertainment',
 'dune': 'Nature',
 'park': 'Nature',
 'rock': 'Nature',
 'sand': 'Nature',
 'beach': 'Nature',
 'cliff': 'Nature',
 'islet': 'Nature',
 'ridge': 'Nature',
 'water': 'Nature',
 'wreck': 'Patrimoine',
 'casino': 'Entertainment',
 'castle': 'Patrimoine',
 'cinema': 'Culture',
 'forest': 'Nature',
 'geyser': 'Nature',
 'marina': 'Nature',
 'meadow': 'Nature',
 'museum': 'Culture',
 'valley': 'Nature',
 'theatre': 'Culture',
 'volcano': 'Nature',
 'wetland': 'Nature',
 'heritage': 'Patrimoine',
 'monument': 'Patrimoine',
 'vineyard': 'Nature',
 'viewpoint': 'Nature',
 'waterfall': 'Nature',
 'allotments': 'Patrimoine',
 'attraction': 'Entertainment',
 'theme_park': 'Entertainment',
 'water_park': 'Entertainment',
 'golf_course': 'Entertainment',
 'cave_entrance': 'Culture',
 'national_park': 'Nature',
 'protected_area': 'Nature'}

In [None]:
#création de la colonne "catégorie"
site_df["Category"] = site_df["poi"].map(category_dict)
site_df

In [None]:
{'Monument historique': 'patrimoine',
 'Eau': 'aquatique',
 'Musée': 'culture',
 'Forêt': 'toto',
 'Théâtre': 'toto',
 'Château': 'toto',
 'Zone protégée': 'toto',
 'Zone humide': 'toto',
 'Vignoble': 'toto',
 'Terrain de golf': 'toto',
 'Cinéma': 'toto',
 'Parc à thème': 'toto',
 'Falaise': 'toto',
 'Patrimoine mondial': 'toto',
 'Parc aquatique': 'toto',
 'Port de plaisance': 'toto',
 'Plage': 'toto',
 'Vallée': 'toto',
 'Zoo': 'toto',
 'Îlot': 'toto',
 'Volcan': 'toto',
 'Monument': 'toto',
 'Crête': 'toto',
 'Bocage': 'toto',
 'Parc': 'toto',
 'Sable': 'toto',
 'ancien': 'toto',
 'Point de vue': 'toto',
 'Casino': 'toto',
 'ancienne': 'toto',
 'Entrée de grotte': 'toto'}

#### AGGREGATION

In [5]:
#création de tables permettant de scorer le potentiel touristique de chaque département
site_df = site_df.merge (georef_df, on=["municipality_code"])
site_df.head(5)

Unnamed: 0,poi,name,latitude_x,longitude_x,municipality_code,importance,name_reprocessed,city_name,city_name_normalized,municipality_type,latitude_y,longitude_y,department_code,epci_code,department_name
0,1,Fortifications de Vauban (Patrimoine mondial),46.39616,1.4726,36134,0.139527,Fortifications de Vauban,Mouhet,MOUHET,municipality,46.389251,1.442651,36,200035137.0,Indre
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,46.313695,1.478772,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...,Vareilles,VAREILLES,municipality,46.305016,1.456031,23,242300135.0,Creuse
2,1,Canal du Midi (Patrimoine mondial),43.332709,3.21989,34032,0.129531,Canal du Midi,Béziers,BEZIERS,municipality,43.347588,3.230768,34,243400769.0,Hérault
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,50.455895,2.965034,62427,0.12717,Bassin minier du Nord-Pas de Calais,Hénin-Beaumont,HENIN BEAUMONT,municipality,50.409234,2.958997,62,246200299.0,Pas-de-Calais
4,1,"Les Causses et les Cévennes, paysage culturel ...",44.34621,3.613406,48050,0.124981,"Les Causses et les Cévennes, paysage culturel ...",Bédouès-Cocurès,BEDOUES COCURES,municipality,44.353946,3.61956,48,200069151.0,Lozère


In [6]:
#sélection des colonnes dont on aura besoin pour le calcul
site_df_department = site_df[["poi", "name", "municipality_code", "importance", "name_reprocessed", "department_name"]]
site_df_department

Unnamed: 0,poi,name,municipality_code,importance,name_reprocessed,department_name
0,1,Fortifications de Vauban (Patrimoine mondial),36134,0.139527,Fortifications de Vauban,Indre
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...,Creuse
2,1,Canal du Midi (Patrimoine mondial),34032,0.129531,Canal du Midi,Hérault
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,62427,0.127170,Bassin minier du Nord-Pas de Calais,Pas-de-Calais
4,1,"Les Causses et les Cévennes, paysage culturel ...",48050,0.124981,"Les Causses et les Cévennes, paysage culturel ...",Lozère
...,...,...,...,...,...,...
31019,viewpoint,Tour Eiffel 3e étage (Point de vue),75056,0.053782,Tour Eiffel 3e étage,Paris
31020,theme_park,Jardin d'Acclimatation (Parc à thème),75056,0.087097,Jardin d'Acclimatation,Paris
31021,theme_park,Foire du Trône (Parc à thème),75056,0.060000,Foire du Trône,Paris
31022,golf_course,Golf du Bois de Boulogne (Terrain de golf),75056,0.060073,Golf du Bois de Boulogne,Paris


In [7]:
#groupement par département, puis classement par le département ayant le + d'atouts touristiques
group_site = site_df_department.groupby("department_name")[["importance"]].sum()
group_site
group_site.sort_values("importance", ascending =False)

Unnamed: 0_level_0,importance
department_name,Unnamed: 1_level_1
Gironde,52.539958
Bouches-du-Rhône,47.068588
Finistère,46.685274
Isère,44.230787
Loire-Atlantique,43.525109
...,...
Haute-Marne,7.416165
Val-de-Marne,6.699159
Lozère,6.230406
Seine-Saint-Denis,5.718311


In [8]:
#même calcul que précédemment, mais pour la partie concernant les logements/lieux de villégiature
poi_df = poi_df.merge (georef_df, on=["municipality_code"])
poi_df.head(5)

Unnamed: 0,poi,latitude_x,longitude_x,municipality_code,importance,name_reprocessed,city_name,city_name_normalized,municipality_type,latitude_y,longitude_y,department_code,epci_code,department_name
0,hotel,45.678531,-1.163635,17225,0.078556,Club Med,Les Mathes,MATHES,municipality,45.705988,-1.170867,17,241700640.0,Charente-Maritime
1,hotel,44.026859,4.847491,84129,0.078419,Cabanes des Grands Cépages,Sorgues,SORGUES,municipality,44.014576,4.867405,84,248400293.0,Vaucluse
2,hotel,50.331541,1.565677,80333,0.077999,Pierre et Vacances,Fort-Mahon-Plage,FORT MAHON PLAGE,municipality,50.345059,1.577068,80,200070936.0,Somme
3,hotel,43.285936,6.569696,83068,0.077702,Pierre & Vacances,Grimaud,GRIMAUD,municipality,43.282028,6.533032,83,200036077.0,Var
4,hotel,43.54105,3.752852,34095,0.077542,Le Domaine du Golf,Fabrègues,FABREGUES,municipality,43.534477,3.77193,34,243400017.0,Hérault


In [9]:
#sélection des colonnes dont on aura besoin pour le calcul
poi_df_department = poi_df[["poi", "municipality_code", "importance", "department_name"]]
poi_df_department

Unnamed: 0,poi,municipality_code,importance,department_name
0,hotel,17225,0.078556,Charente-Maritime
1,hotel,84129,0.078419,Vaucluse
2,hotel,80333,0.077999,Somme
3,hotel,83068,0.077702,Var
4,hotel,34095,0.077542,Hérault
...,...,...,...,...
26202,camp_site,19164,0.040000,Corrèze
26203,camp_site,03238,0.040000,Allier
26204,camp_site,19241,0.040000,Corrèze
26205,camp_site,23131,0.040000,Creuse


In [10]:
#groupement par département, puis classement par le département ayant le + de logements/lieux de villégiature
group_poi = poi_df_department.groupby("department_name")[["importance"]].sum()
group_poi
group_poi.sort_values("importance", ascending =False)

Unnamed: 0_level_0,importance
department_name,Unnamed: 1_level_1
Paris,70.275305
Savoie,37.401407
Haute-Savoie,35.158395
Hérault,33.793973
Alpes-Maritimes,32.802552
...,...
Eure-et-Loir,3.871754
Haute-Marne,3.670584
Ardennes,3.541133
Mayenne,3.154595


In [11]:
#ajout des 2 calculs d'importance
department_merged_df = group_poi.merge (group_site, on=["department_name"])
department_merged_df["somme_importance"]=department_merged_df["importance_x"]+department_merged_df["importance_y"]
department_merged_df = department_merged_df.drop(columns=["importance_x", "importance_y"])
department_merged_df
department_merged_df.sort_values("somme_importance", ascending =False)

Unnamed: 0_level_0,somme_importance
department_name,Unnamed: 1_level_1
Paris,110.051777
Gironde,77.287832
Savoie,76.393945
Finistère,75.977072
Bouches-du-Rhône,71.561051
...,...
Aube,13.061510
Seine-Saint-Denis,11.649267
Haute-Marne,11.086749
Ardennes,11.006796


In [12]:
#cartographie
sf = gpd.read_file(file_path_json)
sf.head()

from geoviews import dim
import hvplot.pandas

# Supposons que 'department_name' dans GeoDataFrame est 'nom'
sf = sf.rename(columns={'nom': 'department_name'})
# Fusionner les deux dataframes sur 'department_name'
merged_dataframe_json = sf.merge(department_merged_df, on='department_name', how='left')

deps = gv.Polygons(merged_dataframe_json)
deps.opts(width=600, height=600, toolbar='above', color=dim('somme_importance'),
          colorbar=True, tools=['hover'], aspect='equal')



:Polygons   [Longitude,Latitude]   (code,department_name,somme_importance)

In [22]:
import bokeh
gv.extension ("bokeh")
deps



:Polygons   [Longitude,Latitude]   (code,department_name,somme_importance)

In [16]:
#test version numpy et bokeh
print("numpy version:", np.__version__)
print("pandas version:", pd.__version__)
print("geopandas version:", gpd.__version__)
print("geoviews version:", gv.__version__)
print("bokeh version:", bokeh.__version__)

numpy version: 2.0.0
pandas version: 2.2.2
geopandas version: 0.14.4
geoviews version: 1.12.0
bokeh version: 3.4.1


In [17]:
pip install --upgrade numpy pandas geopandas geoviews bokeh


Note: you may need to restart the kernel to use updated packages.


SyntaxError: invalid syntax (3209003490.py, line 1)

# 📈 **DATA VIZUALISATION**