# ⚙️ **DATA IMPORT**

In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import os
from sklearn.preprocessing import MinMaxScaler
import plotly.io as pio
pio.renderers.default = 'iframe'

DATA_PATH = '../data'

POI_FILENAME = 'POI_tourist_establishments.csv'
SITE_FILENAME = 'POI_touristic_sites_by_municipality.csv'
SALARY_FILENAME = 'average_salary_by_municipality.csv'
GEOREF_FILENAME = 'geographical_referential.csv'
STOCK_FILENAME = 'housing_stock.csv'
SALES_FILENAME = 'notary_real_estate_sales.csv'
POPULATION_FILENAME = 'population_by_municipality.csv'
POVERTY_FILENAME = 'poverty_population_by_municipality.csv'
REAL_ESTATE_FILENAME = 'real_estate_info_by_municipality.csv'

poi_df = pd.read_csv(os.path.join(DATA_PATH, POI_FILENAME),usecols=lambda column: column != 'name')
site_df = pd.read_csv(os.path.join(DATA_PATH, SITE_FILENAME))
salary_df = pd.read_csv(os.path.join(DATA_PATH, SALARY_FILENAME),usecols=lambda column: column != 'country_code')
georef_df = pd.read_csv(os.path.join(DATA_PATH, GEOREF_FILENAME),usecols=lambda column: column != 'country_code')
stock_df = pd.read_csv(os.path.join(DATA_PATH, STOCK_FILENAME),usecols=lambda column: column not in ['int64_field_0', 'country_code'])
sales_df = pd.read_csv(os.path.join(DATA_PATH, SALES_FILENAME))
population_df = pd.read_csv(os.path.join(DATA_PATH, POPULATION_FILENAME),usecols=lambda column: column != 'country_code')
poverty_df = pd.read_csv(os.path.join(DATA_PATH, POVERTY_FILENAME),usecols=lambda column: column != 'country_code')
real_estate_df = pd.read_csv(os.path.join(DATA_PATH, REAL_ESTATE_FILENAME))

# 🔭 **DATA EXPLORATION**




#### DF POI

In [2]:
poi_df.head() #DATA CLEAN
# importance : poids interne pour évaluer l'importance

Unnamed: 0,poi,latitude,longitude,municipality_code,importance,name_reprocessed
0,hotel,45.678531,-1.163635,17225,0.078556,Club Med
1,hotel,44.026859,4.847491,84129,0.078419,Cabanes des Grands Cépages
2,hotel,50.331541,1.565677,80333,0.077999,Pierre et Vacances
3,hotel,43.285936,6.569696,83068,0.077702,Pierre & Vacances
4,hotel,43.54105,3.752852,34095,0.077542,Le Domaine du Golf


In [3]:
# Création de la carte avec Plotly Express
fig = px.scatter_mapbox(poi_df, lat='latitude', lon='longitude', hover_name='name_reprocessed',
                        color='poi', size='importance', zoom=5, height=990)

# Personnalisation du titre et du style de la carte
fig.update_layout(
    title='Répartition des points d\'intérêt',
    mapbox_style='open-street-map'  # Vous pouvez choisir parmi d'autres styles de carte, par exemple 'carto-positron'
)

# Réduction de la taille des points
fig.update_traces(marker=dict(size=4))

# Affichage du graphique
fig.show()

In [4]:
poi_count = poi_df['poi'].value_counts()

# Création du diagramme circulaire avec Plotly Express
fig = px.pie(names=poi_count.index, values=poi_count.values, title='Répartition des points d\'intérêt (POI)')

# Affichage du graphique
fig.show()

In [5]:
# 0 DOUBLONS !
poi_df.duplicated().sum()#.drop_duplicates()

np.int64(0)

In [6]:
poi_df.info()
poi_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26216 entries, 0 to 26215
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   poi                26216 non-null  object 
 1   latitude           26216 non-null  float64
 2   longitude          26216 non-null  float64
 3   municipality_code  26216 non-null  object 
 4   importance         26216 non-null  float64
 5   name_reprocessed   26216 non-null  object 
dtypes: float64(3), object(3)
memory usage: 1.2+ MB


poi                  0
latitude             0
longitude            0
municipality_code    0
importance           0
name_reprocessed     0
dtype: int64

#### DF SITE

In [7]:
site_df.head() # DATA CLEAN
# importance : poids interne pour évaluer l'importance

Unnamed: 0,poi,latitude,longitude,municipality_code,importance,name_reprocessed
0,1,46.39616,1.4726,36134,0.139527,Fortifications de Vauban
1,1,46.313695,1.478772,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...
2,1,43.332709,3.21989,34032,0.129531,Canal du Midi
3,1,50.455895,2.965034,62427,0.12717,Bassin minier du Nord-Pas de Calais
4,1,44.34621,3.613406,48050,0.124981,"Les Causses et les Cévennes, paysage culturel ..."


In [8]:
# Création de la carte avec Plotly Express
fig = px.scatter_mapbox(site_df, lat='latitude', lon='longitude', hover_name='name_reprocessed',
                        color='poi', size='importance', zoom=5, height=990)

# Personnalisation du titre et du style de la carte
fig.update_layout(
    title='Répartition des site touristiques',
    mapbox_style='open-street-map'  # Vous pouvez choisir parmi d'autres styles de carte, par exemple 'carto-positron'
)

# Réduction de la taille des points
fig.update_traces(marker=dict(size=4))

# Affichage du graphique
fig.show()

In [9]:
poi_count = site_df['poi'].value_counts()

# Création du diagramme circulaire avec Plotly Express
fig = px.pie(names=poi_count.index, values=poi_count.values, title='Répartition des site touristiques')

# Affichage du graphique
fig.show()

In [10]:
# 0 DOUBLONS !
site_df.duplicated().sum()#.drop_duplicates()

np.int64(0)

In [11]:
site_df.info()
site_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31034 entries, 0 to 31033
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   poi                31034 non-null  object 
 1   latitude           31034 non-null  float64
 2   longitude          31034 non-null  float64
 3   municipality_code  31034 non-null  object 
 4   importance         31034 non-null  float64
 5   name_reprocessed   31034 non-null  object 
dtypes: float64(3), object(3)
memory usage: 1.4+ MB


poi                  0
latitude             0
longitude            0
municipality_code    0
importance           0
name_reprocessed     0
dtype: int64

#### DF SALARY

In [12]:
salary_df.head() # DATA CLEAN

Unnamed: 0,municipality_code,avg_net_salary,year
0,1004,26471.026727,2019
1,1007,26341.353419,2019
2,1014,25897.497842,2019
3,1024,25695.240341,2019
4,1025,26054.712323,2019


In [13]:
salary_df["year"].unique()

array([2019, 2018, 2017, 2016, 2015])

In [14]:
# 0 DOUBLONS !
salary_df.duplicated().sum()#.drop_duplicates()

np.int64(0)

In [15]:
salary_df.info()
salary_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26675 entries, 0 to 26674
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   municipality_code  26675 non-null  object 
 1   avg_net_salary     26675 non-null  float64
 2   year               26675 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 625.3+ KB


municipality_code    0
avg_net_salary       0
year                 0
dtype: int64

#### DF GEO REF

In [16]:
georef_df.head()

Unnamed: 0,municipality_code,city_name,city_name_normalized,municipality_type,latitude,longitude,department_code,epci_code,department_name
0,1005,Ambérieux-en-Dombes,AMBERIEUX EN DOMBES,municipality,45.99618,4.912273,1,200042497.0,Ain
1,1021,Ars-sur-Formans,ARS SUR FORMANS,municipality,45.993461,4.821996,1,200042497.0,Ain
2,1030,Beauregard,BEAUREGARD,municipality,46.000858,4.756007,1,200042497.0,Ain
3,1105,Civrieux,CIVRIEUX,municipality,45.922467,4.886338,1,200042497.0,Ain
4,1157,Fareins,FAREINS,municipality,46.020998,4.762008,1,200042497.0,Ain


In [17]:
georef_df["municipality_type"].unique()

array(['municipality', 'arrondissement'], dtype=object)

In [18]:
# 0 DOUBLONS !
georef_df.duplicated().sum()#.drop_duplicates()

np.int64(0)

In [19]:
georef_df.info()
georef_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34997 entries, 0 to 34996
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   municipality_code     34997 non-null  object 
 1   city_name             34997 non-null  object 
 2   city_name_normalized  34997 non-null  object 
 3   municipality_type     34997 non-null  object 
 4   latitude              34997 non-null  float64
 5   longitude             34997 non-null  float64
 6   department_code       34997 non-null  object 
 7   epci_code             34945 non-null  float64
 8   department_name       34997 non-null  object 
dtypes: float64(3), object(6)
memory usage: 2.4+ MB


municipality_code        0
city_name                0
city_name_normalized     0
municipality_type        0
latitude                 0
longitude                0
department_code          0
epci_code               52
department_name          0
dtype: int64

#### DF STOCK

In [20]:
stock_df.head() #drop de int64_field_0

Unnamed: 0,municipality_code,year,nb_principal_home,nb_second_home,nb_vacants_housing,nb_tot_housing,secondary_home_rate,principal_home_rate,vacants_housing_rate
0,1339,1968,109,155,0,264,0.587121,0.412879,0.0
1,2368,1968,132,56,0,188,0.297872,0.702128,0.0
2,4073,1968,134,93,0,227,0.409692,0.590308,0.0
3,4148,1968,42,93,0,135,0.688889,0.311111,0.0
4,5012,1968,70,53,0,123,0.430894,0.569106,0.0


In [21]:
stock_df["year"].unique()

array([1968, 1975, 1982, 1990, 1999, 2008, 2013, 2018])

In [22]:
# 0 DOUBLONS !
stock_df.duplicated().sum()#.drop_duplicates()

np.int64(0)

In [23]:
stock_df.info() # supprimer la colonne int64_field-0
stock_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279584 entries, 0 to 279583
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   municipality_code     279584 non-null  object 
 1   year                  279584 non-null  int64  
 2   nb_principal_home     279584 non-null  int64  
 3   nb_second_home        279584 non-null  int64  
 4   nb_vacants_housing    279584 non-null  int64  
 5   nb_tot_housing        279584 non-null  int64  
 6   secondary_home_rate   279584 non-null  float64
 7   principal_home_rate   279584 non-null  float64
 8   vacants_housing_rate  279584 non-null  float64
dtypes: float64(3), int64(5), object(1)
memory usage: 19.2+ MB


municipality_code       0
year                    0
nb_principal_home       0
nb_second_home          0
nb_vacants_housing      0
nb_tot_housing          0
secondary_home_rate     0
principal_home_rate     0
vacants_housing_rate    0
dtype: int64

#### DF SALES

In [24]:
sales_df.head()

Unnamed: 0,sales_date,sales_amount,street_number,street_code,street_name,nom_commune,municipality_code,premise_type,surface,number_of_principal_rooms,sales_price_m2,latitude,longitude
0,2018-02-06,5000000.0,63.0,0040,AV PRINCE RAINIER III,Saint-Jean-Cap-Ferrat,6121,Maison,292.0,10,17123.0,43.678892,7.330651
1,2018-11-26,93060.0,5308.0,B061,LE SERRET,Vallées-d'Antraigues-Asperjoc,7011,Maison,16.0,0,5816.0,44.714072,4.360185
2,2018-06-08,95000.0,161.0,0683,IMP COL EMILE VIGUIER,Millau,12145,Maison,21.0,0,4524.0,44.093714,3.054594
3,2018-07-17,3912000.0,690.0,1868,CHE DE MAZARGUES,Aix-en-Provence,13001,Maison,610.0,13,6413.0,43.502833,5.428194
4,2018-03-27,810000.0,5000.0,0162,CHE DES OLIVIERS,Eygalières,13034,Maison,296.0,9,2736.0,43.756881,4.957214


In [25]:
# 510 211 DOUBLONS !
sales_df.duplicated().sum()#.drop_duplicates()

np.int64(510211)

In [26]:
s = (sales_df['sales_amount']
             .value_counts()
             .loc[sales_df['sales_amount'].value_counts() > 10])

In [27]:
s.index = s.index.astype(int)
s.loc[(s.index % 10) != 0]

sales_amount
10096218     339
220623264    324
18761004     316
19090412     295
1            277
            ... 
96375         11
299999        11
200151        11
377475        11
182243        11
Name: count, Length: 534, dtype: int64

In [28]:
s.plot

<pandas.plotting._core.PlotAccessor object at 0x1305ca5a0>

In [29]:
sales_df['sales_amount'].value_counts().loc[sales_df['sales_amount'].value_counts() > 10] #[220623264]#sort_values(ascending=True).astype(int)

sales_amount
150000.0     50535
120000.0     48185
100000.0     45864
130000.0     43789
110000.0     41900
             ...  
315120.0        11
203680.0        11
601600.0        11
1151000.0       11
555700.0        11
Name: count, Length: 14213, dtype: int64

In [30]:
sales_df.info(), #il manque des latitude et longitude
sales_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4331940 entries, 0 to 4331939
Data columns (total 13 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   sales_date                 object 
 1   sales_amount               float64
 2   street_number              float64
 3   street_code                object 
 4   street_name                object 
 5   nom_commune                object 
 6   municipality_code          object 
 7   premise_type               object 
 8   surface                    float64
 9   number_of_principal_rooms  int64  
 10  sales_price_m2             float64
 11  latitude                   float64
 12  longitude                  float64
dtypes: float64(6), int64(1), object(6)
memory usage: 429.7+ MB


sales_date                       0
sales_amount                     0
street_number                26693
street_code                      0
street_name                     74
nom_commune                      0
municipality_code                0
premise_type                     0
surface                          0
number_of_principal_rooms        0
sales_price_m2                   0
latitude                     63597
longitude                    63597
dtype: int64

#### DF POPULATION

In [31]:
population_df.head()

Unnamed: 0,municipality_code,year,population
0,4213,1968,0.0
1,5137,1968,0.0
2,55039,1968,0.0
3,55050,1968,0.0
4,55239,1968,0.0


In [32]:
population_df["year"].unique()

array([1968, 1975, 1982, 1990, 1999, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])

In [33]:
# ??? DOUBLONS !
population_df.duplicated().sum

<bound method Series.sum of 0         False
1         False
2         False
3         False
4         False
          ...  
689490    False
689491    False
689492    False
689493    False
689494    False
Length: 689495, dtype: bool>

In [34]:
population_df.info() #colonne YEAR en format INT64
population_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689495 entries, 0 to 689494
Data columns (total 3 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   municipality_code  689495 non-null  object 
 1   year               689495 non-null  int64  
 2   population         689495 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 15.8+ MB


municipality_code    0
year                 0
population           0
dtype: int64

#### DF POVERTY

In [35]:
poverty_df.head()

Unnamed: 0,municipality_code,year,population
0,4213,1968,0.0
1,5137,1968,0.0
2,55039,1968,0.0
3,55050,1968,0.0
4,55239,1968,0.0


In [36]:
poverty_df["year"].unique()

array([1968, 1975, 1982, 1990, 1999, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])

In [37]:
# 0 DOUBLONS !
poverty_df.duplicated().sum()

np.int64(0)

In [38]:
poverty_df.info() # YEAR est en type INT64 et non DATE
poverty_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689495 entries, 0 to 689494
Data columns (total 3 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   municipality_code  689495 non-null  object 
 1   year               689495 non-null  int64  
 2   population         689495 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 15.8+ MB


municipality_code    0
year                 0
population           0
dtype: int64

#### DF REAL ESTATE

In [39]:
real_estate_df.head()

Unnamed: 0,municipality_code,intensite_tension_immo,rental_max_apartment,rental_min_apartment,rental_med_house,rental_max_house,rental_min_house,rental_med_all,rental_max_all,rental_min_all
0,57133,8,12.27,9.07,9.19,14.45,6.64,9.53,13.77,7.25
1,57446,8,18.22,7.69,10.92,14.16,7.77,11.09,15.97,7.73
2,77013,9,18.3,8.39,12.2,16.71,9.21,12.26,16.75,9.19
3,77026,9,15.76,6.28,9.51,13.2,7.04,9.53,13.23,7.03
4,77072,9,17.5,7.69,11.47,15.75,8.21,11.47,15.75,8.21


In [40]:
# 0 DOUBLONS !
real_estate_df.duplicated().sum()

np.int64(0)

In [41]:
real_estate_df.info()
real_estate_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34441 entries, 0 to 34440
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   municipality_code       34441 non-null  object 
 1   intensite_tension_immo  34441 non-null  int64  
 2   rental_max_apartment    34441 non-null  float64
 3   rental_min_apartment    34441 non-null  float64
 4   rental_med_house        34421 non-null  float64
 5   rental_max_house        34421 non-null  float64
 6   rental_min_house        34421 non-null  float64
 7   rental_med_all          34441 non-null  float64
 8   rental_max_all          34441 non-null  float64
 9   rental_min_all          34441 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 2.6+ MB


municipality_code          0
intensite_tension_immo     0
rental_max_apartment       0
rental_min_apartment       0
rental_med_house          20
rental_max_house          20
rental_min_house          20
rental_med_all             0
rental_max_all             0
rental_min_all             0
dtype: int64

# 🧹 **DATA CLEANING**

### CLEANING

##### DF_SALES CLEANING

In [42]:
# SALES_DF: Suppression des doublons > nous passons de 4,3M de lignes à 3,821M
sales_df = sales_df.drop_duplicates()
sales_df.shape

(3821729, 13)

In [43]:
# SALES_DF: Check si les doublons on été enlevés : OK
sales_df.duplicated().sum()

np.int64(0)

In [44]:
# SALES_DF: Suppression des prix au m2 supérieur à 30K€ et inférieur à 1K€ > nous passons à 3,3399M de lignes
sales_df = sales_df[(sales_df['sales_price_m2'] <= 30000) & (sales_df['sales_price_m2'] >= 1000)]
sales_df.shape

(3448398, 13)

In [45]:
# SALES_DF:
s2 = (sales_df['sales_amount']
             .value_counts()
             .loc[sales_df['sales_amount'].value_counts() > 10])

In [46]:
# SALES_DF:
s2.index = s2.index.astype(int)
s2.loc[(s2.index % 10) != 0]

sales_amount
199999    45
231132    37
125581    35
161865    34
139999    34
          ..
114709    11
74729     11
142857    11
121325    11
86136     11
Name: count, Length: 150, dtype: int64

In [47]:
# SALES_DF:
sales_df = sales_df[sales_df['sales_amount'] > 1] # on enlève les 166 fois ou sales_amount = 1€
sales_df.shape

(3448398, 13)

In [48]:
# SALES_DF: changement du type sales_date en datetime
sales_df['sales_date'] = pd.to_datetime(sales_df['sales_date'])
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3448398 entries, 0 to 4331939
Data columns (total 13 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   sales_date                 datetime64[ns]
 1   sales_amount               float64       
 2   street_number              float64       
 3   street_code                object        
 4   street_name                object        
 5   nom_commune                object        
 6   municipality_code          object        
 7   premise_type               object        
 8   surface                    float64       
 9   number_of_principal_rooms  int64         
 10  sales_price_m2             float64       
 11  latitude                   float64       
 12  longitude                  float64       
dtypes: datetime64[ns](1), float64(6), int64(1), object(5)
memory usage: 368.3+ MB


In [49]:
# SALES_DF: Création de l'histogramme avec Plotly Express
fig = px.histogram(sales_df, x='sales_price_m2', nbins=700, title='Distribution de sales_price_m2')

# SALES_DF: Affichage du graphique
fig.show()

In [50]:
# SALES_DF: Création de l'histogramme avec Plotly Express
fig = px.histogram(sales_df, x='sales_amount', nbins=400, title='Distribution de sales')

# SALES_DF: Affichage du graphique
fig.show()

##### DF_SALARY CLEANING

In [51]:
# DF_SALARY: ROUND avg_net_salary
salary_df['avg_net_salary'] = salary_df['avg_net_salary'].round()
salary_df.head()

Unnamed: 0,municipality_code,avg_net_salary,year
0,1004,26471.0,2019
1,1007,26341.0,2019
2,1014,25897.0,2019
3,1024,25695.0,2019
4,1025,26055.0,2019


##### DF_REAL_ESTATE CLEANING

In [52]:
# DF_REAL_ESTATE: suppression des nulls
real_estate_df = real_estate_df.dropna(axis=1)
real_estate_df.isnull().sum()

municipality_code         0
intensite_tension_immo    0
rental_max_apartment      0
rental_min_apartment      0
rental_med_all            0
rental_max_all            0
rental_min_all            0
dtype: int64

##### DF_SITE CLEANING

In [56]:
# SITE_DF: tri avec les données entre parenthèses de la colonne "name" inclues

import re

site_df['data_inside_parenthesis'] = site_df['name'].apply(lambda x: re.search(r'\((.*?)\)', x).group(1) if re.search(r'\((.*?)\)', x) else '')
site_df

#suppression de la colonne "name" dans un second temps

site_df.drop(columns=["name"])

#check pour savoir les informations présentes dans la colonne "poi", et si elles correspondent aux valeurs présentes dans la colonne "type"
print (site_df["poi"].value_counts())
print (site_df["data_inside_parenthesis"].value_counts().head(50))

#faire un mapping des colonnes poi, qui sont en fait plus pertinentes que celles de la colonne "type"

poi
castle            5611
2                 5207
water             4500
museum            3400
theatre           2439
forest            2233
beach             1010
protected_area     927
wetland            784
cinema             667
vineyard           641
golf_course        612
theme_park         358
cliff              337
marina             255
water_park         239
1                  236
zoo                234
valley             223
ridge              213
islet              163
casino             158
volcano            145
park               132
meadow              78
monument            76
sand                35
heritage            32
viewpoint           21
cave_entrance       15
wreck               12
rock                11
waterfall           10
attraction           8
dune                 5
national_park        4
allotments           2
geyser               1
Name: count, dtype: int64
data_inside_parenthesis
                       9899
Monument historique    5090
Eau             

In [57]:
# SITE_DF: création d'un dictionnaire intégrant toutes les différentes valeurs inclues dans la colonne "poi"
s = site_df["poi"].value_counts()[site_df["poi"]]
{k: "toto" for k in s.index}

{'1': 'toto',
 '2': 'toto',
 'zoo': 'toto',
 'dune': 'toto',
 'park': 'toto',
 'rock': 'toto',
 'sand': 'toto',
 'beach': 'toto',
 'cliff': 'toto',
 'islet': 'toto',
 'ridge': 'toto',
 'water': 'toto',
 'wreck': 'toto',
 'casino': 'toto',
 'castle': 'toto',
 'cinema': 'toto',
 'forest': 'toto',
 'geyser': 'toto',
 'marina': 'toto',
 'meadow': 'toto',
 'museum': 'toto',
 'valley': 'toto',
 'theatre': 'toto',
 'volcano': 'toto',
 'wetland': 'toto',
 'heritage': 'toto',
 'monument': 'toto',
 'vineyard': 'toto',
 'viewpoint': 'toto',
 'waterfall': 'toto',
 'allotments': 'toto',
 'attraction': 'toto',
 'theme_park': 'toto',
 'water_park': 'toto',
 'golf_course': 'toto',
 'cave_entrance': 'toto',
 'national_park': 'toto',
 'protected_area': 'toto'}

In [58]:
# SITE_DF: création d'un dictionnaire avec les catégories associées aux valeurs de la colonne POI

category_dict = {'1': 'Patrimoine',
 '2': 'Patrimoine',
 'zoo': 'Entertainment',
 'dune': 'Nature',
 'park': 'Nature',
 'rock': 'Nature',
 'sand': 'Nature',
 'beach': 'Nature',
 'cliff': 'Nature',
 'islet': 'Nature',
 'ridge': 'Nature',
 'water': 'Nature',
 'wreck': 'Patrimoine',
 'casino': 'Entertainment',
 'castle': 'Patrimoine',
 'cinema': 'Culture',
 'forest': 'Nature',
 'geyser': 'Nature',
 'marina': 'Nature',
 'meadow': 'Nature',
 'museum': 'Culture',
 'valley': 'Nature',
 'theatre': 'Culture',
 'volcano': 'Nature',
 'wetland': 'Nature',
 'heritage': 'Patrimoine',
 'monument': 'Patrimoine',
 'vineyard': 'Nature',
 'viewpoint': 'Nature',
 'waterfall': 'Nature',
 'allotments': 'Patrimoine',
 'attraction': 'Entertainment',
 'theme_park': 'Entertainment',
 'water_park': 'Entertainment',
 'golf_course': 'Entertainment',
 'cave_entrance': 'Culture',
 'national_park': 'Nature',
 'protected_area': 'Nature'}

In [59]:
# SITE_DF: création de la colonne "catégorie"
site_df["Category"] = site_df["poi"].map(category_dict)
site_df

Unnamed: 0,poi,name,latitude,longitude,municipality_code,importance,name_reprocessed,data_inside_parenthesis,Category
0,1,Fortifications de Vauban (Patrimoine mondial),46.396160,1.472600,36134,0.139527,Fortifications de Vauban,Patrimoine mondial,Patrimoine
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,46.313695,1.478772,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...,Patrimoine mondial,Patrimoine
2,1,Canal du Midi (Patrimoine mondial),43.332709,3.219890,34032,0.129531,Canal du Midi,Patrimoine mondial,Patrimoine
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,50.455895,2.965034,62427,0.127170,Bassin minier du Nord-Pas de Calais,Patrimoine mondial,Patrimoine
4,1,"Les Causses et les Cévennes, paysage culturel ...",44.346210,3.613406,48050,0.124981,"Les Causses et les Cévennes, paysage culturel ...",Patrimoine mondial,Patrimoine
...,...,...,...,...,...,...,...,...,...
31029,viewpoint,Tour Eiffel 3e étage (Point de vue),48.858262,2.294497,75056,0.053782,Tour Eiffel 3e étage,Point de vue,Nature
31030,theme_park,Jardin d'Acclimatation (Parc à thème),48.878145,2.264686,75056,0.087097,Jardin d'Acclimatation,Parc à thème,Entertainment
31031,theme_park,Foire du Trône (Parc à thème),48.832003,2.404337,75056,0.060000,Foire du Trône,Parc à thème,Entertainment
31032,golf_course,Golf du Bois de Boulogne (Terrain de golf),48.857776,2.232651,75056,0.060073,Golf du Bois de Boulogne,Terrain de golf,Entertainment


### CLEANED DF CHECK

In [None]:
poi_df.info()
site_df.info()
salary_df.info()
georef_df.info() 
stock_df.info() 
sales_df.info()
population_df.info() 
poverty_df.info()
real_estate_df.info()

In [None]:
poi_df.head(1)

In [None]:
site_df.head(1) 

In [None]:
salary_df.head(1)

In [None]:
georef_df.head(1) 

In [None]:
stock_df.head(1) 

In [None]:
sales_df.head(1)

In [None]:
population_df.head(1) 

In [None]:
poverty_df.head(1)

In [None]:
real_estate_df.head(1)

# 🧪 **DATA TRANSFORMATION**

### KPIS AGGREGATION BY DEPARTMENT

##### 1. POPULATION

In [60]:
# 1.1 Salaire moyen par département
# Joindre les informations de géolocalisation pour obtenir les départements
salary_dep_df = salary_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer le salaire moyen par département
avg_salary_per_department = salary_dep_df.groupby(['department_code', 'department_name'])['avg_net_salary'].mean().reset_index().round()
avg_salary_per_department.rename(columns={'avg_net_salary': 'avg_salary'}, inplace=True)
avg_salary_per_department.head()

Unnamed: 0,department_code,department_name,avg_salary
0,1,Ain,25614.0
1,2,Aisne,22865.0
2,3,Allier,23272.0
3,4,Alpes-de-Haute-Provence,24147.0
4,5,Hautes-Alpes,22269.0


In [93]:
# 1.2 Évolution de la population par département
# Joindre les informations de géolocalisation pour obtenir les départements
population_dep_df = population_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer l'évolution de la population par département (différence entre les années)
pop_evolution = population_dep_df.groupby(['department_code', 'department_name', 'year'])['population'].sum().unstack().reset_index()
pop_evolution['evolution'] = (pop_evolution[pop_evolution.columns[-1]] - pop_evolution[pop_evolution.columns[-2]]) / pop_evolution[pop_evolution.columns[-2]] * 100
pop_evolution = pop_evolution[['department_code', 'department_name', 'evolution']]

pop_evolution.head()

year,department_code,department_name,evolution
0,1,Ain,0.740851
1,2,Aisne,-0.369817
2,3,Allier,-0.354716
3,4,Alpes-de-Haute-Provence,0.146281
4,5,Hautes-Alpes,0.371007


In [65]:
# 1.3 Taux de pauvreté par département
# Joindre les données de population
poverty_df = poverty_df.merge(population_df[['municipality_code', 'population']], on='municipality_code', suffixes=('_poverty', '_population'))

# Merge avec georef pour avoir le department_name
poverty_df = poverty_df.merge(georef_df[['municipality_code', 'department_name']], on='municipality_code', how='left')

# Groupe par department_name pour calculer le taux de pauvreté par département
poverty_by_department = poverty_df.groupby('department_name').agg({
    'population_poverty': 'sum',
    'population_population': 'sum'
}).reset_index()

# Calculer le poverty_rate pour chaque département
poverty_by_department['poverty_rate'] = (poverty_by_department['population_poverty'] / poverty_by_department['population']) * 100

# Afficher le résultat
poverty_by_department.head()

# calcul = poverty_rate = poverty_population / total_population * 100
# il faut ensuite le DF qui a poverty_rate à georef pour récupérer le department_name

KeyboardInterrupt: 

##### 2. TOURISM

In [67]:
# 2.1 Nombre de sites touristiques par département
# Joindre les informations de géolocalisation pour obtenir les départements
site_dep_df = site_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer le nombre de sites touristiques par département
num_sites_per_department = site_dep_df.groupby(['department_code', 'department_name'])['poi'].count().reset_index()
num_sites_per_department.rename(columns={'poi': 'nb_sites'}, inplace=True)
num_sites_per_department.head()

Unnamed: 0,department_code,department_name,nb_sites
0,1,Ain,508
1,2,Aisne,217
2,3,Allier,323
3,4,Alpes-de-Haute-Provence,224
4,5,Hautes-Alpes,313


In [68]:
# 2.2 Importance moyenne des sites par département
# Calculer l'importance moyenne des sites touristiques par département
avg_site_importance_per_department = site_dep_df.groupby(['department_code', 'department_name'])['importance'].mean().reset_index()
avg_site_importance_per_department.rename(columns={'importance': 'avg_site_importance'}, inplace=True)
avg_site_importance_per_department.head()

Unnamed: 0,department_code,department_name,avg_site_importance
0,1,Ain,0.067051
1,2,Aisne,0.072294
2,3,Allier,0.066258
3,4,Alpes-de-Haute-Provence,0.065725
4,5,Hautes-Alpes,0.069074


In [69]:
# 2.3 Stock de logement par département
# Joindre les informations de géolocalisation pour obtenir les départements
stock_dep_df = stock_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer le stock de logement par département (nombre total de logements)
total_stock_per_department = stock_dep_df.groupby(['department_code', 'department_name'])['nb_tot_housing'].sum().reset_index()
total_stock_per_department.rename(columns={'nb_tot_housing': 'total_stock'}, inplace=True)
total_stock_per_department.head()

Unnamed: 0,department_code,department_name,total_stock
0,1,Ain,1781912
1,2,Aisne,1829844
2,3,Allier,1462151
3,4,Alpes-de-Haute-Provence,755894
4,5,Hautes-Alpes,763007


##### 3. REAL ESTATE

In [70]:
# 3.1 Rentabilité locative au m² par département
# Joindre les informations de géolocalisation pour obtenir les départements
real_estate_dep_df = real_estate_df.merge(georef_df[['municipality_code', 'department_code', 'department_name']], on='municipality_code')

# Calculer la rentabilité locative moyenne au m² par département
real_estate_dep_df['avg_rental_yield'] = (real_estate_dep_df['rental_max_all'] + real_estate_dep_df['rental_min_all']) / 2
rental_yield_per_department = real_estate_dep_df.groupby(['department_code', 'department_name'])['avg_rental_yield'].mean().reset_index()
rental_yield_per_department.rename(columns={'avg_rental_yield': 'avg_rental_yield'}, inplace=True)
rental_yield_per_department

Unnamed: 0,department_code,department_name,avg_rental_yield
0,01,Ain,12.253130
1,02,Aisne,9.627683
2,03,Allier,9.244937
3,04,Alpes-de-Haute-Provence,11.311378
4,05,Hautes-Alpes,11.714241
...,...,...,...
91,91,Essonne,16.469145
92,92,Hauts-de-Seine,25.354306
93,93,Seine-Saint-Denis,21.065125
94,94,Val-de-Marne,22.158404


In [71]:
# 3.2 Tension immobilière par département
# Calculer la tension immobilière par département
housing_tension_per_department = real_estate_dep_df.groupby(['department_code', 'department_name'])['intensite_tension_immo'].mean().reset_index()
housing_tension_per_department.rename(columns={'intensite_tension_immo': 'avg_housing_tension'}, inplace=True)
housing_tension_per_department

Unnamed: 0,department_code,department_name,avg_housing_tension
0,01,Ain,8.0
1,02,Aisne,6.0
2,03,Allier,8.0
3,04,Alpes-de-Haute-Provence,6.0
4,05,Hautes-Alpes,10.0
...,...,...,...
91,91,Essonne,11.0
92,92,Hauts-de-Seine,21.0
93,93,Seine-Saint-Denis,13.0
94,94,Val-de-Marne,17.0


In [72]:
# 3.3 Part de maisons secondaires par département
# Calculer la part de maisons secondaires par département
secondary_home_rate_per_department = stock_dep_df.groupby(['department_code', 'department_name'])['secondary_home_rate'].mean().reset_index()
secondary_home_rate_per_department.rename(columns={'secondary_home_rate': 'avg_secondary_home_rate'}, inplace=True)
secondary_home_rate_per_department

Unnamed: 0,department_code,department_name,avg_secondary_home_rate
0,01,Ain,0.138884
1,02,Aisne,0.102122
2,03,Allier,0.132708
3,04,Alpes-de-Haute-Provence,0.397630
4,05,Hautes-Alpes,0.400697
...,...,...,...
95,95,Val-d'Oise,0.071613
96,971,Guadeloupe,0.076645
97,972,Martinique,0.068388
98,973,Guyane,0.062976


sales_date,department_code,department_name,price_evolution
0,1,Ain,-22.042678
1,2,Aisne,-27.922611
2,3,Allier,-16.614434
3,4,Alpes-de-Haute-Provence,
4,5,Hautes-Alpes,19.86818


In [79]:
# Calculer le prix moyen au m² des ventes immobilières par département
avg_price_per_m2_per_department = sales_dep_df.groupby(['department_code', 'department_name'])['sales_price_m2'].mean().reset_index()
avg_price_per_m2_per_department.rename(columns={'sales_price_m2': 'avg_sales_price_m2'}, inplace=True)
avg_price_per_m2_per_department.head()

Unnamed: 0,department_code,department_name,avg_sales_price_m2
0,1,Ain,2648.283789
1,2,Aisne,1351.775633
2,3,Allier,1279.995017
3,4,Alpes-de-Haute-Provence,2283.535471
4,5,Hautes-Alpes,2602.996876


# 📈 **DATA VIZUALISATION**

# 🏆 **SCORING SYSTEM**

Population
1.1 Salaire moyen par département
1.2 Évolution de la population par département
1.3 Taux de pauvreté par département

Tourisme
2.1 Nombre de sites touristiques par département
2.2 Importance moyenne des sites par département
2.3 Stock de logement par département

Immobilier
3.1 Rentabilité locative au m² par département
3.2 Tension immobilière par département
3.3 Part de maisons secondaires par département
3.4 Évolution du prix au m² par département
3.5 Prix moyen au m² des ventes immobilières par département
creer moi un système de scoring (avec pondération) pour avoir :

un score Population
un score Tourisme
un score Immobilier
puis un Score Global grâce aux 3 précédents scores

In [96]:
# Supposons que vous avez déjà calculé les données nécessaires pour les scores Population, Tourisme et Immobilier

# 1. Définition des poids pour chaque sous-score dans chaque catégorie
weights_population = {
    'average_salary': 0.4,
    'population_growth': 0.3,
    'poverty_rate': 0.3
}

weights_tourism = {
    'num_tourism_sites': 0.4,
    'average_importance': 0.3,
    'stock_housing': 0.3
}

weights_real_estate = {
    'rental_yield_per_m2': 0.2,
    'real_estate_tension': 0.2,
    'secondary_home_rate': 0.1,
    'price_growth': 0.2,
    'average_price_per_m2': 0.3
}

# 2. Calcul des scores pour chaque catégorie

# POPULATION
# Supposons que vous avez déjà les dataframes suivants : average_salary_by_department, population_evolution_by_department, average_poverty_rate_by_department

population_scores = (
    avg_salary_per_department['avg_salary'] * weights_population['average_salary'] +
    pop_evolution['evolution'] * weights_population['population_growth'] +
    (100 - average_poverty_rate_by_department['poverty_rate']) * weights_population['poverty_rate']
)

# TOURISM
# Supposons que vous avez déjà les dataframes suivants : tourism_sites_by_department, average_importance_by_department, stock_housing_by_department

tourism_scores = (
    tourism_sites_by_department['num_tourism_sites'] * weights_tourism['num_tourism_sites'] +
    average_importance_by_department['importance'] * weights_tourism['average_importance'] +
    stock_housing_by_department['stock_housing'] * weights_tourism['stock_housing']
)

# REAL ESTATE
# Supposons que vous avez déjà les dataframes suivants : rental_yield_per_m2_by_department, real_estate_tension_by_department, second_home_rate_by_department, price_growth_by_department, average_price_per_m2_by_department

real_estate_scores = (
    rental_yield_per_m2_by_department['rental_yield_per_m2'] * weights_real_estate['rental_yield_per_m2'] +
    (100 - real_estate_tension_by_department['intensite_tension_immo']) * weights_real_estate['real_estate_tension'] +
    (100 - second_home_rate_by_department['secondary_home_rate']) * weights_real_estate['secondary_home_rate'] +
    price_growth_by_department['price_growth'] * weights_real_estate['price_growth'] +
    average_price_per_m2_by_department['average_price_per_m2'] * weights_real_estate['average_price_per_m2']
)

# 3. Calcul du score global
# Supposons que les scores sont déjà calculés pour chaque catégorie

global_score = (
    population_scores +
    tourism_scores +
    real_estate_scores
)

# Afficher ou utiliser les résultats
print("Scores Population :\n", population_scores.head())
print("\nScores Tourisme :\n", tourism_scores.head())
print("\nScores Immobilier :\n", real_estate_scores.head())
print("\nScore Global :\n", global_score.head())


SyntaxError: invalid syntax (2095781877.py, line 32)

In [82]:
# POPULATION SCORE
from sklearn.preprocessing import MinMaxScaler

# Supposons que vous avez déjà calculé les sous-scores pour la catégorie Population : average_salary_by_department, population_evolution_by_department, average_poverty_rate_by_department

# Initialisation du MinMaxScaler
scaler_population = MinMaxScaler()

# Normalisation des données
population_scores_scaled = scaler_population.fit_transform(
    population_scores[['avg_net_salary', 'population_growth', 'poverty_rate']]
)

# Calcul des scores normalisés
population_scores_normalized = (
    population_scores_scaled[:, 0] * weights_population['average_salary'] +
    population_scores_scaled[:, 1] * weights_population['population_growth'] +
    population_scores_scaled[:, 2] * weights_population['poverty_rate']
)

# Afficher ou utiliser les résultats des scores normalisés
print("Scores Population normalisés :\n", population_scores_normalized.head())


NameError: name 'population_scores' is not defined

In [84]:
# TOURISM SCORE
# Supposons que vous avez déjà calculé les sous-scores pour la catégorie Tourisme : tourism_sites_by_department, average_importance_by_department, stock_housing_by_department

# Initialisation du MinMaxScaler
scaler_tourism = MinMaxScaler()

# Normalisation des données
tourism_scores_scaled = scaler_tourism.fit_transform(
    tourism_scores[['num_tourism_sites', 'average_importance', 'stock_housing']]
)

# Calcul des scores normalisés
tourism_scores_normalized = (
    tourism_scores_scaled[:, 0] * weights_tourism['num_tourism_sites'] +
    tourism_scores_scaled[:, 1] * weights_tourism['average_importance'] +
    tourism_scores_scaled[:, 2] * weights_tourism['stock_housing']
)

# Afficher ou utiliser les résultats des scores normalisés
print("Scores Tourisme normalisés :\n", tourism_scores_normalized.head())


NameError: name 'tourism_scores' is not defined

In [85]:
# 1 REAL ESTATE SCORE
# Supposons que vous avez déjà calculé les sous-scores pour la catégorie Immobilier : rental_yield_per_m2_by_department, real_estate_tension_by_department, second_home_rate_by_department, price_growth_by_department, average_price_per_m2_by_department

# Initialisation du MinMaxScaler
scaler_real_estate = MinMaxScaler()

# Normalisation des données
real_estate_scores_scaled = scaler_real_estate.fit_transform(
    real_estate_scores[['rental_yield_per_m2', 'real_estate_tension', 'secondary_home_rate', 'price_growth', 'average_price_per_m2']]
)

# Calcul des scores normalisés
real_estate_scores_normalized = (
    real_estate_scores_scaled[:, 0] * weights_real_estate['rental_yield_per_m2'] +
    real_estate_scores_scaled[:, 1] * weights_real_estate['real_estate_tension'] +
    real_estate_scores_scaled[:, 2] * weights_real_estate['secondary_home_rate'] +
    real_estate_scores_scaled[:, 3] * weights_real_estate['price_growth'] +
    real_estate_scores_scaled[:, 4] * weights_real_estate['average_price_per_m2']
)

# Afficher ou utiliser les résultats des scores normalisés
print("Scores Immobilier normalisés :\n", real_estate_scores_normalized.head())


NameError: name 'real_estate_scores' is not defined

In [None]:
1 GLOBAL SCORE
# Supposons que vous avez déjà les scores normalisés pour chaque catégorie
# population_scores_normalized, tourism_scores_normalized, real_estate_scores_normalized

# Définition des poids pour chaque catégorie
weights = {
    'population': 0.4,
    'tourism': 0.3,
    'real_estate': 0.3
}

# Calcul du score global pondéré
global_score = (
    population_scores_normalized * weights['population'] +
    tourism_scores_normalized * weights['tourism'] +
    real_estate_scores_normalized * weights['real_estate']
)

# Afficher ou utiliser le score global
print("Score Global :\n", global_score.head())
