# ⚙️ **DATA IMPORT**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
#import seaborn as sns
import os
import plotly.io as pio
pio.renderers.default = 'iframe'

DATA_PATH = '../data'

POI_FILENAME = 'POI_tourist_establishments.csv'
SITE_FILENAME = 'POI_touristic_sites_by_municipality.csv'
SALARY_FILENAME = 'average_salary_by_municipality.csv'
GEOREF_FILENAME = 'geographical_referential.csv'
STOCK_FILENAME = 'housing_stock.csv'
SALES_FILENAME = 'notary_real_estate_sales.csv'
POPULATION_FILENAME = 'population_by_municipality.csv'
POVERTY_FILENAME = 'poverty_population_by_municipality.csv'
REAL_ESTATE_FILENAME = 'real_estate_info_by_municipality.csv'

poi_df = pd.read_csv(os.path.join(DATA_PATH, POI_FILENAME),usecols=lambda column: column != 'name')
site_df = pd.read_csv(os.path.join(DATA_PATH, SITE_FILENAME))
salary_df = pd.read_csv(os.path.join(DATA_PATH, SALARY_FILENAME),usecols=lambda column: column != 'country_code')
georef_df = pd.read_csv(os.path.join(DATA_PATH, GEOREF_FILENAME),usecols=lambda column: column != 'country_code')
stock_df = pd.read_csv(os.path.join(DATA_PATH, STOCK_FILENAME),usecols=lambda column: column not in ['int64_field_0', 'country_code'])
sales_df = pd.read_csv(os.path.join(DATA_PATH, SALES_FILENAME))
population_df = pd.read_csv(os.path.join(DATA_PATH, POPULATION_FILENAME),usecols=lambda column: column != 'country_code')
poverty_df = pd.read_csv(os.path.join(DATA_PATH, POVERTY_FILENAME),usecols=lambda column: column != 'country_code')
real_estate_df = pd.read_csv(os.path.join(DATA_PATH, REAL_ESTATE_FILENAME))

# 🔭 **DATA EXPLORATION**




#### DF POI

In [2]:
poi_df.head() #DATA CLEAN
# importance : poids interne pour évaluer l'importance

Unnamed: 0,poi,latitude,longitude,municipality_code,importance,name_reprocessed
0,hotel,45.678531,-1.163635,17225,0.078556,Club Med
1,hotel,44.026859,4.847491,84129,0.078419,Cabanes des Grands Cépages
2,hotel,50.331541,1.565677,80333,0.077999,Pierre et Vacances
3,hotel,43.285936,6.569696,83068,0.077702,Pierre & Vacances
4,hotel,43.54105,3.752852,34095,0.077542,Le Domaine du Golf


In [53]:
# Création de la carte avec Plotly Express
fig = px.scatter_mapbox(poi_df, lat='latitude', lon='longitude', hover_name='name_reprocessed',
                        color='poi', size='importance', zoom=5, height=990)

# Personnalisation du titre et du style de la carte
fig.update_layout(
    title='Répartition des points d\'intérêt',
    mapbox_style='open-street-map'  # Vous pouvez choisir parmi d'autres styles de carte, par exemple 'carto-positron'
)

# Réduction de la taille des points
fig.update_traces(marker=dict(size=4))

# Affichage du graphique
fig.show()

In [4]:
poi_count = poi_df['poi'].value_counts()

# Création du diagramme circulaire avec Plotly Express
fig = px.pie(names=poi_count.index, values=poi_count.values, title='Répartition des points d\'intérêt (POI)')

# Affichage du graphique
fig.show()

In [5]:
# 0 DOUBLONS !
poi_df.duplicated().sum()#.drop_duplicates()

np.int64(0)

In [6]:
poi_df.info()
poi_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26216 entries, 0 to 26215
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   poi                26216 non-null  object 
 1   latitude           26216 non-null  float64
 2   longitude          26216 non-null  float64
 3   municipality_code  26216 non-null  object 
 4   importance         26216 non-null  float64
 5   name_reprocessed   26216 non-null  object 
dtypes: float64(3), object(3)
memory usage: 1.2+ MB


poi                  0
latitude             0
longitude            0
municipality_code    0
importance           0
name_reprocessed     0
dtype: int64

#### DF SITE

In [7]:
site_df.head() # DATA CLEAN
# importance : poids interne pour évaluer l'importance

Unnamed: 0,poi,name,latitude,longitude,municipality_code,importance,name_reprocessed
0,1,Fortifications de Vauban (Patrimoine mondial),46.39616,1.4726,36134,0.139527,Fortifications de Vauban
1,1,Chemins de Saint-Jacques-de-Compostelle en Fra...,46.313695,1.478772,23258,0.137821,Chemins de Saint-Jacques-de-Compostelle en Fra...
2,1,Canal du Midi (Patrimoine mondial),43.332709,3.21989,34032,0.129531,Canal du Midi
3,1,Bassin minier du Nord-Pas de Calais (Patrimoin...,50.455895,2.965034,62427,0.12717,Bassin minier du Nord-Pas de Calais
4,1,"Les Causses et les Cévennes, paysage culturel ...",44.34621,3.613406,48050,0.124981,"Les Causses et les Cévennes, paysage culturel ..."


In [8]:
# Création de la carte avec Plotly Express
fig = px.scatter_mapbox(site_df, lat='latitude', lon='longitude', hover_name='name_reprocessed',
                        color='poi', size='importance', zoom=5, height=990)

# Personnalisation du titre et du style de la carte
fig.update_layout(
    title='Répartition des site touristiques',
    mapbox_style='open-street-map'  # Vous pouvez choisir parmi d'autres styles de carte, par exemple 'carto-positron'
)

# Réduction de la taille des points
fig.update_traces(marker=dict(size=4))

# Affichage du graphique
fig.show()

In [9]:
poi_count = site_df['poi'].value_counts()

# Création du diagramme circulaire avec Plotly Express
fig = px.pie(names=poi_count.index, values=poi_count.values, title='Répartition des site touristiques')

# Affichage du graphique
fig.show()

In [10]:
# 0 DOUBLONS !
site_df.duplicated().sum()#.drop_duplicates()

np.int64(0)

In [11]:
site_df.info()
site_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31034 entries, 0 to 31033
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   poi                31034 non-null  object 
 1   name               31034 non-null  object 
 2   latitude           31034 non-null  float64
 3   longitude          31034 non-null  float64
 4   municipality_code  31034 non-null  object 
 5   importance         31034 non-null  float64
 6   name_reprocessed   31034 non-null  object 
dtypes: float64(3), object(4)
memory usage: 1.7+ MB


poi                  0
name                 0
latitude             0
longitude            0
municipality_code    0
importance           0
name_reprocessed     0
dtype: int64

#### DF SALARY

In [12]:
salary_df.head() # DATA CLEAN

Unnamed: 0,municipality_code,avg_net_salary,year
0,1004,26471.026727,2019
1,1007,26341.353419,2019
2,1014,25897.497842,2019
3,1024,25695.240341,2019
4,1025,26054.712323,2019


In [13]:
salary_df["year"].unique()

array([2019, 2018, 2017, 2016, 2015])

In [14]:
# 0 DOUBLONS !
salary_df.duplicated().sum()#.drop_duplicates()

np.int64(0)

In [15]:
salary_df.info()
salary_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26675 entries, 0 to 26674
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   municipality_code  26675 non-null  object 
 1   avg_net_salary     26675 non-null  float64
 2   year               26675 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 625.3+ KB


municipality_code    0
avg_net_salary       0
year                 0
dtype: int64

#### DF GEO REF

In [16]:
georef_df.head()

Unnamed: 0,municipality_code,city_name,city_name_normalized,municipality_type,latitude,longitude,department_code,epci_code,department_name
0,1005,Ambérieux-en-Dombes,AMBERIEUX EN DOMBES,municipality,45.99618,4.912273,1,200042497.0,Ain
1,1021,Ars-sur-Formans,ARS SUR FORMANS,municipality,45.993461,4.821996,1,200042497.0,Ain
2,1030,Beauregard,BEAUREGARD,municipality,46.000858,4.756007,1,200042497.0,Ain
3,1105,Civrieux,CIVRIEUX,municipality,45.922467,4.886338,1,200042497.0,Ain
4,1157,Fareins,FAREINS,municipality,46.020998,4.762008,1,200042497.0,Ain


In [17]:
georef_df["municipality_type"].unique()

array(['municipality', 'arrondissement'], dtype=object)

In [18]:
# 0 DOUBLONS !
georef_df.duplicated().sum()#.drop_duplicates()

np.int64(0)

In [19]:
georef_df.info()
georef_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34997 entries, 0 to 34996
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   municipality_code     34997 non-null  object 
 1   city_name             34997 non-null  object 
 2   city_name_normalized  34997 non-null  object 
 3   municipality_type     34997 non-null  object 
 4   latitude              34997 non-null  float64
 5   longitude             34997 non-null  float64
 6   department_code       34997 non-null  object 
 7   epci_code             34945 non-null  float64
 8   department_name       34997 non-null  object 
dtypes: float64(3), object(6)
memory usage: 2.4+ MB


municipality_code        0
city_name                0
city_name_normalized     0
municipality_type        0
latitude                 0
longitude                0
department_code          0
epci_code               52
department_name          0
dtype: int64

#### DF STOCK

In [20]:
stock_df.head() #drop de int64_field_0

Unnamed: 0,municipality_code,year,nb_principal_home,nb_second_home,nb_vacants_housing,nb_tot_housing,secondary_home_rate,principal_home_rate,vacants_housing_rate
0,1339,1968,109,155,0,264,0.587121,0.412879,0.0
1,2368,1968,132,56,0,188,0.297872,0.702128,0.0
2,4073,1968,134,93,0,227,0.409692,0.590308,0.0
3,4148,1968,42,93,0,135,0.688889,0.311111,0.0
4,5012,1968,70,53,0,123,0.430894,0.569106,0.0


In [21]:
stock_df["year"].unique()

array([1968, 1975, 1982, 1990, 1999, 2008, 2013, 2018])

In [22]:
# 0 DOUBLONS !
stock_df.duplicated().sum()#.drop_duplicates()

np.int64(0)

In [23]:
stock_df.info() # supprimer la colonne int64_field-0
stock_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279584 entries, 0 to 279583
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   municipality_code     279584 non-null  object 
 1   year                  279584 non-null  int64  
 2   nb_principal_home     279584 non-null  int64  
 3   nb_second_home        279584 non-null  int64  
 4   nb_vacants_housing    279584 non-null  int64  
 5   nb_tot_housing        279584 non-null  int64  
 6   secondary_home_rate   279584 non-null  float64
 7   principal_home_rate   279584 non-null  float64
 8   vacants_housing_rate  279584 non-null  float64
dtypes: float64(3), int64(5), object(1)
memory usage: 19.2+ MB


municipality_code       0
year                    0
nb_principal_home       0
nb_second_home          0
nb_vacants_housing      0
nb_tot_housing          0
secondary_home_rate     0
principal_home_rate     0
vacants_housing_rate    0
dtype: int64

#### DF SALES

In [24]:
sales_df.head()

Unnamed: 0,sales_date,sales_amount,street_number,street_code,street_name,nom_commune,municipality_code,premise_type,surface,number_of_principal_rooms,sales_price_m2,latitude,longitude
0,2018-02-06,5000000.0,63.0,0040,AV PRINCE RAINIER III,Saint-Jean-Cap-Ferrat,6121,Maison,292.0,10,17123.0,43.678892,7.330651
1,2018-11-26,93060.0,5308.0,B061,LE SERRET,Vallées-d'Antraigues-Asperjoc,7011,Maison,16.0,0,5816.0,44.714072,4.360185
2,2018-06-08,95000.0,161.0,0683,IMP COL EMILE VIGUIER,Millau,12145,Maison,21.0,0,4524.0,44.093714,3.054594
3,2018-07-17,3912000.0,690.0,1868,CHE DE MAZARGUES,Aix-en-Provence,13001,Maison,610.0,13,6413.0,43.502833,5.428194
4,2018-03-27,810000.0,5000.0,0162,CHE DES OLIVIERS,Eygalières,13034,Maison,296.0,9,2736.0,43.756881,4.957214


In [25]:
# 510 211 DOUBLONS !
sales_df.duplicated().sum()#.drop_duplicates()

np.int64(510211)

In [26]:
s = (sales_df['sales_amount']
             .value_counts()
             .loc[sales_df['sales_amount'].value_counts() > 10])

In [27]:
s.index = s.index.astype(int)
s.loc[(s.index % 10) != 0]

sales_amount
10096218     339
220623264    324
18761004     316
19090412     295
1            277
            ... 
96375         11
299999        11
200151        11
377475        11
182243        11
Name: count, Length: 534, dtype: int64

In [28]:
s.plot

<pandas.plotting._core.PlotAccessor object at 0x1299cf0b0>

In [29]:
sales_df['sales_amount'].value_counts().loc[sales_df['sales_amount'].value_counts() > 10] #[220623264]#sort_values(ascending=True).astype(int)

sales_amount
150000.0     50535
120000.0     48185
100000.0     45864
130000.0     43789
110000.0     41900
             ...  
315120.0        11
203680.0        11
601600.0        11
1151000.0       11
555700.0        11
Name: count, Length: 14213, dtype: int64

In [30]:
sales_df.info(), #il manque des latitude et longitude
sales_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4331940 entries, 0 to 4331939
Data columns (total 13 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   sales_date                 object 
 1   sales_amount               float64
 2   street_number              float64
 3   street_code                object 
 4   street_name                object 
 5   nom_commune                object 
 6   municipality_code          object 
 7   premise_type               object 
 8   surface                    float64
 9   number_of_principal_rooms  int64  
 10  sales_price_m2             float64
 11  latitude                   float64
 12  longitude                  float64
dtypes: float64(6), int64(1), object(6)
memory usage: 429.7+ MB


sales_date                       0
sales_amount                     0
street_number                26693
street_code                      0
street_name                     74
nom_commune                      0
municipality_code                0
premise_type                     0
surface                          0
number_of_principal_rooms        0
sales_price_m2                   0
latitude                     63597
longitude                    63597
dtype: int64

#### DF POPULATION

In [31]:
population_df.head()

Unnamed: 0,municipality_code,year,population
0,4213,1968,0.0
1,5137,1968,0.0
2,55039,1968,0.0
3,55050,1968,0.0
4,55239,1968,0.0


In [32]:
population_df["year"].unique()

array([1968, 1975, 1982, 1990, 1999, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])

In [33]:
# ??? DOUBLONS !
population_df.duplicated().sum

<bound method Series.sum of 0         False
1         False
2         False
3         False
4         False
          ...  
689490    False
689491    False
689492    False
689493    False
689494    False
Length: 689495, dtype: bool>

In [34]:
population_df.info() #colonne YEAR en format INT64
population_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689495 entries, 0 to 689494
Data columns (total 3 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   municipality_code  689495 non-null  object 
 1   year               689495 non-null  int64  
 2   population         689495 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 15.8+ MB


municipality_code    0
year                 0
population           0
dtype: int64

#### DF POVERTY

In [35]:
poverty_df.head()

Unnamed: 0,municipality_code,year,population
0,4213,1968,0.0
1,5137,1968,0.0
2,55039,1968,0.0
3,55050,1968,0.0
4,55239,1968,0.0


In [36]:
poverty_df["year"].unique()

array([1968, 1975, 1982, 1990, 1999, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])

In [37]:
# 0 DOUBLONS !
poverty_df.duplicated().sum()

np.int64(0)

In [38]:
poverty_df.info() # YEAR est en type INT64 et non DATE
poverty_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689495 entries, 0 to 689494
Data columns (total 3 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   municipality_code  689495 non-null  object 
 1   year               689495 non-null  int64  
 2   population         689495 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 15.8+ MB


municipality_code    0
year                 0
population           0
dtype: int64

#### DF REAL ESTATE

In [39]:
real_estate_df.head()

Unnamed: 0,municipality_code,intensite_tension_immo,rental_max_apartment,rental_min_apartment,rental_med_house,rental_max_house,rental_min_house,rental_med_all,rental_max_all,rental_min_all
0,57133,8,12.27,9.07,9.19,14.45,6.64,9.53,13.77,7.25
1,57446,8,18.22,7.69,10.92,14.16,7.77,11.09,15.97,7.73
2,77013,9,18.3,8.39,12.2,16.71,9.21,12.26,16.75,9.19
3,77026,9,15.76,6.28,9.51,13.2,7.04,9.53,13.23,7.03
4,77072,9,17.5,7.69,11.47,15.75,8.21,11.47,15.75,8.21


In [40]:
# 0 DOUBLONS !
real_estate_df.duplicated().sum()

np.int64(0)

In [41]:
real_estate_df.info()
real_estate_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34441 entries, 0 to 34440
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   municipality_code       34441 non-null  object 
 1   intensite_tension_immo  34441 non-null  int64  
 2   rental_max_apartment    34441 non-null  float64
 3   rental_min_apartment    34441 non-null  float64
 4   rental_med_house        34421 non-null  float64
 5   rental_max_house        34421 non-null  float64
 6   rental_min_house        34421 non-null  float64
 7   rental_med_all          34441 non-null  float64
 8   rental_max_all          34441 non-null  float64
 9   rental_min_all          34441 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 2.6+ MB


municipality_code          0
intensite_tension_immo     0
rental_max_apartment       0
rental_min_apartment       0
rental_med_house          20
rental_max_house          20
rental_min_house          20
rental_med_all             0
rental_max_all             0
rental_min_all             0
dtype: int64

# 🧪 **DATA TRANSFORMATION**

#### CLEANING

##### DF_SALES CLEANING

In [50]:
# SALES_DF: Suppression des doublons > nous passons de 4,3M de lignes à 3,821M
sales_df = sales_df.drop_duplicates()
sales_df.shape

(3448398, 13)

In [43]:
# SALES_DF: Check si les doublons on été enlevés : OK
sales_df.duplicated().sum()

np.int64(0)

In [44]:
# SALES_DF: Suppression des prix au m2 supérieur à 30K€ et inférieur à 1K€ > nous passons à 3,3399M de lignes
sales_df = sales_df[(sales_df['sales_price_m2'] <= 30000) & (sales_df['sales_price_m2'] >= 1000)]
sales_df.shape

(3448398, 13)

In [45]:
# SALES_DF:
s2 = (sales_df['sales_amount']
             .value_counts()
             .loc[sales_df['sales_amount'].value_counts() > 10])

In [46]:
# SALES_DF:
s2.index = s2.index.astype(int)
s2.loc[(s2.index % 10) != 0]

sales_amount
199999    45
231132    37
125581    35
161865    34
139999    34
          ..
114709    11
74729     11
142857    11
121325    11
86136     11
Name: count, Length: 150, dtype: int64

In [47]:
# SALES_DF:
sales_df = sales_df[sales_df['sales_amount'] > 1] # on enlève les 166 fois ou sales_amount = 1€
sales_df.shape

(3448398, 13)

In [48]:
# SALES_DF: Création de l'histogramme avec Plotly Express
fig = px.histogram(sales_df, x='sales_price_m2', nbins=700, title='Distribution de sales_price_m2')

# SALES_DF: Affichage du graphique
fig.show()

In [52]:
# SALES_DF: Création de l'histogramme avec Plotly Express
fig = px.histogram(sales_df, x='sales_amount', nbins=400, title='Distribution de sales')

# SALES_DF: Affichage du graphique
fig.show()

##### DF_SITE CLEANING

#### MERGE

#### AGGREGATION

# 📈 **DATA VIZUALISATION**