# ⚙️ **DATA IMPORT**

In [1]:
import pandas as pd
import numpy as np
import os

DATA_PATH = '../data/raw'

POI_FILENAME = 'POI_tourist_establishments.csv'
SITE_FILENAME = 'POI_touristic_sites_by_municipality.csv'
SALARY_FILENAME = 'average_salary_by_municipality.csv'
GEOREF_FILENAME = 'geographical_referential.csv'
STOCK_FILENAME = 'housing_stock.csv'
SALES_FILENAME = 'notary_real_estate_sales.csv'
POPULATION_FILENAME = 'population_by_municipality.csv'
POVERTY_FILENAME = 'poverty_population_by_municipality.csv'
REAL_ESTATE_FILENAME = 'real_estate_info_by_municipality.csv'

poi_df = pd.read_csv(os.path.join(DATA_PATH, POI_FILENAME),usecols=lambda column: column != 'name')
site_df = pd.read_csv(os.path.join(DATA_PATH, SITE_FILENAME))
salary_df = pd.read_csv(os.path.join(DATA_PATH, SALARY_FILENAME),usecols=lambda column: column != 'country_code')
georef_df = pd.read_csv(os.path.join(DATA_PATH, GEOREF_FILENAME),usecols=lambda column: column != 'country_code')
stock_df = pd.read_csv(os.path.join(DATA_PATH, STOCK_FILENAME),usecols=lambda column: column not in ['int64_field_0', 'country_code'])
sales_df = pd.read_csv(os.path.join(DATA_PATH, SALES_FILENAME))
population_df = pd.read_csv(os.path.join(DATA_PATH, POPULATION_FILENAME),usecols=lambda column: column != 'country_code')
poverty_df = pd.read_csv(os.path.join(DATA_PATH, POVERTY_FILENAME),usecols=lambda column: column != 'country_code')
real_estate_df = pd.read_csv(os.path.join(DATA_PATH, REAL_ESTATE_FILENAME))

# 🔭 **DATA EXPLORATION**




#### DF POI

In [None]:
poi_df.head() #DATA CLEAN
# importance : poids interne pour évaluer l'importance

In [None]:
# Création de la carte avec Plotly Express
#fig = px.scatter_mapbox(poi_df, lat='latitude', lon='longitude', hover_name='name_reprocessed',
                        #color='poi', size='importance', zoom=5, height=990)

# Personnalisation du titre et du style de la carte
#fig.update_layout(
    #title='Répartition des points d\'intérêt',
    #mapbox_style='open-street-map'  # Vous pouvez choisir parmi d'autres styles de carte, par exemple 'carto-positron'
#)

# Réduction de la taille des points
#fig.update_traces(marker=dict(size=4))

# Affichage du graphique
#fig.show()

In [None]:
#poi_count = poi_df['poi'].value_counts()

# Création du diagramme circulaire avec Plotly Express
#fig = px.pie(names=poi_count.index, values=poi_count.values, title='Répartition des points d\'intérêt (POI)')

# Affichage du graphique
#fig.show()

In [None]:
# 0 DOUBLONS !
poi_df.duplicated().sum()#.drop_duplicates()

In [None]:
poi_df.info()
poi_df.isnull().sum()

#### DF SITE

In [None]:
site_df.head() # DATA CLEAN
# importance : poids interne pour évaluer l'importance

In [None]:
# Création de la carte avec Plotly Express
#fig = px.scatter_mapbox(site_df, lat='latitude', lon='longitude', hover_name='name_reprocessed',
                        #color='poi', size='importance', zoom=5, height=990)

# Personnalisation du titre et du style de la carte
#fig.update_layout(
    #title='Répartition des site touristiques',
    #mapbox_style='open-street-map'  # Vous pouvez choisir parmi d'autres styles de carte, par exemple 'carto-positron'
#)

# Réduction de la taille des points
#fig.update_traces(marker=dict(size=4))

# Affichage du graphique
#fig.show()

In [None]:
#poi_count = site_df['poi'].value_counts()

# Création du diagramme circulaire avec Plotly Express
#fig = px.pie(names=poi_count.index, values=poi_count.values, title='Répartition des site touristiques')

# Affichage du graphique
#fig.show()

In [None]:
# 0 DOUBLONS !
site_df.duplicated().sum()#.drop_duplicates()

In [None]:
site_df.info()
site_df.isnull().sum()

#### DF SALARY

In [None]:
salary_df.head() # DATA CLEAN

In [None]:
salary_df["year"].unique()

In [None]:
# 0 DOUBLONS !
salary_df.duplicated().sum()#.drop_duplicates()

In [None]:
salary_df.info()
salary_df.isnull().sum()

#### DF GEO REF

In [None]:
georef_df.head()

In [None]:
georef_df["municipality_type"].unique()

In [None]:
# 0 DOUBLONS !
georef_df.duplicated().sum()#.drop_duplicates()

In [None]:
georef_df.info()
georef_df.isnull().sum()

#### DF STOCK

In [3]:
stock_df.head() #drop de int64_field_0

Unnamed: 0,municipality_code,year,nb_principal_home,nb_second_home,nb_vacants_housing,nb_tot_housing,secondary_home_rate,principal_home_rate,vacants_housing_rate
0,1339,1968,109,155,0,264,0.587121,0.412879,0.0
1,2368,1968,132,56,0,188,0.297872,0.702128,0.0
2,4073,1968,134,93,0,227,0.409692,0.590308,0.0
3,4148,1968,42,93,0,135,0.688889,0.311111,0.0
4,5012,1968,70,53,0,123,0.430894,0.569106,0.0


In [2]:
stock_df["year"].unique()

array([1968, 1975, 1982, 1990, 1999, 2008, 2013, 2018])

In [None]:
# 0 DOUBLONS !
stock_df.duplicated().sum()#.drop_duplicates()

In [None]:
stock_df.info() # supprimer la colonne int64_field-0
stock_df.isnull().sum()

#### DF SALES

In [None]:
sales_df.head()

In [None]:
# 510 211 DOUBLONS !
sales_df.duplicated().sum()#.drop_duplicates()

In [None]:
s = (sales_df['sales_amount']
             .value_counts()
             .loc[sales_df['sales_amount'].value_counts() > 10])

In [None]:
s.index = s.index.astype(int)
s.loc[(s.index % 10) != 0]

In [None]:
s.plot

In [None]:
sales_df['sales_amount'].value_counts().loc[sales_df['sales_amount'].value_counts() > 10] #[220623264]#sort_values(ascending=True).astype(int)

In [None]:
sales_df.info(), #il manque des latitude et longitude
sales_df.isnull().sum()

#### DF POPULATION

In [None]:
population_df.head()

In [None]:
population_df["year"].unique()

In [None]:
# ??? DOUBLONS !
population_df.duplicated().sum

In [None]:
population_df.info() #colonne YEAR en format INT64
population_df.isnull().sum()

#### DF POVERTY

In [None]:
poverty_df.head()

In [None]:
poverty_df["year"].unique()

In [None]:
# 0 DOUBLONS !
poverty_df.duplicated().sum()

In [None]:
poverty_df.info() # YEAR est en type INT64 et non DATE
poverty_df.isnull().sum()

#### DF REAL ESTATE

In [2]:
real_estate_df.head()

Unnamed: 0,municipality_code,intensite_tension_immo,rental_max_apartment,rental_min_apartment,rental_med_house,rental_max_house,rental_min_house,rental_med_all,rental_max_all,rental_min_all
0,57133,8,12.27,9.07,9.19,14.45,6.64,9.53,13.77,7.25
1,57446,8,18.22,7.69,10.92,14.16,7.77,11.09,15.97,7.73
2,77013,9,18.3,8.39,12.2,16.71,9.21,12.26,16.75,9.19
3,77026,9,15.76,6.28,9.51,13.2,7.04,9.53,13.23,7.03
4,77072,9,17.5,7.69,11.47,15.75,8.21,11.47,15.75,8.21


In [None]:
# 0 DOUBLONS !
real_estate_df.duplicated().sum()

In [None]:
real_estate_df.info()
real_estate_df.isnull().sum()

# 🧹 **DATA CLEANING**

### CLEANING

##### DF_SALES CLEANING

In [None]:
# SALES_DF: Suppression des doublons > nous passons de 4,3M de lignes à 3,821M
sales_df = sales_df.drop_duplicates()
sales_df.shape

In [None]:
# SALES_DF: Check si les doublons on été enlevés : OK
sales_df.duplicated().sum()

In [None]:
# SALES_DF: Suppression des prix au m2 supérieur à 30K€ et inférieur à 1K€ > nous passons à 3,3399M de lignes
sales_df = sales_df[(sales_df['sales_price_m2'] <= 30000) & (sales_df['sales_price_m2'] >= 1000)]
sales_df.shape

In [None]:
# SALES_DF:
s2 = (sales_df['sales_amount']
             .value_counts()
             .loc[sales_df['sales_amount'].value_counts() > 10])

In [None]:
# SALES_DF:
s2.index = s2.index.astype(int)
s2.loc[(s2.index % 10) != 0]

In [None]:
# SALES_DF:
sales_df = sales_df[sales_df['sales_amount'] > 1] # on enlève les 166 fois ou sales_amount = 1€
sales_df.shape

In [None]:
# SALES_DF: changement du type sales_date en datetime
sales_df['sales_date'] = pd.to_datetime(sales_df['sales_date'])
sales_df.info()

In [None]:
# SALES_DF: Création de l'histogramme avec Plotly Express
#fig = px.histogram(sales_df, x='sales_price_m2', nbins=700, title='Distribution de sales_price_m2')

# SALES_DF: Affichage du graphique
#fig.show()

In [None]:
# SALES_DF: Création de l'histogramme avec Plotly Express
#fig = px.histogram(sales_df, x='sales_amount', nbins=400, title='Distribution de sales')

# SALES_DF: Affichage du graphique
#fig.show()

##### DF_SALARY CLEANING

In [None]:
# DF_SALARY: ROUND avg_net_salary
salary_df['avg_net_salary'] = salary_df['avg_net_salary'].round()
salary_df.head()

##### DF_REAL_ESTATE CLEANING

In [None]:
# DF_REAL_ESTATE: suppression des nulls
real_estate_df = real_estate_df.dropna(axis=1)
real_estate_df.isnull().sum()

##### DF_SITE CLEANING

In [None]:
# SITE_DF: tri avec les données entre parenthèses de la colonne "name" inclues

import re

site_df['data_inside_parenthesis'] = site_df['name'].apply(lambda x: re.search(r'\((.*?)\)', x).group(1) if re.search(r'\((.*?)\)', x) else '')
site_df

#suppression de la colonne "name" dans un second temps

site_df.drop(columns=["name"])

#check pour savoir les informations présentes dans la colonne "poi", et si elles correspondent aux valeurs présentes dans la colonne "type"
print (site_df["poi"].value_counts())
print (site_df["data_inside_parenthesis"].value_counts().head(50))

#faire un mapping des colonnes poi, qui sont en fait plus pertinentes que celles de la colonne "type"

In [None]:
# SITE_DF: création d'un dictionnaire intégrant toutes les différentes valeurs inclues dans la colonne "poi"
s = site_df["poi"].value_counts()[site_df["poi"]]
{k: "toto" for k in s.index}

In [None]:
# SITE_DF: création d'un dictionnaire avec les catégories associées aux valeurs de la colonne POI

category_dict = {'1': 'Patrimoine',
 '2': 'Patrimoine',
 'zoo': 'Entertainment',
 'dune': 'Nature',
 'park': 'Nature',
 'rock': 'Nature',
 'sand': 'Nature',
 'beach': 'Nature',
 'cliff': 'Nature',
 'islet': 'Nature',
 'ridge': 'Nature',
 'water': 'Nature',
 'wreck': 'Patrimoine',
 'casino': 'Entertainment',
 'castle': 'Patrimoine',
 'cinema': 'Culture',
 'forest': 'Nature',
 'geyser': 'Nature',
 'marina': 'Nature',
 'meadow': 'Nature',
 'museum': 'Culture',
 'valley': 'Nature',
 'theatre': 'Culture',
 'volcano': 'Nature',
 'wetland': 'Nature',
 'heritage': 'Patrimoine',
 'monument': 'Patrimoine',
 'vineyard': 'Nature',
 'viewpoint': 'Nature',
 'waterfall': 'Nature',
 'allotments': 'Patrimoine',
 'attraction': 'Entertainment',
 'theme_park': 'Entertainment',
 'water_park': 'Entertainment',
 'golf_course': 'Entertainment',
 'cave_entrance': 'Culture',
 'national_park': 'Nature',
 'protected_area': 'Nature'}

In [None]:
# SITE_DF: création de la colonne "catégorie"
site_df["Category"] = site_df["poi"].map(category_dict)
site_df

### CLEANED DF CHECK

In [None]:
poi_df.info()
site_df.info()
salary_df.info()
georef_df.info() 
stock_df.info() 
sales_df.info()
population_df.info() 
poverty_df.info()
real_estate_df.info()

In [None]:
poi_df.head(1)

In [None]:
site_df.head(1) 

In [None]:
salary_df.head(1)

In [None]:
georef_df.head(1) 

In [None]:
stock_df.head(1) 

In [None]:
sales_df.dtypes

In [None]:
population_df.head(1) 

In [None]:
poverty_df.head(1)

In [None]:
real_estate_df.head(1)

# 🚀 **EXPORT**

In [None]:
# Chemin du dossier où les fichiers seront enregistrés
output_folder = "../data/cleaned"

# Assurez-vous que le dossier existe
os.makedirs(output_folder, exist_ok=True)

# Liste des DataFrames et leurs noms
dataframes = {
    "poi_df": poi_df,
    "site_df": site_df,
    "salary_df": salary_df,
    "georef_df": georef_df,
    "stock_df": stock_df,
    "sales_df": sales_df,
    "population_df": population_df,
    "poverty_df": poverty_df,
    "real_estate_df": real_estate_df
}

# Exportation des DataFrames en CSV
for name, df in dataframes.items():
    output_path = os.path.join(output_folder, f"{name}_cleaned.csv")
    df.to_csv(output_path, index=False)
    print(f"DataFrame {name} exporté vers {output_path}")