# Install required libraries
To install the required libraries from the requirements file, use the following command:
"pip install -r requirements.txt"

# 0. Initialization

In [None]:
import numpy as np # a library for numerical computations and handling arrays
import pandas as pd #  a library for data manipulation and analysis, often used for handling tabular data

# Visualisation Library
import matplotlib.pyplot as plt # module for data visualization through plots and charts
import seaborn as sns # a data visualization library built on Matplotlib, offering advanced plotting functions and attractive visual styles
import matplotlib.ticker as ticker
# Classification continous
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Classification Categorial
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import svm

# Processing Classification
from sklearn.model_selection import train_test_split
from sklearn import metrics

# avoid displaying warnings
import warnings
warnings.filterwarnings("ignore")

# 1. Collecte the Data

In [None]:
# Load the dataset from the specified path
data = pd.read_csv("../data/G3_immobiliers.csv")

# 2. Data Cleaning and Preparation

In [None]:
# Display the first 10 rows of the dataset to understand its structure
data.head(10)

In [None]:
# Display dataset information including column names, data types, and missing values
data.info()

In [None]:
# Get statistical summaries of numerical columns
data.describe()

In [None]:
# Check the total number of missing values in each column
data.isnull().sum()

In [None]:
# Drop unnecessary columns that are not useful for predictive modeling
df_data = data[["valeur_fonciere", "date_mutation",  # Price and transaction date
    "code_postal", "code_commune", "nom_commune", "code_departement",  # Location
    "longitude", "latitude",  # GPS coordinates
    "type_local", "surface_reelle_bati", "nombre_pieces_principales",  # Goods characteristics
    "surface_terrain", "nature_culture"  # Land characteristics
]]

In [None]:
# Convert "valeur_fonciere" column to numeric, forcing errors to NaN
df_data["valeur_fonciere"] = pd.to_numeric(df_data["valeur_fonciere"], errors="coerce")
# Remove rows where "valeur_fonciere" is missing
df_data = df_data.dropna(subset=["valeur_fonciere"])

In [None]:
# Convert "surface_reelle_bati" column to numeric, forcing errors to NaN
df_data["surface_reelle_bati"] = pd.to_numeric(df_data["surface_reelle_bati"], errors="coerce")
# Remove rows where "surface_reelle_bati" is missing
df_data = df_data.dropna(subset=["surface_reelle_bati"])

In [None]:
# Remove rows with more than 50% missing values
df_data = df_data.dropna(thresh=len(df_data.columns) * 0.5)

In [None]:
# Fill missing values in numerical columns with the median
# Convert numerical columns to float type (forcing errors to NaN)
cols_num = ["surface_reelle_bati", "surface_terrain", "nombre_pieces_principales", "valeur_fonciere"]
for col in cols_num:
    df_data[col] = pd.to_numeric(df_data[col], errors="coerce")  # Convert to float
    df_data[col] = df_data[col].fillna(df_data[col].median())  # Replace NaN with median

In [None]:
# Fill missing values in categorical columns with "Unknown"
cols_cat = ["type_local", "nature_culture"]
for col in cols_cat:
    df_data[col] = df_data[col].fillna("Unknown")

In [None]:
# Convert categorical variables into dummy/indicator variables (One-Hot Encoding)
df_data = pd.get_dummies(df_data, columns=["type_local", "nature_culture"], drop_first=True)

In [None]:
# Display the first 10 rows after cleaning and transformation
df_data.head(10)

In [None]:
# Display dataset information after cleaning and transformation
df_data.info()

# 3. Distribution des Prix Immobiliers

In [None]:
# Distribution des prix immobiliers
plt.figure(figsize=(8, 5))
sns.histplot(df_data[df_data.valeur_fonciere <= 1e6].valeur_fonciere, bins=50, kde=True)
plt.title("Distribution des valeurs foncières")
plt.xlabel("Prix (€)")
plt.ylabel("Nombre de transactions")
plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{x:,.0f}'))
plt.show()

# 4. Visualisation des Relations entre Surface et Prix par Type de Bien

In [None]:
# Obtenir les types de biens uniques
unique_types = df_data.columns[df_data.columns.str.startswith("type_local_")]

# Créer un graphique distinct pour chaque type de bien
for type_col in unique_types:
    type_name = type_col.replace("type_local_", "")  # Extraire le nom du type de bien
    subset = df_data[(df_data[type_col] == 1) & (df_data.surface_reelle_bati <= 4000) & (df_data.valeur_fonciere <= 1e6)]  # Filtrer les données

    # Vérifier si le sous-ensemble contient des données
    if not subset.empty:
        plt.figure(figsize=(8, 5))
        sns.scatterplot(x=subset.surface_reelle_bati, y=subset.valeur_fonciere)
        plt.title(f"Relation entre Surface Habitable et Prix pour {type_name}")
        plt.xlabel("Surface habitable (m²)")
        plt.ylabel("Prix (€)")
        plt.show()

In [None]:
# Répartition des types de biens
plt.figure(figsize=(6, 4))

# Extraire les noms des types de biens
type_local_columns = [col for col in df_data.columns if col.startswith("type_local_")]

# Compter le nombre d'occurrences pour chaque type de bien
type_counts = {col.replace("type_local_", ""): df_data[col].sum() for col in type_local_columns}

# Créer un DataFrame pour la visualisation
df_type_counts = pd.DataFrame.from_dict(type_counts, orient="index", columns=["count"]).reset_index()
df_type_counts.rename(columns={"index": "type_local"}, inplace=True)

# Tracer le graphique
sns.barplot(x="type_local", y="count", data=df_type_counts)
plt.title("Répartition des types de biens")
plt.xticks(rotation=45)
plt.xlabel("Type de bien")
plt.ylabel("Nombre de transactions")
plt.show()
