In [11]:
from pathlib import Path
import json

# Chemin du fichier notebook
notebook_path = Path("01_nettoyage_et_filtrage.ipynb")
notebook_path.parent.mkdir(parents=True, exist_ok=True)

# Contenu complet du notebook avec étapes de nettoyage et filtrage
notebook_content = {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Nettoyage et Filtrage du Dataset\n",
    "Ce notebook effectue les premières étapes de traitement sur le fichier CSV original :\n",
    "- Suppression des lignes vides\n",
    "- Suppression des colonnes inutiles\n",
    "- Suppression des doublons\n",
    "- Filtrage des stations de métro\n",
    "- Séparation en train/test\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# Charger le dataset brut\n",
    "df = pd.read_csv('../data/raw/qualite-de-lair-dans-le-reseau-de-transport-francilien.csv', sep=';', low_memory=False)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Supprimer les lignes vides et les doublons\n",
    "df.dropna(how='all', inplace=True)\n",
    "df.drop_duplicates(inplace=True)\n",
    "df.reset_index(drop=True, inplace=True)\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Afficher les colonnes\n",
    "df.columns.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Supprimer les colonnes jugées inutiles\n",
    "colonnes_a_supprimer = [\n",
    "    'Lien vers les mesures en direct', 'Durée des mesures', 'Mesures d’amélioration mises en place ou prévues',\n",
    "    'point_geo', 'pollution_air', 'niveau', 'actions'\n",
    "]\n",
    "df.drop(columns=colonnes_a_supprimer, inplace=True, errors='ignore')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Garder uniquement les stations de métro\n",
    "df_metro = df[df['Nom de la ligne'].str.contains('Métro', na=False)]\n",
    "df_metro.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Séparer en train (70%) et test (30%)\n",
    "train, test = train_test_split(df_metro, test_size=0.3, random_state=42)\n",
    "\n",
    "# Sauvegarder les fichiers\n",
    "train.to_csv('../data/processed/train.csv', index=False)\n",
    "test.to_csv('../data/processed/test.csv', index=False)\n",
    "print('✅ Fichiers train.csv et test.csv enregistrés.')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

# Écriture du fichier notebook
notebook_path.write_text(json.dumps(notebook_content, indent=2))


3188