# Génération du dataset pour l'analyse de données

Les données issues du scraping de l'API IDF Mobilités ne constituent pas un jeu de données en tant que tel. Le rôle de ce notebook est de passer du des données brutes à un jeu de données exploitable par la suite.

In [1]:
# On utilise cudf.pandas pour accélérer les opérations Pandas sur GPU, optionnel

!pip install \
  --extra-index-url=https://pypi.nvidia.com \
  cudf-cu12==24.12.* \
  dask-cudf-cu12==24.12.* \
  cuml-cu12==24.12.* \
  cugraph-cu12==24.12.*

%load_ext cudf.pandas

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cudf-cu12==24.12.*
  Downloading https://pypi.nvidia.com/cudf-cu12/cudf_cu12-24.12.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (26.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/26.7 MB[0m [31m?[0m eta [36m-:--:--[0m

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m83.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting dask-cudf-cu12==24.12.*
  Downloading https://pypi.nvidia.com/dask-cudf-cu12/dask_cudf_cu12-24.12.0-py3-none-any.whl (67 kB)
Collecting cuml-cu12==24.12.*
  Downloading https://pypi.nvidia.com/cuml-cu12/cuml_cu12-24.12.0-cp312-cp312-manylinux_2_28_x86_64.whl (547.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.9/547.9 MB[0m [31m83.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting cugraph-cu12==24.12.*
  Downloading https://pypi.nvidia.com/cugraph-cu12/cugraph_cu12-24.12.0-cp312-cp312-manylinux_2_28_x86_64.whl (920.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m920.1/920.1 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting cuda-python<13.0a0,<=12.6.0,>=12.0 (from cudf-cu12==24.12.*)
  Downloading cuda_python-12.6.0-cp312-cp312-manylinux_2_17_x86_

In [2]:
!pip install -r requirements.txt

Collecting aiohttp==3.11.10 (from -r requirements.txt (line 3))
  Downloading aiohttp-3.11.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiosignal==1.3.1 (from -r requirements.txt (line 5))
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting attrs==24.2.0 (from -r requirements.txt (line 6))
  Downloading attrs-24.2.0-py3-none-any.whl.metadata (11 kB)
Collecting certifi==2024.8.30 (from -r requirements.txt (line 8))
  Downloading certifi-2024.8.30-py3-none-any.whl.metadata (2.2 kB)
Collecting fsspec==2024.10.0 (from -r requirements.txt (line 11))
  Downloading fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Collecting python-dotenv==1.0.1 (from -r requirements.txt (line 17))
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting s3fs==2024.10.0 (from -r requirements.txt (line 19))
  Downloading s3fs-2024.10.0-py3-none-any.whl.metadata (1.7 kB)
Downloading aiohttp-3.11.10-cp312-cp312-manylinux

In [None]:

import json
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

True

Pour avoir plus de flexibilité pour mener notre analyse, nous ne travaillerons pas seulement avec un dataset combinant les données sur les perturbations et les lignes affectées, nous garderons aussi de côté les données propres aux perturbations et aux lignes séparées. Aussi, du fait de perturbations pouvant être de longue durée (travaux...), nos données brutes comportent de nombreux doublons que nous allons devoir traiter.

Nous traiterons les données de sorte à n'avoir plus qu'à créer nos dataframes à partir des objets que nous constitutions ici. Créer des dataframes plus rudimentaires puis les raffiner s'avère être un processus très complexe du fait d'objets imbriqués et de listes de longueurs variables dans nos données brutes.

In [15]:
from src.config import ROOT
import s3fs

# Le code directement ci-dessous sert à accéder au bucket SSP Cloud qui contient les données de scraping
# Il nécessite que les variables d'environnement MINIO_KEY et MINIO_SECRET soient enregistrées

KEY = os.environ.get("MINIO_KEY")
SECRET = os.environ.get("MINIO_SECRET_KEY")

fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"}, key=KEY, secret=SECRET)

paths = fs.ls(ROOT)

paths = [p for p in paths if "intermediate" not in p]
paths = [p for p in paths if not p.split("/").pop().startswith(".")] # on exclue les fichiers cachés

all_results = []
all_disruptions = []
all_objects = []

results = {}
disruptions = {}
objects = {}
objects_disruptions = {}

for file_path in paths:
    try:
        with fs.open(file_path, 'r', encoding='ascii') as f:
            raw_data = f.read()
            data = json.loads(raw_data)
            
            last_updated = data.get('lastUpdatedDate')
            this_disruptions = data.get('disruptions', [])
            this_lines = data.get('lines', [])
            
            all_results.append(last_updated)
            
            if last_updated not in results:
                results[last_updated] = last_updated
            
            for d in this_disruptions:
                disruption_id = d.get('id')
                application_periods = d.get('applicationPeriods')

                all_disruptions.append(disruption_id)

                for p in application_periods:
                    begin = p.get('begin')
                    end = p.get('end')

                    key = f"{disruption_id}:{begin}:{end}"

                    if key not in disruptions:            
                        disruptions[key] = {
                            'disruption_id': d.get('id'),
                            'begin': p.get('begin'),
                            'end': p.get('end'),
                            'lastUpdate': d.get('lastUpdate'),
                            'cause': d.get('cause'),
                            'severity': d.get('severity'),
                            'title': d.get('title'),
                            'message': d.get('message'),
                            'file_lastUpdatedDate': last_updated
                        }
            
            for l in this_lines:
                line_id = l.get('id')
                impacted_objects = l.get('impactedObjects', [])

                for o in impacted_objects:
                    object_id = o.get('id')
                    disruption_ids = o.get('disruptionIds')

                    all_objects.append(object_id)

                    for disruption_id in disruption_ids:
                        key = f"{object_id}:{disruption_id}"
                        
                        if key not in objects_disruptions:
                            objects_disruptions[key] = {
                                'object_id': object_id,
                                'disruption_id': disruption_id,
                                'line_id': l.get('id'),
                                'object_name': o.get('name'),
                                'object_type': o.get('type'),
                                'line_name': l.get('name'),
                                'line_shortName': l.get('shortName'),
                                'line_mode': l.get('mode'),
                                'line_networkId': l.get('networkId'),
                                'file_lastUpdatedDate': last_updated
                            }

                    if object_id in objects:
                        continue

                    objects[object_id] = {
                        'object_id': object_id,
                        'line_id': l.get('id'),
                        'object_name': o.get('name'),
                        'object_type': o.get('type'),
                        'line_name': l.get('name'),
                        'line_shortName': l.get('shortName'),
                        'line_mode': l.get('mode'),
                        'line_networkId': l.get('networkId'),
                        'file_lastUpdatedDate': last_updated
                    }
                
    except Exception as e:
        print("Error occured with file:", file_path)
        raise e

Du fait de perturbations pouvant être de longue durée (travaux...), nos données brutes comportent de nombreux duplicats que nous allons devoir traiter.

In [16]:
print("Total results (all):", len(all_results))
print("Total disruptions (all):", len(all_disruptions))
print("Total objects (all):", len(all_objects))

print("#####################")

print("Total results (no duplicates):", len(results))
print("Total disruptions (no duplicates):", len(disruptions))
print("Total objects (no duplicates):", len(objects))
print("Total objects/disruptions tuples", len(objects_disruptions))

Total results (all): 2357
Total disruptions (all): 1724401
Total objects (all): 5476570
#####################
Total results (no duplicates): 2357
Total disruptions (no duplicates): 30177
Total objects (no duplicates): 7570
Total objects/disruptions tuples 102807


In [17]:
df_disruptions = pd.DataFrame.from_dict(disruptions)
df_objects = pd.DataFrame.from_dict(objects)
df_objects_disruptions = pd.DataFrame.from_dict(objects_disruptions)

KeyboardInterrupt: 

In [None]:
df_disruptions.head()

In [6]:
print("Total results (%reduction):", (1 - round(results_after/results_before, 4)) * 100)
print("Total disruptions (%reduction):", (1 - round(disruptions_after/disruptions_before, 4)) * 100)
print("Total lines (%reduction):", (1 - round(lines_after/lines_before, 4)) * 100)

Total results (%reduction): 0.0
Total disruptions (%reduction): 98.85000000000001
Total lines (%reduction): 99.92999999999999


On comprend ici que l'API IDF Mobilités est mise à jour plus régulièrement que notre fréquence de scraping (aucun doublon dans les résultats des appels API). Cela veut dire qu'il n'est pas impossible que nous ayons manqué des perturbations de très courte durée sur la période considérée. Nous garderons cela en tête pour l'analyse des données.

La déduplication a toutefois été très importante pour les données sur les perturbations et sur les lignes, ce qui était attendu. Avec 19757 perturbations différentes dans notre jeu de données établi sur 3 semaines. Nous avons assez de données pour faire une analyse intéressante, bien que l'idéal serait de produire un outil permettant une analyse continue et automatisée des perturbations fournies par l'API. Avant tout, mettons sur S3 ce jeu de données fraîchement généré.

In [None]:
paths = ["data/disruptions.csv", "data/lines.csv", "data/lines_unique.csv", "data/results.csv"]

for path in paths:
    remote_path = "intermediate-data/" + path.split("/").pop()
    if fs.exists(ROOT + remote_path):
        continue
    with open(path, "r", encoding="utf-8") as local_f:
        with open(local_path, "w+", encoding="utf-8") as remote_f:
            remote_f.write(local_f.read())

In [9]:
# jointure entre lignes et perturbations
def merge_lines_disruptions(df_lines, df_disruptions):
    line_disruption_links = []

    for idx, row in df_lines.iterrows():
        impacted_objects = row['impactedObjects']
        line_id = row['line_id']
        if impacted_objects:
            for obj in impacted_objects:
                disruptions_ids = obj.get('disruptionIds', [])
                for d_id in disruptions_ids:
                    line_disruption_links.append({
                        'line_id': line_id,
                        'disruption_id': d_id
                    })

    df_line_disruption = pd.DataFrame(line_disruption_links)

    df_line_disruption.merge(df_disruptions, on='disruption_id')

    return df_line_disruption

In [10]:
df_line_disruption = merge_lines_disruptions(df_lines, df_disruptions)

df_line_disruption.to_csv("data/line_disruption.csv")

column_view::get_data: Unsupported type: 24
column_view::get_data: Unsupported type: 24
column_view::get_data: Unsupported type: 24
column_view::get_data: Unsupported type: 24
column_view::get_data: Unsupported type: 24
column_view::get_data: Unsupported type: 24
column_view::get_data: Unsupported type: 24
column_view::get_data: Unsupported type: 24


: 

In [25]:
len(df_line_disruption)

column_view::get_data: Unsupported type: 24
column_view::get_data: Unsupported type: 24
column_view::get_data: Unsupported type: 24
column_view::get_data: Unsupported type: 24


5245