"""
ETL - Transport Urbain : Chicago & Philadelphie
================================================
Projet : Dashboard Power BI - Analyse Ridership
Auteur : Data Analyst
Date : FÃ©vrier 2026


"""

In [2]:
import sys
print(sys.executable)

c:\Users\leila\anaconda3\python.exe


In [3]:
%pip install rdflib

Collecting rdflib
  Downloading rdflib-7.5.0-py3-none-any.whl (587 kB)
Collecting isodate<1.0.0,>=0.7.2
  Downloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.7.2 rdflib-7.5.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np
from scipy import stats
from rdflib import Graph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import RDF, XSD


In [20]:
RDF_FILES = [
    "C:\Python_challenges\Brief_jury_blanc\data-as-files-698094fa4d54e700584170\CTA Chicago - Ridership - Bus Routes - Daily Type Averages & Totals (RDF).rdf",
    "C:\Python_challenges\Brief_jury_blanc\data-as-files-698094fa4d54e700584170\CTA Chicago - Ridership - Bus Routes - Monthly Day-Type Averages & Totals (RDF).rdf"
]

In [29]:

g = Graph()
g.parse("C:\Python_challenges\Brief_jury_blanc\data-as-files-698094fa4d54e700584170\CTA Chicago - Ridership - Bus Routes - Daily Type Averages & Totals (RDF).rdf", format="xml")

print("Nombre de triplets :", len(g))


Nombre de triplets : 3500


In [30]:
predicates = set()

for s, p, o in g:
    predicates.add(p)

for p in list(predicates)[:20]:
    print(p)

https://data.cityofchicago.org/resource/jyb9-n7fm/route
https://data.cityofchicago.org/resource/jyb9-n7fm/daytype
http://www.w3.org/1999/02/22-rdf-syntax-ns#type
http://www.socrata.com/rdf/terms#rowID
http://www.w3.org/2000/01/rdf-schema#member
https://data.cityofchicago.org/resource/jyb9-n7fm/date
https://data.cityofchicago.org/resource/jyb9-n7fm/rides


In [27]:
g = Graph()
g.parse("C:\Python_challenges\Brief_jury_blanc\data-as-files-698094fa4d54e700584170\CTA Chicago - Ridership - Bus Routes - Monthly Day-Type Averages & Totals (RDF).rdf", format="xml")

print("Nombre de triplets :", len(g))

Nombre de triplets : 5000


In [28]:
predicates = set()

for s, p, o in g:
    predicates.add(p)

for p in list(predicates)[:20]:
    print(p)

https://data.cityofchicago.org/resource/bynn-gwxy/avg_saturday_rides
https://data.cityofchicago.org/resource/bynn-gwxy/routename
https://data.cityofchicago.org/resource/bynn-gwxy/avg_weekday_rides
https://data.cityofchicago.org/resource/bynn-gwxy/route
https://data.cityofchicago.org/resource/bynn-gwxy/monthtotal
http://www.w3.org/1999/02/22-rdf-syntax-ns#type
http://www.socrata.com/rdf/terms#rowID
http://www.w3.org/2000/01/rdf-schema#member
https://data.cityofchicago.org/resource/bynn-gwxy/avg_sunday_holiday_rides
https://data.cityofchicago.org/resource/bynn-gwxy/month_beginning


In [31]:
NS1 = {
    "route": URIRef("https://data.cityofchicago.org/resource/jyb9-n7fm/route"),
    "date": URIRef("https://data.cityofchicago.org/resource/jyb9-n7fm/date"),
    "day_type": URIRef("https://data.cityofchicago.org/resource/jyb9-n7fm/daytype"),
    "ridership": URIRef("https://data.cityofchicago.org/resource/jyb9-n7fm/rides")
}
NS2 = {
    "route": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/route"),
    "route_name": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/routename"),
    "avg_weekday": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/avg_weekday_rides"),
    "avg_saturday": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/avg_saturday_rides"),
    "avg_sunday": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/avg_sunday_holiday_rides"),
    "month_total": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/monthtotal"),
    "month_beginning": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/month_beginning")
}


In [36]:
RDF_DAILY = RDF_FILES[0]

NS_DAILY = {
    "route": URIRef("https://data.cityofchicago.org/resource/jyb9-n7fm/route"),
    "date": URIRef("https://data.cityofchicago.org/resource/jyb9-n7fm/date"),
    "day_type": URIRef("https://data.cityofchicago.org/resource/jyb9-n7fm/daytype"),
    "ridership": URIRef("https://data.cityofchicago.org/resource/jyb9-n7fm/rides")
}

g = Graph()
g.parse(RDF_DAILY, format="xml")
print("Triplets RDF (daily):", len(g))

records = {}

for s, p, o in g:
    if p in NS_DAILY.values():
        if s not in records:
            records[s] = {
                "date": None,
                "route": None,
                "day_type": None,
                "ridership": None
            }

        if p == NS_DAILY["date"]:
            records[s]["date"] = str(o)

        elif p == NS_DAILY["route"]:
            records[s]["route"] = str(o)

        elif p == NS_DAILY["day_type"]:
            records[s]["day_type"] = str(o)

        elif p == NS_DAILY["ridership"]:
            try:
                records[s]["ridership"] = int(o)
            except:
                records[s]["ridership"] = None

df_daily = pd.DataFrame.from_dict(records, orient="index")

# ðŸ”¹ Nettoyage
df_daily["date"] = pd.to_datetime(df_daily["date"], errors="coerce")
df_daily = df_daily.dropna(subset=["date", "route", "ridership"])
df_daily["route"] = df_daily["route"].str.upper().str.strip()
df_daily["day_type"] = df_daily["day_type"].str.capitalize()

# ðŸ”¹ Export CSV
OUTPUT_DAILY = r"C:\Python_challenges\Brief_jury_blanc\output\chicago_ridership_daily.csv"
df_daily.to_csv(OUTPUT_DAILY, index=False)

print("âœ… CSV DAILY gÃ©nÃ©rÃ© :", OUTPUT_DAILY)

Triplets RDF (daily): 3500
âœ… CSV DAILY gÃ©nÃ©rÃ© : C:\Python_challenges\Brief_jury_blanc\output\chicago_ridership_daily.csv


In [37]:
RDF_MONTHLY = RDF_FILES[1]

NS_MONTHLY = {
    "route": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/route"),
    "route_name": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/routename"),
    "avg_weekday": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/avg_weekday_rides"),
    "avg_saturday": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/avg_saturday_rides"),
    "avg_sunday": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/avg_sunday_holiday_rides"),
    "month_total": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/monthtotal"),
    "month_beginning": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/month_beginning")
}

g = Graph()
g.parse(RDF_MONTHLY, format="xml")
print("Triplets RDF (monthly):", len(g))

records = {}

for s, p, o in g:
    if p in NS_MONTHLY.values():
        if s not in records:
            records[s] = {
                "month_beginning": None,
                "route": None,
                "route_name": None,
                "avg_weekday_rides": None,
                "avg_saturday_rides": None,
                "avg_sunday_holiday_rides": None,
                "month_total": None
            }

        if p == NS_MONTHLY["month_beginning"]:
            records[s]["month_beginning"] = str(o)

        elif p == NS_MONTHLY["route"]:
            records[s]["route"] = str(o)

        elif p == NS_MONTHLY["route_name"]:
            records[s]["route_name"] = str(o)

        elif p == NS_MONTHLY["avg_weekday"]:
            records[s]["avg_weekday_rides"] = float(o)

        elif p == NS_MONTHLY["avg_saturday"]:
            records[s]["avg_saturday_rides"] = float(o)

        elif p == NS_MONTHLY["avg_sunday"]:
            records[s]["avg_sunday_holiday_rides"] = float(o)

        elif p == NS_MONTHLY["month_total"]:
            records[s]["month_total"] = float(o)

df_monthly = pd.DataFrame.from_dict(records, orient="index")

# ðŸ”¹ Nettoyage
df_monthly["month_beginning"] = pd.to_datetime(df_monthly["month_beginning"], errors="coerce")
df_monthly["route"] = df_monthly["route"].str.upper().str.strip()

# ðŸ”¹ Export CSV
OUTPUT_MONTHLY = r"C:\Python_challenges\Brief_jury_blanc\output\chicago_ridership_monthly.csv"
df_monthly.to_csv(OUTPUT_MONTHLY, index=False)

print("âœ… CSV MONTHLY gÃ©nÃ©rÃ© :", OUTPUT_MONTHLY)

Triplets RDF (monthly): 5000
âœ… CSV MONTHLY gÃ©nÃ©rÃ© : C:\Python_challenges\Brief_jury_blanc\output\chicago_ridership_monthly.csv


In [38]:
print("ðŸ“Š RÃ©sultat global :")
print(df_monthly.head())
print(df_monthly.info())

ðŸ“Š RÃ©sultat global :
                                                   month_beginning route  \
https://data.cityofchicago.org/resource/bynn-gw...      2001-01-01    65   
https://data.cityofchicago.org/resource/bynn-gw...      2001-04-01   53A   
https://data.cityofchicago.org/resource/bynn-gw...      2001-04-01    11   
https://data.cityofchicago.org/resource/bynn-gw...      2001-03-01    29   
https://data.cityofchicago.org/resource/bynn-gw...      2001-04-01    93   

                                                          route_name  \
https://data.cityofchicago.org/resource/bynn-gw...             Grand   
https://data.cityofchicago.org/resource/bynn-gw...     South Pulaski   
https://data.cityofchicago.org/resource/bynn-gw...  Lincoln/Sedgwick   
https://data.cityofchicago.org/resource/bynn-gw...             State   
https://data.cityofchicago.org/resource/bynn-gw...  California/Dodge   

                                                    avg_weekday_rides  \
https://data.