In [None]:
# Installer PySpark
!pip install pyspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null



In [None]:
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as F

In [None]:
# Initialiser SparkSession
spark = SparkSession.builder.appName("TD2").getOrCreate()

# 1 – Charger les jeux de données suivants (10 minutes) :

- Validations sur le réseau ferré
- Vacances scolaires
- Jours fériés

In [None]:
df_fact = spark.read.parquet("fact_validations.parquet")
df_jour_feries = spark.read.parquet("dim_jour_feries.parquet")
df_vacances = spark.read.parquet("dim_vacances_scolaires.parquet")

In [None]:
df_fact.limit(5).toPandas()

Unnamed: 0,JOUR,LIBELLE_ARRET,CATEGORIE_TITRE,ID_REFA_LDA,CODE_STIF_RES,CODE_STIF_TRNS,NB_VALD
0,2020-06-20,GARE DE LYON,?,73626,804,800,5
1,2019-02-07,GARE DE LYON,?,73626,804,800,5
2,2019-03-13,GARE DE LYON,?,73626,804,800,14
3,2019-03-15,GARE DE LYON,?,73626,804,800,14
4,2019-06-19,GARE DE LYON,?,73626,804,800,9


In [None]:
df_jour_feries.limit(5).toPandas()

Unnamed: 0,annee,date_ferie,nom_ferie
0,2015,2015-01-01,1er janvier
1,2015,2015-04-06,Lundi de Pâques
2,2015,2015-05-01,1er mai
3,2015,2015-05-08,8 mai
4,2015,2015-05-14,Ascension


In [None]:
df_vacances.limit(5).toPandas()

Unnamed: 0,description,population,start_date,end_date,location,zones,annee_scolaire
0,Vacances de la Toussaint,-,2009-10-25 23:00:00,2009-11-08 23:00:00,Corse,Corse,2009-2010
1,Vacances de Noël,-,2009-12-20 23:00:00,2010-01-03 23:00:00,Corse,Corse,2009-2010
2,Vacances d'Hiver,-,2010-02-21 23:00:00,2010-03-07 23:00:00,Corse,Corse,2009-2010
3,Vacances de Printemps,-,2010-04-25 22:00:00,2010-05-09 22:00:00,Corse,Corse,2009-2010
4,Vacances d'Été,Enseignants,2010-07-07 22:00:00,2010-09-07 22:00:00,Corse,Corse,2009-2010


# 2 - Créer `core_df` : index temporel journalier par gare (30 min)

Créer un dataset `core_df` contenant, pour **chaque gare unique**, un **index temporel complet à granularité journalière**, couvrant la période allant de la **date minimale** à la **date maximale** observée dans les données.

Résultat attendu : **une ligne par gare et par jour**

### Exemple attendu
Supposons :
- Gares uniques : `["Gare_A", "Gare_B"]`
- `min_date = 2024-01-01`
- `max_date = 2024-01-03`

Alors `CORE_df` doit contenir :

| gare   | date       |
|--------|------------|
| Gare_A | 2024-01-01 |
| Gare_A | 2024-01-02 |
| Gare_A | 2024-01-03 |
| Gare_B | 2024-01-01 |
| Gare_B | 2024-01-02 |
| Gare_B | 2024-01-03 |

> On obtient donc toutes les combinaisons **(gare, date)** sur l’intervalle, même si certaines dates n’existent pas initialement dans les validations.


In [None]:
gares = df_fact.select("LIBELLE_ARRET").distinct()

df_core = (
    df_fact
    .select(
        F.min("JOUR").alias("min_date"),
        F.max("JOUR").alias("max_date"),
    )
    .distinct()
    .withColumn(
        "date_index",
        F.sequence(start=F.col("min_date"), stop=F.col("max_date"),
                   step=F.make_interval(days=F.lit(1))))
    .select(F.explode("date_index").alias("dt_jour"))
    .crossJoin(gares)
)

In [None]:
from pyspark.sql import functions as F

# Travail réalisé par l'étudiante

gares = df_fact.select("LIBELLE_ARRET").distinct()

core_df = (
    df_fact
    .select(
        F.min("JOUR").alias("min_date"),
        F.max("JOUR").alias("max_date"),
    )
    .distinct()
    .withColumn(
        "date_index",
        F.sequence(
            start=F.col("min_date"),
            stop=F.col("max_date"),
            step=F.make_interval(days=F.lit(1))
        )
    )
    .select(F.explode("date_index").alias("date"))
    .crossJoin(gares)
    .select(
        F.col("LIBELLE_ARRET").alias("gare"),
        F.col("date")
    )
)

core_df.orderBy("gare", "date").show(10, truncate=False)


+-------------+----------+
|gare         |date      |
+-------------+----------+
|GARE DE L'EST|2017-01-01|
|GARE DE L'EST|2017-01-02|
|GARE DE L'EST|2017-01-03|
|GARE DE L'EST|2017-01-04|
|GARE DE L'EST|2017-01-05|
|GARE DE L'EST|2017-01-06|
|GARE DE L'EST|2017-01-07|
|GARE DE L'EST|2017-01-08|
|GARE DE L'EST|2017-01-09|
|GARE DE L'EST|2017-01-10|
+-------------+----------+
only showing top 10 rows


## 3 - Ajouter au dataset core_df une colonne par période de vacances scolaires (45 Min)

À partir des dataset **Vacances Scolaires** et **Jours Fériés**, enrichir `core_df` en ajoutant **une colonne indicatrice (0/1)** pour **chaque période de vacances scolaires** (ex. *Toussaint*, *Noël*, *Hiver*, *Printemps*, *Été*), indiquant si la date de `core_df` appartient à cette période.

### Exemple attendu
Pour une ligne de `CORE_df` correspondant au **2024-12-28**, on pourrait obtenir :

| date       | gare | vacances_toussaint | vacances_noel | vacances_hiver | vacances_printemps | vacances_ete |
|------------|------|---------------------|---------------|----------------|--------------------|--------------|
| 2024-12-28 | A... | 0                   | 1             | 0              | 0                  | 0            |

> Ici, `vacances_noel = 1` car la date est incluse dans la période des vacances de Noël, sinon la valeur est `0`.


In [None]:
df_core = (
    df_core
    .withColumn("dow", F.dayofweek("dt_jour"))
    .withColumn("dom", F.dayofmonth("dt_jour"))
    .withColumn(
        "is_weekend",
        F.when(F.col("dow").isin([1,7]), 1).otherwise(0)
      )
    .withColumn("moy", F.month("dt_jour"))
    .withColumn("year", F.year("dt_jour"))
)

df_core.toPandas()

Unnamed: 0,dt_jour,LIBELLE_ARRET,dow,dom,is_weekend,moy,year
0,2017-01-01,GARE DU NORD,1,1,1,1,2017
1,2017-01-02,GARE DU NORD,2,2,0,1,2017
2,2017-01-03,GARE DU NORD,3,3,0,1,2017
3,2017-01-04,GARE DU NORD,4,4,0,1,2017
4,2017-01-05,GARE DU NORD,5,5,0,1,2017
...,...,...,...,...,...,...,...
10950,2022-12-27,GARE DE LYON,3,27,0,12,2022
10951,2022-12-28,GARE DE LYON,4,28,0,12,2022
10952,2022-12-29,GARE DE LYON,5,29,0,12,2022
10953,2022-12-30,GARE DE LYON,6,30,0,12,2022


In [None]:
# Travail de l’étudiant

c = df_core.alias("c")
v = df_vacances.alias("v")

df_core = (
    c.join(
        v,
        F.col("c.dt_jour").between(
            F.col("v.start_date"),
            F.col("v.end_date")
        ),
        how="left"
    )
)

df_core = (
    df_core
    .withColumn(
        "vacances_toussaint",
        F.when(F.col("v.description").contains("Toussaint"), 1).otherwise(0)
    )
    .withColumn(
        "vacances_noel",
        F.when(F.col("v.description").contains("Noël"), 1).otherwise(0)
    )
    .withColumn(
        "vacances_hiver",
        F.when(F.col("v.description").contains("Hiver"), 1).otherwise(0)
    )
    .withColumn(
        "vacances_printemps",
        F.when(F.col("v.description").contains("Printemps"), 1).otherwise(0)
    )
    .withColumn(
        "vacances_ete",
        F.when(
            (F.col("v.description").contains("Été")) & (F.col("moy").isin([7, 8])),
            1
        ).otherwise(0)
    )
)

df_core = (
    df_core
    .groupBy(
        "LIBELLE_ARRET",
        "dt_jour",
        "dow",
        "dom",
        "is_weekend",
        "moy",
        "year"
    )
    .agg(
        F.max("vacances_toussaint").alias("vacances_toussaint"),
        F.max("vacances_noel").alias("vacances_noel"),
        F.max("vacances_hiver").alias("vacances_hiver"),
        F.max("vacances_printemps").alias("vacances_printemps"),
        F.max("vacances_ete").alias("vacances_ete")
    )
)

df_core.toPandas()


Unnamed: 0,LIBELLE_ARRET,dt_jour,dow,dom,is_weekend,moy,year,vacances_toussaint,vacances_noel,vacances_hiver,vacances_printemps,vacances_ete
0,GARE DU NORD,2017-01-01,1,1,1,1,2017,0,1,0,0,0
1,GARE DU NORD,2017-01-02,2,2,0,1,2017,0,1,0,0,0
2,GARE DU NORD,2017-01-03,3,3,0,1,2017,0,0,0,0,0
3,GARE DU NORD,2017-01-04,4,4,0,1,2017,0,0,0,0,0
4,GARE DU NORD,2017-01-05,5,5,0,1,2017,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
10950,GARE DE LYON,2020-12-24,5,24,0,12,2020,0,1,0,0,0
10951,GARE DE LYON,2021-12-15,4,15,0,12,2021,0,1,0,0,0
10952,GARE DE LYON,2021-12-29,4,29,0,12,2021,0,1,0,0,0
10953,GARE DE LYON,2022-04-05,3,5,0,4,2022,0,0,0,0,0


## 3 - Calculer les lags temporels (1 à 42 jours) (15 min)

À partir du dataset `CORE_df`, calculer pour **chaque gare** et **chaque jour**, les **variables de retard (lags)** allant de **1 à 42 jours** sur la variable cible (ex. nombre de validations).

Chaque lag `lag_k` correspond à la valeur observée **k jours auparavant** pour la même gare.

### Exemple attendu
Pour la gare `Gare_A`, supposons les données suivantes :

| gare   | date       | validations |
|--------|------------|-------------|
| Gare_A | 2024-01-01 | 100         |
| Gare_A | 2024-01-02 | 120         |
| Gare_A | 2024-01-03 | 130         |
| Gare_A | 2024-01-04 | 110         |

Après calcul des lags (extrait) :

| gare   | date       | validations | lag_1 | lag_2 | lag_3 |
|--------|------------|-------------|-------|-------|-------|
| Gare_A | 2024-01-01 | 100         | null  | null  | null  |
| Gare_A | 2024-01-02 | 120         | 100   | null  | null  |
| Gare_A | 2024-01-03 | 130         | 120   | 100   | null  |
| Gare_A | 2024-01-04 | 110         | 130   | 120   | 100   |

> Les lags sont calculés **par gare**, en respectant l’ordre chronologique des dates.  
> Les premières dates n’ont pas suffisamment d’historique et contiennent donc des valeurs `null`.


In [None]:
from pyspark.sql import Window

# Définir la fenêtre de partitionnement
W = Window().partitionBy("c.LIBELLE_ARRET").orderBy("dt_jour")

# Agréger les validations par jour et arrêt
df_fact_agg = (
    df_fact
    .groupby("JOUR", "LIBELLE_ARRET")
    .agg(F.sum("NB_VALD").alias("NB_VALD"))
)

# Jointure avec le calendrier
df_lag = (
    df_core.alias("c")
    .join(
        df_fact_agg.alias("f"),
        how="left",
        on=(F.col("c.dt_jour") == F.col("f.JOUR")) & (F.col("c.LIBELLE_ARRET") == F.col("f.LIBELLE_ARRET"))
    )
    .drop(F.col("f.JOUR"), F.col("f.LIBELLE_ARRET"))
)

# ===== AJOUTER 42 LAGS AVEC UNE BOUCLE =====
NB_LAGS = 42
for i in range(1, NB_LAGS + 1):
    df_lag = df_lag.withColumn(f"lag{i}", F.lag(F.col("NB_VALD"), i).over(W))

# Filtrer et trier
df_lag = (
    df_lag
    .filter(F.col("c.LIBELLE_ARRET") == "GARE DE L'EST")
    .orderBy(F.asc("dt_jour"))

)

df_lag.toPandas()

Unnamed: 0,LIBELLE_ARRET,dt_jour,dow,dom,is_weekend,moy,year,vacances_toussaint,vacances_noel,vacances_hiver,...,lag33,lag34,lag35,lag36,lag37,lag38,lag39,lag40,lag41,lag42
0,GARE DE L'EST,2017-01-01,1,1,1,1,2017,0,1,0,...,,,,,,,,,,
1,GARE DE L'EST,2017-01-02,2,2,0,1,2017,0,1,0,...,,,,,,,,,,
2,GARE DE L'EST,2017-01-03,3,3,0,1,2017,0,0,0,...,,,,,,,,,,
3,GARE DE L'EST,2017-01-04,4,4,0,1,2017,0,0,0,...,,,,,,,,,,
4,GARE DE L'EST,2017-01-05,5,5,0,1,2017,0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2186,GARE DE L'EST,2022-12-27,3,27,0,12,2022,0,1,0,...,38352.0,37279.0,38335.0,37312.0,9185.0,15388.0,35319.0,38126.0,37051.0,38396.0
2187,GARE DE L'EST,2022-12-28,4,28,0,12,2022,0,1,0,...,35114.0,38352.0,37279.0,38335.0,37312.0,9185.0,15388.0,35319.0,38126.0,37051.0
2188,GARE DE L'EST,2022-12-29,5,29,0,12,2022,0,1,0,...,32846.0,35114.0,38352.0,37279.0,38335.0,37312.0,9185.0,15388.0,35319.0,38126.0
2189,GARE DE L'EST,2022-12-30,6,30,0,12,2022,0,1,0,...,21626.0,32846.0,35114.0,38352.0,37279.0,38335.0,37312.0,9185.0,15388.0,35319.0


In [None]:
df_lag

DataFrame[LIBELLE_ARRET: string, dt_jour: date, dow: int, dom: int, is_weekend: int, moy: int, year: int, vacances_toussaint: int, vacances_noel: int, vacances_hiver: int, vacances_printemps: int, vacances_ete: int, NB_VALD: bigint, lag1: bigint, lag2: bigint, lag3: bigint, lag4: bigint, lag5: bigint, lag6: bigint, lag7: bigint, lag8: bigint, lag9: bigint, lag10: bigint, lag11: bigint, lag12: bigint, lag13: bigint, lag14: bigint, lag15: bigint, lag16: bigint, lag17: bigint, lag18: bigint, lag19: bigint, lag20: bigint, lag21: bigint, lag22: bigint, lag23: bigint, lag24: bigint, lag25: bigint, lag26: bigint, lag27: bigint, lag28: bigint, lag29: bigint, lag30: bigint, lag31: bigint, lag32: bigint, lag33: bigint, lag34: bigint, lag35: bigint, lag36: bigint, lag37: bigint, lag38: bigint, lag39: bigint, lag40: bigint, lag41: bigint, lag42: bigint]

In [None]:
# Travail de l’étudiant

NB_LAGS = 42

for i in range(1, NB_LAGS + 1):
    df_lag = df_lag.withColumn(
        f"lag{i}",
        F.lag(F.col("NB_VALD"), i).over(W)
    )

df_lag = (
    df_lag
    .filter(F.col("LIBELLE_ARRET") == "GARE DE L'EST")
    .orderBy(F.col("dt_jour"))
)

df_lag.show()


+-------------+----------+---+---+----------+---+----+------------------+-------------+--------------+------------------+------------+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|LIBELLE_ARRET|   dt_jour|dow|dom|is_weekend|moy|year|vacances_toussaint|vacances_noel|vacances_hiver|vacances_printemps|vacances_ete|NB_VALD| lag1| lag2| lag3| lag4| lag5| lag6| lag7| lag8| lag9|lag10|lag11|lag12|lag13|lag14|lag15|lag16|lag17|lag18|lag19|lag20|lag21|lag22|lag23|lag24|lag25|lag26|lag27|lag28|lag29|lag30|lag31|lag32|lag33|lag34|lag35|lag36|lag37|lag38|lag39|lag40|lag41|lag42|
+-------------+----------+---+---+----------+---+----+------------------+-------------+--------------+------------------+------------+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+--

## 5 - Calculer des moyennes historiques sans data leakage (45 min)

À partir du dataset `core_df`, calculer, pour **chaque gare** et **pour chaque date** :

1. Le **nombre moyen de passagers par mois**, calculé **uniquement à partir des dates strictement antérieures** à la date courante.
2. Le **nombre moyen de passagers par jour de la semaine**, calculé **uniquement à partir des dates strictement antérieures** à la date courante.

⚠️ **Important :**  
Les moyennes doivent être calculées **sans utiliser d’informations futures**, afin d’éviter tout **data leakage**.

### Exemple attendu

| gare   | date       | passagers | avg_mois_1 | avg_mois_2 | avg_lundi | avg_mardi | avg_mercredi | avg_jeudi | avg_vendredi | avg_samedi | avg_dimanche |
|--------|------------|-----------|------------|------------|-----------|-----------|--------------|-----------|--------------|------------|--------------|
| Gare_A | 2024-01-01 | 100       | null       | null       | null      | null      | null         | null      | null         | null       | null         |
| Gare_A | 2024-01-02 | 120       | 100        | null       | null      | null      | null         | null      | null         | null       | null         |
| Gare_A | 2024-01-03 | 130       | 110        | null       | null      | null      | null         | null      | null         | null       | null         |
| Gare_A | 2024-01-04 | 110       | 116.7      | null       | null      | null      | null         | null      | null         | null       | null         |
| Gare_A | 2024-01-11 | 140       | 116.7      | null       | null      | null      | null         | 110       | null         | null       | null         |


In [None]:
# Travail de l’étudiant

from pyspark.sql import functions as F
from pyspark.sql import Window

W_past = (
    Window()
    .partitionBy("LIBELLE_ARRET")
    .orderBy("dt_jour")
    .rowsBetween(Window.unboundedPreceding, -1)
)

df_avg = df_lag

for m in range(1, 13):
    df_avg = df_avg.withColumn(
        f"avg_mois_{m}",
        F.when(
            F.col("moy") == m,
            F.avg("NB_VALD").over(
                W_past.partitionBy("LIBELLE_ARRET", "moy")
            )
        )
    )

df_avg = (
    df_avg
    .withColumn(
        "avg_lundi",
        F.when(
            F.col("dow") == 2,
            F.avg("NB_VALD").over(W_past.partitionBy("LIBELLE_ARRET", "dow"))
        )
    )
    .withColumn(
        "avg_mardi",
        F.when(
            F.col("dow") == 3,
            F.avg("NB_VALD").over(W_past.partitionBy("LIBELLE_ARRET", "dow"))
        )
    )
    .withColumn(
        "avg_mercredi",
        F.when(
            F.col("dow") == 4,
            F.avg("NB_VALD").over(W_past.partitionBy("LIBELLE_ARRET", "dow"))
        )
    )
    .withColumn(
        "avg_jeudi",
        F.when(
            F.col("dow") == 5,
            F.avg("NB_VALD").over(W_past.partitionBy("LIBELLE_ARRET", "dow"))
        )
    )
    .withColumn(
        "avg_vendredi",
        F.when(
            F.col("dow") == 6,
            F.avg("NB_VALD").over(W_past.partitionBy("LIBELLE_ARRET", "dow"))
        )
    )
    .withColumn(
        "avg_samedi",
        F.when(
            F.col("dow") == 7,
            F.avg("NB_VALD").over(W_past.partitionBy("LIBELLE_ARRET", "dow"))
        )
    )
    .withColumn(
        "avg_dimanche",
        F.when(
            F.col("dow") == 1,
            F.avg("NB_VALD").over(W_past.partitionBy("LIBELLE_ARRET", "dow"))
        )
    )
)

df_avg = (
    df_avg
    .select(
        F.col("LIBELLE_ARRET").alias("gare"),
        F.col("dt_jour").alias("date"),
        F.col("NB_VALD").alias("passagers"),
        "avg_mois_1",
        "avg_mois_2",
        "avg_lundi",
        "avg_mardi",
        "avg_mercredi",
        "avg_jeudi",
        "avg_vendredi",
        "avg_samedi",
        "avg_dimanche",
    )
    .filter(F.col("gare") == "GARE DE L'EST")
    .orderBy("date")
)

df_avg.show()


+-------------+----------+---------+------------------+----------+---------+---------+------------+---------+------------+----------+------------+
|         gare|      date|passagers|        avg_mois_1|avg_mois_2|avg_lundi|avg_mardi|avg_mercredi|avg_jeudi|avg_vendredi|avg_samedi|avg_dimanche|
+-------------+----------+---------+------------------+----------+---------+---------+------------+---------+------------+----------+------------+
|GARE DE L'EST|2017-01-01|     5134|              NULL|      NULL|     NULL|     NULL|        NULL|     NULL|        NULL|      NULL|        NULL|
|GARE DE L'EST|2017-01-02|    24420|            5134.0|      NULL|     NULL|     NULL|        NULL|     NULL|        NULL|      NULL|        NULL|
|GARE DE L'EST|2017-01-03|    34906|           14777.0|      NULL|     NULL|     NULL|        NULL|     NULL|        NULL|      NULL|        NULL|
|GARE DE L'EST|2017-01-04|    36378|21486.666666666668|      NULL|     NULL|     NULL|        NULL|     NULL|        N

## 6 - Ajouter les lags et la Target au dataset `core_df` et imputer les valeurs manquantes (10 min)

In [None]:
df_core.columns

['LIBELLE_ARRET',
 'dt_jour',
 'dow',
 'dom',
 'is_weekend',
 'moy',
 'year',
 'vacances_toussaint',
 'vacances_noel',
 'vacances_hiver',
 'vacances_printemps',
 'vacances_ete']

In [None]:
# Travail de l’étudiant

from pyspark.sql import functions as F

cols_lags = [c for c in df_lag.columns if c.startswith("lag")]

df_core_final = (
    df_core
    .join(
        df_lag.select(
            "LIBELLE_ARRET",
            "dt_jour",
            *cols_lags
        ),
        on=["LIBELLE_ARRET", "dt_jour"],
        how="left"
    )
)

df_core_final = df_core_final.fillna(0)

df_core_final.columns

pdf = df_core_final.toPandas()
display(pdf)


Unnamed: 0,LIBELLE_ARRET,dt_jour,dow,dom,is_weekend,moy,year,vacances_toussaint,vacances_noel,vacances_hiver,...,lag33,lag34,lag35,lag36,lag37,lag38,lag39,lag40,lag41,lag42
0,GARE DU NORD,2017-01-01,1,1,1,1,2017,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,GARE DU NORD,2017-01-02,2,2,0,1,2017,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,GARE DU NORD,2017-01-03,3,3,0,1,2017,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,GARE DU NORD,2017-01-04,4,4,0,1,2017,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,GARE DU NORD,2017-01-05,5,5,0,1,2017,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10950,GARE DE LYON,2020-12-24,5,24,0,12,2020,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10951,GARE DE LYON,2021-12-15,4,15,0,12,2021,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10952,GARE DE LYON,2021-12-29,4,29,0,12,2021,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10953,GARE DE LYON,2022-04-05,3,5,0,4,2022,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
display(pdf)

Unnamed: 0,LIBELLE_ARRET,dt_jour,dow,dom,is_weekend,moy,year,vacances_toussaint,vacances_noel,vacances_hiver,...,lag33,lag34,lag35,lag36,lag37,lag38,lag39,lag40,lag41,lag42
0,GARE DU NORD,2017-01-01,1,1,1,1,2017,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,GARE DU NORD,2017-01-02,2,2,0,1,2017,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,GARE DU NORD,2017-01-03,3,3,0,1,2017,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,GARE DU NORD,2017-01-04,4,4,0,1,2017,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,GARE DU NORD,2017-01-05,5,5,0,1,2017,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10950,GARE DE LYON,2020-12-24,5,24,0,12,2020,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10951,GARE DE LYON,2021-12-15,4,15,0,12,2021,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10952,GARE DE LYON,2021-12-29,4,29,0,12,2021,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10953,GARE DE LYON,2022-04-05,3,5,0,4,2022,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 7 - Sauvegarder le dataset final sous format parquet

In [None]:
OUTPUT_PATH_PANDAS = "sncf_features_dataset_for_ml.csv"

df_lag.coalesce(1).write.format("parquet").mode("append").save("df_lag.parquet")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import shutil

shutil.make_archive(
    "/content/df_lag_parquet",
    "zip",
    "/content/df_lag.parquet"
)


'/content/df_lag_parquet.zip'