In [58]:
import pandas as pd
import unicodedata
import re

def remove_accents(text: str) -> str:
    normalized_text = unicodedata.normalize('NFD', text)
    text_without_accents = ''.join(
        char for char in normalized_text
        if not unicodedata.combining(char)
    )
    return text_without_accents

def remove_punctuation(text: str) -> str:
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', ' ', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    return cleaned_text.strip()

stations_df = pd.read_csv(
    '../../data/raw/liste-des-gares.csv',
    sep=';',
    usecols=['LIBELLE', 'COMMUNE', 'VOYAGEURS', 'X_WGS84', 'Y_WGS84']
)

stations_df = stations_df[stations_df["VOYAGEURS"] == "O"]

libelle_entries = stations_df[["LIBELLE", "X_WGS84", "Y_WGS84"]].dropna(subset=["LIBELLE"])
commune_entries = stations_df[["COMMUNE", "X_WGS84", "Y_WGS84"]].dropna(subset=["COMMUNE"])

libelle_entries = libelle_entries.rename(columns={"LIBELLE": "raw"})
commune_entries = commune_entries.rename(columns={"COMMUNE": "raw"})

all_entries = pd.concat([libelle_entries, commune_entries], ignore_index=True)

df_entries = all_entries.copy()
df_entries["entries"] = (
    df_entries["raw"]
        .str.lower()
        .apply(remove_accents)
        .apply(remove_punctuation)
)

df_entries = df_entries[df_entries["entries"].str.strip() != ""]
df_entries = df_entries.drop_duplicates(subset=["entries"]).reset_index(drop=True)
df_entries = df_entries.sort_values(by="entries").reset_index(drop=True)
df_entries.insert(0, "index", df_entries.index)

df_entries.to_csv('../../data/processed/entries.csv', index=False)

df_entries.head(10)

Unnamed: 0,index,raw,X_WGS84,Y_WGS84,entries
0,0,Abancourt,1.773588,49.684506,abancourt
1,1,Abbaretz,-1.524313,47.555462,abbaretz
2,2,Abbeville,1.825534,50.100849,abbeville
3,3,Ablon,2.417862,48.723897,ablon
4,4,ABLON-SUR-SEINE,2.417862,48.723897,ablon sur seine
5,5,ACHERES,2.077779,48.970052,acheres
6,6,Achères-Grand-Cormier,2.091973,48.955145,acheres grand cormier
7,7,Achères-Ville,2.077779,48.970052,acheres ville
8,8,Acheux-Franleu,1.657197,50.08249,acheux franleu
9,9,Achiet,2.780612,50.131968,achiet
