In [24]:
import pandas as pd
import unicodedata
import re

stations_df = pd.read_csv('../../data/raw/liste-des-gares.csv', sep=';', usecols=['LIBELLE', 'COMMUNE'])

stations_df = (
    stations_df
        .rename(columns={'LIBELLE': 'station', 'COMMUNE': 'commune'})
        .sort_values(by='station', ascending=True)
        .dropna()
        .reset_index(drop=True)
)

def remove_accents(text: str) -> str:
    """
    Removes accents from a string.
    """
    normalized_text = unicodedata.normalize('NFD', text)
    text_without_accents = ''.join(
        char for char in normalized_text
        if not unicodedata.combining(char)
    )
    return text_without_accents

def remove_punctuation(text: str) -> str:
    """
    Removes punctuation from a string.
    """
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', ' ', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    return cleaned_text.strip()

for col in ['station', 'commune']:
    stations_df[col] = (
        stations_df[col]
            .str.lower()
            .apply(remove_accents)
            .apply(remove_punctuation)
    )

stations_df = stations_df.drop_duplicates()

stations_df.sample(10)
stations_df.to_csv('../../data/processed/stations.csv', index=False)