In [None]:
import pandas as pd
import re
import spacy
from nltk.corpus import stopwords

In [3]:
df = pd.read_csv('../../data/raw/liste-des-gares.csv', sep=';')
stations_df = df[['LIBELLE']]

In [8]:
nlp = spacy.load('fr_core_news_md')
french_stopwords = set(stopwords.words('french'))

def remove_punctuation(text: str) -> str:
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', ' ', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    return cleaned_text.strip()

def lemmatize(text: str) -> str:
    """Lemmatization with spaCy - reduces words to their base form (lemma)"""
    doc = nlp(text)
    lemmatized_words = [token.lemma_ for token in doc]
    return " ".join(lemmatized_words)

def return_tokens(text: str) -> list[str]:
    doc = nlp(text)
    return [token.text for token in doc]

def remove_stopwords(text: str) -> str:
    tokens = return_tokens(text)
    return " ".join([token for token in tokens if token not in french_stopwords])

stations_df = (
    stations_df
        .rename(columns={'LIBELLE': 'station'})
        .sort_values(by='station', ascending=True)
        .reset_index(drop=True)
        .dropna()
)

stations_df['station'] = (
    stations_df['station']
        .str.lower()
        .apply(remove_punctuation)
        .apply(remove_stopwords)
        .apply(lemmatize)
)

stations_df.sample(10)
stations_df.to_csv('../../data/processed/stations.csv')