In [1]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from unidecode import unidecode

# T√©l√©charger les ressources NLTK (√† faire une seule fois)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Charger ton dataset nettoy√©
df = pd.read_csv("data_nettoyee_pro.csv")

# Colonnes textuelles √† nettoyer
text_cols = ['clinicalpresentation', 'commentary', 'description', 'diagnosis']
text_cols = [col for col in text_cols if col in df.columns]
print("Colonnes textuelles √† nettoyer :", text_cols)

# D√©finir les outils NLP
stop_words = set(stopwords.words('french') + stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Fonction de nettoyage NLP
def clean_nlp_text(text):
    if not isinstance(text, str):
        return ""
    
    # Retirer accents, majuscules, ponctuation, chiffres et URLs
    text = unidecode(text.lower())
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text).strip()
    
    # Tokenisation simple
    words = text.split()
    
    # Stopwords + lemmatisation
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and len(w) > 2]
    
    # Reconstruction
    return " ".join(words)

# Appliquer le nettoyage sur chaque colonne
for col in text_cols:
    print(f"üßπ Nettoyage NLP : {col}")
    df[col + '_clean'] = df[col].apply(clean_nlp_text)

# Supprimer les lignes vides ou trop courtes
for col in text_cols:
    df = df[df[col + '_clean'].str.len() > 5]

# Fusion optionnelle en une colonne pour NLP global
df['full_text'] = df[[c + '_clean' for c in text_cols]].apply(lambda x: ' '.join(x.dropna()), axis=1)

# Supprimer les doublons textuels
df.drop_duplicates(subset=['full_text'], inplace=True)

# Sauvegarde finale
df.to_csv("data_nlp_ready.csv", index=False, encoding='utf-8')

print(" Nettoyage NLP termin√© !")
print("Dimensions finales :", df.shape)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bough\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bough\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bough\AppData\Roaming\nltk_data...


Colonnes textuelles √† nettoyer : ['clinicalpresentation', 'commentary', 'description', 'diagnosis']
üßπ Nettoyage NLP : clinicalpresentation
üßπ Nettoyage NLP : commentary
üßπ Nettoyage NLP : description
üßπ Nettoyage NLP : diagnosis
 Nettoyage NLP termin√© !
Dimensions finales : (1236, 35)
