In [1]:
import os 
import nltk


In [2]:

# Obtenir le chemin absolu du répertoire de données NLTK
nltk_data_dir = os.path.abspath('./.venv/nltk_data')

# Définir la variable d'environnement NLTK_DATA
os.environ['NLTK_DATA'] = nltk_data_dir

# Télécharger les ressources NLTK
nltk.download('punkt', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)
nltk.download('wordnet', download_dir=nltk_data_dir)
nltk.download('punkt_tab', download_dir=nltk_data_dir)

[nltk_data] Downloading package punkt to
[nltk_data]     c:\Users\anton\Downloads\keras_demo\.venv\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     c:\Users\anton\Downloads\keras_demo\.venv\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     c:\Users\anton\Downloads\keras_demo\.venv\nltk_data...
[nltk_data] Downloading package punkt_tab to
[nltk_data]     c:\Users\anton\Downloads\keras_demo\.venv\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

#### tokenisation

In [24]:
from nltk.tokenize import word_tokenize

text = "Demain je montrerai que l'intelligence artificielle c'est génial! 10/10"
tokens = word_tokenize(text)
print(tokens)

['Demain', 'je', 'montrerai', 'que', "l'intelligence", 'artificielle', "c'est", 'génial', '!', '10/10']


#### reg ex pour cleaner

In [25]:
import re
import regex
def clean_text(text):
    # text = re.sub(r"[^a-zA-Z0-9]+",' ', text)
    text = regex.sub(r"[^\p{L}0-9]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()

clean_txt = clean_text(text)
clean_txt

tokens = word_tokenize(clean_txt)
tokens

['demain',
 'je',
 'montrerai',
 'que',
 'l',
 'intelligence',
 'artificielle',
 'c',
 'est',
 'génial',
 '10',
 '10']

#### stopwords

In [26]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('french'))
filtered = [w for w in tokens if w.lower() not in stop_words and w.isalnum()]
print(filtered)

['demain', 'montrerai', 'intelligence', 'artificielle', 'génial', '10', '10']


#### stemming

In [27]:
from nltk.stem import PorterStemmer 

stemmer = PorterStemmer()

stems = [stemmer.stem(m) for m in filtered]
print(stems)

['demain', 'montrerai', 'intellig', 'artificiel', 'génial', '10', '10']


#### lemmatisation

In [28]:
from nltk.stem import WordNetLemmatizer 

lemmatiser = WordNetLemmatizer()

lemmatized = [lemmatiser.lemmatize(m) for m in filtered]
print(lemmatized)

['demain', 'montrerai', 'intelligence', 'artificielle', 'génial', '10', '10']


In [31]:
mots = ["cars", "careful", "carefully", "caring", "care", "carol"]
lemmatized = [lemmatiser.lemmatize(m) for m in mots]
print(lemmatized)

['car', 'careful', 'carefully', 'caring', 'care', 'carol']


In [32]:
stemmer = PorterStemmer()

stems = [stemmer.stem(m) for m in mots]
print(stems)

['car', 'care', 'care', 'care', 'care', 'carol']


#### Bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 
import pandas as pd 

mon_text = [
    "le chat mange la souris",
    "le chien mange un os", 
    "le poisson regarde le chat",
    "le chien regarde le chat"
] 
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(mon_text)
df = pd.DataFrame(bow.toarray(), columns=vectorizer.get_feature_names_out())
print(df)

   chat  chien  la  le  mange  os  poisson  regarde  souris  un
0     1      0   1   1      1   0        0        0       1   0
1     0      1   0   1      1   1        0        0       0   1
2     1      0   0   2      0   0        1        1       0   0
3     1      1   0   2      0   0        0        1       0   0


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
bow = vectorizer.fit_transform(mon_text)
df = pd.DataFrame(bow.toarray(), columns=vectorizer.get_feature_names_out())
print(df)

       chat     chien        la        le     mange        os   poisson  \
0  0.351295  0.000000  0.550372  0.287207  0.433919  0.000000  0.000000   
1  0.000000  0.420493  0.000000  0.278320  0.420493  0.533343  0.000000   
2  0.361459  0.000000  0.000000  0.591032  0.000000  0.000000  0.566295   
3  0.385612  0.476308  0.000000  0.630527  0.000000  0.000000  0.000000   

    regarde    souris        un  
0  0.000000  0.550372  0.000000  
1  0.000000  0.000000  0.533343  
2  0.446473  0.000000  0.000000  
3  0.476308  0.000000  0.000000  


In [39]:
from sklearn.cluster import KMeans

mon_text=["ce film est super nulle",
          "mon telephone est cassé",
          "la fibre c'est génial",
          "le scénario est plat et les acteurs sont mauvais",
          "la ram ca coute très cher",
          "le processeur est rapide"]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(mon_text)

kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X)

for i, label in enumerate(kmeans.labels_):
    print(f"Groupe {label} : {mon_text[i]}")


Groupe 0 : ce film est super nulle
Groupe 0 : mon telephone est cassé
Groupe 0 : la fibre c'est génial
Groupe 1 : le scénario est plat et les acteurs sont mauvais
Groupe 0 : la ram ca coute très cher
Groupe 1 : le processeur est rapide
