# Apprentissage

## Stemming, lemming, tokenizing

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

# Download required NLTK data (run once)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Sample text
text = "The runners were running quickly through the beautiful gardens, carrying heavy boxes."

print("Original text:", text)
print("\n" + "="*60 + "\n")

# Step 1: TOKENIZATION - Split text into individual words
tokens = word_tokenize(text.lower())
print("1. TOKENIZATION:")
print(tokens)
print("\n" + "="*60 + "\n")

# Step 2a: STEMMING - Crude chopping to get word root
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens]
print("2a. STEMMING (Porter Stemmer):")
print(stemmed_words)
print("\nExamples:")
print(f"  running → {stemmer.stem('running')}")
print(f"  runner → {stemmer.stem('runner')}")
print(f"  beautiful → {stemmer.stem('beautiful')}")
print("\n" + "="*60 + "\n")

# Step 2b: LEMMATIZATION - Intelligent reduction to dictionary form
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in tokens]
print("2b. LEMMATIZATION:")
print(lemmatized_words)
print("\nExamples:")
print(f"  running → {lemmatizer.lemmatize('running', pos='v')}")
print(f"  runner → {lemmatizer.lemmatize('runner', pos='n')}")
print(f"  beautiful → {lemmatizer.lemmatize('beautiful', pos='a')}")
print("\n" + "="*60 + "\n")

# COMPARISON TABLE
print("COMPARISON - Stemming vs Lemmatization:")
print(f"{'Word':<15} {'Stemmed':<15} {'Lemmatized':<15}")
print("-" * 45)
for token in tokens:
    if token.isalpha():  # Only process words
        stemmed = stemmer.stem(token)
        lemmatized = lemmatizer.lemmatize(token, pos='v')
        print(f"{token:<15} {stemmed:<15} {lemmatized:<15}")

[nltk_data] Downloading package punkt to /Users/j/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/j/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/j/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/j/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original text: The runners were running quickly through the beautiful gardens, carrying heavy boxes.


1. TOKENIZATION:
['the', 'runners', 'were', 'running', 'quickly', 'through', 'the', 'beautiful', 'gardens', ',', 'carrying', 'heavy', 'boxes', '.']


2a. STEMMING (Porter Stemmer):
['the', 'runner', 'were', 'run', 'quickli', 'through', 'the', 'beauti', 'garden', ',', 'carri', 'heavi', 'box', '.']

Examples:
  running → run
  runner → runner
  beautiful → beauti


2b. LEMMATIZATION:
['the', 'runners', 'be', 'run', 'quickly', 'through', 'the', 'beautiful', 'garden', ',', 'carry', 'heavy', 'box', '.']

Examples:
  running → run
  runner → runner
  beautiful → beautiful


COMPARISON - Stemming vs Lemmatization:
Word            Stemmed         Lemmatized     
---------------------------------------------
the             the             the            
runners         runner          runners        
were            were            be             
running         run             run         

## TFIDF

In [None]:
docs = [
    "chat chat chien",
    "chat souris",
    "chien souris",
    "chien souris chien"
]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

print(vectorizer.get_feature_names_out())
print(X.toarray())

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
list_description_str = ['key feature of elegance polyester multicolor abstract eyelet door curtain floral curtain , elegance polyester multicolor abstract eyelet door curtain ( 213 cm in height , pack of 2 ) price : r . 899 this curtain enhances the look of the interiors.this curtain is made from 100 % high quality polyester fabric.it feature an eyelet style stitch with metal ring.it make the room environment romantic and loving.this curtain is ant- wrinkle and anti shrinkage and have elegant apparance.give your home a bright and modernistic appeal with these design . the surreal attention is sure to steal heart . these contemporary eyelet and valance curtain slide smoothly so when you draw them apart first thing in the morning to welcome the bright sun ray you want to wish good morning to the whole world and when you draw them close in the evening , you create the most special moment of joyous beauty given by the soothing print . bring home the elegant curtain that softly filter light in your room so that you get the right amount of sunlight. , specification of elegance polyester multicolor abstract eyelet door curtain ( 213 cm in height , pack of 2 ) general brand elegance designed for door type eyelet model name abstract polyester door curtain set of 2 model id duster25 color multicolor dimension length 213 cm in the box number of content in sale package pack of 2 sale package 2 curtain body & design material polyester']


# list_description_str doit être une liste de chaînes de texte, par exemple :
# list_description_str = ["texte 1", "texte 2", "autre texte", ...]

# 1. On crée un vectorizer TF-IDF
vectorizer = TfidfVectorizer()  # transforme du texte en vecteurs numériques pondérés TF-IDF

# 2. On apprend le vocabulaire (fit) et on transforme les textes en matrice TF-IDF (transform)
X = vectorizer.fit_transform(list_description_str)

# 3. Affiche les mots du vocabulaire (colonnes de la matrice TF-IDF)
print("Mots du vocabulaire :")
print(vectorizer.get_feature_names_out())

# 4. Affiche la matrice TF-IDF complète sous forme de tableau
#    Chaque ligne = un texte ; chaque colonne = un mot du vocabulaire
print("\nMatrice TF-IDF :")
print(X.toarray())


Mots du vocabulaire :
['100' '213' '899' 'abstract' 'amount' 'an' 'and' 'ant' 'anti' 'apart'
 'apparance' 'appeal' 'attention' 'beauty' 'body' 'box' 'brand' 'bright'
 'bring' 'by' 'close' 'cm' 'color' 'contemporary' 'content' 'create'
 'curtain' 'design' 'designed' 'dimension' 'door' 'draw' 'duster25'
 'elegance' 'elegant' 'enhances' 'environment' 'evening' 'eyelet' 'fabric'
 'feature' 'filter' 'first' 'floral' 'for' 'from' 'general' 'get' 'give'
 'given' 'good' 'have' 'heart' 'height' 'high' 'home' 'id' 'in'
 'interiors' 'is' 'it' 'joyous' 'key' 'length' 'light' 'look' 'loving'
 'made' 'make' 'material' 'metal' 'model' 'modernistic' 'moment' 'morning'
 'most' 'multicolor' 'name' 'number' 'of' 'pack' 'package' 'polyester'
 'price' 'print' 'quality' 'ray' 'right' 'ring' 'romantic' 'room' 'sale'
 'set' 'shrinkage' 'slide' 'smoothly' 'so' 'softly' 'soothing' 'special'
 'specification' 'steal' 'stitch' 'style' 'sun' 'sunlight' 'sure'
 'surreal' 'that' 'the' 'them' 'these' 'thing' 'this' 't

# Word2vec

Word embeddings = vecteurs numériques pour chaque mot, appris à partir d’un grand corpus.  
Exemple : word2vec("king") - word2vec("man") + word2vec("woman") ≈ word2vec("queen")

Problème : tes modèles classiques donnent des vecteurs mot par mot, alors toi tu veux un vecteur par phrase/document.

Solution simple : moyenne (average) ou pooling

On prend tous les vecteurs des mots de la phrase

On fait la moyenne → vecteur unique pour la phrase