# Apprentissage

## Stemming, lemming, tokenizing

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

# Download required NLTK data (run once)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Sample text
text = "The runners were running quickly through the beautiful gardens, carrying heavy boxes."

print("Original text:", text)
print("\n" + "="*60 + "\n")

# Step 1: TOKENIZATION - Split text into individual words
tokens = word_tokenize(text.lower())
print("1. TOKENIZATION:")
print(tokens)
print("\n" + "="*60 + "\n")

# Step 2a: STEMMING - Crude chopping to get word root
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens]
print("2a. STEMMING (Porter Stemmer):")
print(stemmed_words)
print("\nExamples:")
print(f"  running → {stemmer.stem('running')}")
print(f"  runner → {stemmer.stem('runner')}")
print(f"  beautiful → {stemmer.stem('beautiful')}")
print("\n" + "="*60 + "\n")

# Step 2b: LEMMATIZATION - Intelligent reduction to dictionary form
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in tokens]
print("2b. LEMMATIZATION:")
print(lemmatized_words)
print("\nExamples:")
print(f"  running → {lemmatizer.lemmatize('running', pos='v')}")
print(f"  runner → {lemmatizer.lemmatize('runner', pos='n')}")
print(f"  beautiful → {lemmatizer.lemmatize('beautiful', pos='a')}")
print("\n" + "="*60 + "\n")

# COMPARISON TABLE
print("COMPARISON - Stemming vs Lemmatization:")
print(f"{'Word':<15} {'Stemmed':<15} {'Lemmatized':<15}")
print("-" * 45)
for token in tokens:
    if token.isalpha():  # Only process words
        stemmed = stemmer.stem(token)
        lemmatized = lemmatizer.lemmatize(token, pos='v')
        print(f"{token:<15} {stemmed:<15} {lemmatized:<15}")

[nltk_data] Downloading package punkt to /Users/j/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/j/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/j/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/j/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original text: The runners were running quickly through the beautiful gardens, carrying heavy boxes.


1. TOKENIZATION:
['the', 'runners', 'were', 'running', 'quickly', 'through', 'the', 'beautiful', 'gardens', ',', 'carrying', 'heavy', 'boxes', '.']


2a. STEMMING (Porter Stemmer):
['the', 'runner', 'were', 'run', 'quickli', 'through', 'the', 'beauti', 'garden', ',', 'carri', 'heavi', 'box', '.']

Examples:
  running → run
  runner → runner
  beautiful → beauti


2b. LEMMATIZATION:
['the', 'runners', 'be', 'run', 'quickly', 'through', 'the', 'beautiful', 'garden', ',', 'carry', 'heavy', 'box', '.']

Examples:
  running → run
  runner → runner
  beautiful → beautiful


COMPARISON - Stemming vs Lemmatization:
Word            Stemmed         Lemmatized     
---------------------------------------------
the             the             the            
runners         runner          runners        
were            were            be             
running         run             run         

## TFIDF

In [None]:
docs = [
    "chat chat chien",
    "chat souris",
    "chien souris",
    "chien souris chien"
]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

print(vectorizer.get_feature_names_out())
print(X.toarray())