In [2]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

words = ["running", "better", "studies", "wolves", "mice", "children", "was", "ate", 
         "swimming", "parties", "leaves", "knives", "happier", "studying", "played", 
         "goes", "driving", "talked"]

lemmas = {word: lemmatizer.lemmatize(word, pos='v') for word in words}  
print(lemmas)


{'running': 'run', 'better': 'better', 'studies': 'study', 'wolves': 'wolves', 'mice': 'mice', 'children': 'children', 'was': 'be', 'ate': 'eat', 'swimming': 'swim', 'parties': 'party', 'leaves': 'leave', 'knives': 'knives', 'happier': 'happier', 'studying': 'study', 'played': 'play', 'goes': 'go', 'driving': 'drive', 'talked': 'talk'}


In [15]:
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Baixar recursos necessários do NLTK


def basic_cleaning(text):
    # Converter para minúsculas
    text = text.lower()
    
    # Remover pontuações
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remover números
    text = re.sub(r'\d+', '', text)
    
    # Remover espaços extras
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def simple_tokenize(text):
    """
    Tokeniza um texto dividindo por espaços
    """
    return text.split()

def remove_stopwords(tokens):
    """
    Remove stopwords da lista de tokens
    """
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

def lemmatize_tokens(tokens):
    """
    Lematiza uma lista de tokens usando WordNetLemmatizer do NLTK
    """
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

frases = [
    "The children were playing in the leaves yesterday.",
    "She studies computer science and is taking three courses.",
    "The wolves howled at the moon while mice scurried in the grass.",
    "He was driving faster than the cars around him.",
    "The chefs used sharp knives to prepare the tastiest dishes."
]
for i in frases:
    print("==============================================")
    
    frase_limpa = basic_cleaning(i)
    print ("frase limpa : " + frase_limpa)
    
    frase_token = simple_tokenize(frase_limpa)
    print ("Frase Token : " , frase_token)

    frase_sem_stop = remove_stopwords(frase_token)
    print ("Frase remove stop: " , frase_sem_stop)

    frase_lematizada = lemmatize_tokens(frase_sem_stop)
    print ("frase lematizada: " , frase_lematizada)

    print("==============================================\n")

frase limpa : the children were playing in the leaves yesterday
Frase Token :  ['the', 'children', 'were', 'playing', 'in', 'the', 'leaves', 'yesterday']
Frase remove stop:  ['children', 'playing', 'leaves', 'yesterday']
frase lematizada:  ['child', 'playing', 'leaf', 'yesterday']

frase limpa : she studies computer science and is taking three courses
Frase Token :  ['she', 'studies', 'computer', 'science', 'and', 'is', 'taking', 'three', 'courses']
Frase remove stop:  ['studies', 'computer', 'science', 'taking', 'three', 'courses']
frase lematizada:  ['study', 'computer', 'science', 'taking', 'three', 'course']

frase limpa : the wolves howled at the moon while mice scurried in the grass
Frase Token :  ['the', 'wolves', 'howled', 'at', 'the', 'moon', 'while', 'mice', 'scurried', 'in', 'the', 'grass']
Frase remove stop:  ['wolves', 'howled', 'moon', 'mice', 'scurried', 'grass']
frase lematizada:  ['wolf', 'howled', 'moon', 'mouse', 'scurried', 'grass']

frase limpa : he was driving fas