In [1]:
import nltk

# 1. Tokenización

La tokenización es el proceso de dividir un texto en unidades más pequeñas llamadas tokens. NLTK proporciona diferentes tokenizadores para adaptarse a diferentes tipos de texto.

In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
text = "NLTK is a powerful library for natural language processing. It provides various tokenizers."

# Tokenización de palabras
tokens = word_tokenize(text)
print(tokens)

# Tokenización de oraciones
sentences = sent_tokenize(text)
print(sentences)

['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing', '.', 'It', 'provides', 'various', 'tokenizers', '.']
['NLTK is a powerful library for natural language processing.', 'It provides various tokenizers.']


# 2. Etiquetado gramatical

El etiquetado gramatical implica asignar etiquetas a cada palabra en un texto para indicar su parte del discurso (sustantivo, verbo, adjetivo, etc.) y otras características gramaticales.

In [4]:
from nltk import pos_tag

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [5]:
text = "NLTK is a powerful library for natural language processing."

# Tokenización de palabras
tokens = word_tokenize(text)

# Etiquetado gramatical
pos_tags = pos_tag(tokens)
print(pos_tags)

[('NLTK', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('powerful', 'JJ'), ('library', 'NN'), ('for', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('.', '.')]


# 3. Lematización

La lematización es el proceso de convertir una palabra a su forma base o lema. Por ejemplo, la lematización convertirá palabras como "corriendo" a "correr" o "ratones" a "ratón".

In [10]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [11]:
lemmatizer = WordNetLemmatizer()

word = "running"
lemma = lemmatizer.lemmatize(word, pos='v')
print(lemma)

run


# Ejemplo práctico

Vamos a realizar las siguientes tareas:

 - Tokenización de palabras.
 - Etiquetado gramatical.
 - Extracción de entidades nombradas.
 - Análisis de sentimientos.

In [12]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag, ne_chunk
from nltk.sentiment import SentimentIntensityAnalyzer

# Descargar los recursos necesarios
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')

# Texto de ejemplo
text = "Apple Inc. is planning to open a new store in New York City. The CEO, Tim Cook, announced the news yesterday. The company is expanding its operations."

# Tokenización de palabras
tokens = word_tokenize(text)

# Tokenización de oraciones
sentences = sent_tokenize(text)

# Etiquetado gramatical
pos_tags = pos_tag(tokens)

# Extracción de entidades nombradas
named_entities = ne_chunk(pos_tags)

# Análisis de sentimientos
sentiment_analyzer = SentimentIntensityAnalyzer()
sentiment_scores = sentiment_analyzer.polarity_scores(text)

# Imprimir los resultados
print("Tokens:", tokens)
print("Oraciones:", sentences)
print("Etiquetas gramaticales:", pos_tags)
print("Entidades nombradas:", named_entities)
print("Puntuación de sentimiento:", sentiment_scores)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Tokens: ['Apple', 'Inc.', 'is', 'planning', 'to', 'open', 'a', 'new', 'store', 'in', 'New', 'York', 'City', '.', 'The', 'CEO', ',', 'Tim', 'Cook', ',', 'announced', 'the', 'news', 'yesterday', '.', 'The', 'company', 'is', 'expanding', 'its', 'operations', '.']
Oraciones: ['Apple Inc. is planning to open a new store in New York City.', 'The CEO, Tim Cook, announced the news yesterday.', 'The company is expanding its operations.']
Etiquetas gramaticales: [('Apple', 'NNP'), ('Inc.', 'NNP'), ('is', 'VBZ'), ('planning', 'VBG'), ('to', 'TO'), ('open', 'VB'), ('a', 'DT'), ('new', 'JJ'), ('store', 'NN'), ('in', 'IN'), ('New', 'NNP'), ('York', 'NNP'), ('City', 'NNP'), ('.', '.'), ('The', 'DT'), ('CEO', 'NNP'), (',', ','), ('Tim', 'NNP'), ('Cook', 'NNP'), (',', ','), ('announced', 'VBD'), ('the', 'DT'), ('news', 'NN'), ('yesterday', 'NN'), ('.', '.'), ('The', 'DT'), ('company', 'NN'), ('is', 'VBZ'), ('expanding', 'VBG'), ('its', 'PRP$'), ('operations', 'NNS'), ('.', '.')]
Entidades nombradas: (S