<a href="https://colab.research.google.com/github/LCaravaggio/NLP/blob/main/02_b%C3%A1sicas/Lemmatizers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WordNet

In [56]:
from nltk.stem import WordNetLemmatizer

In [61]:
import nltk
nltk.download('wordnet')

text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
text = nltk.WordPunctTokenizer().tokenize(text)
text = [nltk.stem.WordNetLemmatizer().lemmatize(token, pos='v') for token in text if len(token)>1]

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [62]:
print(text)

['When', 'Sebastian', 'Thrun', 'start', 'work', 'on', 'self', 'drive', 'cars', 'at', 'Google', 'in', '2007', 'few', 'people', 'outside', 'of', 'the', 'company', 'take', 'him', 'seriously', 'can', 'tell', 'you', 'very', 'senior', 'CEOs', 'of', 'major', 'American', 'car', 'company', 'would', 'shake', 'my', 'hand', 'and', 'turn', 'away', 'because', 'wasn', 'worth', 'talk', 'to', ',”', 'say', 'Thrun', 'in', 'an', 'interview', 'with', 'Recode', 'earlier', 'this', 'week']


# Spacy

In [49]:
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'tell', 'shake', 'turn', 'talk', 'say']
Sebastian Thrun PERSON
Google ORG
2007 DATE
American NORP
Thrun PERSON
Recode ORG
earlier this week DATE


In [50]:
%%capture
!python -m spacy download es_core_news_sm
nlp = spacy.load("es_core_news_sm")

In [51]:
text = ('"Las libertades de diferentes tipos pueden fortalecerse entre sí." Amartya Sen')

doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['Las libertades', 'diferentes tipos', 'sí', '" Amartya Sen']
Verbs: ['fortalecer él']
Amartya Sen PER


# Stanza

In [52]:
%%capture
!pip install stanza

In [53]:
import stanza
from stanza.models.common.doc import Document

nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma', lemma_pretagged=True, tokenize_pretokenized=True)
pp = Document([[{'id': 1, 'text': 'puppies', 'upos': 'NOUN'}]])
print("BEFORE ADDING LEMMA")
print(pp)
doc = nlp(pp)
print("AFTER ADDING LEMMA")
print(doc)

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| lemma     | combined_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!


BEFORE ADDING LEMMA
[
  [
    {
      "id": 1,
      "text": "puppies",
      "upos": "NOUN"
    }
  ]
]
AFTER ADDING LEMMA
[
  [
    {
      "id": 1,
      "text": "puppies",
      "lemma": "puppy",
      "upos": "NOUN"
    }
  ]
]


In [54]:
text = ('"Las libertades de diferentes tipos pueden fortalecerse entre sí." Amartya Sen')

nlp = stanza.Pipeline(lang='es', processors='tokenize,lemma', lemma_pretagged=True, tokenize_pretokenized=True)

doc = nlp(text)

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: es (Spanish):
| Processor | Package         |
-------------------------------
| tokenize  | ancora          |
| lemma     | ancora_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!


In [55]:
print(doc)

[
  [
    {
      "id": 1,
      "text": "\"Las",
      "lemma": "\"La",
      "misc": "",
      "start_char": 0,
      "end_char": 4
    },
    {
      "id": 2,
      "text": "libertades",
      "lemma": "libertad",
      "misc": "",
      "start_char": 5,
      "end_char": 15
    },
    {
      "id": 3,
      "text": "de",
      "lemma": "de",
      "misc": "",
      "start_char": 16,
      "end_char": 18
    },
    {
      "id": 4,
      "text": "diferentes",
      "lemma": "diferente",
      "misc": "",
      "start_char": 19,
      "end_char": 29
    },
    {
      "id": 5,
      "text": "tipos",
      "lemma": "tipo",
      "misc": "",
      "start_char": 30,
      "end_char": 35
    },
    {
      "id": 6,
      "text": "pueden",
      "lemma": "poder",
      "misc": "",
      "start_char": 36,
      "end_char": 42
    },
    {
      "id": 7,
      "text": "fortalecerse",
      "lemma": "fortalecerse",
      "misc": "",
      "start_char": 43,
      "end_char": 55
    },
    {
 