In [2]:
### *** French lemmatization and synsets with spacy and nltk *** ### 

In [4]:
### Lemmatization ### 

# Note the wordnet included in the nltk corpus already includes Spanish

import nltk 
from nltk.corpus import wordnet as wn 

# Example political text taken from 
# https://www.nytimes.com/es/2019/01/16/pena-nieto-soborno-chapo/?rref=collection%2Fsectioncollection%2Fnyt-es&action=click&contentCollection=corrupcion-en-mexico&region=stream&module=stream_unit&version=latest&contentPlacement=4&pgtype=collection

text = """ ¿Un soborno de 100 millones de dólares? Los mexicanos se muestran indiferentes.
        CIUDAD DE MÉXICO — La acusación de un testigo cayó como una bomba en Estados Unidos: uno de los capos de la droga más poderosos del mundo le había pagado un soborno de 100 millones de dólares a Enrique Peña Nieto, el expresidente de México.

        Sin embargo, en México la revelación —de boca de un exaliado del narcotraficante Joaquín “el Chapo” Guzmán, en un tribunal de Nueva York— fue recibida con indiferencia.
        """
print(text)

 ¿Un soborno de 100 millones de dólares? Los mexicanos se muestran indiferentes.
        CIUDAD DE MÉXICO — La acusación de un testigo cayó como una bomba en Estados Unidos: uno de los capos de la droga más poderosos del mundo le había pagado un soborno de 100 millones de dólares a Enrique Peña Nieto, el expresidente de México.

        Sin embargo, en México la revelación —de boca de un exaliado del narcotraficante Joaquín “el Chapo” Guzmán, en un tribunal de Nueva York— fue recibida con indiferencia.
        


In [6]:
### Tokenization ###

## nltk ## 
print("***nltk tokenization*** \n")

from nltk import word_tokenize
from nltk import sent_tokenize

word_tokens = [token for token in word_tokenize(text, language='spanish')] # tokenize by words
sent_tokens = [sent for sent in sent_tokenize(text, language='spanish')] # tokenize by sentence
print(word_tokens, "\n") 
print(sent_tokens, "\n")

"""Comments: 
    nltk seems to do the job just right."""

## spacy ## 

print("***spacy tokenization*** \n")

import spacy

nlp = spacy.load("es_core_news_sm")
# NOTE: The md model is more comprehensive but takes a bit more to load

doc = nlp(text)

sp_word_tokens = [token.text for token in doc if token.text.isalpha()]
sp_sent_tokens = [sent for sent in doc.sents]
print(sp_word_tokens, "\n")
print(sp_sent_tokens, "\n")

""" Comments: 
    spacy also does the job """

## polyglot ##

***nltk tokenization*** 

['¿Un', 'soborno', 'de', '100', 'millones', 'de', 'dólares', '?', 'Los', 'mexicanos', 'se', 'muestran', 'indiferentes', '.', 'CIUDAD', 'DE', 'MÉXICO', '—', 'La', 'acusación', 'de', 'un', 'testigo', 'cayó', 'como', 'una', 'bomba', 'en', 'Estados', 'Unidos', ':', 'uno', 'de', 'los', 'capos', 'de', 'la', 'droga', 'más', 'poderosos', 'del', 'mundo', 'le', 'había', 'pagado', 'un', 'soborno', 'de', '100', 'millones', 'de', 'dólares', 'a', 'Enrique', 'Peña', 'Nieto', ',', 'el', 'expresidente', 'de', 'México', '.', 'Sin', 'embargo', ',', 'en', 'México', 'la', 'revelación', '—de', 'boca', 'de', 'un', 'exaliado', 'del', 'narcotraficante', 'Joaquín', '“', 'el', 'Chapo', '”', 'Guzmán', ',', 'en', 'un', 'tribunal', 'de', 'Nueva', 'York—', 'fue', 'recibida', 'con', 'indiferencia', '.'] 

[' ¿Un soborno de 100 millones de dólares?', 'Los mexicanos se muestran indiferentes.', 'CIUDAD DE MÉXICO — La acusación de un testigo cayó como una bomba en Estados Unidos: uno de los capo

' Comments: \n    spacy also does the job '

In [11]:
### Lemmatization ### 

## nltk ##
print("**nltk lemmatization***\n")

from nltk.stem.snowball import SpanishStemmer # from SnowballStemmer

stemmer = SpanishStemmer() # instantiate the stemmer 

alpha_words = [w for w in word_tokens if w.isalpha()] # remove punctuation
nltk_sp_stems = [(word, stemmer.stem(word)) for word in alpha_words]

print(nltk_sp_stems, "\n") 

""" Comments: 
    - As it can be seen, nltk does a very poor job in lemmatizing Spanish.
    - An alternative could be to implement our own stemmer with the 
        stem.Regexp() function, but it looks quite time consuming, 
        given all the irregularities of the language. In fact, implementing 
        a new module would be a better idea."""

## spacy lemmatization: ##
print("***spacy lemmatization***\n")

import spacy

nlp = spacy.load("es_core_news_md") # I used the medium model for better accuracy

doc = nlp(text)
doc_lemmas = [(token.text, token.lemma_) for token in doc if token.text.isalpha()]
print(doc_lemmas)

print("Lemma: ", doc_lemmas[0][1])
""" Comments: 
    - Compared to nltk, spacy does an amazing job in lemmatizing French. """

**nltk lemmatization***

[('soborno', 'soborn'), ('de', 'de'), ('millones', 'millon'), ('de', 'de'), ('dólares', 'dolar'), ('Los', 'los'), ('mexicanos', 'mexican'), ('se', 'se'), ('muestran', 'muestr'), ('indiferentes', 'indiferent'), ('CIUDAD', 'ciud'), ('DE', 'de'), ('MÉXICO', 'mexic'), ('La', 'la'), ('acusación', 'acus'), ('de', 'de'), ('un', 'un'), ('testigo', 'testig'), ('cayó', 'cay'), ('como', 'com'), ('una', 'una'), ('bomba', 'bomb'), ('en', 'en'), ('Estados', 'estad'), ('Unidos', 'unid'), ('uno', 'uno'), ('de', 'de'), ('los', 'los'), ('capos', 'cap'), ('de', 'de'), ('la', 'la'), ('droga', 'drog'), ('más', 'mas'), ('poderosos', 'poder'), ('del', 'del'), ('mundo', 'mund'), ('le', 'le'), ('había', 'hab'), ('pagado', 'pag'), ('un', 'un'), ('soborno', 'soborn'), ('de', 'de'), ('millones', 'millon'), ('de', 'de'), ('dólares', 'dolar'), ('a', 'a'), ('Enrique', 'enriqu'), ('Peña', 'peñ'), ('Nieto', 'niet'), ('el', 'el'), ('expresidente', 'expresident'), ('de', 'de'), ('México', 'mexic

' Comments: \n    - Compared to nltk, spacy does an amazing job in lemmatizing French. '

In [13]:
### Synsets ### 

## nltk ##

print("***nltk synsets***\n")

from nltk.corpus import wordnet as wn

# Note I decided to use alpha words and not stemmed given that the nltk stemmed words for 
# spa are so imprecise. The result would be a bunch of empty synsets. 

# create a dictionary of synsets for each word
nltk_synsets_sp = {word: wn.synsets(word, lang='spa') for word in alpha_words}

def print_synsets(synset, num_synsets=10):
    """ expects a dictionary with synsets. 
        prints num_synsets number of synsets in the dictionary"""
    i = 0
    for key, val in synset.items():
        if i < num_synsets: 
            print("word : {}".format(key))
            print(val)
            i += 1
            print()
    
print_synsets(nltk_synsets_sp)

""" Comments: 
    Many words are missing synsets. """

## spacy + nltk ##

print("\n***spacy + nltk synsets***\n")

spnlkt_synsest_sp = {tup[0]: wn.synsets(tup[1], lang='spa') for tup in doc_lemmas}

print_synsets(spnlkt_synsest_sp) 

""" Comments: 
    - Better results but several entries are obviously wrong. 
    - We could filter things like articles and propositions since finding synsets for these 
        might not be very useful. """

***nltk synsets***

word : soborno
[Synset('price.n.06')]

word : de
[Synset('delaware.n.04')]

word : millones
[]

word : dólares
[]

word : Los
[]

word : mexicanos
[]

word : se
[]

word : muestran
[]

word : indiferentes
[]

word : CIUDAD
[Synset('city.n.01'), Synset('township.n.01')]


***spacy + nltk synsets***

word : Un
[]

word : soborno
[Synset('suborn.v.03'), Synset('sop.v.01')]

word : de
[Synset('delaware.n.04')]

word : millones
[Synset('million.n.01')]

word : dólares
[Synset('dollar.n.04'), Synset('dollar.n.03'), Synset('dollar.n.02'), Synset('dollar.n.01')]

word : Los
[]

word : mexicanos
[Synset('mexican.a.01')]

word : se
[]

word : muestran
[Synset('show.v.10'), Synset('indicate.v.02'), Synset('express.v.01'), Synset('testify.v.02'), Synset('picture.v.02'), Synset('show.v.04'), Synset('disclose.v.02'), Synset('register.v.07')]

word : indiferentes
[Synset('unconcerned.a.01'), Synset('blase.s.03'), Synset('indifferent.s.02'), Synset('uninterested.s.02'), Synset('ina

' Comments: \n    - Better results but several entries are obviously wrong. \n    - We could filter things like articles and propositions since finding synsets for these \n        might not be very useful. '

In [14]:
##  improved spacy + nltk ## 

print("***improved spacy+nltk***\n")

# create a filtered lemmas to exclude determiners (DET), adpositions(aka prepositions) (ADP), 
# punctuation (PUNCT), conjuctions (CONJ,CCONJ), numerals (NUM), symbols (SYM), spaces (NUM), 
# and non-alpha tokens. 
# Full list can be found at https://spacy.io/api/annotation

tags = ["DET","ADP","PUNCT","CONJ","CCONJ","NUM","SYM","SPACE"]
flt_doc_lemmas = [(token.text, token.lemma_) for token in doc if token.pos_ not in tags and token.text.isalpha()]

print("**filtered doc lemmas**\n")
print(flt_doc_lemmas)

# Now we will generate the synsets using these filtered lemmaS
print("\n**synsets**\n")

# list comprehension of lemmatized words
flt_synsets_sp = {tup[1]: wn.synsets(tup[1], lang='spa') for tup in flt_doc_lemmas}

print_synsets(flt_synsets_sp)

"""Comments: 
    - Improvement from the previous implementations. 
    - tags can be easily adjusted if one wishes to include them in the list comprehension. 
    - NOTE:  """

***improved spacy+nltk***

**filtered doc lemmas**

[('soborno', 'sobornar'), ('millones', 'millón'), ('dólares', 'dólar'), ('mexicanos', 'mexicano'), ('se', 'se'), ('muestran', 'mostrar'), ('indiferentes', 'indiferente'), ('CIUDAD', 'CIUDAD'), ('MÉXICO', 'MÉXICO'), ('acusación', 'acusación'), ('testigo', 'testigo'), ('cayó', 'caer'), ('como', 'comer'), ('bomba', 'bombo'), ('Estados', 'Estados'), ('Unidos', 'Unidos'), ('uno', 'unir'), ('capos', 'capos'), ('droga', 'drogar'), ('más', 'más'), ('poderosos', 'poderoso'), ('mundo', 'mundo'), ('le', 'le'), ('había', 'haber'), ('pagado', 'pagar'), ('soborno', 'sobornar'), ('millones', 'millón'), ('dólares', 'dólar'), ('Enrique', 'Enrique'), ('Peña', 'Peña'), ('Nieto', 'Nieto'), ('expresidente', 'expresidente'), ('México', 'México'), ('embargo', 'embargar'), ('México', 'México'), ('revelación', 'revelación'), ('boca', 'boca'), ('exaliado', 'exaliado'), ('narcotraficante', 'narcotraficante'), ('Joaquín', 'Joaquín'), ('Chapo', 'Chapo'), ('Guzmán

'Comments: \n    - Improvement from the previous implementations. \n    - tags can be easily adjusted if one wishes to include them in the list comprehension. \n    - NOTE:  '

In [15]:
### spaCy NER & visualization ### 

from spacy import displacy

for ent in doc.ents:
    print("[Entity:{}] [start:{}] [end:{}] [Label:{}]".format(ent.text, ent.start_char, ent.end_char, ent.label_))
    
displacy.render(doc, style="ent",jupyter=True)

[Entity:MÉXICO] [start:99] [end:105] [Label:LOC]
[Entity:en Estados Unidos] [start:155] [end:172] [Label:LOC]
[Entity:Enrique Peña Nieto] [start:283] [end:301] [Label:PER]
[Entity:México] [start:322] [end:328] [Label:LOC]
[Entity:México] [start:355] [end:361] [Label:LOC]
[Entity:—de] [start:376] [end:379] [Label:PER]
[Entity:Joaquín] [start:420] [end:427] [Label:PER]
[Entity:Chapo” Guzmán] [start:432] [end:445] [Label:PER]
[Entity:Nueva York—] [start:465] [end:476] [Label:LOC]
