<img src="https://github.com/hernancontigiani/ceia_memorias_especializacion/raw/master/Figures/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## Preprocesamiento con NLTK y Spacy


In [1]:
import json
import string
import random 

import numpy as np

### Datos

In [2]:
simple_text = "if she leaves now she might miss something important!"

In [3]:
large_text = "Patients who in late middle age have smoked 20 cigarettes a day since their teens constitute an at-risk group. One thing they’re clearly at risk for is the acute sense of guilt that a clinician can incite, which immediately makes a consultation tense."

### 1 - Preprocesamiento con NLTK
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [4]:
# NLTK cuenta con una extensiva documentación 
# que vale la pena consultar
# https://www.nltk.org

import nltk
nltk.download('punkt_tab')   # add to avoid errors
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize  

# Descargar tokenizador punkt
nltk.download("punkt")
# Descargar la red de semántica del inglés WordNet
# Es una extensa red semántica que puede usarse (entre otras cosas)
# para hacer POS tagging o lematizar.
nltk.download("wordnet")
# Descargar diccionario de stopwords del inglés
nltk.download('stopwords')
# Para usar NLTK 3.6.6 o superior es necesario instalar OMW 1.4 
# (Open Multilingual WordNet)
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\julio\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\julio\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\julio\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\julio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\julio\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
simple_text

'if she leaves now she might miss something important!'

In [6]:
# Instanciar el derivador, NLTK provee varios derivadores para elegir:
# https://www.nltk.org/api/nltk.stem.html

from nltk.stem.porter import *
p_stemmer = PorterStemmer()

In [7]:
# Instanciar el lematizador
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [8]:
# Extraer los tokens de un doc
# estos no van a ser los tokens finales, sino que serán procesados
# aplicando lematización, filtrando, etc...
tokens = word_tokenize(simple_text)
print("Tokens:", tokens)

Tokens: ['if', 'she', 'leaves', 'now', 'she', 'might', 'miss', 'something', 'important', '!']


In [9]:
# Transformar los tokens a sus respectivas palabras derivadas
# Stemming
nltk_stemedList = []
for word in tokens:
    nltk_stemedList.append(p_stemmer.stem(word))
print("Stemming:", nltk_stemedList)

Stemming: ['if', 'she', 'leav', 'now', 'she', 'might', 'miss', 'someth', 'import', '!']


In [10]:
# Transformar los tokens a sus respectivas palabras raiz
# Lemmatization
nltk_lemmaList = []
for word in tokens:
    nltk_lemmaList.append(lemmatizer.lemmatize(word))
print("Lemmatization:", nltk_lemmaList)

Lemmatization: ['if', 'she', 'leaf', 'now', 'she', 'might', 'miss', 'something', 'important', '!']


In [11]:
# Miremos el conjunto de los signos de puntuación
# que contiene la librería `string` de Python
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
# Quitar los signos de puntuacion
nltk_punctuation = [w for w in nltk_lemmaList if w not in string.punctuation]
print("Punctuation filter:", nltk_punctuation)

Punctuation filter: ['if', 'she', 'leaf', 'now', 'she', 'might', 'miss', 'something', 'important']


In [13]:
nltk_stop_words = set(stopwords.words("english"))
len(nltk_stop_words)

179

In [14]:
# Filtrar stopwords
nltk_stop_words = set(stopwords.words("english"))
filtered_sentence = [w for w in nltk_punctuation if w not in nltk_stop_words]
print("Stop words filter:", filtered_sentence)

Stop words filter: ['leaf', 'might', 'miss', 'something', 'important']


### 2 - Proceso completo con NLTK
Tokenization → Lemmatization → Remove stopwords → Remove punctuation

In [15]:
def nltk_process(text):
    # Tokenization
    nltk_tokenList = word_tokenize(text)
      
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    nltk_lemmaList = []
    for word in nltk_tokenList:
        nltk_lemmaList.append(lemmatizer.lemmatize(word))
    
    print("Lemmatization")
    print(nltk_lemmaList)

    # Stop words
    nltk_stop_words = set(stopwords.words("english"))
    filtered_sentence = [w for w in nltk_lemmaList if w not in nltk_stop_words]

    # Filter Punctuation
    filtered_sentence = [w for w in filtered_sentence if w not in string.punctuation]

    print(" ")
    print("Remove stopword & Punctuation")
    print(filtered_sentence)
    return filtered_sentence

In [16]:
nltk_text = nltk_process(large_text)
print("Text len:", len(nltk_text))

Lemmatization
['Patients', 'who', 'in', 'late', 'middle', 'age', 'have', 'smoked', '20', 'cigarette', 'a', 'day', 'since', 'their', 'teen', 'constitute', 'an', 'at-risk', 'group', '.', 'One', 'thing', 'they', '’', 're', 'clearly', 'at', 'risk', 'for', 'is', 'the', 'acute', 'sense', 'of', 'guilt', 'that', 'a', 'clinician', 'can', 'incite', ',', 'which', 'immediately', 'make', 'a', 'consultation', 'tense', '.']
 
Remove stopword & Punctuation
['Patients', 'late', 'middle', 'age', 'smoked', '20', 'cigarette', 'day', 'since', 'teen', 'constitute', 'at-risk', 'group', 'One', 'thing', '’', 'clearly', 'risk', 'acute', 'sense', 'guilt', 'clinician', 'incite', 'immediately', 'make', 'consultation', 'tense']
Text len: 27


### 3 - Proceso completo con spaCy
Tokenization → Lemmatization → Remove stopwords → Remove punctuation

In [20]:
import spacy
# Cargar pipeline de preprocesamiento de inglés
nlp = spacy.load('en_core_web_sm')

def spacy_process(text):
    doc = nlp(text)
    
    # Tokenization & lemmatization
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
    print("Tokenize+Lemmatize:")
    print(lemma_list)
    
    # Stop words
    filtered_sentence =[]
    for word in lemma_list:
        # word es un string, para recuperar la información de los objetos de SpaCy
        # necesitamos usar el string para pasar a un lexema, el objeto de SpaCy
        # que para cada término contiene la información del preprocesamiento
        # (se podría también directamente filtrar stopwords en el paso de lematización)
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    
    # Filter punctuation
    filtered_sentence = [w for w in filtered_sentence if w not in string.punctuation]

    print(" ")
    print("Remove stopword & punctuation: ")
    print(filtered_sentence)
    return filtered_sentence

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
spacy_text = spacy_process(large_text)
print("Text len:", len(nltk_text))

### 4 - Conclusiones
- NLTK no pasa a minúsculas el texto por su cuenta
- spacy algunas palabras las reemplaza por su Tag (como "'")
- spacy descompone palabras

In [None]:
from prettytable import PrettyTable
table = PrettyTable(['NLTK', 'spaCy'])
for nltk_word, spacy_word in zip(nltk_text, spacy_text):
    table.add_row([nltk_word, spacy_word])
print(table)