# 5.1- NLP

NLP trata de aplicaciones que entiendan nuestro idioma, reconocimiento de voz, traducción, comprensión semántica, análisis de sentimiento..

**Usos**

+ Motores de búsqueda
+ Feed de redes sociales
+ Asistentes de voz 
+ Filtros de span
+ Chatbots

**Librerías**

+ NLTK
+ Spacy
+ TFIDF
+ OpenNLP

La dificultad del NLP está en varios niveles:

+ Ambigüedad:

  * Nivel léxico: por ejemplo, varios significados
  * Nivel referencial: anáforas, metáforas, etc...
  * Nivel estructural: la semántica es necesaria para entender la estructura de una oración
  * Nivel pragmático: dobles sentidos, ironía, humor
  
+ Detección de espacios
+ Recepción imperfecta: acentos, -ismos, OCR

El proceso es similar que en USL, primero se vectorizan las palabras y después se miden sus distancias/similitudes. 

In [1]:
# lista de 100 peliculas

titles=open('../data/title_list.txt').read().split('\n')[:100]

titles[:10]

['The Godfather',
 'The Shawshank Redemption',
 "Schindler's List",
 'Raging Bull',
 'Casablanca',
 "One Flew Over the Cuckoo's Nest",
 'Gone with the Wind',
 'Citizen Kane',
 'The Wizard of Oz',
 'Titanic']

In [2]:
synopsis=open('../data/synopses_list.txt').read().split('\n BREAKS HERE')[:100]

synopsis[0][:100]

" Plot  [edit]  [  [  edit  edit  ]  ]  \n  On the day of his only daughter's wedding, Vito Corleone h"

### Limpieza

In [3]:
#%pip install spacy

In [4]:
import string

import spacy

from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English


import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

import re

[nltk_data] Downloading package stopwords to /Users/iudh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
#%python -m spacy download en_core_web_sm
#%pip install spacy-lookups-data

In [6]:
nlp=spacy.load('en_core_web_sm')

stop_words_en=set(stopwords.words('english') + list(STOP_WORDS) + list(nlp.Defaults.stop_words) + ['edit', 'plot'])

stop_words=stop_words_en 

parser=English()

In [7]:
def spacy_tokenizer(sentence):
    
    tokens=parser(sentence)
    
    filtered_tokens=[]

    for word in tokens:
        
        lemma=nlp(str(word))[0].lemma_.lower().strip()
        
        if lemma not in stop_words and re.search('^[a-zA-Z]+$', lemma):
            filtered_tokens.append(lemma)
            
    return filtered_tokens

In [8]:
%%time

spacy_tokenizer(synopsis[0][:200])

CPU times: user 143 ms, sys: 4.76 ms, total: 147 ms
Wall time: 157 ms


['day',
 'daughter',
 'wedding',
 'vito',
 'corleone',
 'hear',
 'request',
 'role',
 'godfather',
 'new',
 'york',
 'crime',
 'family',
 'vito',
 'young',
 'son']

### TFIDF (term frequency inverse document frequency)

In [9]:
type(synopsis[0])

str

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
tfidf=TfidfVectorizer(min_df=0.15, tokenizer=spacy_tokenizer)

In [None]:
%%time

tfidf_matrix=tfidf.fit_transform(synopsis)

In [None]:
tfidf_matrix.shape, len(synopsis)

In [None]:
(str(tfidf_matrix[0]).split('\n'))

In [None]:
import pandas as pd

df=pd.DataFrame(tfidf_matrix)

In [None]:
df.head()

In [None]:
terms=tfidf.get_feature_names_out()

terms[:15]

In [None]:
# os dejo la kata

tfidf_matrix

### Distancias

In [None]:
from sklearn.metrics.pairwise import cosine_similarity as cos

In [None]:
distancias=1-cos(tfidf_matrix)

distancias.shape

In [None]:
pd.DataFrame(distancias).head()

### Clustering

In [None]:
import warnings
warnings.simplefilter('ignore')

import pylab as plt
from IPython.display import set_matplotlib_formats

%matplotlib inline
set_matplotlib_formats('svg')

import numpy as np

In [None]:
from umap import UMAP

In [None]:
umap=UMAP(n_neighbors=5, random_state=42)

emb=umap.fit_transform(distancias)

In [None]:
emb[:5]

In [None]:
emb.shape

In [None]:
plt.scatter(emb[:, 0], emb[:, 1]);

In [None]:
# DBSCAN

from sklearn.cluster import DBSCAN

In [None]:
dbscan=DBSCAN(eps=0.8, min_samples=7)

In [None]:
dbscan.fit(emb)

In [None]:
dbscan.labels_

In [None]:
# HDBSCAN

from hdbscan import HDBSCAN

In [None]:
#help(HDBSCAN)

In [None]:
hdbscan=HDBSCAN(min_cluster_size=5)

clusters=hdbscan.fit_predict(emb)

In [None]:
np.unique(clusters)

In [None]:
plt.scatter(emb[:, 0], emb[:, 1], c=clusters);

### titulos de los clusters

In [None]:
def get_titles_from_cluster(c):
    return pd.Series(titles)[clusters==c]

In [None]:
get_titles_from_cluster(-1)

In [None]:
tfidf_df=pd.DataFrame(tfidf_matrix.toarray(), columns=terms)

tfidf_df.head()

In [None]:
def get_df_from_cluster(c):
    return tfidf_df[clusters==c]

In [None]:
get_df_from_cluster(2).describe()

In [None]:
get_df_from_cluster(2).describe().T

In [None]:
top_words=get_df_from_cluster(-1).T.sum(axis=1).sort_values(ascending=False)

top_words.head()

In [None]:
get_titles_from_cluster(3)

## NLP_es 

In [None]:
#!python -m spacy download es_core_news_md

In [None]:
nlp=spacy.load('es_core_news_md')

In [None]:
doc2vec=nlp('hola me llamo pepito').vector.sum()

doc2vec

In [None]:
word2vec=(nlp('hola').vector + nlp('me').vector + nlp('llamo').vector + nlp('pepito').vector).sum()/4

word2vec

##### similitud

In [None]:
nlp('quiero saludar a todos los alumnos de Ironhack, que pasa alegres').similarity(nlp('hola a todos los alumnos'))

In [None]:
token_1=spacy_tokenizer('quiero saludar a todos los alumnos de Ironhack, que pasa alegres')
token_2=spacy_tokenizer('hola a todos los alumnos')

token_1

In [None]:
def get_simil(t1, t2):
    return nlp(' '.join(t1)).similarity(nlp(' '.join(t2)))  # similitud es entre -1 y 1, por el valor del coseno

In [None]:
get_simil(token_1, token_2)

# WordClouds

In [None]:
#!pip install langdetect
#!pip install wordcloud

In [None]:
# librerias

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image

import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

from spacy.lang.es import Spanish



import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words_sp=set(stopwords.words('spanish') + ['haber'])
stop_words_en=set(stopwords.words('english'))

stop_words=stop_words_sp | stop_words_en 


import re

from langdetect import detect

from wordcloud import WordCloud

In [None]:
def tokenizer(frase):
    
    if detect(frase)=='en':  # si esta en ingles...
        nlp=spacy.load('en_core_web_sm')
        parser=English()
        
    elif detect(frase)=='es': # si esta en castellano...
        nlp=spacy.load('es_core_news_md')
        parser=Spanish()
    else:
        return 'No es ni castellano ni ingles..'
    
    
    tokens=parser(frase)
    
    clean_tokens=[]
    
    for e in tokens:
        
        lema=nlp(str(e))[0].lemma_.lower().strip()
        
        if lema not in stop_words and re.search('^[a-zA-Z]+$', lema):
            
            clean_tokens.append(lema)
            
    return ' '.join(clean_tokens)

In [None]:
def wordcloud(df, col):
    
    wordcloud=WordCloud(width=1600,
                        height=400,
                        stopwords=stop_words,
                        colormap='Spectral').generate(' '.join([e for e in df[col]]))
    
    plt.figure(figsize=(15, 10), facecolor='k')
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.savefig('images/wordcloud.png', facecolor='k', bbox_inches='tight')
    plt.show();

In [None]:
%%time

df=pd.DataFrame(synopsis, columns=['text'])

df.text=df.text.apply(tokenizer)

df.head()

In [None]:
wordcloud(df, 'text')

In [None]:
%%time

txt=open('../data/conde.txt').read().split('\n BREAKS HERE')[:100]

df2=pd.DataFrame(txt, columns=['text'])

df2.text=df2.text.apply(tokenizer)

df2.head()

In [None]:
wordcloud(df2, 'text')

In [None]:
#help(WordCloud)

#### Mascara

In [None]:
# imagen con mascara

Image.open('images/vino.png')

In [None]:
vino_mask=np.array(Image.open('images/vino.png'))

vino_mask.shape

In [None]:
# transformacion de la mascara

def transformacion(val):
    if val==0:
        return 255
    else:
        return val

In [None]:
t_vino_mask=np.ndarray((vino_mask.shape[0], vino_mask.shape[1]), np.int32)


for i in range(len(vino_mask)):
    t_vino_mask[i]=list(map(transformacion, vino_mask[i]))

In [None]:
w=WordCloud(background_color='white',
                   max_words=1000,
                   mask=t_vino_mask,
                   stopwords=stop_words,
                   contour_width=3,
                   contour_color='firebrick').generate(' '.join([e for e in df.text]))


w.to_file('images/copa&botella.png')


plt.figure(figsize=(15, 10), facecolor='k')
plt.imshow(w)
plt.axis('off')
plt.tight_layout(pad=0)


### ejemplo con todo

In [None]:
def tokenizer(lst):  # ahora entra una lista
    
    en=0
    es=0
    
    for txt in lst:
        try:
            txt=str(txt)
            if detect(txt)=='en':    # si el texto esta en ingles...
                en+=1
                
                nlp=spacy.load('en_core_web_sm')
                parser=English()
                tokens=parser(txt)

                tokens_en=[]

                for word in tokens:
                    lemma=nlp(str(word))[0].lemma_.lower().strip()
                    if lemma not in STOP_WORDS and re.search('^[a-zA-Z]+$', lemma):
                        tokens_en.append(lemma)

            elif detect(txt)=='es':   # si el texto esta en castellano...
                es+=1
                
                nlp=spacy.load('es_core_news_md')
                parser=Spanish()
                tokens=parser(txt)

                tokens_es=[]

                for word in tokens:
                    lemma=nlp(str(word))[0].lemma_.lower().strip()
                    if lemma not in STOP_WORDS and re.search('^[a-zA-Z]+$', lemma):
                        tokens_es.append(lemma)

            else:
                print ('No se reconoce idioma (EN / ES)...')
        
        except:
            print ('ERROR...')
            continue
    
    
    return ' '.join(tokens_en), ' '.join(tokens_es), en, es

In [None]:
%%time

mix_txt=synopsis+txt

tokens=tokenizer(mix_txt)

In [None]:
# wordcloud en ingles

serie_en=pd.DataFrame({'en': tokens[0]}, index=[0])

wordcloud(serie_en, 'en')

In [None]:
# wordcloud en castellano

serie_es=pd.DataFrame({'es': tokens[1]}, index=[0])

wordcloud(serie_es, 'es')

In [None]:
tokens[2]

In [None]:
tokens[3]

## NER

In [None]:
news=pd.read_csv('data/noticias.csv')

news.head()

In [None]:
news.tail(2).T

In [None]:
import spacy
from spacy import displacy

In [None]:
#!python -m spacy download es_core_news_lg

In [None]:
#!python -m spacy download en_core_web_lg

In [None]:
spacy_core_es='es_core_news_md'

spacy_core_en='en_core_web_sm'

In [None]:
def ner(spacy_core, data):
    
    nlp=spacy.load(spacy_core)
    
    frases=list(nlp(data).sents)  # frases, sentencias
    
    entidades=displacy.render(nlp(str(frases)), style='ent')
    
    return entidades

In [None]:
ner(spacy_core_es, news.text[150])

In [None]:
ner(spacy_core_en, synopsis[3][:1000])

### Transformers (creacion de texto)

In [None]:
#!pip install transformers
#!pip install tensorflow
#!pip install torch

In [None]:
from transformers import pipeline

generador=pipeline('text-generation', 
                   model='EleutherAI/gpt-neo-125M')

In [None]:
def crea_texto(generador, texto, min_long=20):
    
    return generador(texto, do_sample=True, min_length=min_long)[0]['generated_text']

In [None]:
crea_texto(generador, 'data science is')

In [None]:
crea_texto(generador, 'what planet is this')

In [None]:
crea_texto(generador, 'humans are')

In [None]:
crea_texto(generador, 'el monstruo de las galletas')

In [None]:
crea_texto(generador, 'molt be noi')

In [None]:
crea_texto(generador, 'rellampagos asgaya')

In [None]:
crea_texto(generador, 'humans are')

In [None]:
#help(generador)

In [None]:
crea_texto(generador, 'what the fuck asshole')