# Práctica del módulo NLP

In [24]:
import pandas as pd
import matplotlib.pyplot as plt

import re
from collections import Counter

from spacy.lang.es.stop_words import STOP_WORDS

import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

import spacy
import lemminflect
spacy_lemmas = spacy.load('en_core_web_sm')

In [2]:
aiv5 = pd.read_csv("Amazon_Instant_Video_5.csv")
a5 = pd.read_csv("Automotive_5.csv")
dm5 = pd.read_csv("Digital_Music_5.csv")
mi5 = pd.read_csv("Musical_Instruments_5.csv")
plg5 = pd.read_csv("Patio_Lawn_and_Garden_5.csv")

reviews = pd.concat([aiv5,a5,dm5,mi5,plg5], ignore_index=True).dropna(subset=['reviewText']).sample(frac=1)
reviews.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
7904,ADSWHY3431SD7,B004263SAM,Sadcelticlady,"[0, 0]",Helen murres is a fine actress ! This series n...,5,Great series !,1388448000,"12 31, 2013"
9927,A354ZN03NP2ZK8,B00DN15JWW,"William J. Meegan ""Never let another do for y...","[0, 0]",Series like this I find educational as well as...,5,Amazing series discussing very interesting psy...,1388793600,"01 4, 2014"
12174,A5SBPZ1KHWX5Y,B0050SFZBG,jwm1941,"[0, 0]",This tool is excellent at performing the job f...,4,VERY USEFUL TOOL,1392595200,"02 17, 2014"
21935,A3AL8GQ69QE7WN,B000000OQW,Jason Stein,"[9, 12]",Of the two Edie Brickell and the New Bohemians...,5,Great College Rock.,953251200,"03 17, 2000"
592,A3E2158X66LFMI,B00F55HIRI,Mery,"[1, 2]",The show still has plenty with the hot origina...,5,Only thing I didn't like was that Rebecca left...,1404172800,"07 1, 2014"


# 2. Análisis de sentimiento

Me traigo mi función de preprocesado de textos y la amplío con nuevas funciones (lemmatizers, ngrams...)

In [32]:
def preprocess_texts(texts, **kwargs):
    
    processed_texts = [] # Aquí me guardaré los textos procesados en forma de lista
    
    # Para el autocorrector que se aplica más abajo utilizaré todo el texto de mi corpus
    if 'auto_correct' in kwargs and kwargs['auto_correct']:
        full_text = " ".join([text for text in texts])
    
    for text in texts:
        
        tokens = [] # Aquí se guardan los tokens que conforman cada texto
        
        # Comenzando con el preprocesado completo de gensim.utils.simple_preprocess()
        if 'simple_gensim' in kwargs and kwargs['simple_gensim']:
            rawtokens = [token for token in gensim.utils.simple_preprocess(text)]
            
        # Si decido aplicar un lemmatizer como split inicial
        elif 'lemmatizer' in kwargs:
            lemmas = kwargs['lemmatizer'](text)
            rawtokens = [token._.lemma() for token in lemmas]
            
        # Hago un split y utilizo tan solo mi preprocesado adicional
        else: rawtokens = text.split()
        
        # Aplico reglas adicionales de preprocesado.
        for token in rawtokens:
            if 'lower' in kwargs and kwargs['lower']: token = token.lower()
            if 'only_alpha' in kwargs and kwargs['only_alpha']: token = re.sub('[^a-zA-Z]+', '', token)
            if 'stop_words' in kwargs and token in kwargs.get('stop_words'): token = None

            if token: tokens.append(token)
                
        # Aplico mi corrector ortográfico al texto preprocesado.
        if 'auto_correct' in kwargs and kwargs['auto_correct']:
            for ix in range(len(tokens)):
                tokens[ix] = correct(tokens[ix], full_text, kwargs['auto_correct'])
                
        # Genero N-Gramas
        if 'ngrams' in kwargs:
            
            ngrams = [] # Aquí guardaré los ngramas que reemplazarán a la lista de tokens
            
            # Se puede utilizar una lista de varios ngramas
            if type(kwargs['ngrams']) is list or type(kwargs['ngrams']) is tuple:
                for ngram in kwargs['ngrams']:
                    for i in range(len(tokens)-ngram):
                        ngrams.append(" ".join(tokens[i:i+ngram]))
            
            # También se puede indicar un valor entero como única medida para el ngrama
            else:
                for i in range(len(tokens)-kwargs['ngrams']):
                    ngrams.append(" ".join(tokens[i:i+kwargs['ngrams']]))
                
            tokens = ngrams
        
        processed_texts.append(tokens)
        
    return processed_texts

In [4]:
# Con steps_away indico el número máximo de modificaciones que se realizarán para buscar una corrección.
def correct(word, text, steps_away=1):
    counts = Counter(text)
    
    candidates = [w for w in edits(word, steps=steps_away) if w in text or w is word]
    
    return max(candidates, key=lambda w: counts[w] / len(text))

In [5]:
def edits(word, steps=1):
    
    words = [word]
    
    for step in range(steps):
        
        new_words = []
        
        for word in words:
            letters    = 'abcdefghijklmnopqrstuvwxyz'
            splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
            deletes    = [L + R[1:]                 for L, R in splits if R]
            transposes = [L + R[1] + R[0] + R[2:]   for L, R in splits if len(R)>1]
            replaces   = [L + c + R[1:]             for L, R in splits if R for c in letters]
            inserts    = [L + c + R                 for L, R in splits for c in letters]
            
            new_words += list(set(deletes + transposes + replaces + inserts))

        words += list(set(new_words))
    
    return words

In [33]:
preprocess_texts(reviews.reviewText, lemmatizer=spacy_lemmas)

[['Helen',
  'murre',
  'be',
  'a',
  'fine',
  'actress',
  '!',
  'This',
  'series',
  'never',
  'grow',
  'old',
  '!',
  'Have',
  'enjoy',
  'it',
  'over',
  'and',
  'over',
  'again',
  '.'],
 ['Series',
  'like',
  'this',
  'I',
  'find',
  'educational',
  'as',
  'well',
  'as',
  'be',
  'entertaining',
  '.',
  ' ',
  'I',
  'have',
  'not',
  'find',
  'one',
  'episode',
  'to',
  'be',
  'boring',
  '.',
  ' ',
  'The',
  'idea',
  'be',
  'interestingly',
  'work',
  'out',
  'each',
  'week',
  'in',
  'a',
  'mere',
  '43',
  'minute',
  '.'],
 ['This',
  'tool',
  'be',
  'excellent',
  'at',
  'perform',
  'the',
  'job',
  'for',
  'which',
  'it',
  'be',
  'design',
  ':',
  'remove',
  'hose',
  'connection',
  '.',
  'I',
  'would',
  'only',
  'caution',
  'that',
  'you',
  'take',
  'care',
  'to',
  'observe',
  'the',
  'extremely',
  'SHARP',
  'point',
  '.',
  'However',
  ',',
  'the',
  '&',
  '#',
  '34;sharp',
  'point&e',
  ';',
  'be',
  'one

Probaré diferentes modelos con diferentes tipos de preprocesado y al final compararé mis resultados.

Comenzaré construyéndome un corpus que contenga tan solo el texto y una categoría (positivo, negativo y neutro).

In [None]:
reviews = reviews[['reviewText','overall']]

for ix, processed_text in zip(range(len(reviews)), preprocess_texts(reviews.reviewText)):
    reviews.reviewText[ix] = " ".join(processed_text)
    
reviews.overall = reviews.overall.apply(lambda x: 'pos' if x > 3 else 'neg' if x < 3 else 'neu')

Ahora me interesa hacerun split en train y test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    reviews.reviewText,
    reviews.overall,
    train_size=0.80,
    test_size=0.20,
    shuffle=True
)

Ahora utilizaré el tf-idf de mi corpus de entrenamiento para obtener las palabras más relevantes que detectan el sentimiento y comprobar si tiene sentido.

In [None]:
cv = TfidfVectorizer(
    max_df=0.95,
    min_df=5,
    max_features=2500,
    strip_accents='ascii',
    ngram_range=(2, 3)
)
cv.fit(X_train)

In [None]:
# Compute the chi-squared score for each word in the training set and show the values
i = 15

chi2score = chi2(X_train_, y_train)[0]
scores = list(zip(cv.get_feature_names(), chi2score))
sorted_scores = sorted(scores, key=lambda x:x[1])
topchi2 = list(zip(*sorted_scores[-i:]))
x = range(len(topchi2[1]))
labels = topchi2[0]

plt.figure(figsize=(12, 8))
plt.barh(x,topchi2[1], align='center', alpha=0.5)
plt.plot(topchi2[1], x, '-o', markersize=5, alpha=0.8)
plt.yticks(x, labels, fontsize=12)
plt.xlabel('$\chi^2$', fontsize=26)
plt.ylabel('word', fontsize=16)
plt.title('Top {} $\chi^2$ score for each word in the training set'.format(i), fontsize=20)
plt.grid()
plt.tight_layout()
plt.show();