## Flujo de Trabajo en NLP

<img src="../_src/PLN_Flujo_Trabajo.jpg" height="300"><br>

In [1]:
import pandas as pd
import nltk
import re 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer

In [27]:
# Descargo los recursos necesarios
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hernán\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
# Cargo la información de las reviews depuradas
df_reviews = pd.read_parquet('..\\datasets\\2. Depurado\\user_reviews_depurado.parquet')
#df_reviews = df_reviews.iloc[:1000] # para pruebas actodas.

- Se cargan las Stopwords en Inglés y se instancia el lemmatizer.

In [26]:
stopwords = nltk.corpus.stopwords.words('english') 
wordnet_lemmatizer = WordNetLemmatizer()

- Se elabora la función para mapeo de tipo de palabra para Lematización

In [28]:
# Las etiquetas refieren al tipo de palabra. Vamos a definir una función para traducir estas etiquetas a los valores de POS que entiende 'wordnet_lemmatizer'.
def get_wordnet_pos(word):
    
    """mapea los tipos de palabras cde nltk para la lematización"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

- Se realiza el proceso de Tokenización, eliminación de Expresiones Regulares y Lematización

In [29]:
def process_review(review):
    if pd.isna(review):
        return []  # Review vacío
    
    # Limpieza y procesamiento - Si la review no es nula

    # Elimino URLs
    text = re.sub(r'http\S+', '', text)
    
    # Elimino todo lo que no sean letras y convertir a minúsculas
    review = re.sub("[^a-zA-Z]", " ", str(review)).lower()

    # Tokeniza la review depues de sacar las condiciones regulares que le pasamos
    tokenizer = RegexpTokenizer(r'\w+')
    review = tokenizer.tokenize(review)
    
    # Elimina las stopwords 
    review = [word for word in review if word not in stopwords]
    # Lematización
    review = [wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in review]
    
    return review


# Aplicar la función de procesamiento a cada reseña
df_reviews["processed_reviews"] = df_reviews["review"].apply(process_review)


- Utiizamos la Librería TEXTBLOB para la categorización de las Reviews.

In [19]:
from textblob import TextBlob

In [20]:
# Función para clasificar el sentimiento
def classify_sentiment(review):
    # Unir la lista de palabras en una cadena de texto
    review_text = " ".join(review)

    if not review_text:
        return 'Neutro' # Si no hay review, asignar "Neutro"
    
    # Crear un objeto TextBlob
    blob = TextBlob(review_text)
    # Obtener el sentimiento
    polarity = blob.sentiment.polarity

    # Clasificación del sentimiento
    if polarity > 0:
        return 'Positivo'
    elif polarity < 0:
        return 'Negativo'
    else:
        return 'Neutro'

# Aplicar la función a la columna "review"
df_reviews['sentiment'] = df_reviews['processed_reviews'].apply(classify_sentiment)

# Aplicar el valor necesario
df_reviews['sentiment_value'] = df_reviews['sentiment'].map({'Negativo': '0', 'Neutro': '1', 'Positivo':'2'})

# Guardo el resultado para su uso posterior
df_reviews.to_parquet('..\\datasets\\2. Depurado\\user_reviews_NLP_TextBlob.parquet', index=False)


- Breve análisis de resultados

In [2]:
# Cargo la información de las reviews
df_reviews = pd.read_parquet('..\\datasets\\2. Depurado\\user_reviews_NLP_TextBlob.parquet')

In [25]:
df_reviews[df_reviews['recommend'] == False].head(10)

Unnamed: 0,user_id,user_url,item_id,posted,helpful,recommend,funny,review,processed_reviews,sentiment,sentiment_value
48,76561198043472122,http://steamcommunity.com/profiles/76561198043...,33440,"Posted December 19, 2014.",1 of 3 people (33%) found this review helpful,False,,This Game Doesn't Work,"[game, work]",Negativo,0
62,76561198066046412,http://steamcommunity.com/profiles/76561198066...,359320,"Posted December 28, 2015.",4 of 5 people (80%) found this review helpful,False,,"♥♥♥♥♥ charged me 80 now its 15 dollars, got bo...","[charge, dollar, get, boring, hour]",Negativo,0
68,76561198070565427,http://steamcommunity.com/profiles/76561198070...,570,"Posted June 27, 2014.",No ratings yet,False,,"w,",[w],Neutro,1
84,boydeer,http://steamcommunity.com/id/boydeer,383080,"Posted August 24, 2015.",4 of 7 people (57%) found this review helpful,False,,เกมเเดกเงินดีๆนี้เอง,[],Neutro,1
122,sandwiches1,http://steamcommunity.com/id/sandwiches1,417860,"Posted November 23, 2015.",No ratings yet,False,,Emily is a thot,"[emily, thot]",Neutro,1
132,iamthekingofbrowntown,http://steamcommunity.com/id/iamthekingofbrown...,344760,"Posted September 12, 2015.",3 of 4 people (75%) found this review helpful,False,,Spent 3 days making a base. Man teleported in ...,"[spent, day, make, base, man, teleport, base, ...",Negativo,0
156,76561198010674657,http://steamcommunity.com/profiles/76561198010...,377160,Posted March 27.,3 of 4 people (75%) found this review helpful,False,,Where do I begin...I trusted you Bethesda. Fro...,"[begin, trust, bethesda, time, spent, fo, neve...",Positivo,2
183,Nozomikat,http://steamcommunity.com/id/Nozomikat,437220,Posted August 19.,7 of 13 people (54%) found this review helpful,False,1 person found this review funny,Very harsh to new players. Combat system is st...,"[harsh, new, player, combat, system, strange, ...",Negativo,0
186,xfluttersx,http://steamcommunity.com/id/xfluttersx,202530,Posted September 14.,1 of 1 people (100%) found this review helpful,False,,This has got to be worse than sonic 06'!I play...,"[get, bad, sonic, played, many, sonic, game, k...",Negativo,0
193,ii_voltage_ii,http://steamcommunity.com/id/ii_voltage_ii,253710,"Posted June 29, 2014.",1 of 2 people (50%) found this review helpful,False,,Is this game actually Free to Play ? I don't t...,"[game, actually, free, play, think, hunt, one,...",Negativo,0


In [26]:
# Ver el contenido completo de la primera reseña
print(df_reviews['review'].iloc[156])
# Ver el contenido completo de la primera reseña
print(df_reviews['processed_reviews'].iloc[156])


['begin' 'trust' 'bethesda' 'time' 'spent' 'fo' 'never' 'could' 'imagine'
 'garbage' 'spew' 'depth' 'whatever' 'creative' 'hell' 'come' 'thought'
 'would' 'learn' 'obsidian' 'new' 'vega' 'depth' 'realism' 'map' 'small'
 'skyrim' 'todd' 'length' 'width' 'sense' 'little' 'thing' 'miss'
 'special' 'touch' 'charm' 'cusp' 'every' 'corner' 'turn' 'previous'
 'fallout' 'title' 'miss' 'environment' 'feel' 'hollow' 'vibrant' 'smooth'
 'fallout' 'theme' 'grow' 'crave' 'felt' 'wonder' 'trade' 'something'
 'original' 'something' 'special' 'wonky' 'shooter' 'mechanic' 'skill'
 'bar' 'karma' 'system' 'well' 'fps' 'mechanic' 'console' 'friendly'
 'interaction' 'system' 'would' 'suggest' 'digital' 'copy' 'wipe'
 'physical' 'copy' 'pile' 'garbage' 'dump' 'like' 'atari' 'e' 'extra'
 'terrestrial' 'fail' 'digress' 'game' 'serve' 'warn' 'find' 'ip'
 'significant' 'fallout' 'universe' 'like' 'say' 'broke' 'fix' 'favour'
 'buy' 'fallout' 'new' 'vega' 'instead' 'squirm' 'feel' 'regret'
 'hopelessness']
