In [4]:
!pip install num2words



In [5]:
import pandas as pd
import numpy as np
import unicodedata
import nltk
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from num2words import num2words

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
df = pd.read_csv('binomial_data.csv')
print(len(df))
df.head()

40000


Unnamed: 0,overall,reviewText,overall_label
0,1.0,Deepak Chopra would like to introduce you to J...,1
1,1.0,this games sucks. spend your time on somthing ...,1
2,1.0,Don't waste your money. Nothing here you can't...,1
3,1.0,How I wish Amazon would make their own content...,1
4,1.0,"Moderately interesting plot, but extremely poo...",1


In [7]:
def nltk_cleaner(text, tokenizer, sw_list, lemmatizer):
    """
    Limpia y normaliza un texto utilizando una serie de pasos de procesamiento:

    Parámetros:
    - text (str): El texto a limpiar y normalizar.
    - tokenizer: Un objeto tokenizer para separar palabras y eliminar signos de puntuación.
    - sw_list: Una lista de palabras de parada (stop words) que se deben eliminar del texto.
    - lemmatizer: Un lematizador para reducir las palabras a su forma base.

    Retorna:
    - str: El texto limpio y normalizado.
    """

    clean_text = list()

    # Eliminar acentos, etc
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # Separar palabras eliminando signos de puntuación
    for word in tokenizer.tokenize(text):

        # Eliminar stop words
        if word not in sw_list:

        # Eliminar espacios sobrantes, convertir a minúsculas y lematizar
            clean_word = lemmatizer.lemmatize(word).lower().strip()

        # Convertir dígitos a palabras
            if clean_word.isdigit():
                clean_word = num2words(clean_word, lang='en')

            clean_text.append(clean_word)

    return ' '.join(clean_text)

In [8]:
# Se define un tokenizador, una lista de palabras de parada y un lematizador que serán utilizados en el proceso de limpieza y normalización de texto
tokenizer = RegexpTokenizer(r'\w+')
sw_list = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [9]:
# Se pasan todos los caracteres a str y se llama a la función de limpieza
df['reviewText'] = df['reviewText'].astype(str)
processed_reviews = df['reviewText'].apply(nltk_cleaner, tokenizer=tokenizer, sw_list=sw_list, lemmatizer=lemmatizer)


In [10]:
print('Review original: {}'.format(df['reviewText'].values[0]))
print('Review procesada: {}'.format(processed_reviews[0]))

Review original: Deepak Chopra would like to introduce you to Jesus. Not the Jesus you might know from Sunday School, or the pulpit, or from conversations with Christian friends. Chopra has in mind a different Jesus, one who embodied the highest level of enlightenment and invites his followers into union with God and indeed with all Creation. He calls his book after his name for this character: "The Third Jesus."

A practicing Christian might well object to this premise on doctrinal grounds, but you don't have to be Christian to be wary of this book. I would argue that Chopra fails, and fails repeatedly, on a more fundamental level than orthodoxy. The title of the book is a signal example. Early on, Chopra explains that there are three Jesuses: one is the historical Jesus, about whom, Chopra says, we know next to nothing. The second Jesus is the one built up by the Church for worship, including theologizing about the Trinity and the Holy Spirit. Chopras Third Jesus is the Jesus who tau

In [11]:
# Se añade la fila de reviews procesadas al df
df.loc[:, 'processedReview'] = processed_reviews

In [12]:
# Se sustituyen las columnas vacias por NaN
df['processedReview'] = df['processedReview'].replace('', np.nan)

# Se elimina los valores NaN de la columna
df = df.dropna(subset=['processedReview'])

In [13]:
print(len(df))
df.head()

# A pesaar de los NaN que se eliminan, no son suficientes para desvalancear el dataset

39983


Unnamed: 0,overall,reviewText,overall_label,processedReview
0,1.0,Deepak Chopra would like to introduce you to J...,1,deepak chopra would like introduce jesus not j...
1,1.0,this games sucks. spend your time on somthing ...,1,game suck spend time somthing else dont buy do...
2,1.0,Don't waste your money. Nothing here you can't...,1,don waste money nothing get free listening rai...
3,1.0,How I wish Amazon would make their own content...,1,how i wish amazon would make content compatibl...
4,1.0,"Moderately interesting plot, but extremely poo...",1,moderately interesting plot extremely poor cop...


In [14]:
# Guardar el DataFrame preprocesados en un archivo CSV
df.to_csv('preprocessing_data.csv', index=False)

In [15]:
# Se comprueba que se ha guardado correctamente
df = pd.read_csv('preprocessing_data.csv')

print(len(df))
df.head()

39983


Unnamed: 0,overall,reviewText,overall_label,processedReview
0,1.0,Deepak Chopra would like to introduce you to J...,1,deepak chopra would like introduce jesus not j...
1,1.0,this games sucks. spend your time on somthing ...,1,game suck spend time somthing else dont buy do...
2,1.0,Don't waste your money. Nothing here you can't...,1,don waste money nothing get free listening rai...
3,1.0,How I wish Amazon would make their own content...,1,how i wish amazon would make content compatibl...
4,1.0,"Moderately interesting plot, but extremely poo...",1,moderately interesting plot extremely poor cop...
