In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Descargar las palabras vacías (que no aportan significado)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    # 1. Eliminar etiquetas HTML (como <br />)
    text = re.sub(r'<.*?>', '', text)
    # 2. Eliminar caracteres especiales y números (dejamos solo letras)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # 3. Convertir a minúsculas
    text = text.lower()
    # 4. Tokenizar (dividir en palabras)
    words = text.split()
    # 5. Eliminar stopwords y aplicar Stemming (reducir a la raíz)
    # Ejemplo: "running" -> "run"
    words = [stemmer.stem(w) for w in words if w not in stop_words]
    
    return " ".join(words)

# Probar la función con un ejemplo
ejemplo = "I am <b>loving</b> this movie! It's better than running 10 miles."
print(f"Original: {ejemplo}")
print(f"Limpio: {clean_text(ejemplo)}")

Original: I am <b>loving</b> this movie! It's better than running 10 miles.
Limpio: love movi better run mile


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ivanb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [2]:
# Cargar el dataset
df = pd.read_csv('../data/raw/imdb_reviews.csv')

print("Iniciando limpieza... toma un café, esto tardará unos 2-3 minutos.")
df['review_cleansed'] = df['review'].apply(clean_text)
print("¡Limpieza terminada!")

# Ver cómo quedó
print(df[['review', 'review_cleansed']].head())

Iniciando limpieza... toma un café, esto tardará unos 2-3 minutos.
¡Limpieza terminada!
                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                     review_cleansed  
0  one review mention watch oz episod hook right ...  
1  wonder littl product film techniqu unassum old...  
2  thought wonder way spend time hot summer weeke...  
3  basic famili littl boy jake think zombi closet...  
4  petter mattei love time money visual stun film...  


In [3]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
print(df['sentiment'].value_counts())

sentiment
1    25000
0    25000
Name: count, dtype: int64


In [4]:
df.to_csv('../data/processed/imdb_limpio.csv', index=False)
print("Dataset limpio guardado en data/processed/")

Dataset limpio guardado en data/processed/
