# Baseline - preparación de los datos

In [1]:
import pandas as pd
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
comments = pd.read_csv("../data/comments.csv")

In [3]:
print(comments.shape)
comments.head(3)

(11113, 10)


Unnamed: 0,post_id,comment_id,author,created,score,body,vader_compound,sentiment_label,agrees,disagrees
0,1nvfumo,nh892fi,u/MuptonBossman,2025-10-01T18:22:39+00:00,2836,I remember watching Jane Goodall documentaries...,0.8748,pos,0,0
1,1nvfumo,nh87tvb,u/clownus,2025-10-01T18:16:40+00:00,3594,Highly suggest people check out podcast or tal...,0.6335,pos,0,0
2,1nvfumo,nh89xsl,u/Renegadeforever2024,2025-10-01T18:26:49+00:00,432,One of one,0.0,neu,0,0


In [4]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"u\/\w+", "", text)
    text = re.sub(r"r\/\w+", "", text)
    text = re.sub(r"\n+", " ", text)
    text = re.sub(r"[^a-zA-Záéíóúüñ0-9\s]", "", text)
    text = re.sub(r"\s{2,}", " ", text).strip()
    return text


In [5]:
comments["clean_body"] = comments["body"].apply(clean_text)

comments[["body", "clean_body"]].head(3)

Unnamed: 0,body,clean_body
0,I remember watching Jane Goodall documentaries...,i remember watching jane goodall documentaries...
1,Highly suggest people check out podcast or tal...,highly suggest people check out podcast or tal...
2,One of one,one of one


In [6]:
comments = comments.drop_duplicates(subset=["clean_body"])
comments = comments.dropna(subset=["clean_body", "sentiment_label"])
comments = comments[comments["clean_body"].str.len() > 3]

print(comments.shape)

(10794, 11)


In [None]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_tokens(text):
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

comments["processed_text"] = comments["clean_body"].apply(preprocess_tokens)
comments[["clean_body", "processed_text"]].head(3)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/melissa/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/melissa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/melissa/nltk_data...


Unnamed: 0,clean_body,processed_text
0,i remember watching jane goodall documentaries...,remember watching jane goodall documentary ele...
1,highly suggest people check out podcast or tal...,highly suggest people check podcast talk one i...
2,one of one,one one
