In [42]:
import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np
import os

In [43]:
# Cargar datos
posts = pd.read_csv(".\\reddit_api_output\\posts.csv")
comments = pd.read_csv(".\\reddit_api_output\\comments.csv")

# Unificar columnas relevantes
df_posts = posts[['post_id', 'author', 'selftext', 'title']]
df_posts['text'] = df_posts['title'].fillna('') + ' ' + df_posts['selftext'].fillna('')

df_comments = comments[['post_id', 'author', 'body']]
df_comments = df_comments.rename(columns={'body': 'text'})

# Unificar posts + comentarios
df_all = pd.concat([df_posts[['post_id', 'author', 'text']],
                    df_comments[['post_id', 'author', 'text']]], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_posts['text'] = df_posts['title'].fillna('') + ' ' + df_posts['selftext'].fillna('')


In [44]:
def clean_text(text):
    text = re.sub(r"http\S+", "", str(text))         
    text = re.sub(r"[^a-zA-ZáéíóúñüÁÉÍÓÚÑÜ\s]", "", text)  
    text = re.sub(r"\s+", " ", text).strip()          
    return text.lower()

In [45]:
df_all['clean_text'] = df_all['text'].apply(clean_text)
df_all.head()

Unnamed: 0,post_id,author,text,clean_text
0,1lcftjz,u/SuperiorT,Got my mom her green card by enlisting in the ...,got my mom her green card by enlisting in the ...
1,1grmeq4,u/adepojus,Today I became a US citizen I came into United...,today i became a us citizen i came into united...
2,1gkfbph,u/Asteroids19_9,Today I became a US citizen I am a 19 year old...,today i became a us citizen i am a year old st...
3,1glflxy,u/Honest-Grape-9352,"So, what now? An immigration attorney perspect...",so what now an immigration attorney perspectiv...
4,1ltlanr,u/Ajax4557,Became a Citizen after 26 years!!,became a citizen after years


In [46]:
analyzer = SentimentIntensityAnalyzer()

df_all[['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']] = df_all['clean_text'].apply(
    lambda x: pd.Series(analyzer.polarity_scores(x))
)

In [47]:
bad_words = ["idiot", "stupid", "moron", "hate", "trash", "kill", "fuck", "dumb", "loser", "shut up", "disgusting", "ugly",
             "worthless", "pathetic", "suck", "annoying", "nonsense", "fool", "jerk", "bastard", "crap", "damn"]



def pseudo_label(text):
    if any(bad in text for bad in bad_words):
        return 1
    return 0

df_all["target"] = df_all["clean_text"].apply(pseudo_label)


In [48]:
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(df_all['clean_text'])

# Concatenar features numéricos de VADER
import numpy as np
X = np.hstack((X_tfidf.toarray(), df_all[['vader_neg','vader_neu','vader_pos','vader_compound']].values))

y = df_all['target']

# Entrenamiento
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      2951
           1       1.00      0.27      0.42       108

    accuracy                           0.97      3059
   macro avg       0.99      0.63      0.71      3059
weighted avg       0.97      0.97      0.97      3059

