In [1]:
# Importar librerías necesarias
import json
import pandas as pd
import numpy as np
import spacy
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import statsmodels.api as sm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator


import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('corpus')

# Configurar visualización
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mathi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\mathi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mathi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading corpus: Package 'corpus' not found in index


In [2]:
#Cargamos los csv EDA
subreddit_data = pd.read_csv('../data/subreddit_data.csv')
posts_data = pd.read_csv('../data/posts_data.csv')

In [3]:

class RedditTextProcessor:
    def __init__(self):
        # Inicializar el lematizador
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.columns_to_process = ["titlePost", "tittleSubreddit", "descriptionReddit"]

    def preprocess_text(self, text):
        # Tokenización
        text = str(text)
        tokens = word_tokenize(text.lower())
        # Lematización y filtrado de stopwords
        tokens = [
            self.lemmatizer.lemmatize(token) for token in tokens 
            if token.isalpha() and token not in self.stop_words
        ]
        # Se devuelve una cadena y no una lista de tokens para los modelos de aprendizaje
        return " ".join(tokens)
    
    def process_dataframe(self, df):
        for column in df.columns:
            if df[column].dtype == object and column  in self.columns_to_process:
                df[column] = df[column].apply(self.preprocess_text)
        return df

In [4]:
textProcessor = RedditTextProcessor()
processed_subreddit = textProcessor.process_dataframe(subreddit_data)
processed_posts = textProcessor.process_dataframe(posts_data)

In [5]:
processed_subreddit.head()

Unnamed: 0,subreddit_id,tittleSubreddit,subscribersReddit,descriptionReddit,createdReddit
0,0,home,243743,,2009-01-25 02:25:57
1,1,askreddit,48481312,place ask answer question,2008-01-25 03:52:15
2,2,nostupidquestions,4980890,ask away disclaimer anonymous forum answer may...,2013-02-02 08:52:24
3,3,,2364982,community baldur gate iii video game larian st...,2019-05-30 14:35:38
4,4,facepalm,8265075,please sir,2009-08-28 08:49:50


In [6]:
processed_posts.head()

Unnamed: 0,subreddit_id,titlePost,createdPost,scorePost,upVotedRatio,upVotes,commentsPost
0,0,price range house like,2023-08-18 10:17:43,11633,0.92,11633,1727
1,0,door garage,2023-05-18 16:41:57,6026,0.96,6026,1700
2,0,fix garage door torsion spring,2023-08-26 19:44:33,5581,0.86,5581,3644
3,0,call type design,2023-08-21 17:37:07,2824,0.96,2824,579
4,0,inspector said termite damage year old house m...,2023-07-28 17:40:05,2692,0.94,2692,952
