In [1]:
import pandas as pd

In [2]:
csv_file = "data/posts/etiquetado.csv"
df_posts = pd.read_csv(csv_file)
df_posts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124 entries, 0 to 123
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   titulo       124 non-null    object
 1   cuerpo       124 non-null    object
 2   sentimiento  124 non-null    object
dtypes: object(3)
memory usage: 3.0+ KB


In [3]:
csv_file = "data/comentarios/etiquetado.csv"
df_comments = pd.read_csv(csv_file)
df_comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   208 non-null    int64 
 1   titulo       208 non-null    object
 2   cuerpo       208 non-null    object
 3   sentimiento  208 non-null    object
dtypes: int64(1), object(3)
memory usage: 6.6+ KB


In [4]:
df_comments = df_comments.drop(columns="Unnamed: 0")

<hr/>

In [5]:
# Unir los dataframes de posts y comentarios
df = pd.concat([df_posts, df_comments], ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 332 entries, 0 to 331
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   titulo       332 non-null    object
 1   cuerpo       332 non-null    object
 2   sentimiento  332 non-null    object
dtypes: object(3)
memory usage: 7.9+ KB


In [6]:
df["sentimiento"] = df["sentimiento"].str.strip()

<hr/>

In [7]:
import pandas as pd
import numpy as np
import nltk
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
import re


nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
!python -m spacy download es_core_news_sm

Collecting es-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')


In [9]:
nlp = spacy.load("es_core_news_sm")
stopwords_es = set(stopwords.words("spanish"))

In [10]:
df["texto"] = df["titulo"].fillna("") + " " + df["cuerpo"].fillna("")

In [11]:
df["sentimiento"].value_counts()

sentimiento
NEUTRAL     189
NEGATIVO    104
POSITIVO     39
Name: count, dtype: int64

In [12]:
def limpiar_texto(texto):
    texto = texto.lower()
    texto = re.sub(r'\W+', ' ', texto) # eliminar puntuación
    doc = nlp(texto)
    tokens = [token.lemma_ for token in doc if token.lemma_ not in stopwords_es and not token.is_punct and not token.is_space]
    return " ".join(tokens)

In [13]:
df["texto_limpio"] = df["texto"].apply(limpiar_texto)

In [14]:
from sklearn.utils import resample

df_min = df[df.sentimiento == "POSITIVO"]
df_med = df[df.sentimiento == "NEGATIVO"]
df_maj = df[df.sentimiento == "NEUTRAL"]

df_min_upsampled = resample(df_min, replace=True, n_samples=100, random_state=42)
df_med_upsampled = resample(df_med, replace=True, n_samples=100, random_state=42)

df_balanceado = pd.concat([df_maj, df_med_upsampled, df_min_upsampled])


X = df_balanceado["texto_limpio"]
y = df_balanceado["sentimiento"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [15]:
from sklearn.linear_model import LogisticRegression
modelo = Pipeline([
("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=5000)),
("clf", LogisticRegression(class_weight="balanced", max_iter=1000))
])
modelo.fit(X_train, y_train) 

In [16]:
y_pred = modelo.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    NEGATIVO       0.81      0.65      0.72        20
     NEUTRAL       0.75      0.87      0.80        38
    POSITIVO       0.89      0.80      0.84        20

    accuracy                           0.79        78
   macro avg       0.82      0.77      0.79        78
weighted avg       0.80      0.79      0.79        78

