In [1]:
import pandas as pd
import nltk
import spacy
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

In [33]:
# Cargar los datos de entrenamiento y prueba
train_path = "FinancES_train_kaggle.csv"  # Ruta del archivo de entrenamiento
test_path = "FinancES_test_kaggle.csv"    # Ruta del archivo de prueba

# Leer los archivos CSV
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Mostrar información de los datos cargados
print("Train Dataset:")
print(train_df.info())
print(train_df.head())

print("\nTest Dataset:")
print(test_df.info())
print(test_df.head())

Train Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6359 entries, 0 to 6358
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      6359 non-null   int64 
 1   text    6359 non-null   object
 2   label   6359 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 149.2+ KB
None
   id                                               text  label
0   0  Renfe afronta mañana un nuevo día de paros par...      2
1   1       Presupuesto populista con cimientos frágiles      2
2   2  Biden no cree que la OPEP+ vaya a ayudar con l...      2
3   3  La deuda de las familias cae en 25.000 millone...      0
4   4  Bestinver: no hay "momento más inoportuno" par...      2

Test Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1621 entries, 0 to 1620
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1621 non-null   int64 
 1   text    1621 non-null   

In [34]:
# Normalización: convertir a minúsculas
train_df["text"] = train_df["text"].str.lower()
test_df["text"] = test_df["text"].str.lower()

In [35]:
from nltk.stem import SnowballStemmer
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Contar palabras más frecuentes en el dataset
all_words = " ".join(train_df["text"]).split()
word_freq = Counter(all_words)

# Eliminar palabras que aparecen en más del 80% de los documentos
frequent_words = {word for word, freq in word_freq.items() if freq > 0.8 * len(train_df)}

# Inicializar el Stemmer para español
stemmer = SnowballStemmer("spanish")

# Lista de stopwords personalizada
nltk.download('stopwords')
custom_stopwords = set(stopwords.words('spanish'))

# Agregar palabras irrelevantes para titulares financieros
custom_stopwords.update({"día", "años", "mes", "nuevo", "euros"})

# Añadir palabras frecuentes a la lista de stopwords
custom_stopwords.update(frequent_words)

# Función de preprocesamiento con Stemming y stopwords mejoradas
def preprocess_text(text):

    # Tokenización
    tokens = word_tokenize(text)

    # Eliminar signos de puntuación y caracteres especiales, manteniendo números
    tokens = [re.sub(r"(?<!\d)[^\w\s](?!\d)", "", token) for token in tokens]

    # Eliminar stopwords personalizadas
    tokens = [word for word in tokens if word not in custom_stopwords]

    # Aplicar Stemming
    tokens = [stemmer.stem(word) for word in tokens]

    return " ".join(tokens)

# Aplicar preprocesamiento
train_df["text"] = train_df["text"].apply(preprocess_text)
test_df["text"] = test_df["text"].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
print("\nTexto preprocesado:")
print(train_df.head())


Texto preprocesado:
   id                                               text  label
0   0               renf afront mañan par parcial maquin      2
1   1                    presupuest popul cimient fragil      2
2   2                bid cre opep vay ayud preci petrole      2
3   3  deud famili cae 25.000 millon 2015 marc nivel ...      0
4   4                 bestinv   moment inoportun  brexit      2


## FastText


In [8]:
! pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313470 sha256=454140a7f9063565d828bd56792ad911247a472979203108fcf70ba3ea7db925
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a51

In [36]:
# Mapear sentimiento a formato de FastText
train_df['label'] = train_df['label'].astype(str).map({'0': "__label__0",
                                                       '1': "__label__1",
                                                       '2': "__label__2"})

print(train_df.head())
# Guardar en formato FastText
train_frac = 0.8
train_data = train_df.sample(frac=train_frac, random_state=42)
test_data = train_df.drop(train_data.index)

train_data[['label','text']].to_csv("train.txt", index=False, sep='\t', header=False, quoting=3, escapechar='\\')
test_data[['label','text']].to_csv("test.txt", index=False, sep='\t', header=False, quoting=3, escapechar='\\')

   id                                               text       label
0   0               renf afront mañan par parcial maquin  __label__2
1   1                    presupuest popul cimient fragil  __label__2
2   2                bid cre opep vay ayud preci petrole  __label__2
3   3  deud famili cae 25.000 millon 2015 marc nivel ...  __label__0
4   4                 bestinv   moment inoportun  brexit  __label__2


In [37]:
import fasttext

# Entrenar el modelo
model = fasttext.train_supervised(input="train.txt", epoch=100, lr=0.5, wordNgrams=2, verbose=2, minCount=1)

# Guardar el modelo para uso posterior
model.save_model("fasttext_sentiment.bin")


In [38]:
# Evaluar el modelo con datos de prueba
result = model.test("test.txt")
print(f"Precisión: {result[1] * 100:.2f}%")  # result[1] es la precisión del modelo

Precisión: 67.30%


In [39]:
from sklearn.metrics import f1_score


# Cargar el modelo entrenado
model = fasttext.load_model("fasttext_sentiment.bin")

# Cargar los datos de prueba
test_data = pd.read_csv("test.txt", sep='\t', header=None, names=["label", "text"], quoting=3)

# Remover el prefijo "__label__" de las etiquetas reales
test_data['label'] = test_data['label'].str.replace("__label__", "")

# Convertir etiquetas a formato numérico
test_data['label'] = test_data['label'].astype(int)

# Generar predicciones para cada texto
predictions = [model.predict(text)[0][0].replace("__label__", "") for text in test_data["text"]]

# Convertir predicciones a formato numérico
predictions = list(map(int, predictions))

# Calcular F1-score
f1 = f1_score(test_data['label'], predictions, average='macro')
print(f"F1-score: {f1:.4f}")


F1-score: 0.6075
