# **PREPROCESAMIENTO DE TEXTOS**

## **PARA EMBEDDINGS NO CONTEXTUALES**

In [9]:
import re
import pandas as pd
from tqdm import tqdm
import spacy

csv_path = r"..\..\finnhubAPI\data\porEmpresas\definitivos\INDEX_ALL_scrapped_filtrado.csv"
df = pd.read_csv(csv_path)

Lo primero es limpiar símbolos y distintas cosas que el modelo pueda malinterpretar. Observando los textos de los articulos, hemos detectado que se cuelan algunos simbolos como ">" "<" o comillas,...Hemos tomado la decisión de eliminarlas para un mejor funcionamiento

In [10]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "textcat"])
tqdm.pandas()

STOPWORDS = set(nlp.Defaults.stop_words)
PLACEHOLDER_RE = re.compile(r"__\w+__")

URL_RE = re.compile(r"https?://\S+|www\.\S+")
HTML_TAG_RE = re.compile(r"<[^>]+>")
BOILERPLATE_RE = re.compile(
    r"(^read more:.*$|^story continues.*$|copyright\s*©.*$)",
    flags=re.IGNORECASE | re.MULTILINE,
)
CASHTAG_RE = re.compile(r"\$([A-Za-z]{1,10})\b")

In [11]:
def pre_rules(text: str) -> str:
    if not isinstance(text, str) or not text.strip():
        return ""
    text = (text.replace("’", "'").replace("“", '"').replace("”", '"')
                 .replace("–", "-").replace("—", "-")).lower()
    t = BOILERPLATE_RE.sub("", text)
    t = HTML_TAG_RE.sub(" ", t)
    t = URL_RE.sub(" ", t)
    t = re.sub(CASHTAG_RE, "__TICKER__", t)

    # placeholders financieros
    t = re.sub(r'\bQ([1-4])\b', r'__QTR\1__', t, flags=re.IGNORECASE)
    t = re.sub(r'\b[+-]?\d[\d,\.]*\s*%(?=\W|$)', '__PERCENT__', t)
    t = re.sub(r'\b[+-]?\d[\d,\.]*\s*percent(?=\W|$)', '__PERCENT__', t, flags=re.IGNORECASE)
    t = re.sub(r'\b[+-]?\d[\d,\.]*\s*per\s+cent(?=\W|$)', '__PERCENT__', t, flags=re.IGNORECASE)
    t = re.sub(r'(\$|€|£)\s*\d[\d,\.]*\s*(?:bn|b|m|k)?\b', '__MONEY__', t, flags=re.IGNORECASE)
    t = re.sub(r'\b\d[\d,\.]*\s*(million|billion|trillion|bn|m)\b', '__AMOUNT__', t, flags=re.IGNORECASE)
    t = re.sub(r'\b(19|20)\d{2}\b', '__YEAR__', t)
    t = re.sub(r'\b((jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)[a-z]*\s+\d{1,2})\b',
               '__DATE__', t, flags=re.IGNORECASE)
    t = re.sub(r'\b(\d{1,2}\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)[a-z]*)\b',
               '__DATE__', t, flags=re.IGNORECASE)
    # números genéricos al final
    t = re.sub(r'(?<![A-Za-z_])\b\d[\d,\.]*\b(?![A-Za-z_])', '__NUM__', t)
    t = re.sub(r'\s+', ' ', t).strip()
    return t


In [12]:

def spacy_clean_strong(doc) -> str:
    out = []
    for tok in doc:
        if tok.is_punct or tok.is_space:
            continue
        # conservar placeholders tal cual
        if PLACEHOLDER_RE.fullmatch(tok.text):
            out.append(tok.text); continue
        if tok.text == "%":
            continue
        lemma = tok.lemma_.lower()
        if lemma == "percent":
            out.append("__PERCENT__"); continue
        if tok.text.lower() == "data" and lemma == "datum":
            lemma = "data"
        if lemma in STOPWORDS:
            continue
        if lemma.isalpha() and len(lemma) > 2:
            out.append(lemma)
    return " ".join(out)

In [13]:
base_col = "article_text"
assert base_col in df.columns, f"No existe la columna {base_col}"

df["text_nc_step1"] = df[base_col].apply(pre_rules)
df["text_nc"] = df["text_nc_step1"].progress_apply(lambda x: spacy_clean_strong(nlp(x)))

print(df[["article_text", "text_nc"]].head(3).to_string(max_colwidth=80))
print("\nLongitud media (tokens aprox.):",
      df["text_nc"].str.split().map(len).replace(0, pd.NA).mean())

out_path = "processData.csv"
df.to_csv(out_path, index=False)
print("Guardado:", out_path)

100%|██████████| 5160/5160 [01:59<00:00, 43.30it/s]


                                                                      article_text                                                                          text_nc
0  The UK jobs market continues to show signs of weakness, with pay growth slow...  job market continue sign weakness pay growth slowing unemployment edge highe...
1  New data from the Department from Work and Pensions on the working patterns ...  new data department work pension work pattern people age num work life long ...
2  Asking for workplace accommodations is often easier said than done. Many peo...  ask workplace accommodation easy people worry needy incompetent result conti...

Longitud media (tokens aprox.): 235.75348837209302
Guardado: processData.csv


## **PARA EMBEDDINGS CONTEXTUALES**