# PREPROCESAMIENTO DE TODO

## PARA MÉTODOS TRADICIONALES(TF-IDF)

### PRUEBA CON SPACY

In [None]:
import spacy
from tqdm import tqdm
import re
import en_core_web_sm
from spacy.lang.en import English
import pandas as pd

In [3]:
csv_path = r"..\..\finnhubAPI\data\porEmpresas\definitivos\AAPL_scrapped_filtrado.csv"
df = pd.read_csv(csv_path)

texto = df["article_text"].dropna().iloc[0]
print(texto)


The UK jobs market continues to show signs of weakness, with pay growth slowing and unemployment edging higher ahead of the autumn budget next month.
The latest data from the Office for National Statistics (ONS), released on Tuesday, showed that annual wage growth excluding bonuses in the three months to August was 4.7%, down slightly from 4.8% between May and July.
The unemployment rate came in at 4.8% for the period, slightly higher than the 4.7% recorded for the previous three months.
The number of employees on the payroll in the year to August was estimated to have fallen by 93,000, though it increased by 10,000 between July and August.
Early estimates for the number of payrolled employees in September suggested a fall of 100,000 on the year and 10,000 on a monthly basis, though the ONS said this was likely to be revised when more data is received next month.
The estimated number of job vacancies fell by 9,000 from the previous three months to 717,000 in July to September.
Read mor

In [4]:
nlp = spacy.load("en_core_web_sm")
english_stop_words = nlp.Defaults.stop_words

df['processed_text'] = ''

### En nuestro caso, vamos a aplicar normalización cuantitativa, visto en papers distintos

pillar valores como, $, million, trillion,... Como marcas de dinero. Es decir como MoneyValue

Lo mismo para years, vamos a coger cifras de 4 en cuatro.

In [None]:
def procesarDatos(text):
    text = re.sub(
    r'(\$|€|£)\s*\d[\d,\.]*\s*(bn|b|m|k)?'
    r'|(\d[\d,\.]*\s*(million|billion|trillion|m|bn))'
    r'|(\d[\d,\.]*\s*(percent|%))',
    flags=re.IGNORECASE
    )   
    
    text = re.sub(r'\b\d{4}\b', '<YEAR>', text)
    doc = nlp(text)

    tokens = []


    for token in doc:
        lemma = token.lemma_.lower()

        if not token.is_punct and not token.is_space:
            if lemma not in english_stop_words:
                tokens.append(lemma)
    return " ".join(tokens)


### Aplicarlo al dataset (Prueba)

In [None]:
import re
import pandas as pd
from tqdm import tqdm
import spacy

nlp = spacy.load("en_core_web_sm")
tqdm.pandas()

csv_path = r"..\..\finnhubAPI\data\porEmpresas\definitivos\INDEX_ALL_scrapped_filtrado.csv"
df = pd.read_csv(csv_path)

OUT_COL = "preprocessed_text"
if OUT_COL not in df.columns:
    df[OUT_COL] = ""


In [3]:
english_stop_words = nlp.Defaults.stop_words


BOILERPLATE_PATTERNS = [
    r'^read more:.*$',              # líneas de "Read more:"
    r'^story continues.*$',         # otros ganchos
    r'copyright\s+©.*$',
]
BOILERPLATE_RE = re.compile("|".join(BOILERPLATE_PATTERNS), flags=re.IGNORECASE | re.MULTILINE)

def procesarDatos(text: str) -> str:
    if not isinstance(text, str) or not text.strip():
        return ""

    # 1) Limpia boilerplate
    text = BOILERPLATE_RE.sub("", text)

    text = re.sub(r'\bQ([1-4])\b', r'__QTR\1__', text, flags=re.IGNORECASE)

    text = re.sub(r'\b\d[\d,\.]*\s*%\b', '__PERCENT__', text)
    text = re.sub(r'\b\d[\d,\.]*\s*percent\b', '__PERCENT__', text, flags=re.IGNORECASE)

    text = re.sub(r'(\$|€|£)\s*\d[\d,\.]*\s*(bn|b|m|k)?\b', '__MONEY__', text, flags=re.IGNORECASE)
    text = re.sub(r'\b\d[\d,\.]*\s*(million|billion|trillion|bn|m)\b', '__AMOUNT__', text, flags=re.IGNORECASE)


    text = re.sub(r'\b(19|20)\d{2}\b', '__YEAR__', text)
    text = re.sub(r'\b((jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)[a-z]*\s+\d{1,2})\b','__DATE__', text)
    text = re.sub(r'\b(\d{1,2}\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)[a-z]*)\b','__DATE__', text)
    text = re.sub(r'(?<![A-Za-z_])\b\d[\d,\.]*\b(?![A-Za-z_])', '__NUM__', text)

    doc = nlp(text)

    out = []
    for tok in doc:
        if tok.is_punct or tok.is_space:
            continue
        lemma = tok.lemma_.lower()

        # 3) Correcciones de lema útiles en finanzas
        if tok.text.lower() == "data" and lemma == "datum":
            lemma = "data"
        if tok.text.lower() == "percent":
            lemma = "%"

        # stopwords
        if lemma in nlp.Defaults.stop_words:
            continue

        out.append(lemma)
    return " ".join(out)


In [5]:
mask_valid = df['article_text'].notna() & df['article_text'].astype(str).str.len().gt(0)
df.loc[mask_valid, OUT_COL] = df.loc[mask_valid, 'article_text'].progress_apply(procesarDatos)


100%|██████████| 5160/5160 [04:45<00:00, 18.04it/s]


In [6]:

first_original_text = df.loc[mask_valid, "article_text"].iloc[0]
first_preprocessed_text = df.loc[mask_valid, OUT_COL].iloc[0]

print("TEXTO ORIGINAL DE LA PRIMERA NOTICIA:")
print(first_original_text)

print("TEXTO PREPROCESADO:")
print(first_preprocessed_text)

print("PRUEBA")
doc_test = nlp(first_original_text)
verification_data = [(w.text, w.pos_, w.lemma_) for w in doc_test][:10]
print(verification_data)
print("No hay artículos válidos en 'article_text'.")


TEXTO ORIGINAL DE LA PRIMERA NOTICIA:
The UK jobs market continues to show signs of weakness, with pay growth slowing and unemployment edging higher ahead of the autumn budget next month.
The latest data from the Office for National Statistics (ONS), released on Tuesday, showed that annual wage growth excluding bonuses in the three months to August was 4.7%, down slightly from 4.8% between May and July.
The unemployment rate came in at 4.8% for the period, slightly higher than the 4.7% recorded for the previous three months.
The number of employees on the payroll in the year to August was estimated to have fallen by 93,000, though it increased by 10,000 between July and August.
Early estimates for the number of payrolled employees in September suggested a fall of 100,000 on the year and 10,000 on a monthly basis, though the ONS said this was likely to be revised when more data is received next month.
The estimated number of job vacancies fell by 9,000 from the previous three months to 

# **TF-IDF**

## PREPROCESADO

In [None]:
import re
import pandas as pd
from tqdm import tqdm
import spacy
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "textcat"])
tqdm.pandas()

csv_path = r"..\..\finnhubAPI\data\porEmpresas\definitivos\INDEX_ALL_scrapped_filtrado.csv"
df = pd.read_csv(csv_path)

OUT_COL = "preprocessed_text"
if OUT_COL not in df.columns:
    df[OUT_COL] = ""

STOPWORDS = set(nlp.Defaults.stop_words)
# Si quieres conservar la negación:
# STOPWORDS.discard("not")

# ======= Regex de limpieza/placeholder =======
URL_RE = re.compile(r"https?://\S+|www\.\S+")
HTML_TAG_RE = re.compile(r"<[^>]+>")
BOILERPLATE_RE = re.compile(
    r"(^read more:.*$|^story continues.*$|copyright\s*©.*$)",
    flags=re.IGNORECASE | re.MULTILINE,
)
CASHTAG_RE = re.compile(r"\$[A-Za-z]{1,10}\b")
PLACEHOLDER_RE = re.compile(r"__\w+__")  # detecta __PERCENT__, __NUM__, etc.

def pre_rules(text: str) -> str:
    if not isinstance(text, str) or not text.strip():
        return ""

    # Normalizaciones unicode ligeras (evita tokens raros)
    text = (text.replace("’", "'")
                .replace("“", '"').replace("”", '"')
                .replace("–", "-").replace("—", "-"))

    t = BOILERPLATE_RE.sub("", text)
    t = HTML_TAG_RE.sub(" ", t)
    t = URL_RE.sub(" ", t)

    # Cashtags/tickers ($AAPL) -> __TICKER__
    t = CASHTAG_RE.sub("__TICKER__", t)

    # === Placeholders financieros (¡orden importa!) ===
    # Q1..Q4
    t = re.sub(r'\bQ([1-4])\b', r'__QTR\1__', t, flags=re.IGNORECASE)

    # % con posible signo y puntuación detrás (4.7%, -3.2% etc.)
    t = re.sub(r'\b[+-]?\d[\d,\.]*\s*%(?=\W|$)', '__PERCENT__', t)

    # "percent" y "per cent" (con puntuación detrás)
    t = re.sub(r'\b[+-]?\d[\d,\.]*\s*percent(?=\W|$)', '__PERCENT__', t, flags=re.IGNORECASE)
    t = re.sub(r'\b[+-]?\d[\d,\.]*\s*per\s+cent(?=\W|$)', '__PERCENT__', t, flags=re.IGNORECASE)

    # Dinero: símbolo + cantidad + sufijo (bn/b/m/k)
    t = re.sub(r'(\$|€|£)\s*\d[\d,\.]*\s*(?:bn|b|m|k)?\b', '__MONEY__', t, flags=re.IGNORECASE)

    # Cantidades con sufijo textual: million/billion/...
    t = re.sub(r'\b\d[\d,\.]*\s*(million|billion|trillion|bn|m)\b',
               '__AMOUNT__', t, flags=re.IGNORECASE)

    # Año tipo 2010…2099
    t = re.sub(r'\b(19|20)\d{2}\b', '__YEAR__', t)

    # Fechas "Nov 26" o "26 Nov"
    t = re.sub(r'\b((jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)[a-z]*\s+\d{1,2})\b',
               '__DATE__', t, flags=re.IGNORECASE)
    t = re.sub(r'\b(\d{1,2}\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)[a-z]*)\b',
               '__DATE__', t, flags=re.IGNORECASE)

    # Números genéricos (al final, para no pisar los anteriores)
    t = re.sub(r'(?<![A-Za-z_])\b\d[\d,\.]*\b(?![A-Za-z_])', '__NUM__', t)

    # Compacta espacios
    t = re.sub(r'\s+', ' ', t).strip()
    return t

def spacy_clean(doc) -> str:
    out = []
    for tok in doc:
        if tok.is_punct or tok.is_space:
            continue
        if PLACEHOLDER_RE.fullmatch(tok.text):
            out.append(tok.text)
            continue

        # 2) Descarta % sueltos
        if tok.text == "%":
            continue

        # 3) Lema normal
        lemma = tok.lemma_.lower()

        # 4) Correcciones semánticas
        if lemma == "percent":
            out.append("__PERCENT__")
            continue
        if tok.text.lower() == "data" and lemma == "datum":
            lemma = "data"

        # 5) Stopwords tras lematizar
        if lemma in STOPWORDS:
            continue

        out.append(lemma)
    return " ".join(out)



In [25]:
mask_valid = df['article_text'].notna() & df['article_text'].astype(str).str.len().gt(0)
texts = df.loc[mask_valid, 'article_text'].astype(str).map(pre_rules).tolist()

print(f"Procesando {len(texts)} textos con spaCy...")
processed = []
for doc in tqdm(nlp.pipe(texts, batch_size=1000, n_process=2), total=len(texts)):
    processed.append(spacy_clean(doc))

df.loc[mask_valid, OUT_COL] = processed
print("Ya está:", OUT_COL)


Procesando 5160 textos con spaCy...


100%|██████████| 5160/5160 [01:57<00:00, 44.07it/s] 


Ya está: preprocessed_text


In [26]:

if mask_valid.any():
    first_original_text = df.loc[mask_valid, "article_text"].iloc[0]
    first_preprocessed_text = df.loc[mask_valid, OUT_COL].iloc[0]

    print("\n============================") 
    print("Texto original") 
    print("============================\n")
    print(first_original_text, "...\n")
    print("\n============================") 
    print("Ver Prueba de procesado") 
    print("============================\n")
    print(first_preprocessed_text, "...\n")

    doc_test = nlp(first_original_text)
    verification_data = [(w.text, w.pos_, w.lemma_) for w in doc_test]

    print(f"{len(verification_data)}")

    print(verification_data)


Texto original

The UK jobs market continues to show signs of weakness, with pay growth slowing and unemployment edging higher ahead of the autumn budget next month.
The latest data from the Office for National Statistics (ONS), released on Tuesday, showed that annual wage growth excluding bonuses in the three months to August was 4.7%, down slightly from 4.8% between May and July.
The unemployment rate came in at 4.8% for the period, slightly higher than the 4.7% recorded for the previous three months.
The number of employees on the payroll in the year to August was estimated to have fallen by 93,000, though it increased by 10,000 between July and August.
Early estimates for the number of payrolled employees in September suggested a fall of 100,000 on the year and 10,000 on a monthly basis, though the ONS said this was likely to be revised when more data is received next month.
The estimated number of job vacancies fell by 9,000 from the previous three months to 717,000 in July to Se

### Guardar el dataset procesado para no runear todo el rato

In [27]:
save_path = r"..\..\finnhubAPI\data\procesadosINDEX_ALL_preprocessed.csv"
df.to_csv(save_path, index=False, encoding="utf-8")

## MONTAR EMBEDDING

In [30]:
# === TF-IDF: construcción y exportación de artefactos ===
import json, numpy as np, pandas as pd
from pathlib import Path
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump

TEXT_COL = "preprocessed_text"    # tu columna ya procesada
SAVE_DIR = Path("embeddings_tfidf")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

TFIDF_CFG = {
    "ngram_range": (1, 2),        # unigrams + bigrams
    "min_df": 3,
    "max_df": 0.90,
    "max_features": 120_000,      # sube/baja según RAM
    "norm": "l2",
    "use_idf": True,
    "smooth_idf": True,
    "sublinear_tf": False,
    "dtype": np.float32,
}

df_embed = df[[TEXT_COL]].dropna()
df_embed = df_embed[df_embed[TEXT_COL].astype(str).str.strip().astype(bool)]
texts = df_embed[TEXT_COL].astype(str).tolist()

vec = TfidfVectorizer(**TFIDF_CFG)
X = vec.fit_transform(texts)                # <--- ESTE es tu embedding (sparse)

print("Shape TF-IDF:", X.shape)             # (n_docs, n_features)
print("Tamaño del vocabulario:", len(vec.vocabulary_))

# Guarda matriz y artefactos
sparse.save_npz(SAVE_DIR / "tfidf_X.npz", X)
dump(vec, SAVE_DIR / "tfidf_vectorizer.joblib")

# Vocabulario e IDF (útil para inspección/reproducibilidad)
terms = vec.get_feature_names_out()
idf = vec.idf_
idf_df = pd.DataFrame({"term": terms, "idf": idf}).sort_values("idf", ascending=False)
idf_df.to_csv(SAVE_DIR / "tfidf_idf.csv", index=False)

# ⚙️ Crear versión JSON serializable del config
TFIDF_CFG_JSON = dict(TFIDF_CFG)
TFIDF_CFG_JSON["dtype"] = np.dtype(TFIDF_CFG["dtype"]).name  # convierte float32 en "float32"
TFIDF_CFG_JSON["ngram_range"] = list(TFIDF_CFG["ngram_range"])  # convierte tuple → list

with open(SAVE_DIR / "tfidf_config.json", "w", encoding="utf-8") as f:
    json.dump(TFIDF_CFG_JSON, f, indent=2)

print("✅ Guardado en:", SAVE_DIR.resolve())



Shape TF-IDF: (5160, 104149)
Tamaño del vocabulario: 104149
✅ Guardado en: C:\Users\mpsua\OneDrive\Escritorio\ud\CUARTO\Primer_Cuatri\PLN\Proyecto\FinTracker\data_processing\procesamiento\preprocesamiento\embeddings_tfidf


## EVALUACIÓN TRADICIONAL

In [32]:
# === Evaluación exploratoria del embedding TF-IDF ===
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

n_docs, n_features = X.shape
sparsity = X.nnz / (n_docs * n_features)  # proporción de elementos distintos de 0
print(f"📘 Shape TF-IDF: {X.shape} (docs x vocab)")
print(f"📊 Sparsity: {sparsity:.6f}  ({sparsity*100:.4f}% de celdas no nulas)")
print(f"🔠 Tamaño del vocabulario: {len(vec.vocabulary_):,}")

# --- 2️⃣ Términos más comunes / más raros ---
terms = np.array(vec.get_feature_names_out())
idf = vec.idf_

common_terms = terms[np.argsort(idf)[:10]]
rare_terms   = terms[np.argsort(-idf)[:10]]

print("\n🔥 10 términos más comunes (bajo IDF):")
print(", ".join(common_terms))
print("\n🧊 10 términos más raros (alto IDF):")
print(", ".join(rare_terms))

# --- 3️⃣ Top términos de un documento específico ---
def top_terms_for_doc(i, vec, X, n=10):
    row = X[i].toarray().ravel()
    top_idx = row.argsort()[::-1][:n]
    return list(zip(terms[top_idx], row[top_idx]))

idx_example = 0  # cambia el índice para otros documentos
top_terms = top_terms_for_doc(idx_example, vec, X, n=10)
print(f"\n📰 Top 10 términos del documento {idx_example}:")
for term, weight in top_terms:
    print(f"{term:<20} {weight:.5f}")

# --- 4️⃣ Similitud entre documentos ---
if X.shape[0] > 1:
    sim = cosine_similarity(X[0], X[1])[0,0]
    print(f"\n🤝 Similitud coseno entre documento 0 y 1: {sim:.4f}")

# --- 5️⃣ (Opcional) Guardar resumen rápido ---
with open(SAVE_DIR / "embedding_summary.txt", "w", encoding="utf-8") as f:
    f.write(f"Shape: {X.shape}\n")
    f.write(f"Sparsity: {sparsity:.6f}\n")
    f.write("Top comunes: " + ", ".join(common_terms) + "\n")
    f.write("Top raros: " + ", ".join(rare_terms) + "\n")
    f.write(f"Ejemplo doc {idx_example}: " +
            ", ".join([t for t,_ in top_terms]) + "\n")
print("\n✅ Informe de embedding guardado en:", SAVE_DIR / "embedding_summary.txt")


📘 Shape TF-IDF: (5160, 104149) (docs x vocab)
📊 Sparsity: 0.002787  (0.2787% de celdas no nulas)
🔠 Tamaño del vocabulario: 104,149

🔥 10 términos más comunes (bajo IDF):
year, company, num, ai, __percent__, market, stock, new, share, high

🧊 10 términos más raros (alto IDF):
zuckerberg year, zuckerberg widely, zuckerberg wednesday, zuckerberg time, zuckerberg talk, gpu operate, zuckerberg recognition, zuckerberg recently, zuckerberg quickly, zuckerberg pull

📰 Top 10 términos del documento 0:
autumn budget        0.19996
ons                  0.19609
autumn               0.18688
unemployment         0.17604
fall num             0.17263
estimate number      0.15747
number               0.15146
num july             0.14965
vacancy              0.14668
continue sign        0.14668

🤝 Similitud coseno entre documento 0 y 1: 0.0351

✅ Informe de embedding guardado en: embeddings_tfidf\embedding_summary.txt


# FINAL
