# **MÉTODOS DE REPRESENTACIÓN TRADICIONALES**

## **TF-IDF**

In [27]:
save_path = r"..\..\finnhubAPI\data\procesadosINDEX_ALL_preprocessed.csv"
df.to_csv(save_path, index=False, encoding="utf-8")

### PRUEBA MONTAR EMBEDDING

In [30]:
# === TF-IDF: construcción y exportación de artefactos ===
import json, numpy as np, pandas as pd
from pathlib import Path
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump

TEXT_COL = "preprocessed_text"    # tu columna ya procesada
SAVE_DIR = Path("embeddings_tfidf")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

TFIDF_CFG = {
    "ngram_range": (1, 2),        # unigrams + bigrams
    "min_df": 3,
    "max_df": 0.90,
    "max_features": 120_000,      # sube/baja según RAM
    "norm": "l2",
    "use_idf": True,
    "smooth_idf": True,
    "sublinear_tf": False,
    "dtype": np.float32,
}

df_embed = df[[TEXT_COL]].dropna()
df_embed = df_embed[df_embed[TEXT_COL].astype(str).str.strip().astype(bool)]
texts = df_embed[TEXT_COL].astype(str).tolist()

vec = TfidfVectorizer(**TFIDF_CFG)
X = vec.fit_transform(texts)                # <--- ESTE es tu embedding (sparse)

print("Shape TF-IDF:", X.shape)             # (n_docs, n_features)
print("Tamaño del vocabulario:", len(vec.vocabulary_))

# Guarda matriz y artefactos
sparse.save_npz(SAVE_DIR / "tfidf_X.npz", X)
dump(vec, SAVE_DIR / "tfidf_vectorizer.joblib")

# Vocabulario e IDF (útil para inspección/reproducibilidad)
terms = vec.get_feature_names_out()
idf = vec.idf_
idf_df = pd.DataFrame({"term": terms, "idf": idf}).sort_values("idf", ascending=False)
idf_df.to_csv(SAVE_DIR / "tfidf_idf.csv", index=False)

# ⚙️ Crear versión JSON serializable del config
TFIDF_CFG_JSON = dict(TFIDF_CFG)
TFIDF_CFG_JSON["dtype"] = np.dtype(TFIDF_CFG["dtype"]).name  # convierte float32 en "float32"
TFIDF_CFG_JSON["ngram_range"] = list(TFIDF_CFG["ngram_range"])  # convierte tuple → list

with open(SAVE_DIR / "tfidf_config.json", "w", encoding="utf-8") as f:
    json.dump(TFIDF_CFG_JSON, f, indent=2)

print("✅ Guardado en:", SAVE_DIR.resolve())



Shape TF-IDF: (5160, 104149)
Tamaño del vocabulario: 104149
✅ Guardado en: C:\Users\mpsua\OneDrive\Escritorio\ud\CUARTO\Primer_Cuatri\PLN\Proyecto\FinTracker\data_processing\procesamiento\preprocesamiento\embeddings_tfidf


## EVALUACIÓN TRADICIONAL

In [32]:
# === Evaluación exploratoria del embedding TF-IDF ===
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

n_docs, n_features = X.shape
sparsity = X.nnz / (n_docs * n_features)  # proporción de elementos distintos de 0
print(f"📘 Shape TF-IDF: {X.shape} (docs x vocab)")
print(f"📊 Sparsity: {sparsity:.6f}  ({sparsity*100:.4f}% de celdas no nulas)")
print(f"🔠 Tamaño del vocabulario: {len(vec.vocabulary_):,}")

# --- 2️⃣ Términos más comunes / más raros ---
terms = np.array(vec.get_feature_names_out())
idf = vec.idf_

common_terms = terms[np.argsort(idf)[:10]]
rare_terms   = terms[np.argsort(-idf)[:10]]

print("\n🔥 10 términos más comunes (bajo IDF):")
print(", ".join(common_terms))
print("\n🧊 10 términos más raros (alto IDF):")
print(", ".join(rare_terms))

# --- 3️⃣ Top términos de un documento específico ---
def top_terms_for_doc(i, vec, X, n=10):
    row = X[i].toarray().ravel()
    top_idx = row.argsort()[::-1][:n]
    return list(zip(terms[top_idx], row[top_idx]))

idx_example = 0  # cambia el índice para otros documentos
top_terms = top_terms_for_doc(idx_example, vec, X, n=10)
print(f"\n📰 Top 10 términos del documento {idx_example}:")
for term, weight in top_terms:
    print(f"{term:<20} {weight:.5f}")

# --- 4️⃣ Similitud entre documentos ---
if X.shape[0] > 1:
    sim = cosine_similarity(X[0], X[1])[0,0]
    print(f"\n🤝 Similitud coseno entre documento 0 y 1: {sim:.4f}")

# --- 5️⃣ (Opcional) Guardar resumen rápido ---
with open(SAVE_DIR / "embedding_summary.txt", "w", encoding="utf-8") as f:
    f.write(f"Shape: {X.shape}\n")
    f.write(f"Sparsity: {sparsity:.6f}\n")
    f.write("Top comunes: " + ", ".join(common_terms) + "\n")
    f.write("Top raros: " + ", ".join(rare_terms) + "\n")
    f.write(f"Ejemplo doc {idx_example}: " +
            ", ".join([t for t,_ in top_terms]) + "\n")
print("\n✅ Informe de embedding guardado en:", SAVE_DIR / "embedding_summary.txt")


📘 Shape TF-IDF: (5160, 104149) (docs x vocab)
📊 Sparsity: 0.002787  (0.2787% de celdas no nulas)
🔠 Tamaño del vocabulario: 104,149

🔥 10 términos más comunes (bajo IDF):
year, company, num, ai, __percent__, market, stock, new, share, high

🧊 10 términos más raros (alto IDF):
zuckerberg year, zuckerberg widely, zuckerberg wednesday, zuckerberg time, zuckerberg talk, gpu operate, zuckerberg recognition, zuckerberg recently, zuckerberg quickly, zuckerberg pull

📰 Top 10 términos del documento 0:
autumn budget        0.19996
ons                  0.19609
autumn               0.18688
unemployment         0.17604
fall num             0.17263
estimate number      0.15747
number               0.15146
num july             0.14965
vacancy              0.14668
continue sign        0.14668

🤝 Similitud coseno entre documento 0 y 1: 0.0351

✅ Informe de embedding guardado en: embeddings_tfidf\embedding_summary.txt
