## Importar librerias.

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os.path
import sys
import pickle

* Leemos el corpus y lo pasamos a un dataframe y lo almacenamos.

In [7]:
corpus = pd.read_csv(r"../filtered_dataset\noticias_expansion_filtradas.csv")

In [8]:
corpus

Unnamed: 0,titulo,palabras_clave,fuente,contenido,seccion,liga,fecha,texto_completo,texto_comp,coincidencias
0,Emilio Romano asume como presidente de la ABM;...,presidente banco director asociación protesta ...,expansion,El director de Bank of America rindió protesta...,economia,https://expansion.mx/economia/2025/05/09/emili...,09/05/2025,Emilio Romano asume como presidente de la ABM;...,Emilio Romano asume como presidente de la ABM;...,oma
1,#Crónica | Una fiesta bancaria entre laberinto...,deuda humo blanco haciendo banxico optimismo,expansion,La ABM presumió que el sistema financiero es s...,economia,https://expansion.mx/economia/2025/05/09/croni...,09/05/2025,#Crónica | Una fiesta bancaria entre laberinto...,#Crónica | Una fiesta bancaria entre laberinto...,banxico
2,Banxico pide a la banca creatividad para dar s...,financiero banxico sistema banca banco llamado,expansion,"La gobernadora del Banco de México (Banxico), ...",economia,https://expansion.mx/economia/2025/05/08/banxi...,09/05/2025,Banxico pide a la banca creatividad para dar s...,Banxico pide a la banca creatividad para dar s...,banxico
3,Gobierno y bancos acuerdan aumentar 3.5% anual...,pyme periodo presidenta plan claudio abm,expansion,La presidenta Claudia Sheinbaum y la ABM suscr...,economia,https://expansion.mx/economia/2025/05/08/gobie...,09/05/2025,Gobierno y bancos acuerdan aumentar 3.5% anual...,Gobierno y bancos acuerdan aumentar 3.5% anual...,financiamiento
4,"Citigroup enfrenta demanda de 1,000 mmd en cas...",citigroup pese oceanografía dólares petrolero ...,expansion,Los demandantes afirman que Citigroup adelantó...,economia,https://expansion.mx/economia/2025/05/08/citig...,08/05/2025,"Citigroup enfrenta demanda de 1,000 mmd en cas...","Citigroup enfrenta demanda de 1,000 mmd en cas...",demanda
5,Banqueros ven 'inicios' de desaceleración del ...,crédito economía desaceleración empresa person...,expansion,"Ante la desaceleración de la economía, la banc...",economia,https://expansion.mx/economia/2025/05/08/conve...,08/05/2025,Banqueros ven 'inicios' de desaceleración del ...,Banqueros ven 'inicios' de desaceleración del ...,desaceleración
6,Inflación en abril se ubicó en 3.93% y superó ...,producto precio genérico alza mercado jitomate,expansion,Los productos genéricos que más contribuyeron ...,economia,https://expansion.mx/economia/2025/05/08/infla...,08/05/2025,Inflación en abril se ubicó en 3.93% y superó ...,Inflación en abril se ubicó en 3.93% y superó ...,"expectativas, inflación, oma"
7,La Fed mantiene las tasas y menciona riesgo de...,desempleo reserva economía sólido ritmo general,expansion,"La economía en general ""ha seguido expandiéndo...",economia,https://expansion.mx/economia/2025/05/07/la-fe...,07/05/2025,La Fed mantiene las tasas y menciona riesgo de...,La Fed mantiene las tasas y menciona riesgo de...,"desempleo, inflación"
8,Banxico prevé nuevos recortes a la tasa si la ...,estable entorno gobernadora incierto central b...,expansion,La gobernadora del banco central advirtió que ...,economia,https://expansion.mx/economia/2025/05/07/banxi...,07/05/2025,Banxico prevé nuevos recortes a la tasa si la ...,Banxico prevé nuevos recortes a la tasa si la ...,"banxico, inflación"
9,Banxico aún tiene margen para bajar tasa clave...,margen tasa manteniéndola referencia real terr...,expansion,El subgobernador precisó que dada la baja de l...,economia,https://expansion.mx/economia/2025/05/07/banxi...,07/05/2025,Banxico aún tiene margen para bajar tasa clave...,Banxico aún tiene margen para bajar tasa clave...,banxico


## Normalización de texto.

* Función que normaliza texto de un corpus.

In [9]:
import spacy as sp

In [10]:
def normalizador(corpus: pd.DataFrame, col_name: str, obj_nlp) -> pd.DataFrame:
    words_category = ["DET", "ADP", "CCONJ", "SCONJ","PRON"]
    obj_nlp = sp.load("es_core_news_sm")
    corpus.fillna('', inplace=True)
    
    list_col = corpus[col_name].tolist()
    list_final = []
    
    for i in range(len(list_col)):
        list_to_normal = list_col[i].lower() 
        doc = obj_nlp(list_to_normal)
        
        list_normal = []
        for token in doc:
            if token.pos_ not in words_category and token.is_alpha:
                list_normal.append(token.lemma_)
        
        text_norm = ' '.join(list_normal) 
        list_final.append(text_norm)
    
    corpus[col_name] = list_final 
    
    return corpus

## Nuevos conjuntos de datos.

* Creamos tres nuevos conjunto de datos y estaran divididos de la siguiente manera:

    * **Titulo**: Unicamente tendra la columna **titulo**.
    * **Contenido**: Unicamente tendra la columna **contenido**.
    * **Titulo + contenido**: Es la concatenación de las columnas **titulo y contenido**.

In [11]:
def new_sets(corpus: pd.DataFrame) -> tuple:
    corpus.fillna('',inplace=True)
     
    only_title = corpus[['titulo']].copy()
    only_summary = corpus[['palabras_clave']].copy()
    title_summary = pd.DataFrame(corpus['titulo'] + ' ' + corpus['palabras_clave'], columns=['titulo_contenido'])
    
    return only_title, only_summary, title_summary

In [12]:
nlp = sp.load("es_core_news_sm")

In [13]:
corpus = normalizador(corpus, col_name="titulo", obj_nlp=nlp)

In [14]:
df_1, df_2, df_3 = new_sets(corpus)
df_1.name = df_1.columns[0]
df_2.name = df_2.columns[0]
df_3.name = df_3.columns[0]

In [15]:
df_3

Unnamed: 0,titulo_contenido
0,emilio romano asumir presidente prometer más d...
1,crónico fiesta bancario laberinto humo blanco ...
2,banxico pedir banco creatividad dar solución f...
3,gobierno banco acordar aumentar anual financia...
4,citigroup enfrento demanda mmd caso fraude pet...
5,banquero ver inicio desaceleración crédito cré...
6,inflación abril ubicar superar expectativa mer...
7,fed mantener tasa mencionar riesgo mayor infla...
8,banxico prever nuevo recorte tasa inflación ma...
9,banxico aún tener margen bajar tasa clave aseg...


## Representación Vectorial.

* Se hara una representación vectorial del corpus de las siguientes formas:
    * **TF-IDF**.

* Se usaran las siguientes caracteristicas:
    * **Unigramas**.
    * **Bigramas**.

* Función que guarda en un archivo **.pkl** los nombres de las caracteristicas y la matriz de la representación vectorial.

In [17]:
def doc_to_vect(corpus: pd.DataFrame, col_name: str, form: str, feature: str) -> tuple:
    
    output_dir = os.path.join("..", "embedding")
    os.makedirs(output_dir, exist_ok=True)
    file_name = os.path.join(output_dir, f'{form}_{feature}_{corpus.name}_train_vector.pkl')
    
    if os.path.exists(file_name):
        print(f'Ya existe {file_name}, cargando...')
        with open(file_name, 'rb') as vector_file:
            X, feature_names, rep = pickle.load(vector_file)
        return X, feature_names
    
    if form == 'TF-IDF' and feature == 'Unigrama':
        rep_vec = TfidfVectorizer(token_pattern= r'(?u)\w+|\w+\n|\.|\¿|\?')
        X = rep_vec.fit_transform(corpus[col_name])
        feature_names = rep_vec.get_feature_names_out()
    elif form == 'TF-IDF' and feature == 'Bigrama':
        rep_vec = TfidfVectorizer(token_pattern= r'(?u)\w+|\w+\n|\.|\¿|\?', ngram_range=(2,2))
        X = rep_vec.fit_transform(corpus[col_name])
        feature_names = rep_vec.get_feature_names_out()

    with open(file_name, 'wb') as vector_file:
        pickle.dump((X.toarray(), feature_names, rep_vec), vector_file)
    
    return X.toarray(), feature_names


In [18]:
forms = ['TF-IDF']
features = ['Bigrama', 'Unigrama']
dataframes = [df_1, df_2, df_3]

for dataframe in dataframes:
    for form in forms:
        for feature in features:
            X, names = doc_to_vect(dataframe, dataframe.name, form, feature)

Ya existe ..\embedding\TF-IDF_Bigrama_titulo_train_vector.pkl, cargando...
Ya existe ..\embedding\TF-IDF_Unigrama_titulo_train_vector.pkl, cargando...
Ya existe ..\embedding\TF-IDF_Bigrama_palabras_clave_train_vector.pkl, cargando...
Ya existe ..\embedding\TF-IDF_Unigrama_palabras_clave_train_vector.pkl, cargando...
Ya existe ..\embedding\TF-IDF_Bigrama_titulo_contenido_train_vector.pkl, cargando...
Ya existe ..\embedding\TF-IDF_Unigrama_titulo_contenido_train_vector.pkl, cargando...


In [19]:
file_names = [

'TF-IDF_Bigrama_titulo_train_vector.pkl',
'TF-IDF_Unigrama_titulo_train_vector.pkl',
'TF-IDF_Bigrama_palabras_clave_train_vector.pkl',
'TF-IDF_Unigrama_palabras_clave_train_vector.pkl',
'TF-IDF_Bigrama_titulo_contenido_train_vector.pkl',
'TF-IDF_Unigrama_titulo_contenido_train_vector.pkl'
]




In [21]:
import pickle
import pandas as pd

file = r'..\embedding\TF-IDF_Bigrama_palabras_clave_train_vector.pkl'
with open(file, 'rb') as f:
    X, feature_names, vectorizer = pickle.load(f)

df_tfidf = pd.DataFrame(X, columns=feature_names)
df_tfidf


Unnamed: 0,abogado hsbc,agropecuario técnico,ajuste año,alto bendición,alza mercado,anuncio warren,asociación protesta,año trimestre,banca banco,banco dato,...,tasa cet,tasa interés,tasa manteniéndola,tecnología cierre,tendencia presidente,trimestre agropecuario,trimestre sector,trump mitin,trump republicano,warren trump
0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
import pickle
import pandas as pd

file = r'..\embedding\TF-IDF_Unigrama_palabras_clave_train_vector.pkl'
with open(file, 'rb') as f:
    X, feature_names, vectorizer = pickle.load(f)

df_tfidf = pd.DataFrame(X, columns=feature_names)
df_tfidf


Unnamed: 0,abm,abogado,agropecuario,ajuste,alto,alza,americo,anuncio,asociación,año,...,tasa,tecnología,tendencia,tercio,territorio,tributario,trimestre,trump,técnico,warren
0,0.0,0.0,0.0,0.0,0.0,0.0,0.435851,0.0,0.435851,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.415749,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.338936,0.0,0.0,0.0,0.420743,0.0,0.0,0.0,0.0,0.0
