In [20]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Ruta de la carpeta
folder_path = r"C:\Users\kevin\OneDrive\Documentos\Apli2024\preprocessed_txt"

# Leer los archivos de texto en la carpeta
texts = [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').read()
         for filename in os.listdir(folder_path) if filename.endswith(".txt")]

In [21]:
len(texts)

3019

# Vectorización usando Bag of Words (BoW)

In [22]:
vectorizer_bow = CountVectorizer()
X_bow = vectorizer_bow.fit_transform(texts)

# Vectorización usando TF-IDF

In [23]:
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(texts)

# Similitud entre consulta BoW

In [24]:
query = "japan revis longterm energi demand ministri"
# Vectorización de la consulta utilizando el vectorizador BoW
query_vector_bow = vectorizer_bow.transform([query])

# Calcular la similitud entre la consulta y los documentos utilizando BoW
similarity_scores_bow = cosine_similarity(query_vector_bow, X_bow)

# Obtener los documentos ordenados por similitud
ranked_documents_bow = np.argsort(similarity_scores_bow)[0][::-1]

# Mostrar solo los primeros 10 resultados
for i, doc_index in enumerate(ranked_documents_bow[:10]):  # Solo los primeros 10 resultados
    print(f"Resultado {i+1}: Documento {doc_index}, Similitud: {similarity_scores_bow[0][doc_index]}")
    print(texts[doc_index][:200])  # Mostrar los primeros 200 caracteres del documento
    print()


Resultado 1: Documento 2, Similitud: 0.5326236412913076
japan revis longterm energi demand ministri intern trade industri miti revis longterm energi supplydemand outlook august meet forecast downtrend japanes energi demand ministri offici miti expect lower

Resultado 2: Documento 14, Similitud: 0.28545680510919436
japan ministri open farm trade hit japan agricultur ministri anger demand japan open farm product market offici talk month liberalis harm exist farm export japan senior ministri offici import drop due

Resultado 3: Documento 284, Similitud: 0.2810497136596714
econom spotlight telecom key japan ministri japan littleknown ministri post telecommun mpt emerg intern forc reckon polit analyst mpt thrust spotlight trade row britain posit strength due control lucr

Resultado 4: Documento 282, Similitud: 0.2769103481957583
japan ministri comment rice talk report agricultur ministri declin comment local newspap report japan agre hold talk close rice market gatt round idea report commen

# Similitud entre consulta TF-IDF

In [28]:
query = "japan revis longterm energi demand ministri"

# Vectorización de la consulta utilizando el vectorizador TF-IDF
query_vector_tfidf = vectorizer_tfidf.transform([query])

# Calcular la similitud entre la consulta y los documentos utilizando TF-IDF
similarity_scores_tfidf = cosine_similarity(query_vector_tfidf, X_tfidf)

# Obtener los documentos ordenados por similitud
ranked_documents_tfidf = np.argsort(similarity_scores_tfidf)[0][::-1]

# Mostrar solo los primeros 10 resultados
for i, doc_index in enumerate(ranked_documents_tfidf[:10]):  # Solo los primeros 10 resultados
    print(f"Resultado {i+1}: Documento {doc_index}, Similitud: {similarity_scores_tfidf[0][doc_index]}")
    print(texts[doc_index][:200])  # Mostrar los primeros 200 caracteres del documento
    print()



Resultado 1: Documento 2, Similitud: 0.5094756136643848
japan revis longterm energi demand ministri intern trade industri miti revis longterm energi supplydemand outlook august meet forecast downtrend japanes energi demand ministri offici miti expect lower

Resultado 2: Documento 14, Similitud: 0.22764948233074195
japan ministri open farm trade hit japan agricultur ministri anger demand japan open farm product market offici talk month liberalis harm exist farm export japan senior ministri offici import drop due

Resultado 3: Documento 284, Similitud: 0.20425198913736256
econom spotlight telecom key japan ministri japan littleknown ministri post telecommun mpt emerg intern forc reckon polit analyst mpt thrust spotlight trade row britain posit strength due control lucr

Resultado 4: Documento 282, Similitud: 0.20306267500430586
japan ministri comment rice talk report agricultur ministri declin comment local newspap report japan agre hold talk close rice market gatt round idea report comm