# Preprocesamiento

In [185]:
import os
from nltk.stem import SnowballStemmer 
import nltk
import string
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [186]:
#Path de los datos 
#data_path = r'C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\test_txt'
#data_path = r'D:\U\7. Septimo\RI\ProyectoRI\data\training_txt'
data_path = r'C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\training_txt'

In [187]:
#Crear un vector de documentos
#Abrimos cada archivo y lo leemos
documents = []
for filename in os.listdir(data_path):
    if filename.endswith('.txt'):
        path = os.path.join(data_path, filename)
        with open(path, 'r', encoding='utf-8') as file:
            content = file.read()
            documents.append((filename, content))
len(documents)

7769

In [188]:
#Stemmer se usa para reducir las palabras a su raíz
stemmer = SnowballStemmer('english')

In [189]:
#Path de las stopwords
#stopwords_path = r"C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\stopwords.txt"
#stopwords_path = r"D:\U\7. Septimo\RI\ProyectoRI\data\stopwords.txt"
stopwords_path = r"C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\stopwords.txt"

In [190]:
#Abrimos el archivo y leemos las stopwords
with open(stopwords_path, 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines())

In [191]:
#Definimos una función que normaliza el texto con todos los requisitos necesarios:
#conviertimos en minúsculas
#eliminamos los signos de puntuación
#tokenizamos
#aplicamos stemming
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(processed_tokens)

In [192]:
preprocessed_documents = [(filename, preprocess_text(content)) for filename, content in documents]

In [193]:
len(preprocessed_documents)

7769

# Vectorizacion

# TF-IDF

In [195]:
# Preprocesar los contenidos de los documentos y mantener la estructura de lista de tuplas
preprocessed_documents_list = [(filename, preprocess_text(content)) for filename, content in preprocessed_documents]

# Extraer solo los contenidos preprocesados para la vectorización
preprocessed_contents = [content for _, content in preprocessed_documents_list]

# Inicializar el vectorizador TF-IDF
vectorizer_tfidf = TfidfVectorizer()

# Vectorizar los contenidos preprocesados
X_tfidf = vectorizer_tfidf.fit_transform(preprocessed_contents)

In [196]:
# Imprimir la matriz de términos y documentos TF-IDF
print("Matriz de términos y documentos TF-IDF:")
print(X_tfidf.toarray())

# Imprimir el vocabulario TF-IDF
print("Vocabulario TF-IDF:")
print(vectorizer_tfidf.vocabulary_)

Matriz de términos y documentos TF-IDF:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Vocabulario TF-IDF:
{'bahia': 1174, 'cocoa': 2934, 'review': 16864, 'shower': 17867, 'continu': 3278, 'week': 20694, 'zone': 21217, 'allevi': 444, 'drought': 4520, 'ear': 4637, 'januari': 8010, 'improv': 7490, 'prospect': 16047, 'temporao': 19239, 'normal': 14369, 'humid': 7277, 'level': 8806, 'restor': 16809, 'comissaria': 3035, 'smith': 18116, 'dri': 4505, 'period': 15340, 'late': 8647, 'year': 21102, 'arriv': 883, 'end': 4899, 'februari': 5431, 'bag': 1169, 'kilo': 8359, 'make': 12864, 'cumul': 3637, 'total': 19582, 'season': 17515, 'mln': 13604, 'stage': 18430, 'deliv': 3952, 'earlier': 4640, 'consign': 3229, 'includ': 7525, 'figur': 5540, 'doubt': 4447, 'crop': 3557, 'harvest': 6809, 'practic': 15787, 'estim': 5125, 'sale': 17261, 'stand': 18447, 'hundr': 7283, 'thousand': 19376, 'hand': 

In [197]:
query = "japan revis longterm energi demand ministri"

# Vectorización de la consulta utilizando el vectorizador TF-IDF
query_vector_tfidf = vectorizer_tfidf.transform([query])

# Calcular la similitud entre la consulta y los documentos utilizando TF-IDF
similarity_scores_tfidf = cosine_similarity(query_vector_tfidf, X_tfidf)

# Obtener los documentos ordenados por similitud
ranked_documents_tfidf = np.argsort(similarity_scores_tfidf)[0][::-1]

# Mostrar solo los primeros 10 resultados
for i, doc_index in enumerate(ranked_documents_tfidf[:10]):  # Solo los primeros 10 resultados
    print(f"Resultado {i+1}: Documento {doc_index}, Similitud: {similarity_scores_tfidf[0][doc_index]}")
    print(preprocessed_documents[doc_index][:200])  # Mostrar los primeros 200 caracteres del documento
    print()


Resultado 1: Documento 1742, Similitud: 0.3595570495176247
('12795.txt', 'japan ask bank cut dollar sale dealer financ ministri ask japanes commerci bank moder dollar sale bank dealer ministri telephon citi longterm bank earlier week make request dealer time ministri made request commerci bank financ ministri offici unavail comment dealer ministri ask institut investor reduc sale dollar')

Resultado 2: Documento 4343, Similitud: 0.3462078708722056
('4654.txt', 'bank japan call longterm effort trade shortterm effect foreign exchang rate movement correct intern trade imbal appear wane longterm effort requir cut japan chronic depend extern demand bank japan month report japan trade surplus nomin term remain high futur central bank fundament adjust need long japan hope benefit intern alloc resourc maintain free trade system ad')

Resultado 3: Documento 4382, Similitud: 0.3462078708722056
('4717.txt', 'bank japan call longterm effort trade shortterm effect foreign exchang rate movement corr

# BoW

In [198]:
preprocessed_documents_list = [(filename, preprocess_text(content)) for filename, content in preprocessed_documents]

# Extraer solo los contenidos preprocesados para la vectorización
preprocessed_contents = [content for _, content in preprocessed_documents_list]

# Inicializar el vectorizador
vectorizer_bow = CountVectorizer()

# Vectorizar los contenidos preprocesados
X_bow = vectorizer_bow.fit_transform(preprocessed_contents)


In [199]:
# Imprimir la matriz de términos y documentos
print("Matriz de términos y documentos:")
print(X_bow.toarray())

# Imprimir el vocabulario
print("Vocabulario:")
print(vectorizer_bow.vocabulary_)

Matriz de términos y documentos:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Vocabulario:
{'bahia': 1174, 'cocoa': 2934, 'review': 16864, 'shower': 17867, 'continu': 3278, 'week': 20694, 'zone': 21217, 'allevi': 444, 'drought': 4520, 'ear': 4637, 'januari': 8010, 'improv': 7490, 'prospect': 16047, 'temporao': 19239, 'normal': 14369, 'humid': 7277, 'level': 8806, 'restor': 16809, 'comissaria': 3035, 'smith': 18116, 'dri': 4505, 'period': 15340, 'late': 8647, 'year': 21102, 'arriv': 883, 'end': 4899, 'februari': 5431, 'bag': 1169, 'kilo': 8359, 'make': 12864, 'cumul': 3637, 'total': 19582, 'season': 17515, 'mln': 13604, 'stage': 18430, 'deliv': 3952, 'earlier': 4640, 'consign': 3229, 'includ': 7525, 'figur': 5540, 'doubt': 4447, 'crop': 3557, 'harvest': 6809, 'practic': 15787, 'estim': 5125, 'sale': 17261, 'stand': 18447, 'hundr': 7283, 'thousand': 19376, 'hand': 6721, 'farmer': 5366, 'middlemen': 13402, 'export'

In [200]:
query = "japan revis longterm energi demand ministri"
# Vectorización de la consulta utilizando el vectorizador BoW
query_vector_bow = vectorizer_bow.transform([query])

# Calcular la similitud entre la consulta y los documentos utilizando BoW
similarity_scores_bow = cosine_similarity(query_vector_bow, X_bow)

# Obtener los documentos ordenados por similitud
ranked_documents_bow = np.argsort(similarity_scores_bow)[0][::-1]

# Mostrar solo los primeros 10 resultados
for i, doc_index in enumerate(ranked_documents_bow[:10]):  # Solo los primeros 10 resultados
    print(f"Resultado {i+1}: Documento {doc_index}, Similitud: {similarity_scores_bow[0][doc_index]}")
    print(preprocessed_documents_list[doc_index][:200])  # Mostrar los primeros 200 caracteres del documento
    print()


Resultado 1: Documento 4343, Similitud: 0.36140316116210053
('4654.txt', 'bank japan call longterm effort trade shortterm effect foreign exchang rate movement correct intern trade imbal wane longterm effort requir cut japan chronic depend extern demand bank japan month report japan trade surplus nomin term remain high futur central bank fundament adjust long japan hope benefit intern alloc resourc maintain free trade system ad')

Resultado 2: Documento 4382, Similitud: 0.36140316116210053
('4717.txt', 'bank japan call longterm effort trade shortterm effect foreign exchang rate movement correct intern trade imbal wane longterm effort requir cut japan chronic depend extern demand bank japan month report japan trade surplus nomin term remain high futur central bank fundament adjust long japan hope benefit intern alloc resourc maintain free trade system ad')

Resultado 3: Documento 7064, Similitud: 0.3380617018914066
('8911.txt', 'fed heller stronger japan demand american good fed heller s

# Indice Invertido

In [201]:
#folder_path = r'C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\cats.txt'
#folder_path = r'D:\U\7. Septimo\RI\ProyectoRI\data\cats.txt'
folder_path = r'C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\catslimpios.txt'

In [202]:
lines = []
with open(folder_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

In [203]:
# Construir el índice invertido
def build_inverted_index(preprocessed_contents):
    inverted_index = {}
    for doc_id, content in enumerate(preprocessed_contents):
        for word in content.split():
            if word not in inverted_index:
                inverted_index[word] = set()  # Usamos un conjunto para evitar duplicados
            inverted_index[word].add(doc_id)
    return inverted_index

# Crear el índice invertido
inverted_index = build_inverted_index(preprocessed_contents)

In [204]:
len(inverted_index)

21228

In [205]:
index_df = pd.DataFrame.from_dict(inverted_index, orient='index')
print(index_df)

           0       1       2       3       4       5       6       7     \
bahia         0  2047.0   941.0  1240.0     NaN     NaN     NaN     NaN   
cocoa         0  3457.0  4226.0  2947.0  5505.0   389.0   262.0  7432.0   
review        0  4098.0  7694.0  4625.0    25.0  5658.0  3099.0  7706.0   
shower        0  5288.0  5292.0  5271.0   248.0  6937.0   991.0     NaN   
continu       0     1.0  2049.0  6146.0  4109.0  2064.0  6160.0    19.0   
...         ...     ...     ...     ...     ...     ...     ...     ...   
genentech  7760     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
ltgene     7760     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
genecor    7760     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
additivi   7760     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
ltkrn      7765     NaN     NaN     NaN     NaN     NaN     NaN     NaN   

             8       9     ...  3425  3426  3427  3428  3429  3430  3431  \
bahia         NaN     N

# Motor de busqueda

In [206]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [207]:
# Función para realizar una consulta de búsqueda utilizando BoW y el índice invertido
def search_bow(query, vectorizer_bow, X_bow, inverted_index):
    query_words = query.split()
    relevant_docs_set = set()
    for word in query_words:
        if word in inverted_index:
            relevant_docs_set.update(inverted_index[word])
    
    relevant_docs = list(relevant_docs_set)
    if not relevant_docs:
        return [], []

    X_bow_relevant = X_bow[relevant_docs]
    query_vector = vectorizer_bow.transform([query]).toarray()
    similarities = cosine_similarity(query_vector, X_bow_relevant.toarray()).flatten()
    sorted_docs = [relevant_docs[i] for i in np.argsort(similarities)[::-1]]
    sorted_similarities = np.sort(similarities)[::-1]
    return sorted_docs, sorted_similarities

# Función para realizar una consulta de búsqueda utilizando TF-IDF y el índice invertido
def search_tfidf(query, vectorizer_tfidf, X_tfidf, inverted_index):
    query_words = query.split()
    relevant_docs_set = set()
    for word in query_words:
        if word in inverted_index:
            relevant_docs_set.update(inverted_index[word])
    
    relevant_docs = list(relevant_docs_set)
    if not relevant_docs:
        return [], []

    X_tfidf_relevant = X_tfidf[relevant_docs]
    query_vector = vectorizer_tfidf.transform([query]).toarray()
    similarities = cosine_similarity(query_vector, X_tfidf_relevant.toarray()).flatten()
    sorted_docs = [relevant_docs[i] for i in np.argsort(similarities)[::-1]]
    sorted_similarities = np.sort(similarities)[::-1]
    return sorted_docs, sorted_similarities

# Solicitar la consulta del usuario
query2 = "the Bahia cocoa zone"

# Realizar la búsqueda utilizando BoW
relevant_docs_bow, similarities_bow = search_bow(query2, vectorizer_bow, X_bow, inverted_index)

# Realizar la búsqueda utilizando TF-IDF
relevant_docs_tfidf, similarities_tfidf = search_tfidf(query2, vectorizer_tfidf, X_tfidf, inverted_index)

# Mostrar resultados
print("\nDocumentos relevantes utilizando BoW:")
num_docs_to_display = min(5, len(relevant_docs_bow))  # Aseguramos que no intentamos acceder a más elementos de los que existen
for i in range(num_docs_to_display):
    doc_id = relevant_docs_bow[i]
    similarity = similarities_bow[i]
    print(f"Documento {doc_id}, Similaridad: {similarity}")

print("\nDocumentos relevantes utilizando TF-IDF:")
num_docs_to_display = min(5, len(relevant_docs_tfidf))  # Aseguramos que no intentamos acceder a más elementos de los que existen
for i in range(num_docs_to_display):
    doc_id = relevant_docs_tfidf[i]
    similarity = similarities_tfidf[i]
    print(f"Documento {doc_id}, Similaridad: {similarity}")


Documentos relevantes utilizando BoW:
Documento 318, Similaridad: 0.3344968040028364
Documento 7736, Similaridad: 0.29595817420019405
Documento 319, Similaridad: 0.29095718698132317
Documento 366, Similaridad: 0.2858309752375148
Documento 4708, Similaridad: 0.2853775569164239

Documentos relevantes utilizando TF-IDF:
Documento 0, Similaridad: 0.3689172958945197
Documento 318, Similaridad: 0.33827616613140915
Documento 4708, Similaridad: 0.29379934295297416
Documento 319, Similaridad: 0.2933748420354321
Documento 366, Similaridad: 0.2930408286981315
