# Preprocesamiento

In [6]:
import os
from nltk.stem import SnowballStemmer 
import nltk
import string
import pandas as pd
import re

# Inicializar stemmer y definir stopwords
stemmer = SnowballStemmer('english')
stopwords_path = r'C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\stopwords.txt'
with open(stopwords_path, 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines())

# Función de preprocesamiento
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '',text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(processed_tokens)

# Leer y preprocesar documentos
data_path = r'C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\training_txt'
documents = []
for filename in os.listdir(data_path):
    if filename.endswith('.txt'):
        path = os.path.join(data_path, filename)
        with open(path, 'r', encoding='utf-8') as file:
            content = file.read()
            documents.append((filename, preprocess_text(content)))



#  Vectorización

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Extraer contenidos preprocesados
preprocessed_contents = [content for _, content in documents]

# TF-IDF Vectorización
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(preprocessed_contents)
print("Matriz de términos y documentos TF-IDF:")
print(X_tfidf.toarray()[:5])  # Mostrar primeras 5 filas
print("Vocabulario TF-IDF:")
print(list(vectorizer_tfidf.vocabulary_.items())[:20])  # Primeros 20 términos

# BoW Vectorización
vectorizer_bow = CountVectorizer()
X_bow = vectorizer_bow.fit_transform(preprocessed_contents)
print("Matriz de términos y documentos BoW:")
print(X_bow.toarray()[:5])  # Mostrar primeras 5 filas
print("Vocabulario BoW:")
print(list(vectorizer_bow.vocabulary_.items())[:20])  # Primeros 20 términos


Matriz de términos y documentos TF-IDF:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Vocabulario TF-IDF:
[('bahia', 1187), ('cocoa', 2957), ('review', 17009), ('shower', 18020), ('continu', 3314), ('week', 20869), ('zone', 21400), ('allevi', 447), ('drought', 4568), ('earli', 4688), ('januari', 8099), ('improv', 7567), ('prospect', 16185), ('come', 3048), ('temporao', 19400), ('normal', 14488), ('humid', 7353), ('level', 8903), ('restor', 16954), ('comissaria', 3059)]
Matriz de términos y documentos BoW:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Vocabulario BoW:
[('bahia', 1187), ('cocoa', 2957), ('review', 17009), ('shower', 18020), ('continu', 3314), ('week', 20869), ('zone', 21400), ('allevi', 447), ('drought', 4568), ('earli', 4688), ('januari', 8099), ('improv', 7567), ('prospect', 16185), ('come', 3048), ('temporao', 19400), ('normal', 14488), ('h

# Construcción del Índice Invertido

In [8]:
def build_inverted_index(preprocessed_contents):
    inverted_index = {}
    for doc_id, content in enumerate(preprocessed_contents):
        for word in content.split():
            if word not in inverted_index:
                inverted_index[word] = set()
            inverted_index[word].add(doc_id)
    return inverted_index

# Crear el índice invertido
inverted_index = build_inverted_index(preprocessed_contents)
print(f"Total de términos en el índice invertido: {len(inverted_index)}")
index_df = pd.DataFrame.from_dict({k: list(v) for k, v in inverted_index.items()}, orient='index').transpose()
print("Índice Invertido (primeros 20 términos):")
print(index_df.iloc[:, :20])

Total de términos en el índice invertido: 21411
Índice Invertido (primeros 20 términos):
       bahia   cocoa  review  shower  continu    week    zone  allevi  \
0        0.0     0.0     0.0     0.0      0.0     0.0     0.0     0.0   
1     2047.0  3457.0  4098.0  5288.0      1.0  4096.0  2948.0   714.0   
2      941.0  4226.0  7694.0  5292.0   2049.0     2.0  3077.0  3084.0   
3     1240.0  2947.0  4625.0  5271.0   6146.0  2059.0   134.0  5037.0   
4        NaN  5505.0    25.0   248.0   4109.0  6160.0   136.0  2417.0   
...      ...     ...     ...     ...      ...     ...     ...     ...   
3430     NaN     NaN     NaN     NaN      NaN     NaN     NaN     NaN   
3431     NaN     NaN     NaN     NaN      NaN     NaN     NaN     NaN   
3432     NaN     NaN     NaN     NaN      NaN     NaN     NaN     NaN   
3433     NaN     NaN     NaN     NaN      NaN     NaN     NaN     NaN   
3434     NaN     NaN     NaN     NaN      NaN     NaN     NaN     NaN   

      drought   earli  januari  im

# Implementación del Motor de Búsqueda

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search_bow(query, vectorizer_bow, X_bow, inverted_index, preprocessor):
    query = preprocessor(query)
    query_words = query.split()
    relevant_docs_set = set()
    for word in query_words:
        if word in inverted_index:
            relevant_docs_set.update(inverted_index[word])
    relevant_docs = list(relevant_docs_set)
    if not relevant_docs:
        return [], []
    X_bow_relevant = X_bow[relevant_docs]
    query_vector = vectorizer_bow.transform([query]).toarray()
    similarities = cosine_similarity(query_vector, X_bow_relevant.toarray()).flatten()
    sorted_docs = [relevant_docs[i] for i in np.argsort(similarities)[::-1]]
    sorted_similarities = np.sort(similarities)[::-1]
    return sorted_docs, sorted_similarities

def search_tfidf(query, vectorizer_tfidf, X_tfidf, inverted_index, preprocessor):
    query = preprocessor(query)
    query_words = query.split()
    relevant_docs_set = set()
    for word in query_words:
        if word in inverted_index:
            relevant_docs_set.update(inverted_index[word])
    relevant_docs = list(relevant_docs_set)
    if not relevant_docs:
        return [], []
    X_tfidf_relevant = X_tfidf[relevant_docs]
    query_vector = vectorizer_tfidf.transform([query]).toarray()
    similarities = cosine_similarity(query_vector, X_tfidf_relevant.toarray()).flatten()
    sorted_docs = [relevant_docs[i] for i in np.argsort(similarities)[::-1]]
    sorted_similarities = np.sort(similarities)[::-1]
    return sorted_docs, sorted_similarities

# Solicitar la consulta del usuario
query2 = "the Bahia cocoa zone"

# Realizar la búsqueda utilizando BoW
relevant_docs_bow, similarities_bow = search_bow(query2, vectorizer_bow, X_bow, inverted_index, preprocess_text)

# Realizar la búsqueda utilizando TF-IDF
relevant_docs_tfidf, similarities_tfidf = search_tfidf(query2, vectorizer_tfidf, X_tfidf, inverted_index, preprocess_text)

# Mostrar resultados
def display_results(relevant_docs, similarities, method_name):
    print(f"\nDocumentos relevantes utilizando {method_name}:")
    num_docs_to_display = min(5, len(relevant_docs))
    for i in range(num_docs_to_display):
        doc_id = relevant_docs[i]
        similarity = similarities[i]
        print(f"Documento {doc_id}, Similaridad: {similarity}")

display_results(relevant_docs_bow, similarities_bow, "BoW")
display_results(relevant_docs_tfidf, similarities_tfidf, "TF-IDF")



Documentos relevantes utilizando BoW:
Documento 318, Similaridad: 0.33333333333333337
Documento 7736, Similaridad: 0.2948839123097943
Documento 319, Similaridad: 0.2886751345948129
Documento 4708, Similaridad: 0.28530555551325537
Documento 366, Similaridad: 0.2844400619942872

Documentos relevantes utilizando TF-IDF:
Documento 0, Similaridad: 0.36808505309292017
Documento 318, Similaridad: 0.33691516276347605
Documento 4708, Similaridad: 0.2936915872996356
Documento 366, Similaridad: 0.29140135493961683
Documento 319, Similaridad: 0.2908209663059065
