# Preprocesamiento

In [53]:
import os
from nltk.stem import SnowballStemmer 
import nltk
import string
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [54]:
#Path de los datos 
#data_path = r'C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\training_txt'
data_path = r'D:\U\7. Septimo\RI\ProyectoRI\data\training_txt'
#data_path = r'C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\training_txt'

In [55]:
documents = [] #  Vector de documentos
for filename in os.listdir(data_path):
    if filename.endswith('.txt'): 
        path = os.path.join(data_path, filename) #Abrimos cada archivo 
        with open(path, 'r', encoding='utf-8') as file:
            content = file.read() # leemos
            documents.append((filename, content))

In [56]:
#Path de las stopwords
#stopwords_path = r"C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\stopwords.txt"
stopwords_path = r"D:\U\7. Septimo\RI\ProyectoRI\data\stopwords.txt"
#stopwords_path = r"C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\stopwords.txt"

In [57]:
#Abrimos el archivo 
with open(stopwords_path, 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines()) # leemos las stopwords

In [58]:
#Stemmer se usa para reducir las palabras a su raíz
stemmer = SnowballStemmer('english')

In [59]:
#Definimos una función que normaliza el texto con todos los requisitos necesarios:
def preprocess_text(text):
    text = text.lower() #conviertimos en minúsculas
    text = re.sub(r'\d+', '', text)  
    text = text.translate(str.maketrans('', '', string.punctuation))#eliminamos los signos de puntuación
    tokens = nltk.word_tokenize(text)#tokenizamos
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words] #aplicamos stemming
    return ' '.join(processed_tokens)

In [60]:
preprocessed_documents = [(filename, preprocess_text(content)) for filename, content in documents]

# Vectorizacion

### TF-IDF

In [61]:
# Extraer solo los contenidos preprocesados para la vectorización
preprocessed_contents = [content for _, content in preprocessed_documents]

# Inicializar el vectorizador TF-IDF
vectorizer_tfidf = TfidfVectorizer()
# Vectorizar los contenidos preprocesados
X_tfidf = vectorizer_tfidf.fit_transform(preprocessed_contents)

### BoW 

In [70]:
# Extraer solo los contenidos preprocesados para la vectorización
preprocessed_contents = [content for _, content in preprocessed_documents]

# Inicializar el vectorizador
vectorizer_bow = CountVectorizer()

# Vectorizar los contenidos preprocesados
X_bow = vectorizer_bow.fit_transform(preprocessed_contents)
feature_names_bow = vectorizer_bow.get_feature_names_out()


# Indexación

In [71]:
#data_path_cats = r'C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\catslimpia.txt'
#data_path_cats = r'C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\catslimpia.txt'
data_path_cats = r'D:\U\7. Septimo\RI\ProyectoRI\data\catslimpia.txt'

In [72]:
# Cargar cats.txt
cats = {}
with open(data_path_cats, 'r') as f:
    for line in f:
        term, *docs = line.strip().split()
        cats[term] = docs

In [83]:
def build_inverted_index_from_cat(cats):
    inverted_index = {}
    for term, docs in cats.items():
        inverted_index[term] = set(docs)
    return inverted_index

inverted_index_cats = build_inverted_index_from_cat(cats)

In [84]:
len(inverted_index_cats)

10788

## Indexación BoW

In [78]:
def build_inverted_index(X, feature_names):
    inverted_index = {}
    for term_idx, term in enumerate(feature_names):
        term_docs = set(X[:, term_idx].nonzero()[0])
        inverted_index[term] = term_docs
    return inverted_index

inverted_index_bow = build_inverted_index(X_bow, feature_names_bow)
#inverted_index_tfidf = build_inverted_index(X_tfidf, feature_names_tfidf)

# Motor de busqueda

### BoW

In [None]:
query = "japan revis longterm energi demand ministri"
# Vectorización de la consulta utilizando el vectorizador BoW
query_vector_bow = vectorizer_bow.transform([query])

# Calcular la similitud entre la consulta y los documentos utilizando BoW
similarity_scores_bow = cosine_similarity(query_vector_bow, X_bow)

# Obtener los documentos ordenados por similitud
ranked_documents_bow = np.argsort(similarity_scores_bow)[0][::-1]

# Metricas de Evaluacion

In [85]:
query_vector_bow = vectorizer_bow.transform([" ".join(cats.keys())])
similarity_scores_bow = cosine_similarity(query_vector_bow, X_bow)
ranked_documents_bow = np.argsort(similarity_scores_bow)[0][::-1]

In [87]:
def relevant_documents_for_query(query_terms, index):
    relevant_docs = set()
    for term in query_terms:
        if term in index:
            relevant_docs.update(index[term])
    return relevant_docs

query_terms = list(cats.keys())  # Lista de términos de consulta a partir de las categorías

# Obtener documentos relevantes para la consulta actual
relevant_docs_bow = relevant_documents_for_query(query_terms, inverted_index_cats)
relevant_docs_tfidf = relevant_documents_for_query(query_terms, inverted_index_cats)

# Calcular precisión y recall
def precision_recall(ranked_documents, relevant_docs, total_documents):
    retrieved_docs = [doc for doc in ranked_documents if doc in relevant_docs]
    TP = len(retrieved_docs)
    FP = len(ranked_documents) - TP
    FN = len(relevant_docs) - TP
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    return precision, recall

total_documents = X_bow.shape[0]

# Calcular precisión y recall para BoW
precision_bow, recall_bow = precision_recall(ranked_documents_bow, relevant_docs_bow, total_documents)

print("Precisión y recall para BoW:")
print("Precisión:", precision_bow)
print("Recall:", recall_bow)


Precisión y recall para BoW:
Precisión: 0.0
Recall: 0.0


'\nprint("\nPrecisión y recall para TF-IDF:")\nprint("Precisión:", precision_tfidf)\nprint("Recall:", recall_tfidf)\n'