# Preprocesamiento

In [105]:
import os
from nltk.stem import SnowballStemmer 
import nltk
import string
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [106]:
#Path de los datos 
#data_path = r'C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\training_txt'
data_path = r'D:\U\7. Septimo\RI\ProyectoRI\data\training_txt'
#data_path = r'C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\training_txt'

In [107]:
documents = [] #  Vector de documentos
for filename in os.listdir(data_path):
    if filename.endswith('.txt'): 
        path = os.path.join(data_path, filename) #Abrimos cada archivo 
        with open(path, 'r', encoding='utf-8') as file:
            content = file.read() # leemos
            documents.append((filename, content))

In [108]:
#Path de las stopwords
#stopwords_path = r"C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\stopwords.txt"
stopwords_path = r"D:\U\7. Septimo\RI\ProyectoRI\data\stopwords.txt"
#stopwords_path = r"C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\stopwords.txt"

In [109]:
#Abrimos el archivo 
with open(stopwords_path, 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines()) # leemos las stopwords

In [110]:
#Stemmer se usa para reducir las palabras a su raíz
stemmer = SnowballStemmer('english')

In [111]:
#Definimos una función que normaliza el texto con todos los requisitos necesarios:
def preprocess_text(text):
    text = text.lower() #conviertimos en minúsculas
    text = re.sub(r'\d+', '', text)  
    text = text.translate(str.maketrans('', '', string.punctuation))#eliminamos los signos de puntuación
    tokens = nltk.word_tokenize(text)#tokenizamos
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words] #aplicamos stemming
    return ' '.join(processed_tokens)

In [112]:
preprocessed_documents = [(filename, preprocess_text(content)) for filename, content in documents]

# Vectorizacion

### TF-IDF

In [113]:
# Extraer solo los contenidos preprocesados para la vectorización
preprocessed_contents = [content for _, content in preprocessed_documents]

# Inicializar el vectorizador TF-IDF
vectorizer_tfidf = TfidfVectorizer()
# Vectorizar los contenidos preprocesados
X_tfidf = vectorizer_tfidf.fit_transform(preprocessed_contents)

### BoW 

In [114]:
# Extraer solo los contenidos preprocesados para la vectorización
preprocessed_contents = [content for _, content in preprocessed_documents]

# Inicializar el vectorizador
vectorizer_bow = CountVectorizer()

# Vectorizar los contenidos preprocesados
X_bow = vectorizer_bow.fit_transform(preprocessed_contents)
feature_names_bow = vectorizer_bow.get_feature_names_out()


# Indexación

In [115]:
#data_path_cats = r'C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\catslimpia.txt'
#data_path_cats = r'C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\catslimpia.txt'
data_path_cats = r'D:\U\7. Septimo\RI\ProyectoRI\data\catslimpia.txt'

In [116]:
# Cargar cats.txt
cats = []
with open(data_path_cats, 'r') as file:
        cats = file.readlines()
        

In [117]:
print(cats)

['14826 trade\n', '14828 grain\n', '14829 nat-gas crude\n', '14832 rubber tin sugar corn rice grain trade\n', '14833 palm-oil veg-oil\n', '14839 ship\n', '14840 rubber coffee lumber palm-oil veg-oil\n', '14841 wheat grain\n', '14842 gold\n', '14843 acq\n', '14844 tin\n', '14849 interest money-fx\n', '14852 copper acq\n', '14854 ipi\n', '14858 soybean oilseed corn grain carcass livestock rice trade\n', '14859 earn\n', '14860 earn\n', '14861 interest money-fx\n', '14862 bop trade\n', '14863 lead gas\n', '14865 acq\n', '14867 jobs\n', '14872 earn\n', '14873 earn\n', '14875 earn\n', '14876 earn\n', '14877 tin\n', '14881 trade\n', '14882 zinc\n', '14885 sugar\n', '14886 sugar\n', '14888 acq\n', '14890 money-fx interest\n', '14891 cpi gnp\n', '14892 soybean oilseed soy-oil palm-oil veg-oil\n', '14899 earn\n', '14900 acq\n', '14903 earn\n', '14904 trade\n', '14907 acq\n', '14909 acq\n', '14911 earn\n', '14912 trade\n', '14913 yen dlr money-fx\n', '14918 cpi\n', '14919 interest money-fx\n', '1

In [118]:
def build_inverted_index(lines):
    inverted_index = {}
    for line in lines:
        parts = line.strip().split()
        doc_id = parts[0]  # El primer elemento es el identificador del documento
        terms = parts[1:]  # Los siguientes elementos son los términos
        for term in terms:
            if term in inverted_index:
                inverted_index[term].add(doc_id)
            else:
                inverted_index[term] = {doc_id}
    return inverted_index

# Construir el índice invertido
inverted_index_cats = build_inverted_index(cats)

# Convertir los sets a listas para facilitar el manejo posterior (opcional)
for term in inverted_index_cats:
    inverted_index_cats[term] = list(inverted_index_cats[term])

# Verificar el índice invertido
print(inverted_index_cats)


{'trade': ['3532', '5850', '9784', '18061', '8599', '4031', '5498', '5810', '15352', '8596', '16871', '8671', '4903', '19918', '10780', '11771', '7628', '10005', '16926', '11446', '10265', '13045', '16745', '4654', '1932', '9060', '12401', '6926', '6406', '11222', '9051', '8244', '3267', '6716', '7135', '8044', '20649', '8189', '9957', '7477', '6757', '4629', '894', '15725', '1347', '1964', '10209', '2417', '8080', '10347', '11076', '9777', '5189', '11357', '9763', '15372', '7804', '8699', '9184', '20865', '16932', '20441', '10695', '10255', '14012', '15375', '9836', '7632', '5692', '11580', '14881', '5288', '16125', '11198', '5274', '11420', '3931', '8624', '9749', '8135', '15171', '16763', '1656', '17871', '8635', '19546', '4552', '15313', '16775', '4156', '3690', '10362', '9076', '3902', '10767', '5954', '11260', '19062', '5684', '17926', '4115', '10264', '18347', '9821', '18798', '9697', '12563', '16856', '16505', '4595', '6593', '1499', '20248', '4987', '18302', '16766', '14912', 

In [119]:
len(inverted_index_cats)

90

## Indexación BoW

In [120]:
def build_inverted_index(X, feature_names):
    inverted_index = {}
    for term_idx, term in enumerate(feature_names):
        term_docs = set(X[:, term_idx].nonzero()[0])
        inverted_index[term] = term_docs
    return inverted_index

inverted_index_bow = build_inverted_index(X_bow, feature_names_bow)
#inverted_index_tfidf = build_inverted_index(X_tfidf, feature_names_tfidf)

# Motor de busqueda

### BoW

In [121]:
query = "japan revis longterm energi demand ministri"
# Vectorización de la consulta utilizando el vectorizador BoW
query_vector_bow = vectorizer_bow.transform([query])

# Calcular la similitud entre la consulta y los documentos utilizando BoW
similarity_scores_bow = cosine_similarity(query_vector_bow, X_bow)

# Obtener los documentos ordenados por similitud
ranked_documents_bow = np.argsort(similarity_scores_bow)[0][::-1]

# Metricas de Evaluacion

In [124]:
query_vector_bow = vectorizer_bow.transform([" ".join(inverted_index_cats.keys())])
similarity_scores_bow = cosine_similarity(query_vector_bow, X_bow)
ranked_documents_bow = np.argsort(similarity_scores_bow)[0][::-1]

In [125]:
def relevant_documents_for_query(query_terms, index):
    relevant_docs = set()
    for term in query_terms:
        if term in index:
            relevant_docs.update(index[term])
    return relevant_docs


In [126]:
query_terms = list(inverted_index_cats.keys())

In [127]:
len(query_terms)

90

In [128]:
relevant_docs_bow = relevant_documents_for_query(query_terms, inverted_index_cats)


In [129]:
def precision_recall(ranked_documents, relevant_docs):
    retrieved_docs = [doc for doc in ranked_documents if doc in relevant_docs]
    TP = len(retrieved_docs)
    FP = len(ranked_documents) - TP
    FN = len(relevant_docs) - TP
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    return precision, recall

In [130]:
# Calcular precisión y recall para BoW
precision_bow, recall_bow = precision_recall(ranked_documents_bow, relevant_docs_bow)


In [131]:
print("Precisión y recall para BoW:")
print("Precisión:", precision_bow)
print("Recall:", recall_bow)

Precisión y recall para BoW:
Precisión: 0.0
Recall: 0.0
