# Preprocesamiento

In [55]:
import os
from nltk.stem import SnowballStemmer 
import nltk
import string
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [56]:
#Path de los datos 
#data_path = r'C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\test_txt'
#data_path = r'D:\U\7. Septimo\RI\ProyectoRI\data\training_txt'
data_path = r'C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\training_txt'

In [57]:
documents = [] #  Vector de documentos
for filename in os.listdir(data_path):
    if filename.endswith('.txt'): 
        path = os.path.join(data_path, filename) #Abrimos cada archivo 
        with open(path, 'r', encoding='utf-8') as file:
            content = file.read() # leemos
            documents.append((filename, content))

In [58]:
#Path de las stopwords
#stopwords_path = r"C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\stopwords.txt"
#stopwords_path = r"D:\U\7. Septimo\RI\ProyectoRI\data\stopwords.txt"
stopwords_path = r"C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\stopwords.txt"

In [59]:
#Abrimos el archivo 
with open(stopwords_path, 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines()) # leemos las stopwords

In [60]:
#Stemmer se usa para reducir las palabras a su raíz
stemmer = SnowballStemmer('english')

In [61]:
#Definimos una función que normaliza el texto con todos los requisitos necesarios:
def preprocess_text(text):
    text = text.lower() #conviertimos en minúsculas
    text = re.sub(r'\d+', '', text)  
    text = text.translate(str.maketrans('', '', string.punctuation))#eliminamos los signos de puntuación
    tokens = nltk.word_tokenize(text)#tokenizamos
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words] #aplicamos stemming
    return ' '.join(processed_tokens)

In [62]:
preprocessed_documents = [(filename, preprocess_text(content)) for filename, content in documents]

# Vectorizacion

### TF-IDF

In [63]:
# Extraer solo los contenidos preprocesados para la vectorización
preprocessed_contents = [content for _, content in preprocessed_documents]

# Inicializar el vectorizador TF-IDF
vectorizer_tfidf = TfidfVectorizer()
# Vectorizar los contenidos preprocesados
X_tfidf = vectorizer_tfidf.fit_transform(preprocessed_contents)

### BoW 

In [64]:
preprocessed_documents_list = [(filename, preprocess_text(content)) for filename, content in preprocessed_documents]

# Extraer solo los contenidos preprocesados para la vectorización
preprocessed_contents = [content for _, content in preprocessed_documents_list]

# Inicializar el vectorizador
vectorizer_bow = CountVectorizer()

# Vectorizar los contenidos preprocesados
X_bow = vectorizer_bow.fit_transform(preprocessed_contents)


# Indice Invertido

In [65]:
data_path_cats = r'C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\catslimpia.txt'

In [66]:
# Cargar cats.txt
cats = {}
with open(data_path_cats, 'r') as f:
    for line in f:
        term, *docs = line.strip().split()
        cats[term] = docs

In [67]:
lines = []
with open(data_path_cats, 'r', encoding='utf-8') as file:
    lines = file.readlines()

In [68]:
def build_inverted_index(lines):
    index = {}
    for line in lines:
        # Separar la ruta del documento y los términos
        parts = line.strip().split()
        document = parts[0]
        terms = parts[1:]
        
        for term in terms:
            if term in index:
                if document not in index[term]:  # Verificar que el documento no esté ya en la lista
                    index[term].append(document)
            else:
                index[term] = [document]
    return index

In [69]:
index_cats = build_inverted_index(lines)

# Motor de busqueda

### BoW

In [70]:
query = "japan revis longterm energi demand ministri"
# Vectorización de la consulta utilizando el vectorizador BoW
query_vector_bow = vectorizer_bow.transform([query])

# Calcular la similitud entre la consulta y los documentos utilizando BoW
similarity_scores_bow = cosine_similarity(query_vector_bow, X_bow)

# Obtener los documentos ordenados por similitud
ranked_documents_bow = np.argsort(similarity_scores_bow)[0][::-1]

# Metricas de Evaluacion

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

# División de Datos
X_train, X_test, y_train, y_test = train_test_split(documents, index_cats, test_size=0.2, random_state=42)

# Vectorización TF-IDF
vectorizer_tfidf = TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

# Vectorización BoW
vectorizer_bow = CountVectorizer()
X_train_bow = vectorizer_bow.fit_transform(X_train)
X_test_bow = vectorizer_bow.transform(X_test)

# Consulta de Búsqueda
query = "consulta de ejemplo"
query_vector_tfidf = vectorizer_tfidf.transform([query])
query_vector_bow = vectorizer_bow.transform([query])

# Cálculo de Similitud
similarity_scores_tfidf = cosine_similarity(query_vector_tfidf, X_test_tfidf)
similarity_scores_bow = cosine_similarity(query_vector_bow, X_test_bow)

# Métricas de Evaluación
precision_tfidf = precision_score(y_test, similarity_scores_tfidf)
recall_tfidf = recall_score(y_test, similarity_scores_tfidf)
f1_tfidf = f1_score(y_test, similarity_scores_tfidf)

precision_bow = precision_score(y_test, similarity_scores_bow)
recall_bow = recall_score(y_test, similarity_scores_bow)
f1_bow = f1_score(y_test, similarity_scores_bow)

# Comparación de Resultados
print("Métricas para TF-IDF:")
print("Precisión:", precision_tfidf)
print("Recall:", recall_tfidf)
print("F1 Score:", f1_tfidf)

print("\nMétricas para BoW:")
print("Precisión:", precision_bow)
print("Recall:", recall_bow)
print("F1 Score:", f1_bow)


ValueError: Found input variables with inconsistent numbers of samples: [7769, 90]