# Preprocesamiento

In [33]:
import os
from nltk.stem import SnowballStemmer 
import nltk
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [34]:
#Path de los datos 
data_path = r'C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\test_txt' 

In [35]:
#Crear un vector de documentos
#Abrimos cada archivo y lo leemos
documents = []
for filename in os.listdir(data_path):
    if filename.endswith('.txt'):
        path = os.path.join(data_path, filename)
        with open(path, 'r', encoding='utf-8') as file:
            content = file.read()
            documents.append((filename, content))
len(documents)

3019

In [36]:
#Stemmer se usa para reducir las palabras a su raíz
stemmer = SnowballStemmer('english')

In [37]:
#Path de las stopwords
stopwords_path = r"C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\stopwords.txt"

In [38]:
#Abrimos el archivo y leemos las stopwords
with open(stopwords_path, 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines())

In [39]:
#Definimos una función que normaliza el texto con todos los requisitos necesarios:
#conviertimos en minúsculas
#eliminamos los signos de puntuación
#tokenizamos
#aplicamos stemming
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(processed_tokens)

In [40]:
#Preprocesamos los documentos
preprocessed_documents = [(filename, preprocess_text(content)) for filename, content in documents]

In [41]:
len(preprocessed_documents)

3019

# Vectorizacion

In [42]:
# Solo mantener el contenido preprocesado de los documentos
preprocessed_texts = [content for _, content in preprocessed_documents]

# Vectorizar los documentos preprocesados
vectorizer_bow = CountVectorizer()
X_bow = vectorizer_bow.fit_transform(preprocessed_texts)


# Indice Invertido

In [45]:
folder_path = r'C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\cats.txt'

In [46]:
lines = []
with open(folder_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

In [47]:
def build_inverted_index(lines):
    index = {}
    for line in lines:
        # Separar la ruta del documento y los términos
        parts = line.strip().split()
        document = parts[0]
        terms = parts[1:]
        
        for term in terms:
            if term in index:
                index[term].append(document)
            else:
                index[term] = [document]
    return index

# Construir el índice invertido
inverted_index = build_inverted_index(lines)

In [None]:
index_df = pd.DataFrame.from_dict(inverted_index, orient='index')
print(index_df)

# Comparacion index con bow

In [None]:
print("hola")