# Preprocesamiento

In [113]:
import os
from nltk.stem import SnowballStemmer 
import nltk
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [114]:
#Path de los datos 
data_path = r'C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\test_txt' 

In [87]:
#Crear un vector de documentos
#Abrimos cada archivo y lo leemos
documents = []
for filename in os.listdir(data_path):
    if filename.endswith('.txt'):
        path = os.path.join(data_path, filename)
        with open(path, 'r', encoding='utf-8') as file:
            content = file.read()
            documents.append((filename, content))
len(documents)

3019

In [88]:
#Stemmer se usa para reducir las palabras a su raíz
stemmer = SnowballStemmer('english')

In [89]:
#Path de las stopwords
stopwords_path = r"C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\stopwords.txt"

In [90]:
#Abrimos el archivo y leemos las stopwords
with open(stopwords_path, 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines())

In [91]:
#Definimos una función que normaliza el texto con todos los requisitos necesarios:
#conviertimos en minúsculas
#eliminamos los signos de puntuación
#tokenizamos
#aplicamos stemming
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(processed_tokens)

In [92]:
preprocessed_documents = [(filename, preprocess_text(content)) for filename, content in documents]

In [103]:
print (preprocessed_documents)

[('14826.txt', 'asian export fear damag usjapan rift mount trade friction japan rais fear asia export nation row inflict farreach econom damag businessmen offici told reuter correspond asian capit move japan boost protectionist sentiment lead curb american import product export conflict hurt longrun shortterm tokyo loss gain impos 300 mln dlrs tariff import japanes electron good april 17 retali japan alleg failur stick pact sell semiconductor world market cost unoffici japanes estim put impact tariff 10 billion dlrs spokesmen major electron firm virtual halt export product hit tax wouldnt busi spokesman lead japanes electron firm matsushita electr industri ltmct tariff remain place length time month complet eros export good subject tariff tom murtha stock analyst tokyo offic broker ltjame capel taiwan businessmen offici worri awar serious threat japan serv warn senior taiwanes trade offici ask name taiwan trade trade surplus 156 billion dlrs year 95 pct surplus help swell taiwan foreig

# Vectorizacion

# TF-IDF

In [124]:
# Preprocesar los contenidos de los documentos y mantener la estructura de lista de tuplas
preprocessed_documents_list = [(filename, preprocess_text(content)) for filename, content in preprocessed_documents]

# Extraer solo los contenidos preprocesados para la vectorización
preprocessed_contents = [content for _, content in preprocessed_documents_list]

# Inicializar el vectorizador TF-IDF
vectorizer_tfidf = TfidfVectorizer()

# Vectorizar los contenidos preprocesados
X_tfidf = vectorizer_tfidf.fit_transform(preprocessed_contents)



In [125]:
query = "japan revis longterm energi demand ministri"

# Vectorización de la consulta utilizando el vectorizador TF-IDF
query_vector_tfidf = vectorizer_tfidf.transform([query])

# Calcular la similitud entre la consulta y los documentos utilizando TF-IDF
similarity_scores_tfidf = cosine_similarity(query_vector_tfidf, X_tfidf)

# Obtener los documentos ordenados por similitud
ranked_documents_tfidf = np.argsort(similarity_scores_tfidf)[0][::-1]

# Mostrar solo los primeros 10 resultados
for i, doc_index in enumerate(ranked_documents_tfidf[:10]):  # Solo los primeros 10 resultados
    print(f"Resultado {i+1}: Documento {doc_index}, Similitud: {similarity_scores_tfidf[0][doc_index]}")
    print(preprocessed_documents[doc_index][:200])  # Mostrar los primeros 200 caracteres del documento
    print()


Resultado 1: Documento 2, Similitud: 0.49935418251680164
('14829.txt', 'japan revis longterm energi demand ministri intern trade industri miti revis longterm energi supplydemand outlook august meet forecast downtrend japanes energi demand ministri offici miti expect lower project primari energi suppli year 2000 550 mln kilolitr kl 600 mln decis emerg structur japanes industri rise yen declin domest electr power demand miti plan work revis energi supplydemand outlook deliber committe meet agenc natur resourc energi offici miti review breakdown energi suppli sourc includ oil nuclear coal natur gas nuclear energi provid bulk japan electr power fiscal year end march 31 suppli estim 27 pct kilowatthour basi oil 23 pct liquefi natur gas 21 pct note')

Resultado 2: Documento 14, Similitud: 0.27246581930604774
('14858.txt', 'japan ministri open farm trade hit japan agricultur ministri anger demand japan open farm product market offici talk month liberalis harm exist farm export japan senior mi

# BoW

In [126]:
preprocessed_documents_list = [(filename, preprocess_text(content)) for filename, content in preprocessed_documents]

# Extraer solo los contenidos preprocesados para la vectorización
preprocessed_contents = [content for _, content in preprocessed_documents_list]

# Inicializar el vectorizador
vectorizer_bow = CountVectorizer()

# Vectorizar los contenidos preprocesados
X_bow = vectorizer_bow.fit_transform(preprocessed_contents)


In [127]:
query = "japan revis longterm energi demand ministri"
# Vectorización de la consulta utilizando el vectorizador BoW
query_vector_bow = vectorizer_bow.transform([query])

# Calcular la similitud entre la consulta y los documentos utilizando BoW
similarity_scores_bow = cosine_similarity(query_vector_bow, X_bow)

# Obtener los documentos ordenados por similitud
ranked_documents_bow = np.argsort(similarity_scores_bow)[0][::-1]

# Mostrar solo los primeros 10 resultados
for i, doc_index in enumerate(ranked_documents_bow[:10]):  # Solo los primeros 10 resultados
    print(f"Resultado {i+1}: Documento {doc_index}, Similitud: {similarity_scores_bow[0][doc_index]}")
    print(preprocessed_documents_list[doc_index][:200])  # Mostrar los primeros 200 caracteres del documento
    print()


Resultado 1: Documento 2, Similitud: 0.5450865450217846
('14829.txt', 'japan revi longterm energi demand ministri intern trade industri miti revi longterm energi supplydemand outlook august meet forecast downtrend japan energi demand ministri offici miti expect lower project primari energi suppli year 2000 550 mln kilolitr kl 600 mln deci emerg structur japan industri rise yen declin domest electr power demand miti plan work revi energi supplydemand outlook delib committ meet agenc natur resourc energi offici miti review breakdown energi suppli sourc includ oil nuclear coal natur gas nuclear energi provid bulk japan electr power fiscal year end march 31 suppli estim 27 pct kilowatthour basi oil 23 pct liquefi natur gas 21 pct note')

Resultado 2: Documento 14, Similitud: 0.34112114616897665
('14858.txt', 'japan ministri open farm trade hit japan agricultur ministri anger demand japan open farm product market offici talk month liberali harm exist farm export japan senior ministri offici

# Indice Invertido

In [115]:
folder_path = r'C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\cats.txt'

In [116]:
lines = []
with open(folder_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

In [117]:
def build_inverted_index(lines):
    index = {}
    for line in lines:
        # Separar la ruta del documento y los términos
        parts = line.strip().split()
        document = parts[0]
        terms = parts[1:]
        
        for term in terms:
            if term in index:
                index[term].append(document)
            else:
                index[term] = [document]
    return index

# Construir el índice invertido
inverted_index = build_inverted_index(lines)

In [118]:
index_df = pd.DataFrame.from_dict(inverted_index, orient='index')
print(index_df)

                  0               1               2              3     \
trade       test/14826      test/14832      test/14858     test/14862   
grain       test/14828      test/14832      test/14841     test/14858   
nat-gas     test/14829      test/15322      test/15416     test/16007   
crude       test/14829      test/15063      test/15200     test/15230   
rubber      test/14832      test/14840      test/15409     test/15424   
...                ...             ...             ...            ...   
castor-oil  test/19672  training/10300            None           None   
jet         test/20031   training/2957   training/6828  training/7397   
palmkernel  test/20911    training/235  training/11778           None   
cpu         test/21245   training/5388   training/5460  training/5485   
rand        test/21535   training/7043   training/9336           None   

                     4           5           6           7           8     \
trade          test/14881  test/14904  test/14

# Comparacion index con bow