# Preprocesamiento

In [65]:
import os
from nltk.stem import SnowballStemmer 
import nltk
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [66]:
#Path de los datos 
#data_path = r'C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\test_txt'
#data_path = r 'D:\U\7. Septimo\RI\ProyectoRI\data\training_txt'
data_path = r'C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\training_txt'

In [67]:
#Crear un vector de documentos
#Abrimos cada archivo y lo leemos
documents = []
for filename in os.listdir(data_path):
    if filename.endswith('.txt'):
        path = os.path.join(data_path, filename)
        with open(path, 'r', encoding='utf-8') as file:
            content = file.read()
            documents.append((filename, content))
len(documents)

7769

In [68]:
#Stemmer se usa para reducir las palabras a su raíz
stemmer = SnowballStemmer('english')

In [69]:
#Path de las stopwords
#stopwords_path = r"C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\stopwords.txt"
#stopwords_path = r"D:\U\7. Septimo\RI\ProyectoRI\data\stopwords.txt"
stopwords_path = r"C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\stopwords.txt"

In [70]:
#Abrimos el archivo y leemos las stopwords
with open(stopwords_path, 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines())

In [71]:
#Definimos una función que normaliza el texto con todos los requisitos necesarios:
#conviertimos en minúsculas
#eliminamos los signos de puntuación
#tokenizamos
#aplicamos stemming
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(processed_tokens)

In [72]:
preprocessed_documents = [(filename, preprocess_text(content)) for filename, content in documents]

In [73]:
len(preprocessed_documents)

7769

In [74]:
#Guardamos los documentos preprocesados en un directorio 
#preprocessed_data_path = r'D:\U\7. Septimo\Rec. Info\preprocessed_txt'
preprocessed_data_path = r'C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\data\preprocessedTraining_txt'
os.makedirs(preprocessed_data_path, exist_ok=True)
for filename, content in preprocessed_documents:
    with open(os.path.join(preprocessed_data_path, filename), 'w', encoding='utf-8') as file:
        file.write(content)

In [75]:
print (preprocessed_documents)

[('1.txt', 'bahia cocoa review shower continu week bahia cocoa zone allevi drought earli januari improv prospect come temporao normal humid level restor comissaria smith week review dri period mean temporao late year arriv week end februari 22 155221 bag 60 kilo make cumul total season 593 mln 581 stage year cocoa deliv earlier consign includ arriv figur comissaria smith doubt crop cocoa harvest practic end total bahia crop estim 64 mln bag sale stand 62 mln hundr thousand bag hand farmer middlemen export processor doubt cocoa fit export shipper experienc dificulti obtain bahia superior certif view lower qualiti recent week farmer sold good part cocoa held consign comissaria smith spot bean price rose 340 350 cruzado arroba 15 kilo bean shipper reluct offer nearbi shipment limit sale book march shipment 1750 1780 dlrs tonn port name crop sale light open port junejuli 1850 1880 dlrs 35 45 dlrs york juli augsept 1870 1875 1880 dlrs tonn fob routin sale butter made marchapril sold 4340 43

# Vectorizacion

# TF-IDF

In [76]:
# Preprocesar los contenidos de los documentos y mantener la estructura de lista de tuplas
preprocessed_documents_list = [(filename, preprocess_text(content)) for filename, content in preprocessed_documents]

# Extraer solo los contenidos preprocesados para la vectorización
preprocessed_contents = [content for _, content in preprocessed_documents_list]

# Inicializar el vectorizador TF-IDF
vectorizer_tfidf = TfidfVectorizer()

# Vectorizar los contenidos preprocesados
X_tfidf = vectorizer_tfidf.fit_transform(preprocessed_contents)



In [77]:
# Imprimir la matriz de términos y documentos TF-IDF
print("Matriz de términos y documentos TF-IDF:")
print(X_tfidf.toarray())

# Imprimir el vocabulario TF-IDF
print("Vocabulario TF-IDF:")
print(vectorizer_tfidf.vocabulary_)

Matriz de términos y documentos TF-IDF:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Vocabulario TF-IDF:
{'bahia': 13274, 'cocoa': 15045, 'review': 29047, 'shower': 30054, 'continu': 15389, 'week': 32884, 'zone': 33410, 'allevi': 12541, 'drought': 16635, 'ear': 16753, 'januari': 20147, 'improv': 19626, 'prospect': 28229, 'temporao': 31427, 'normal': 26532, 'humid': 19413, 'level': 20947, 'restor': 28992, 'comissaria': 15146, 'smith': 30304, 'dri': 16620, 'period': 27517, 'late': 20788, 'year': 33295, 'arriv': 12980, 'end': 17015, 'februari': 17561, '22': 3960, '155221': 2203, 'bag': 13269, '60': 9158, 'kilo': 20497, 'make': 25019, 'cumul': 15750, 'total': 31769, 'season': 29697, '593': 9082, 'mln': 25769, '581': 8970, 'stage': 30618, 'deliv': 16065, 'earlier': 16756, 'consign': 15340, 'includ': 19661, 'figur': 17670, 'doubt': 16562, 'crop': 15670, 'harvest': 18944, 'practic': 

In [78]:
query = "japan revis longterm energi demand ministri"

# Vectorización de la consulta utilizando el vectorizador TF-IDF
query_vector_tfidf = vectorizer_tfidf.transform([query])

# Calcular la similitud entre la consulta y los documentos utilizando TF-IDF
similarity_scores_tfidf = cosine_similarity(query_vector_tfidf, X_tfidf)

# Obtener los documentos ordenados por similitud
ranked_documents_tfidf = np.argsort(similarity_scores_tfidf)[0][::-1]

# Mostrar solo los primeros 10 resultados
for i, doc_index in enumerate(ranked_documents_tfidf[:10]):  # Solo los primeros 10 resultados
    print(f"Resultado {i+1}: Documento {doc_index}, Similitud: {similarity_scores_tfidf[0][doc_index]}")
    print(preprocessed_documents[doc_index][:200])  # Mostrar los primeros 200 caracteres del documento
    print()


Resultado 1: Documento 1742, Similitud: 0.3595504202853473
('12795.txt', 'japan ask bank cut dollar sale dealer financ ministri ask japanes commerci bank moder dollar sale bank dealer ministri telephon citi longterm bank earlier week make request dealer time ministri made request commerci bank financ ministri offici unavail comment dealer ministri ask institut investor reduc sale dollar')

Resultado 2: Documento 4382, Similitud: 0.3462025850907729
('4717.txt', 'bank japan call longterm effort trade shortterm effect foreign exchang rate movement correct intern trade imbal appear wane longterm effort requir cut japan chronic depend extern demand bank japan month report japan trade surplus nomin term remain high futur central bank fundament adjust need long japan hope benefit intern alloc resourc maintain free trade system ad')

Resultado 3: Documento 4343, Similitud: 0.3462025850907729
('4654.txt', 'bank japan call longterm effort trade shortterm effect foreign exchang rate movement corr

# BoW

In [79]:
preprocessed_documents_list = [(filename, preprocess_text(content)) for filename, content in preprocessed_documents]

# Extraer solo los contenidos preprocesados para la vectorización
preprocessed_contents = [content for _, content in preprocessed_documents_list]

# Inicializar el vectorizador
vectorizer_bow = CountVectorizer()

# Vectorizar los contenidos preprocesados
X_bow = vectorizer_bow.fit_transform(preprocessed_contents)


In [80]:
# Imprimir la matriz de términos y documentos
print("Matriz de términos y documentos:")
print(X_bow.toarray())

# Imprimir el vocabulario
print("Vocabulario:")
print(vectorizer_bow.vocabulary_)

Matriz de términos y documentos:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Vocabulario:
{'bahia': 13274, 'cocoa': 15045, 'review': 29047, 'shower': 30054, 'continu': 15389, 'week': 32884, 'zone': 33410, 'allevi': 12541, 'drought': 16635, 'ear': 16753, 'januari': 20147, 'improv': 19626, 'prospect': 28229, 'temporao': 31427, 'normal': 26532, 'humid': 19413, 'level': 20947, 'restor': 28992, 'comissaria': 15146, 'smith': 30304, 'dri': 16620, 'period': 27517, 'late': 20788, 'year': 33295, 'arriv': 12980, 'end': 17015, 'februari': 17561, '22': 3960, '155221': 2203, 'bag': 13269, '60': 9158, 'kilo': 20497, 'make': 25019, 'cumul': 15750, 'total': 31769, 'season': 29697, '593': 9082, 'mln': 25769, '581': 8970, 'stage': 30618, 'deliv': 16065, 'earlier': 16756, 'consign': 15340, 'includ': 19661, 'figur': 17670, 'doubt': 16562, 'crop': 15670, 'harvest': 18944, 'practic': 27966, 'estim': 17247, '64': 9500, 'sale': 29443, 

In [81]:
query = "japan revis longterm energi demand ministri"
# Vectorización de la consulta utilizando el vectorizador BoW
query_vector_bow = vectorizer_bow.transform([query])

# Calcular la similitud entre la consulta y los documentos utilizando BoW
similarity_scores_bow = cosine_similarity(query_vector_bow, X_bow)

# Obtener los documentos ordenados por similitud
ranked_documents_bow = np.argsort(similarity_scores_bow)[0][::-1]

# Mostrar solo los primeros 10 resultados
for i, doc_index in enumerate(ranked_documents_bow[:10]):  # Solo los primeros 10 resultados
    print(f"Resultado {i+1}: Documento {doc_index}, Similitud: {similarity_scores_bow[0][doc_index]}")
    print(preprocessed_documents_list[doc_index][:200])  # Mostrar los primeros 200 caracteres del documento
    print()


Resultado 1: Documento 4382, Similitud: 0.36140316116210053
('4717.txt', 'bank japan call longterm effort trade shortterm effect foreign exchang rate movement correct intern trade imbal wane longterm effort requir cut japan chronic depend extern demand bank japan month report japan trade surplus nomin term remain high futur central bank fundament adjust long japan hope benefit intern alloc resourc maintain free trade system ad')

Resultado 2: Documento 4343, Similitud: 0.36140316116210053
('4654.txt', 'bank japan call longterm effort trade shortterm effect foreign exchang rate movement correct intern trade imbal wane longterm effort requir cut japan chronic depend extern demand bank japan month report japan trade surplus nomin term remain high futur central bank fundament adjust long japan hope benefit intern alloc resourc maintain free trade system ad')

Resultado 3: Documento 7064, Similitud: 0.3380617018914066
('8911.txt', 'fed heller stronger japan demand american good fed heller s

# Indice Invertido

In [82]:
#folder_path = r'C:\Users\kevin\OneDrive\Documentos\GitHub\ProyectoRI\data\cats.txt'
#folder_path = r'D:\U\7. Septimo\RI\ProyectoRI\data\cats.txt'
folder_path = r'C:\Users\usuario\Fer-Pc\Escritorio\EPN\2024-A\SEPTIMO_SEMESTRE\RECUPERACION_DE_INFORMACION\repoMantillaRI\ProyectoRI\catslimpios.txt'

In [83]:
lines = []
with open(folder_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

In [84]:
# Construir el índice invertido
def build_inverted_index(preprocessed_contents):
    inverted_index = {}
    for doc_id, content in enumerate(preprocessed_contents):
        for word in content.split():
            if word not in inverted_index:
                inverted_index[word] = set()  # Usamos un conjunto para evitar duplicados
            inverted_index[word].add(doc_id)
    return inverted_index

# Crear el índice invertido
inverted_index = build_inverted_index(preprocessed_contents)

In [85]:
len(inverted_index)

33431

In [86]:
index_df = pd.DataFrame.from_dict(inverted_index, orient='index')
print(index_df)

         0       1       2       3       4       5       6       7       8     \
bahia       0  2047.0   941.0  1240.0     NaN     NaN     NaN     NaN     NaN   
cocoa       0  3457.0  4226.0  2947.0  5505.0   389.0   262.0  7432.0     9.0   
review      0  4098.0  7694.0  4625.0    25.0  5658.0  3099.0  7706.0  3108.0   
shower      0  5288.0  5292.0  5271.0   248.0  6937.0   991.0     NaN     NaN   
continu     0     1.0  2049.0  6146.0  4109.0  2064.0  6160.0    19.0  2073.0   
...       ...     ...     ...     ...     ...     ...     ...     ...     ...   
534099   7767     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
811836   7767     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5167573  7767     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
1251337  7767     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
1916000  7768     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   

           9     ...  3424 

# Motor de busqueda

In [87]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [88]:
# Función para realizar una consulta de búsqueda utilizando BoW y el índice invertido
def search_bow(query, vectorizer_bow, X_bow, inverted_index):
    query_words = query.split()
    relevant_docs_set = set()
    for word in query_words:
        if word in inverted_index:
            relevant_docs_set.update(inverted_index[word])
    
    relevant_docs = list(relevant_docs_set)
    if not relevant_docs:
        return [], []

    X_bow_relevant = X_bow[relevant_docs]
    query_vector = vectorizer_bow.transform([query]).toarray()
    similarities = cosine_similarity(query_vector, X_bow_relevant.toarray()).flatten()
    sorted_docs = [relevant_docs[i] for i in np.argsort(similarities)[::-1]]
    sorted_similarities = np.sort(similarities)[::-1]
    return sorted_docs, sorted_similarities

# Función para realizar una consulta de búsqueda utilizando TF-IDF y el índice invertido
def search_tfidf(query, vectorizer_tfidf, X_tfidf, inverted_index):
    query_words = query.split()
    relevant_docs_set = set()
    for word in query_words:
        if word in inverted_index:
            relevant_docs_set.update(inverted_index[word])
    
    relevant_docs = list(relevant_docs_set)
    if not relevant_docs:
        return [], []

    X_tfidf_relevant = X_tfidf[relevant_docs]
    query_vector = vectorizer_tfidf.transform([query]).toarray()
    similarities = cosine_similarity(query_vector, X_tfidf_relevant.toarray()).flatten()
    sorted_docs = [relevant_docs[i] for i in np.argsort(similarities)[::-1]]
    sorted_similarities = np.sort(similarities)[::-1]
    return sorted_docs, sorted_similarities

# Solicitar la consulta del usuario
query2 = "the Bahia cocoa zone"

# Realizar la búsqueda utilizando BoW
relevant_docs_bow, similarities_bow = search_bow(query2, vectorizer_bow, X_bow, inverted_index)

# Realizar la búsqueda utilizando TF-IDF
relevant_docs_tfidf, similarities_tfidf = search_tfidf(query2, vectorizer_tfidf, X_tfidf, inverted_index)

# Mostrar resultados
print("\nDocumentos relevantes utilizando BoW:")
num_docs_to_display = min(5, len(relevant_docs_bow))  # Aseguramos que no intentamos acceder a más elementos de los que existen
for i in range(num_docs_to_display):
    doc_id = relevant_docs_bow[i]
    similarity = similarities_bow[i]
    print(f"Documento {doc_id}, Similaridad: {similarity}")

print("\nDocumentos relevantes utilizando TF-IDF:")
num_docs_to_display = min(5, len(relevant_docs_tfidf))  # Aseguramos que no intentamos acceder a más elementos de los que existen
for i in range(num_docs_to_display):
    doc_id = relevant_docs_tfidf[i]
    similarity = similarities_tfidf[i]
    print(f"Documento {doc_id}, Similaridad: {similarity}")


Documentos relevantes utilizando BoW:
Documento 318, Similaridad: 0.332564397284216
Documento 7736, Similaridad: 0.2956885083818292
Documento 319, Similaridad: 0.29095718698132317
Documento 4708, Similaridad: 0.2848746885840189
Documento 366, Similaridad: 0.28171808490950556

Documentos relevantes utilizando TF-IDF:
Documento 0, Similaridad: 0.34161018227459233
Documento 318, Similaridad: 0.3356795342654957
Documento 319, Similaridad: 0.29336797798883985
Documento 4708, Similaridad: 0.29314038752242794
Documento 366, Similaridad: 0.2908424377346394


In [89]:
print(X_bow[318])

  (0, 15045)	12
  (0, 15389)	1
  (0, 32884)	3
  (0, 16753)	1
  (0, 20147)	1
  (0, 20947)	1
  (0, 31769)	1
  (0, 18856)	1
  (0, 28097)	4
  (0, 2022)	1
  (0, 26765)	1
  (0, 26184)	1
  (0, 21035)	2
  (0, 31737)	1
  (0, 29783)	2
  (0, 17364)	3
  (0, 31201)	1
  (0, 30786)	5
  (0, 28319)	1
  (0, 14242)	3
  (0, 6982)	1
  (0, 27414)	2
  (0, 19996)	1
  (0, 25152)	4
  (0, 15760)	1
  :	:
  (0, 26996)	1
  (0, 15540)	5
  (0, 31809)	1
  (0, 17951)	1
  (0, 15888)	1
  (0, 14152)	4
  (0, 19497)	3
  (0, 16051)	4
  (0, 29341)	5
  (0, 26453)	1
  (0, 16256)	1
  (0, 18373)	1
  (0, 17751)	1
  (0, 17370)	1
  (0, 16210)	1
  (0, 17889)	3
  (0, 19959)	1
  (0, 25265)	1
  (0, 2522)	1
  (0, 30416)	1
  (0, 20296)	1
  (0, 27738)	1
  (0, 25035)	1
  (0, 25034)	1
  (0, 15275)	1


In [90]:
print(X_tfidf[318])

  (0, 15275)	0.05824489324354672
  (0, 25034)	0.0586315559027626
  (0, 25035)	0.06134655539953796
  (0, 27738)	0.08562868633806957
  (0, 20296)	0.08284815928543868
  (0, 30416)	0.08954761911126703
  (0, 2522)	0.08562868633806957
  (0, 25265)	0.08954761911126703
  (0, 19959)	0.05223259522758915
  (0, 17889)	0.13411461643438288
  (0, 16210)	0.06553030686058452
  (0, 17370)	0.04701456989689059
  (0, 17751)	0.04989443263857358
  (0, 18373)	0.06831083391321541
  (0, 16256)	0.059894036208660364
  (0, 26453)	0.06729249529527866
  (0, 29341)	0.22036169306493364
  (0, 16051)	0.20256591246646244
  (0, 19497)	0.187357089566426
  (0, 14152)	0.23076976525675832
  (0, 15888)	0.047194115817793626
  (0, 17951)	0.056674102696252844
  (0, 31809)	0.04343762850655068
  (0, 15540)	0.23332592238961875
  (0, 26996)	0.047753677182901515
  :	:
  (0, 15760)	0.032970581422847575
  (0, 25152)	0.10681438823182594
  (0, 19996)	0.046665184477923755
  (0, 27414)	0.04213701367229813
  (0, 6982)	0.042244770374536625
  