In [2]:
import nltk
import pandas as pd

In [7]:
# Descargar corpus de texto y modelos entrenados
nltk.download('punkt') # Modelo para tokenizar por oraciones basado en caracteres de puntuación
nltk.download('stopwords') # Corpus con palabras de parada para cada lenguaje
nltk.download('wordnet')
##nltk.download('wordnet') # Gran base de datos léxica en inglés con las relaciones entre palabras

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\juane\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\juane\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\juane\AppData\Roaming\nltk_data...


True

In [4]:
##################
## Paths para apuntar a la data
###################
path_docs = './data/docs-raw-texts' #path de los documentos
path_queries = './data/queries-raw-texts'

In [8]:
# Pasos de preprocesamiento: para los siguientes puntos,
#  debe preprocesar documentos y consultas mediante tokenización a nivel de palabra,
#  eliminación de palabras vacías, normalización y stemming

import os
from pathlib import Path
import re

class procesamientotexto:

    def __init__(self,path_dir):
        self.path = path_dir
        self.tokens_doc = {} 
        self.word_tok_nltk_es_sw = {}
        self.nltk_lemmaList = {}

    def tokenizacion(self):
        tokenizer = nltk.RegexpTokenizer(r'\w+')
        for doc in os.listdir(self.path):
            path = os.path.join(self.path,doc)
            content_archivo = open(path,encoding='utf8').read()
            texto = re.match('[\w\W]+<raw><!\[CDATA\[(?P<texto>(.|\n|\s|\s)+)\]\]></raw>',content_archivo).groupdict()['texto'].lower()
            # self.tokens_doc[doc] = nltk.word_tokenize(texto,preserve_line=True)
            self.tokens_doc[doc] = tokenizer.tokenize(texto)
            
        return self.tokens_doc
    
    def stopwords(self):
        nltk_stop_words_es = set(nltk.corpus.stopwords.words('english'))
        for name_doc,doc in self.tokens_doc.items():
            self.word_tok_nltk_es_sw[name_doc] = [token for token in doc if token not in nltk_stop_words_es ]
        return self.word_tok_nltk_es_sw
    
    def stemming(self): 
        wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
        index = 0
        for name_doc,doc in self.word_tok_nltk_es_sw.items():
            index += 1
            self.nltk_lemmaList[name_doc] = {
                'index': int(re.match('wes2015.(d|q)(?P<num>\d+).naf',name_doc).groupdict()['num']),
                'text': [ wordnet_lemmatizer.lemmatize(word) for word in doc ]
            }
        return self.nltk_lemmaList
    
    def dicterminos(self):
        dic = set([])
        for doc in self.nltk_lemmaList.values():
            dic = dic.union(set(doc['text'])) 
        return dic

    
text_process = procesamientotexto(path_docs)
doc_tokens = text_process.tokenizacion()
word_tok_nltk_es_sw = text_process.stopwords()
nltk_lemmaList = text_process.stemming()  # los textos lemmatizados
dicterminos = text_process.dicterminos()  #vocabulario

print(nltk_lemmaList['wes2015.d001.naf'])
print(dicterminos)


  texto = re.match('[\w\W]+<raw><!\[CDATA\[(?P<texto>(.|\n|\s|\s)+)\]\]></raw>',content_archivo).groupdict()['texto'].lower()
  'index': int(re.match('wes2015.(d|q)(?P<num>\d+).naf',name_doc).groupdict()['num']),


{'index': 1, 'text': ['william', 'beaumont', 'human', 'digestion', 'william', 'beaumont', 'physiology', 'digestion', 'image', 'source', 'november', '21', '1785', 'u', 'american', 'surgeon', 'william', 'beaumont', 'born', 'became', 'best', 'known', 'father', 'gastric', 'physiology', 'following', 'research', 'human', 'digestion', 'william', 'beaumont', 'born', 'lebanon', 'connecticut', 'became', 'physician', 'served', 'surgeon', 'mate', 'army', 'war', '1812', 'opened', 'private', 'practice', 'plattsburgh', 'new', 'york', 'rejoined', 'army', 'surgeon', '1819', 'beaumont', 'stationed', 'fort', 'mackinac', 'mackinac', 'island', 'michigan', 'early', '1820s', 'existed', 'protect', 'interest', 'american', 'fur', 'company', 'fort', 'became', 'refuge', 'wounded', '19', 'year', 'old', 'french', 'canadian', 'fur', 'trader', 'named', 'alexis', 'st', 'martin', 'shotgun', 'went', 'accident', 'american', 'fur', 'company', 'store', 'close', 'range', 'june', '6th', '1822', 'st', 'martin', 'wound', 'quit

In [9]:
# [10p] Cree su propia implementación del índice invertido usando 
# los 331 documentos en el conjunto de datos.

def indiceinvertido(doc_lemalist: dict,terminos:dict):
    indiceinvertido = {}
    for termino in terminos: 
         indiceinvertido[termino] = {'IDdocs':[],'len':0}
  
    for documento in doc_lemalist.values():   ## no me gusta el doble for
        set_texto = set(documento['text'])
        for termino in set_texto: 
            indiceinvertido[termino]['IDdocs'].append(documento['index'])
            indiceinvertido[termino]['len'] +=1 
    return indiceinvertido


list_indiceinvertido = indiceinvertido(nltk_lemmaList,dicterminos)
print(list_indiceinvertido)



In [10]:
# [10p] Cree una función que lea el índice invertido y calcule consultas booleanas mediante el algoritmo de mezcla.
# El algoritmo de mezcla debe ser capaz de calcular: AND, y NOT.

def And(indiceInvertido: dict,termino1: str,termino2: str):
    intersect_id = []
     
    indices_doc_t1 = indiceInvertido[termino1]['IDdocs']
    indices_doc_t2 = indiceInvertido[termino2]['IDdocs']
 
    i,j = 0,0 
    while  i<len(indices_doc_t1) and j<len(indices_doc_t2):
        if indices_doc_t1[i] == indices_doc_t2[j]:
            intersect_id.append(indices_doc_t1[i])
            i,j = i+1,j+1
        elif indices_doc_t1[i]>indices_doc_t2[j]: 
            j += 1
        else: 
            i += 1
    return intersect_id

# set1 = set(list_indiceinvertido['despite']['IDdocs'])
# set2 = set(list_indiceinvertido['plan']['IDdocs'])
# print(set1.intersection(set2))
print(And(list_indiceinvertido,'despite','plan'))

def Not(indiceInvertido: dict,termino:str = None, lstTerminos: str = None): 
    
    if termino is not None: 
        setTerminos = set(indiceInvertido[termino])
    else: 
        setTerminos = lstTerminos
    setUniversal = set(range(1,332))
    return list(setUniversal - setTerminos)


print(Not(list_indiceinvertido,'despite'))

[3, 26, 134, 277]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218,

In [11]:
# [5p] Para cada una de las 35 consultas en el conjunto de datos, 
# recupere los documentos utilizando consultas binarias
#  AND (i.e. termino_1 AND termino_2 AND termino_3…). 
# Escriba un archivo (BSII-AND-queries_results) 
# con los resultados siguiendo el mismo formato que "relevance-judgments":
# q01 dXX,dYY,dZZ…
# Nota: pueden resultar archivos vacíos.
# path_queries

proc_querys = procesamientotexto(path_queries)
tokens_querys = proc_querys.tokenizacion()
tokens_querys_sw = proc_querys.stopwords()
terminos_querys = proc_querys.stemming()
print(terminos_querys)

def queries(list_indiceinvertido,terminos_querys):

    data = {'query':[],'results':[]}
    def And2(indiceinvertido:dict,query: list):
        
        IdDocs_list = []
        for termino in query:
            if termino in indiceinvertido: 
                IdDocs_list.append(indiceinvertido[termino]['IDdocs'])
            else: 
                return []
            
        and_set = set(IdDocs_list[0])
        for iddocs in IdDocs_list:
            and_set=and_set.intersection(iddocs)
        
        return and_set
    
    for docquery in terminos_querys.values():
        indice = docquery['index']
        texto = docquery['text']
        results = list(And2(list_indiceinvertido,texto))
        data['query'].append(f'q{indice:02d}')
        
        result = ','.join(
            list(map(lambda x: f'd{x:03d}',results))
            ) if len(results) != 0 else ''
        
        data['results'].append(result)

    df = pd.DataFrame(data)

    return df 


df_q = queries(list_indiceinvertido,terminos_querys)
df_q.head()
df_q.to_csv('BSII-AND-queries_results.tsv',sep='\t',header=False,index=False)


{'wes2015.q01.naf': {'index': 1, 'text': ['fabrication', 'music', 'instrument']}, 'wes2015.q02.naf': {'index': 2, 'text': ['famous', 'german', 'poetry']}, 'wes2015.q03.naf': {'index': 3, 'text': ['romanticism']}, 'wes2015.q04.naf': {'index': 4, 'text': ['university', 'edinburgh', 'research']}, 'wes2015.q06.naf': {'index': 6, 'text': ['bridge', 'construction']}, 'wes2015.q07.naf': {'index': 7, 'text': ['walk', 'fame', 'star']}, 'wes2015.q08.naf': {'index': 8, 'text': ['scientist', 'worked', 'atomic', 'bomb']}, 'wes2015.q09.naf': {'index': 9, 'text': ['invention', 'internet']}, 'wes2015.q10.naf': {'index': 10, 'text': ['early', 'telecommunication', 'method']}, 'wes2015.q12.naf': {'index': 12, 'text': ['explored', 'south', 'pole']}, 'wes2015.q13.naf': {'index': 13, 'text': ['famous', 'member', 'royal', 'navy']}, 'wes2015.q14.naf': {'index': 14, 'text': ['nobel', 'prize', 'winning', 'invention']}, 'wes2015.q16.naf': {'index': 16, 'text': ['south', 'america']}, 'wes2015.q17.naf': {'index': 

In [12]:
def And2(indiceinvertido:dict,query: list):
    len_terminos_q = len(query)
    IdDocs_list = []
    for termino in query: 
        IdDocs_list.append(indiceinvertido[termino]['IDdocs'])
    and_set = set(IdDocs_list[0])
    for iddocs in IdDocs_list:
        and_set=and_set.intersection(iddocs)
    
    return and_set

And2(list_indiceinvertido, ['famous', 'german', 'poetry'])

{291, 293}