# Librerias

In [1]:
# Librerias para el procesamiento de lenguaje natural

from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import nltk

# Librerias para el procesamiento de archivos xml
import xml.etree.ElementTree as ET

# Librerias de uso general
from collections import defaultdict
import os
import re

# Librerias para tipado de funciones
from typing import List

# Configuración de palabras de parada
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Configuración de stemmer
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package stopwords to /home/nico/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Proceso de preprocesamiento

In [4]:
def preprocess(text: str) -> List[str]:
    # Normalización del texto
    text = text.lower()
    # Tokenización simple a nivel de palabra
    tokens = re.findall(r"\w+", text)
    # Quitar stopwords y aplicar stemming
    processed = [stemmer.stem(token) for token in tokens if token not in stop_words]
    # Retornar texto preprocesado
    return processed

In [None]:
print(preprocess("The cats are running quickly in the gardens"))

['cat', 'run', 'quick', 'garden']


# Ingesta de documentos

In [None]:
# Ruta en la que se encuentran los documentos a procesar
path_docs = "../data/test"

def load_documents(path_docs: str):
	# Diccionario para almacenar los documentos por id
	docs = {}
	# Se revisan únicamente los archivos relevantes
	for file_name in os.listdir(path_docs):
		if not file_name.endswith(".naf"):
			continue
		# Construccion del arbol de atributos del documento
		tree = ET.parse(os.path.join(path_docs, file_name))
		root = tree.getroot()
		# Id recuperado desde el atributo pupblicId
		doc_id = root.find("nafHeader/public").attrib["publicId"]

		title = root.find("nafHeader/fileDesc").attrib.get("title", "")

		raw_element = root.find("raw")
		content = raw_element.text if raw_element is not None else ""

		tokens = preprocess(title +  " " + content)
		docs[doc_id] = tokens

	return docs

In [15]:
print(load_documents(path_docs))

d001
William Beaumont and the Human Digestion
--
William Beaumont and the Human Digestion.

William Beaumont: Physiology of digestion Image Source.  On November 21, 1785, US-American surgeon William Beaumont was born. He became best known as “Father of Gastric Physiology” following his research on human digestion. William Beaumont was born in Lebanon, Connecticut and became a physician. He served as a surgeon’s mate in the Army during the War of 1812. He opened a private practice in Plattsburgh, New York, but rejoined the Army as a surgeon in 1819. Beaumont was stationed at Fort Mackinac on Mackinac Island in Michigan in the early 1820s when it existed to protect the interests of the American Fur Company. The fort became the refuge for a wounded 19-year-old French-Canadian fur trader named Alexis St. Martin when a shotgun went off by accident in the American Fur Company store at close range June 6th, 1822. St. Martin’s wound was quite serious because his stomach was perforated and seve