In [1]:
# based on tutorial at https://vegibit.com/python-xml-parsing/
import xml.sax


class XMLContentHandler(xml.sax.ContentHandler):
    def __init__(self, tag_of_interest, output_file):
        self.tag_of_interest = tag_of_interest
        self.inside_tag_of_interest = False
        
        # clears output file if it already exists.
        open(output_file, "w").close()
        
        self.output_file = open(output_file, "a")
        

    # Handle startElement
    def startElement(self, current_tag, attributes):
        if current_tag == self.tag_of_interest:
            #print("Tag found.")
            self.inside_tag_of_interest = True

    # Handle endElement
    def endElement(self, current_tag):
        if current_tag == self.tag_of_interest:
            #print("Tag ended.")
            self.inside_tag_of_interest = False
            
            # it is not clear if "characters" will be called only once 
            # per element, so it might be safer to add the new line 
            # character here.
            self.output_file.write("\n") 

    # Handle text data
    def characters(self, text):
        if self.inside_tag_of_interest:
            #print("Saving tag content.")
            self.output_file.write(str(text))

    def startDocument(self):
        print("Reading started.")

    # Handle endDocument
    def endDocument(self):
        print("Reading finished.")
        print("Closing output file.")
        self.output_file.close()

In [2]:
xml.sax.parse("cf79.xml", XMLContentHandler("AUTHOR", "autores.txt"))


Reading started.
Reading finished.
Closing output file.


In [3]:
import xml.dom.minidom

def retrieve_data_from_tag(input_file, output_file_name, tag_of_interest):
    domtree = xml.dom.minidom.parse(input_file)
    tag_elements = domtree.getElementsByTagName(tag_of_interest)
    
    open(output_file_name, "w").close() # resets the file
    output_file = open(output_file_name, "a")
    
    for element in tag_elements:
        text_content = str(element.firstChild.nodeValue)
        text_content = " ".join(text_content.split()) #text_content.replace("\n", "")
        #print(text_content + "\n")
        output_file.write(text_content + "\n")
        
    output_file.close()
    print("Data saved to file " + output_file_name)


In [4]:
retrieve_data_from_tag("cf79.xml", "titulo.txt", "TITLE")

Data saved to file titulo.txt
