# Extract text
Text is an important element of the xml data for subsequent potential corpus analyses, and extracting text from files in xml format is an important step. In this regard we have tested it for single xml files and finally for batch processing.

In [20]:
import glob
import xml.etree.ElementTree as ET

def extract_text_from_xml(xml_file):
    # Parse the XML file
    tree = ET.parse(xml_file)
    # Get the root element of the XML file
    root = tree.getroot()

    # Define the namespace used in the XML file
    namespaces = {'ns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'}
    
    # Initialize a list to store all extracted texts
    all_texts = []

    # Find all text elements and extract their content
    for elem in root.findall('.//ns:TextEquiv/ns:Unicode', namespaces):
        # Check if the element has text
        if elem.text:
            # Add the stripped text to the list
            all_texts.append(elem.text.strip())

    # Return the list of extracted texts
    return all_texts

def save_to_file(text_list, file_name):
    # Open the file in write mode with UTF-8 encoding
    with open(file_name, "w", encoding="utf-8") as file:
        # Write each piece of text to the file followed by a newline
        for text in text_list:
            file.write(text + "\n")

# Specify the path to the input XML file
xml_file_path = "/home/vivek/Desktop/output_page.xml"
# Specify the path to the output text file
output_file_path = "/home/vivek/Desktop/fulltext.txt"

# Extract text content from the XML file
text_content = extract_text_from_xml(xml_file_path)
# Save the extracted text to a file
save_to_file(text_content, output_file_path)

In [29]:
import os
import glob
import xml.etree.ElementTree as ET

def extract_text_from_xml(xml_file):
    # Parse the XML file
    tree = ET.parse(xml_file)
    # Get the root of the XML tree
    root = tree.getroot()

    # Define the namespace to search for specific tags
    namespaces = {'ns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'}
    
    # Initialize a list to hold all extracted texts
    all_texts = []

    # Loop through each element that matches the specified path and namespace
    for elem in root.findall('.//ns:TextEquiv/ns:Unicode', namespaces):
        # If the element has text, strip it of leading/trailing whitespace and add it to the list
        if elem.text:
            all_texts.append(elem.text.strip())

    # Return the list of extracted texts
    return all_texts

def save_to_file(text_list, file_name):
    # Open a file for writing in UTF-8 encoding and write each text followed by a newline
    with open(file_name, "w", encoding="utf-8") as file:
        for text in text_list:
            file.write(text + "\n")

def process_folder(folder_path):
    # Iterate over folders ending with '_ocr' in the specified directory
    for ocr_folder in glob.glob(os.path.join(folder_path, '*_ocr')):
        # For each OCR processed subfolder
        for subfolder in glob.glob(os.path.join(ocr_folder, '*/')):
            # Find XML files ending with '_page.xml' and process them
            for xml_file in glob.glob(os.path.join(subfolder, '*_page.xml')):
                # Extract text content from the XML file
                text_content = extract_text_from_xml(xml_file)
                # Define the output text file name
                txt_file_name = os.path.join(subfolder, 'fulltext.txt')
                # Save the extracted text to the file
                save_to_file(text_content, txt_file_name)

# Specify the root directory to process
root_folder = "/home/vivek/Desktop/result"
# Call the function to process each folder within the root directory
process_folder(root_folder)