In [12]:
import os
from lxml import etree
import re

# Function for removing special characters that XML doesn't understand

In [13]:

def clean_special_characters(content):

    # Remove smart quotes and other quotation marks
    content = re.sub(r'“|”|‘|’|〝|〞|„|”', '', content)  # Remove smart quotes
    content = re.sub(r'"', '', content)  # Remove double quotes
    content = re.sub(r"'", '', content)  # Remove single quotes

    # Remove dashes (en dash, em dash, and others)
    content = re.sub(r'–|—|−|•', '', content)  # Remove various dash characters

    # Remove angle brackets 
    content = re.sub(r'<', '', content)
    content = re.sub(r'>', '', content)

    # Remove ampersand characters (&)
    content = re.sub(r'&', '', content)  # Remove ampersand
    
    # Remove non-ASCII characters
    content = re.sub(r'[^\x00-\x7F]+', '', content)  # Remove non-ASCII characters
    
    return content
    

In [14]:
def extract_text_from_tei(filepath):
    try:
        # Read the file and parse the XML content
        with open(filepath, "r", encoding="utf-8") as file:
            content = file.read()
        
        # Parse the XML content
        tree = etree.fromstring(content)
        namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}
        
        # Extract text from <p> tags, remove XML metadata and headers
        paragraphs = tree.xpath("//tei:text//tei:p/text()", namespaces=namespaces)
        
        # Clean each paragraph's text
        cleaned_paragraphs = [clean_special_characters(paragraph) for paragraph in paragraphs]
        
        return ' '.join(cleaned_paragraphs)  # Join paragraphs to form the document text
    except Exception as e:
        print(f"Error processing {filepath}: {e}")
        return None  # Return None if there's an error

def convert_tei_to_txt(input_directory, output_directory):
    for filename in os.listdir(input_directory):
        if filename.endswith(".tei"):
            filepath = os.path.join(input_directory, filename)
            output_filepath = os.path.join(output_directory, filename.replace(".tei", ".txt"))
            
            # Try to extract text from the TEI file
            text = extract_text_from_tei(filepath)
            
            if text:
                # Write the cleaned text to a txt file
                with open(output_filepath, 'w', encoding='utf-8') as txt_file:
                    txt_file.write(text)
            else:
                print(f"Warning: No text extracted from {filename}")

In [11]:
input_directory = "/Users/tildeidunsloth/Desktop/chicago_corpus/texts_renamed"
output_directory = "/Users/tildeidunsloth/Desktop/chicago_corpus/texts_renamed_txt_try"
convert_tei_to_txt(input_directory, output_directory)

KeyboardInterrupt: 