In [None]:
import bz2
import xml.etree.ElementTree as ET

def extract_and_clean_wikipedia(xml_bz2_file, output_file):
    """
    Extracts text content from English Wikipedia XML dump, performs basic cleaning,
    and writes each sequence to a new line in the output file.
    """
    with bz2.open(xml_bz2_file, 'rb') as bz2file, \
            open(output_file, 'w', encoding='utf-8') as outfile:
        context = ET.iterparse(bz2file, events=('start', 'end'))
        for event, elem in context:
            if event == 'end' and elem.tag.endswith('text'):
                text = elem.text
                if text:
                    # Basic cleaning: remove newlines and extra whitespace
                    cleaned_text = text.strip().replace('\n', ' ')
                    outfile.write(cleaned_text + '\n')
                # Clear the element from memory to handle large files
                elem.clear()
    print(f"Extracted and cleaned Wikipedia data is now in: {output_file}")

# Specify the path to your downloaded Wikipedia XML.bz2 file
wikipedia_file = 'enwiki-latest-pages-articles-multistream.xml.bz2'
# Specify the desired output file name
output_file = 'dump.txt'

extract_and_clean_wikipedia(wikipedia_file, output_file)