In [1]:
import xml.sax
import csv

class DataDocumentHandler(xml.sax.ContentHandler):
    def __init__(self, output_file_name, attributes_to_include):
        super().__init__()
        self.csvfile = open(output_file_name, "w", newline='', encoding='utf-8')
        self.csvwriter = csv.writer(self.csvfile)
        self.headers_written = False
        self.rows_processed = 0
        self.attributes_to_include = attributes_to_include

        print(f"Initialized handler and opened {output_file_name} for writing.")

    def startElement(self, name, attrs):
        if name == 'row':
            self.rows_processed += 1
            row_data = {a: attrs.getValue(a) for a in self.attributes_to_include if a in attrs}
            if not self.headers_written:
                self.csvwriter.writerow(self.attributes_to_include)
                self.headers_written = True
                print("CSV headers written.")
            self.csvwriter.writerow([row_data.get(a, None) for a in self.attributes_to_include])

    def endDocument(self):
        self.csvfile.close()
        print(f"Finished processing and closed the file. Total rows processed: {self.rows_processed}")

    def startDocument(self):
        print("Started processing XML document.")


def prepare_data(xml_file: str, csv_file: str, attributes_to_include: set[str]):
    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, 0)

    handler = DataDocumentHandler(csv_file, attributes_to_include)
    parser.setContentHandler(handler)

    print(f"Starting XML parsing: {xml_file}")
    parser.parse(xml_file)
    print("XML parsing completed successfully.")

In [2]:
posts_xml_file = "Posts.xml"
posts_csv_file = "Posts.csv"
posts_attributes_to_include = {
    'Id', # need for analysis
    'PostTypeId', # need for analysis
    'ParentId', # need for analysis
    'CreationDate', # need for analysis
    'DeletionDate', # need to filter out deleted posts
    'Tags', # need to explore tags
    'ClosedDate' # need to filter out closed posts
}

prepare_data(posts_xml_file, posts_csv_file, posts_attributes_to_include)

Initialized handler and opened Posts.csv for writing.
Starting XML parsing: Posts.xml
Started processing XML document.
CSV headers written.
Finished processing and closed the file. Total rows processed: 59749049
XML parsing completed successfully.


In [15]:
votes_xml_file = "Votes.xml"
votes_csv_file = "Votes.csv"
votes_attributes_to_include = {
    'PostId', # need for analysis
    'VoteTypeId', # need for analysis
    'CreationDate' # need for analysis
}

prepare_data(votes_xml_file, votes_csv_file, votes_attributes_to_include)

Initialized handler and opened Votes.csv for writing.
Starting XML parsing: Votes.xml
Started processing XML document.
CSV headers written.
Finished processing and closed the file. Total rows processed: 238041583
XML parsing completed successfully.
