In [36]:
import xml.sax
import csv

class XMLToCSVHandler(xml.sax.ContentHandler):
    def __init__(self, csv_filename):
        self.csv_filename = csv_filename
        self.chunk_size = chunk_size
        self.is_in_proceedings = False
        self.current_data = {}
        self.proceedings_data = []
        self.current_element = ""
        self.fieldnames = set()
        self.current_chunk = 0
        self.current_parent_key = ""

    def startElement(self, name, attrs):
        if name == "incollection":
            self.is_in_proceedings = True
            self.current_data = {}
            self.current_parent_key = attrs.get("key", "")  # Get the key attribute
        self.current_element = name  # Keep track of the current element

    def characters(self, content):
        if self.is_in_proceedings:
            content = content.strip()
            if content:
                if self.current_element not in self.current_data:
                    self.current_data[self.current_element] = content
                else:
                    # If the element already exists, create a list to store multiple values
                    if isinstance(self.current_data[self.current_element], list):
                        self.current_data[self.current_element].append(content)
                    else:
                        self.current_data[self.current_element] = [self.current_data[self.current_element], content]
                self.fieldnames.add(self.current_element)  # Add the field to fieldnames if missing

    def endElement(self, name):
        if self.is_in_proceedings and name == "incollection":
            self.is_in_proceedings = False
            self.current_data["parent_key"] = self.current_parent_key  # Add proceedings key
            self.proceedings_data.append(self.current_data.copy())  # Append a copy of the data_dict
            if len(self.proceedings_data) >= self.chunk_size:
                self.write_to_csv()
                self.current_chunk += 1
                self.proceedings_data = []

    def write_to_csv(self):
        with open(f"{self.csv_filename}_{self.current_chunk}.csv", mode="w", newline="", encoding="utf-8") as csv_file:
            if self.proceedings_data:
                fieldnames = list(self.fieldnames) + ["parent_key"]  # Convert fieldnames set to list
                writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
                writer.writeheader()
                writer.writerows(self.proceedings_data)

if __name__ == "__main__":
    xml_filename = "dblp.xml"  # Replace with your XML file name
    csv_filename = "incollection_list_output_"  # Replace with your desired CSV output file name
    chunk_size = 1000000

    handler = XMLToCSVHandler(csv_filename)
    parser = xml.sax.make_parser()
    parser.setContentHandler(handler)
    parser.parse(xml_filename)

    # Write the captured data to the CSV file
    handler.write_to_csv()

    print("XML data has been successfully converted to CSV.")


XML data has been successfully converted to CSV.
