In [20]:
import xml.sax
import csv
import os

In [21]:
if os.getcwd().endswith('notebook'):
    os.chdir('../')
print(f'current working directory: {os.getcwd()}')

current working directory: /Users/howechen/Project/ntu_sd6103_team_project/ntu_sd6103_data_systems_team_project


In [22]:
class Find_K_Records(xml.sax.ContentHandler):
    def __init__(self, target_type, k):
        super().__init__()
        self.current_element = ""
        self.current_data = {}
        self.record_count = 0  # 用于计数已找到的记录
        self.found_target = False  # 标记是否找到目标记录
        self.target_type = target_type

    def startElement(self, tag, attributes):
        self.current_element = tag
        # 如果找到incollection类型的记录，则初始化记录数据
        if tag == self.target_type:
            self.current_data = {"type": tag, "id": attributes["key"], "date": attributes["mdate"]}
            self.found_target = True

    def endElement(self, tag):
        # 在incollection的记录结束时打印该记录的所有元素
        if tag == self.target_type and self.found_target:
            self.record_count += 1
            print(f"Record {self.record_count}: {self.current_data}")
            self.found_target = False  # 重置标志

            # 如果只需要查看一条记录，这里可以终止解析
            if self.record_count == k:
                raise xml.sax.SAXException("Found target, stop parsing")  # 通过抛出异常停止解析

        # 在找到incollection的记录时，将当前元素的内容添加到记录中
        if self.found_target and self.current_element:
            self.current_data[self.current_element] = self.content

    def characters(self, content):
        # 记录当前元素的内容
        self.content = content.strip()

# 初始化解析器和处理器
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
target_type = "mastersthesis"
k = 5
handler = Find_K_Records(target_type=target_type, k=k)
parser.setContentHandler(handler)

# 解析XML文件
xml_file = "../ntu_sd6103_team_project_data/dblp.xml"
try:
    parser.parse(xml_file)
except xml.sax.SAXException:
    print(f"Found and printed {k} records of '{target_type}'")

Record 1: {'type': 'mastersthesis', 'id': 'ms/Klaas2007', 'date': '2020-03-12', 'author': 'Vanessa C. Klaas', 'title': "Who's Who in the World Wide Web: Approaches to Name Disambiguation", 'year': '2007', 'school': 'nchen, Informatik', 'ee': 'http://www.pms.ifi.lmu.de/publikationen/diplomarbeiten/Vanessa.Klaas/thesis.pdf'}
Record 2: {'type': 'mastersthesis', 'id': 'ms/Ley2006', 'date': '2020-03-12', 'author': 'Rita Ley', 'title': 'ckhaltebecken -.', 'year': '2006', 'school': 't Trier, FB VI, Physische Geographie', 'ee': 'http://dblp.uni-trier.de/papers/DiplomarbeitRitaLey.pdf'}
Record 3: {'type': 'mastersthesis', 'id': 'ms/Yurek97', 'date': '2018-06-13', 'author': 'Tolga Yurek', 'title': 'Efficient View Maintenance at Data Warehouses.', 'year': '1997', 'school': 'University of California at Santa Barbara, Department of Computer Science, CA, USA'}
Record 4: {'type': 'mastersthesis', 'id': 'ms/Hoffmann2008', 'date': '2020-03-12', 'author': 'Oliver Hoffmann 0002', 'title': 'Regelbasierte 

## Classes

In [23]:
class incollectionParser(xml.sax.ContentHandler):
    def __init__(self, batch_size=5000):
        super().__init__()
        self.current_element = ""
        self.current_data = {}
        self.records = []
        self.authors = []
        self.batch_size = batch_size
        self.file_count = 1

    def startElement(self, tag, attributes):
        self.current_element = tag
        if tag == 'incollection':
            # Reset for each new publication record
            # print(self.current_data)
            # print(f"*****Processing {tag}...******")
            self.current_data = {
                "type": tag,
                "id": attributes["key"],
                "date": attributes["mdate"],
                "title": "",
                "pages": "",
                "year": "",
                "booktitle": "",
                "publisher": "",
                "ee": "",
                "url": ""
            }
            self.authors = []

    def endElement(self, tag):
        if tag in ["incollection"]:
            # Add authors as a comma-separated string
            self.current_data["authors"] = ", ".join(self.authors)
            self.records.append(self.current_data)
            if len(self.records) >= self.batch_size:
                self.save_to_csv()
                self.records = []  # Reset records
        elif tag == "author":
            # Append author to authors list
            self.authors.append(self.content)
            # print(f"author: {self.content}")
        elif tag in self.current_data:
            # Save the content to the current field
            self.current_data[tag] = self.content
            # print(f"{tag}: {self.content}")

    def characters(self, content):
        self.content = content.strip()

    def save_to_csv(self):
        if not os.path.exists('../ntu_sd6103_team_project_data'):
            os.mkdir('../ntu_sd6103_team_project_data')
        if not os.path.exists('../ntu_sd6103_team_project_data/csv_files'):
            os.mkdir('../ntu_sd6103_team_project_data/csv_files')
        if not os.path.exists('../ntu_sd6103_team_project_data/csv_files/incollection'):
            os.mkdir('../ntu_sd6103_team_project_data/csv_files/incollection')
        csv_file = f"../ntu_sd6103_team_project_data/csv_files/incollection/dplr_incollection_part_{self.file_count}.csv"
        with open(csv_file, "w", newline='', encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=self.records[0].keys())
            writer.writeheader()
            writer.writerows(self.records)
        print(f"Saving batch {self.file_count} to {csv_file}")
        self.file_count += 1

In [24]:
class phdthesisParser(xml.sax.ContentHandler):
    def __init__(self, batch_size=5000):
        super().__init__()
        self.current_element = ""
        self.current_data = {}
        self.records = []
        self.authors = []
        self.batch_size = batch_size
        self.file_count = 1

    def startElement(self, tag, attributes):
        self.current_element = tag
        if tag == 'phdthesis':
            # Reset for each new publication record
            # print(self.current_data)
            # print(f"*****Processing {tag}...******")
            self.current_data = {
                "type": tag,
                "id": attributes["key"],
                "date": attributes["mdate"],
                "title": "",
                "pages": "",
                "publisher": "",
                "year": "",
                "series": "",
                "volume": "",
                "school": "",
                "isbn": "",
                "ee": ""
            }
            self.authors = []

    def endElement(self, tag):
        if tag in ["phdthesis"]:
            # Add authors as a comma-separated string
            self.current_data["authors"] = ", ".join(self.authors)
            self.records.append(self.current_data)
            if len(self.records) >= self.batch_size:
                self.save_to_csv()
                self.records = []  # Reset records
        elif tag == "author":
            # Append author to authors list
            self.authors.append(self.content)
            # print(f"author: {self.content}")
        elif tag in self.current_data:
            # Save the content to the current field
            self.current_data[tag] = self.content
            # print(f"{tag}: {self.content}")

    def characters(self, content):
        self.content = content.strip()

    def save_to_csv(self):
        if not os.path.exists('../ntu_sd6103_team_project_data'):
            os.mkdir('../ntu_sd6103_team_project_data')
        if not os.path.exists('../ntu_sd6103_team_project_data/csv_files'):
            os.mkdir('../ntu_sd6103_team_project_data/csv_files')
        if not os.path.exists('../ntu_sd6103_team_project_data/csv_files/phdthesis'):
            os.mkdir('../ntu_sd6103_team_project_data/csv_files/phdthesis')
        csv_file = f"../ntu_sd6103_team_project_data/csv_files/phdthesis/dplr_phdthesis_part_{self.file_count}.csv"
        with open(csv_file, "w", newline='', encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=self.records[0].keys())
            writer.writeheader()
            writer.writerows(self.records)
        print(f"Saving batch {self.file_count} to {csv_file}")
        self.file_count += 1

In [25]:
class masterthesisParser(xml.sax.ContentHandler):
    def __init__(self, batch_size=5000):
        super().__init__()
        self.current_element = ""
        self.current_data = {}
        self.records = []
        self.authors = []
        self.batch_size = batch_size
        self.file_count = 1

    def startElement(self, tag, attributes):
        self.current_element = tag
        if tag == 'masterthesis':
            # Reset for each new publication record
            # print(self.current_data)
            print(f"*****Processing {tag}...******")
            self.current_data = {
                "type": tag,
                "id": attributes["key"],
                "date": attributes["mdate"],
                "title": "",
                "year": "",
                "school": "",
                "ee": ""
            }
            self.authors = []

    def endElement(self, tag):
        if tag in ["masterthesis"]:
            # Add authors as a comma-separated string
            self.current_data["authors"] = ", ".join(self.authors)
            self.records.append(self.current_data)
            if len(self.records) >= self.batch_size:
                self.save_to_csv()
                self.records = []  # Reset records
        elif tag == "author":
            # Append author to authors list
            self.authors.append(self.content)
            # print(f"author: {self.content}")
        elif tag in self.current_data:
            # Save the content to the current field
            self.current_data[tag] = self.content
            # print(f"{tag}: {self.content}")

    def characters(self, content):
        self.content = content.strip()

    def save_to_csv(self):
        if not os.path.exists('../ntu_sd6103_team_project_data'):
            os.mkdir('../ntu_sd6103_team_project_data')
        if not os.path.exists('../ntu_sd6103_team_project_data/csv_files'):
            os.mkdir('../ntu_sd6103_team_project_data/csv_files')
        if not os.path.exists('../ntu_sd6103_team_project_data/csv_files/masterthesis'):
            os.mkdir('../ntu_sd6103_team_project_data/csv_files/masterthesis')
        csv_file = f"../ntu_sd6103_team_project_data/csv_files/masterthesis/dplr_masterthesis_part_{self.file_count}.csv"
        with open(csv_file, "w", newline='', encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=self.records[0].keys())
            writer.writeheader()
            writer.writerows(self.records)
        print(f"Saving batch {self.file_count} to {csv_file}")
        self.file_count += 1

In [26]:
class wwwParser(xml.sax.ContentHandler):
    def __init__(self, batch_size=5000):
        super().__init__()
        self.current_element = ""
        self.current_data = {}
        self.records = []
        self.authors = []
        self.batch_size = batch_size  # batch size
        self.file_count = 1  # csv No.

    def startElement(self, tag, attributes):
        self.current_element = tag
        if tag == 'www':
            # Reset for each new publication record
            # print(self.current_data)
            # print(f"*****Processing {tag}...******")
            self.current_data = {
                "type": tag,
                "id": attributes["key"],
                "date": attributes["mdate"],
                "title": "",
                "pages": "",
                "note": "",
                "url": ""
            }
            self.authors = []

    def endElement(self, tag):
        if tag in ["www"]:
            # Add authors as a comma-separated string
            self.current_data["authors"] = ", ".join(self.authors)
            self.records.append(self.current_data)
            if len(self.records) >= self.batch_size:
                self.save_to_csv()
                self.records = []  # Reset records
        elif tag == "author":
            # Append author to authors list
            self.authors.append(self.content)
            # print(f"author: {self.content}")
        elif tag in self.current_data:
            # Save the content to the current field
            self.current_data[tag] = self.content
            # print(f"{tag}: {self.content}")

    def characters(self, content):
        self.content = content.strip()
    
    def save_to_csv(self):
        if not os.path.exists('../ntu_sd6103_team_project_data'):
            os.mkdir('../ntu_sd6103_team_project_data')
        if not os.path.exists('../ntu_sd6103_team_project_data/csv_files'):
            os.mkdir('../ntu_sd6103_team_project_data/csv_files')
        if not os.path.exists('../ntu_sd6103_team_project_data/csv_files/www'):
            os.mkdir('../ntu_sd6103_team_project_data/csv_files/www')
        csv_file = f"../ntu_sd6103_team_project_data/csv_files/www/dplr_www_part_{self.file_count}.csv"
        with open(csv_file, "w", newline='', encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=self.records[0].keys())
            writer.writeheader()
            writer.writerows(self.records)
        print(f"Saving batch {self.file_count} to {csv_file}")
        self.file_count += 1

In [27]:
class dataParser(xml.sax.ContentHandler):
    def __init__(self, batch_size=5000):
        super().__init__()
        self.current_element = ""
        self.current_data = {}
        self.records = []
        self.authors = []
        self.batch_size = batch_size  # batch size
        self.file_count = 1  # csv No.

    def startElement(self, tag, attributes):
        self.current_element = tag
        if tag == 'data':
            # Reset for each new publication record
            # print(self.current_data)
            # print(f"*****Processing {tag}...******")
            self.current_data = {
                "type": tag,
                "id": attributes["key"],
                "date": attributes["mdate"],
                "title": "",
                "pages": "",
                "publisher": "",
                "year": "",
                "month": "",
                "ee": "",
                "publisher": "",
                "stream": "",
                "rel": ""
            }
            self.authors = []

    def endElement(self, tag):
        if tag in ["data"]:
            # Add authors as a comma-separated string
            self.current_data["authors"] = ", ".join(self.authors)
            self.records.append(self.current_data)
            if len(self.records) >= self.batch_size:
                self.save_to_csv()
                self.records = []  # Reset records
        elif tag == "author":
            # Append author to authors list
            self.authors.append(self.content)
            # print(f"author: {self.content}")
        elif tag in self.current_data:
            # Save the content to the current field
            self.current_data[tag] = self.content
            # print(f"{tag}: {self.content}")

    def characters(self, content):
        self.content = content.strip()
    
    def save_to_csv(self):
        if not os.path.exists('../ntu_sd6103_team_project_data'):
            os.mkdir('../ntu_sd6103_team_project_data')
        if not os.path.exists('../ntu_sd6103_team_project_data/csv_files'):
            os.mkdir('../ntu_sd6103_team_project_data/csv_files')
        if not os.path.exists('../ntu_sd6103_team_project_data/csv_files/data'):
            os.mkdir('../ntu_sd6103_team_project_data/csv_files/data')
        csv_file = f"../ntu_sd6103_team_project_data/csv_files/data/dplr_data_part_{self.file_count}.csv"
        with open(csv_file, "w", newline='', encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=self.records[0].keys())
            writer.writeheader()
            writer.writerows(self.records)
        print(f"Saving batch {self.file_count} to {csv_file}")
        self.file_count += 1

## Running

In [28]:
# Initialize the parser and handler
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
handler = incollectionParser(batch_size=5000)
parser.setContentHandler(handler)

# Parse the XML file
xml_file = "../ntu_sd6103_team_project_data/dblp.xml"
parser.parse(xml_file)

if handler.records:
    print("Saving the last batch...")
    handler.save_to_csv()   # Save the last batch

Saving batch 1 to ../ntu_sd6103_team_project_data/csv_files/incollection/dplr_incollection_part_1.csv
Saving batch 2 to ../ntu_sd6103_team_project_data/csv_files/incollection/dplr_incollection_part_2.csv
Saving batch 3 to ../ntu_sd6103_team_project_data/csv_files/incollection/dplr_incollection_part_3.csv
Saving batch 4 to ../ntu_sd6103_team_project_data/csv_files/incollection/dplr_incollection_part_4.csv
Saving batch 5 to ../ntu_sd6103_team_project_data/csv_files/incollection/dplr_incollection_part_5.csv
Saving batch 6 to ../ntu_sd6103_team_project_data/csv_files/incollection/dplr_incollection_part_6.csv
Saving batch 7 to ../ntu_sd6103_team_project_data/csv_files/incollection/dplr_incollection_part_7.csv
Saving batch 8 to ../ntu_sd6103_team_project_data/csv_files/incollection/dplr_incollection_part_8.csv
Saving batch 9 to ../ntu_sd6103_team_project_data/csv_files/incollection/dplr_incollection_part_9.csv
Saving batch 10 to ../ntu_sd6103_team_project_data/csv_files/incollection/dplr_inc

In [29]:
# Initialize the parser and handler
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
handler = phdthesisParser(batch_size=5000)
parser.setContentHandler(handler)

# Parse the XML file
xml_file = "../ntu_sd6103_team_project_data/dblp.xml"
parser.parse(xml_file)

if handler.records:
    print("Saving the last batch...")
    handler.save_to_csv()   # Save the last batch

Saving batch 1 to ../ntu_sd6103_team_project_data/csv_files/phdthesis/dplr_phdthesis_part_1.csv
Saving batch 2 to ../ntu_sd6103_team_project_data/csv_files/phdthesis/dplr_phdthesis_part_2.csv
Saving batch 3 to ../ntu_sd6103_team_project_data/csv_files/phdthesis/dplr_phdthesis_part_3.csv
Saving batch 4 to ../ntu_sd6103_team_project_data/csv_files/phdthesis/dplr_phdthesis_part_4.csv
Saving batch 5 to ../ntu_sd6103_team_project_data/csv_files/phdthesis/dplr_phdthesis_part_5.csv
Saving batch 6 to ../ntu_sd6103_team_project_data/csv_files/phdthesis/dplr_phdthesis_part_6.csv
Saving batch 7 to ../ntu_sd6103_team_project_data/csv_files/phdthesis/dplr_phdthesis_part_7.csv
Saving batch 8 to ../ntu_sd6103_team_project_data/csv_files/phdthesis/dplr_phdthesis_part_8.csv
Saving batch 9 to ../ntu_sd6103_team_project_data/csv_files/phdthesis/dplr_phdthesis_part_9.csv
Saving batch 10 to ../ntu_sd6103_team_project_data/csv_files/phdthesis/dplr_phdthesis_part_10.csv
Saving batch 11 to ../ntu_sd6103_team_

In [30]:
# Initialize the parser and handler
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
handler = masterthesisParser(batch_size=5000)
parser.setContentHandler(handler)

# Parse the XML file
xml_file = "../ntu_sd6103_team_project_data/dblp.xml"
parser.parse(xml_file)

if handler.records:
    print("Saving the last batch...")
    handler.save_to_csv()   # Save the last batch

In [31]:
# Initialize the parser and handler
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
handler = wwwParser(batch_size=5000)
parser.setContentHandler(handler)

# Parse the XML file
xml_file = "../ntu_sd6103_team_project_data/dblp.xml"
parser.parse(xml_file)

if handler.records:
    print("Saving the last batch...")
    handler.save_to_csv()   # Save the last batch

Saving batch 1 to ../ntu_sd6103_team_project_data/csv_files/www/dplr_www_part_1.csv
Saving batch 2 to ../ntu_sd6103_team_project_data/csv_files/www/dplr_www_part_2.csv
Saving batch 3 to ../ntu_sd6103_team_project_data/csv_files/www/dplr_www_part_3.csv
Saving batch 4 to ../ntu_sd6103_team_project_data/csv_files/www/dplr_www_part_4.csv
Saving batch 5 to ../ntu_sd6103_team_project_data/csv_files/www/dplr_www_part_5.csv
Saving batch 6 to ../ntu_sd6103_team_project_data/csv_files/www/dplr_www_part_6.csv
Saving batch 7 to ../ntu_sd6103_team_project_data/csv_files/www/dplr_www_part_7.csv
Saving batch 8 to ../ntu_sd6103_team_project_data/csv_files/www/dplr_www_part_8.csv
Saving batch 9 to ../ntu_sd6103_team_project_data/csv_files/www/dplr_www_part_9.csv
Saving batch 10 to ../ntu_sd6103_team_project_data/csv_files/www/dplr_www_part_10.csv
Saving batch 11 to ../ntu_sd6103_team_project_data/csv_files/www/dplr_www_part_11.csv
Saving batch 12 to ../ntu_sd6103_team_project_data/csv_files/www/dplr_ww

In [32]:
# Initialize the parser and handler
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
handler = dataParser(batch_size=5000)
parser.setContentHandler(handler)

# Parse the XML file
xml_file = "../ntu_sd6103_team_project_data/dblp.xml"
parser.parse(xml_file)

if handler.records:
    print("Saving the last batch...")
    handler.save_to_csv()   # Save the last batch

Saving batch 1 to ../ntu_sd6103_team_project_data/csv_files/data/dplr_data_part_1.csv
Saving the last batch...
Saving batch 2 to ../ntu_sd6103_team_project_data/csv_files/data/dplr_data_part_2.csv
