In [1]:
def get_mapping():
    dtd = """<!ENTITY Agrave  "&#192;" ><!-- capital A, grave accent -->
        <!ENTITY micro "&#181;">
        <!ENTITY Aacute  "&#193;" ><!-- capital A, acute accent -->
        <!ENTITY Acirc   "&#194;" ><!-- capital A, circumflex accent -->
        <!ENTITY Atilde  "&#195;" ><!-- capital A, tilde -->
        <!ENTITY Auml    "&#196;" ><!-- capital A, dieresis or umlaut mark -->
        <!ENTITY Aring   "&#197;" ><!-- capital A, ring -->
        <!ENTITY AElig   "&#198;" ><!-- capital AE diphthong (ligature) -->
        <!ENTITY Ccedil  "&#199;" ><!-- capital C, cedilla -->
        <!ENTITY Egrave  "&#200;" ><!-- capital E, grave accent -->
        <!ENTITY Eacute  "&#201;" ><!-- capital E, acute accent -->
        <!ENTITY Ecirc   "&#202;" ><!-- capital E, circumflex accent -->
        <!ENTITY Euml    "&#203;" ><!-- capital E, dieresis or umlaut mark -->
        <!ENTITY Igrave  "&#204;" ><!-- capital I, grave accent -->
        <!ENTITY Iacute  "&#205;" ><!-- capital I, acute accent -->
        <!ENTITY Icirc   "&#206;" ><!-- capital I, circumflex accent -->
        <!ENTITY Iuml    "&#207;" ><!-- capital I, dieresis or umlaut mark -->
        <!ENTITY ETH     "&#208;" ><!-- capital Eth, Icelandic -->
        <!ENTITY Ntilde  "&#209;" ><!-- capital N, tilde -->
        <!ENTITY Ograve  "&#210;" ><!-- capital O, grave accent -->
        <!ENTITY Oacute  "&#211;" ><!-- capital O, acute accent -->
        <!ENTITY Ocirc   "&#212;" ><!-- capital O, circumflex accent -->
        <!ENTITY Otilde  "&#213;" ><!-- capital O, tilde -->
        <!ENTITY Ouml    "&#214;" ><!-- capital O, dieresis or umlaut mark -->
        <!ENTITY Oslash  "&#216;" ><!-- capital O, slash -->
        <!ENTITY Ugrave  "&#217;" ><!-- capital U, grave accent -->
        <!ENTITY Uacute  "&#218;" ><!-- capital U, acute accent -->
        <!ENTITY Ucirc   "&#219;" ><!-- capital U, circumflex accent -->
        <!ENTITY Uuml    "&#220;" ><!-- capital U, dieresis or umlaut mark -->
        <!ENTITY Yacute  "&#221;" ><!-- capital Y, acute accent -->
        <!ENTITY THORN   "&#222;" ><!-- capital THORN, Icelandic -->
        <!ENTITY szlig   "&#223;" ><!-- small sharp s, German (sz ligature) -->
        <!ENTITY agrave  "&#224;" ><!-- small a, grave accent -->   
        <!ENTITY aacute  "&#225;" ><!-- small a, acute accent -->
        <!ENTITY acirc   "&#226;" ><!-- small a, circumflex accent -->
        <!ENTITY atilde  "&#227;" ><!-- small a, tilde -->
        <!ENTITY auml    "&#228;" ><!-- small a, dieresis or umlaut mark -->
        <!ENTITY aring   "&#229;" ><!-- small a, ring -->
        <!ENTITY aelig   "&#230;" ><!-- small ae diphthong (ligature) -->
        <!ENTITY ccedil  "&#231;" ><!-- small c, cedilla -->
        <!ENTITY egrave  "&#232;" ><!-- small e, grave accent -->
        <!ENTITY eacute  "&#233;" ><!-- small e, acute accent -->
        <!ENTITY ecirc   "&#234;" ><!-- small e, circumflex accent -->
        <!ENTITY euml    "&#235;" ><!-- small e, dieresis or umlaut mark -->
        <!ENTITY igrave  "&#236;" ><!-- small i, grave accent -->
        <!ENTITY iacute  "&#237;" ><!-- small i, acute accent -->
        <!ENTITY icirc   "&#238;" ><!-- small i, circumflex accent -->
        <!ENTITY iuml    "&#239;" ><!-- small i, dieresis or umlaut mark -->
        <!ENTITY eth     "&#240;" ><!-- small eth, Icelandic -->
        <!ENTITY ntilde  "&#241;" ><!-- small n, tilde -->
        <!ENTITY ograve  "&#242;" ><!-- small o, grave accent -->
        <!ENTITY oacute  "&#243;" ><!-- small o, acute accent -->
        <!ENTITY ocirc   "&#244;" ><!-- small o, circumflex accent -->
        <!ENTITY otilde  "&#245;" ><!-- small o, tilde -->
        <!ENTITY ouml    "&#246;" ><!-- small o, dieresis or umlaut mark -->
        <!ENTITY oslash  "&#248;" ><!-- small o, slash -->
        <!ENTITY ugrave  "&#249;" ><!-- small u, grave accent -->
        <!ENTITY uacute  "&#250;" ><!-- small u, acute accent -->
        <!ENTITY ucirc   "&#251;" ><!-- small u, circumflex accent -->
        <!ENTITY uuml    "&#252;" ><!-- small u, dieresis or umlaut mark -->
        <!ENTITY yacute  "&#253;" ><!-- small y, acute accent -->
        <!ENTITY thorn   "&#254;" ><!-- small thorn, Icelandic -->
        <!ENTITY yuml    "&#255;" ><!-- small y, dieresis or umlaut mark -->
        <!ENTITY reg   "&#174;">
        <!ENTITY micro "&#181;">
        <!ENTITY times "&#215;">"""
    lines = [x.split()[1:3] for x in dtd.splitlines()]

    keys = ["&"+x[0]+";" for x in lines]
    values = [x[1][1:-1] for x in lines]
    mapping = {keys[n]: values[n] for n,x in enumerate(keys)}
    mapping["key="] = "id="
    return mapping

In [2]:
def transformer(file_in, file_out, mapping):
    with open(file_in, "r") as f_in, open(file_out, "w") as f_out:
        for line in f_in:
            transformed_line = line
            for present, transformation in mapping.items():
                transformed_line = transformed_line.replace(present, transformation)
            f_out.write(transformed_line)
    #print("ugly simbols transformed")

In [3]:
import os
N_LINES = 30_000
K = 10 # We want at least K publications of all types
main_tags = ["incollection", "article", "inproceedings", "proceedings", "mastersthesis", "phdthesis"]
publication_counter = {
    "incollection" : 0,
    "article" : 0,
    "inproceedings" : 0,
    "mastersthesis" : 0,
    "phdthesis" : 0,
    "proceedings": 0
}

class WWWIgnorer():
    def __init__(self, file):
        self.write_down = True
        self.file = file
    def feed(self, line, debug=False):
        was_written = False
        if "<www" in line:
            self.write_down = False
        
        if self.write_down is True:
            self.file.write(line)
            was_written = True
        
        if "</www>" in line:
            self.write_down = True
            self.feed(line[6:])
        
        if was_written is False and debug is True:
            print(f"->{line} was not written!")
        
    
def check_closed_main_tag(line):
    for tag in main_tags:
        if f"</{tag}>" in line:
            return f"</{tag}>"
    return None      

def check_publ_to_fill():
    publications_to_fill = []
    for tag in main_tags:
        if publication_counter[tag] < K:
            publications_to_fill.append(tag)
    return publications_to_fill        

def get_opened_tag(line, tags):
    for tag in tags:
        if f"<{tag}" in line:
            return tag
    return ''

def get_closed_tag(line, tags):
    for tag in tags:
        if f"</{tag}" in line:
            return tag
    return ''    

with open("dblp.xml","r") as fin, open("db-reduced.xml","w") as fout:
    i = 1
    tool = WWWIgnorer(fout)
    
    for n in range(N_LINES):
        line = fin.readline()
        if not "DOCTYPE" in line:
            tool.feed(line)
            tag = get_opened_tag(line, main_tags)
            if tag in main_tags:
                publication_counter[tag] += 1
        i += 1
    
    line = fin.readline()
    write_down = True
    while check_closed_main_tag(line) is None:
        tool.feed(line)
        i += 1
        line = fin.readline()

    closed_main_tag = check_closed_main_tag(line)
    final_line = line[0:line.find("<", 2)]
    fout.write(f"{final_line}")    

    publications_to_fill = check_publ_to_fill()
    write_down = False
    current_tag = ''

    while len(publications_to_fill) > 0: # and not write_down
        line = fin.readline()
        opened_tag = get_opened_tag(line, publications_to_fill)
        closed_tag = get_closed_tag(line, publications_to_fill)
        if write_down:
            if closed_tag == current_tag:
                publication_counter[current_tag] += 1 # When we finish adding an element, we increment its tag's counter
                write_down = False
                modified_line = line[0:line.find("<", 2)]
                tool.feed(modified_line)
            else:
                tool.feed(line)    
        elif opened_tag in publications_to_fill:
            print(f"Found tag: {opened_tag}")
            current_tag = opened_tag
            write_down = True
            modified_line = line[line.find(f"<{opened_tag}"):len(line)]
            tool.feed(modified_line)
        publications_to_fill = check_publ_to_fill()

    fout.write(f"</dblp>")

transformer("db-reduced.xml", "db-final.xml", get_mapping())
os.remove("db-reduced.xml")

print("preprocessing completed")

Found tag: proceedings
Found tag: proceedings
Found tag: proceedings
Found tag: proceedings
Found tag: proceedings
Found tag: proceedings
Found tag: proceedings
Found tag: proceedings
Found tag: inproceedings
Found tag: inproceedings
Found tag: inproceedings
Found tag: inproceedings
Found tag: inproceedings
Found tag: inproceedings
Found tag: inproceedings
Found tag: inproceedings
Found tag: phdthesis
Found tag: phdthesis
Found tag: phdthesis
Found tag: phdthesis
Found tag: phdthesis
Found tag: phdthesis
Found tag: phdthesis
Found tag: phdthesis
Found tag: phdthesis
Found tag: phdthesis
Found tag: mastersthesis
Found tag: mastersthesis
Found tag: mastersthesis
Found tag: mastersthesis
preprocessing completed
