In [1]:
import pandas as pd
import sys
import xml.etree.ElementTree as ET

In [2]:
xml_file = 'test.xml'
test_galley = 'C:/Users/kayp/GitHub/ojs-tools-martijn/test/test.txt'

In [3]:
def main():
    if len(sys.argv) < 2:
        print("Usage: python script.py <xml_file>")
        sys.exit(1)  # Exit with an error code

    # Retrieve the XML file from command-line arguments
    #xml_file = sys.argv[1]
    
    # Open the XML file with the correct encoding
    with open(xml_file, 'rb') as file:
        # Parse the XML file
        tree = ET.parse(file)
        root = tree.getroot()
    
    articles = root.findall('.//{http://pkp.sfu.ca}article')
    
    rows = []
    row_id = 0
    
    for article in articles:
        print(row_id)
        processed = get_article_info(article, root, row_id)
        df = pd.DataFrame.from_dict(processed.to_row())
        rows.append(df)
        row_id += 1
        
    df = pd.concat(rows)
    
    df = df.fillna('')
    
    df['section_policy'] = df['section_policy'].replace('', 'no section policy').fillna('no section policy')
    
    df = df.rename(columns={"article_id": "id"})
    
    df['issue'] = df['issue'].astype(str)
        
    df.to_csv('output.csv', sep=';', index=False, encoding='utf-8')
    
    return df

In [23]:
def extract_base64(article_node):
    # Find all submission files in the article node
    submission_files = article_node.findall('{http://pkp.sfu.ca}submission_file')

    # Iterate through each submission file
    for submission in submission_files:
        # Check each file inside the submission file
        for file in submission.findall('{http://pkp.sfu.ca}file'):
            # Check if the genre is 'Manuscript'
            if submission.get('genre') == 'Manuscript':
                # Find the <embed> tag that contains the base64 content
                embed = file.find('{http://pkp.sfu.ca}embed')
                if embed is not None:
                    # Add the base64 content to the list
                    base64_contents = embed.text
                    
    return base64_contents

In [25]:
extract_base64(article_node)

'JVBERi0xLjYNJeLjz9MNCjU0IDAgb2JqDTw8L0xpbmVhcml6ZWQgMS9MIDE0MDQ0OC9PIDU2L0UgMTI2NjU4L04gMy9UIDE0MDEyMS9IIFsgNDk3IDIxN10+Pg1lbmRvYmoNICAgICAgICAgICAgICAgDQo3MiAwIG9iag08PC9EZWNvZGVQYXJtczw8L0NvbHVtbnMgNS9QcmVkaWN0b3IgMTI+Pi9GaWx0ZXIvRmxhdGVEZWNvZGUvSURbPDNDNEIzMEZCRkU3RDNBNDA4Mjg4Q0ZCNDU1ODM4QzgxPjxDMDg3RjVEMUI3NjdEMzQxODkxNkU4QTE5QkQxRDRCRj5dL0luZGV4WzU0IDM1XS9JbmZvIDUzIDAgUi9MZW5ndGggOTQvUHJldiAxNDAxMjIvUm9vdCA1NSAwIFIvU2l6ZSA4OS9UeXBlL1hSZWYvV1sxIDMgMV0+PnN0cmVhbQ0KaN5iYmRgEGBgYmBg2gUiGXPBJFiERR1EMm8FiySCSNbZYPEAsHpRsMgpsAg/mDQAkzlgcbAsszGQZHygDWI3xADJ/wVrGJiANh4FiQBNpTL5n4FxxyeAAAMAjacNOg0KZW5kc3RyZWFtDWVuZG9iag1zdGFydHhyZWYNCjANCiUlRU9GDQogICAgICAgDQo4OCAwIG9iag08PC9DIDEzMi9GaWx0ZXIvRmxhdGVEZWNvZGUvSSAxNTQvTGVuZ3RoIDEzMC9TIDc0Pj5zdHJlYW0NCmjeYmBgYAKiOAYWBgbOGQyCDAggyMAMFGVh4Hgg8OFn8zSgyPcGIMGodk/I7P4DBqaOBoYOBteOBqgkEhCAYgZGARDNLM64lCGQuZvZgZGVeRfTf2YF5hhmVibl+9L1PjGvwDrYGJgEvEAaGBhYXODmcDIwKWVARBk+AQQYAPg3GWcNCmVuZHN0cmVhbQ1lbmRvYmoNNTUgMCBvYmoNPDwvTWFya0luZm88PC9NYXJrZWQgdHJ1ZT4

In [4]:
class Author:
    def __init__(self, first_name, last_name):
        self.first_name = first_name
        self.last_name = last_name

In [5]:
class Article:
    def __init__(self, 
                 article_id, 
                 title, 
                 publication, 
                 abstract, 
                 base64_file, 
                 publication_date, 
                 year, 
                 issue, 
                 page_number, 
                 section_title,
                 section_policy,
                 section_reference,
                 doi,
                 authors, 
                 locale):
        
        self.article_id = article_id
        self.title = title
        self.publication = publication
        self.abstract = abstract
        self.base64_file = base64_file
        self.publication_date = publication_date
        self.year = year
        self.issue = issue
        self.page_number = page_number
        self.section_title = section_title
        self.section_policy = section_policy
        self.section_reference = section_reference
        self.doi = doi
        self.authors = authors
        self.locale = locale
    
    def export_authors(self):
        #generate a dict with authors and column titles
        amount_of_authors = len(self.authors)
        author_id = 0
        output = {}
        for a in self.authors:
            first_name_column = 'author_given_name_' + str(author_id)
            last_name_column = 'author_family_name_' + str(author_id)
            output[first_name_column] = [a.first_name]
            output[last_name_column] = [a.last_name]
            author_id += 1
        
        return output
    
    def to_row(self):
        #function that outputs the article as a single row for a df, as a list
        output = {'article_id': [self.article_id],
                 'title': [self.title],
                 'publication': [self.publication],
                'abstract': [self.abstract],
                'file': [self.base64_file],
                'publication_date': [self.publication_date],
                'year': [self.year],
                'issue': [self.issue],
                'page_number': [self.page_number],
                'section_title': [self.section_title],
                'section_policy': [self.section_policy],
                'section_reference': [self.section_reference],
                'doi': [self.doi]}
        
        authors = self.export_authors()
        
        output = output | authors
        
        return output

In [6]:
# Function to find the parent issue of a given article node
def find_parent_issue(article_node, root):
    for issue in root.findall('.//{http://pkp.sfu.ca}issue'):  # Iterate through all issues
        if article_node in issue.findall('.//{http://pkp.sfu.ca}article'):  # Check if the article is in this issue
            return issue
    return None

In [7]:
def get_article_info(article_node, root, article_id):
    
    base64_file = extract_base64(article_node)
    
    publications = article_node.findall('{http://pkp.sfu.ca}publication')
    publication = publications[0]
    
    locale = publication.attrib['locale']
    publication_date = publication.attrib['date_published']
    section_reference = publication.attrib['section_ref']
    
    for id_node in publication.findall('{http://pkp.sfu.ca}id'):
        if id_node.get('type') == 'doi':  # Check for the 'type' attribute
            doi = id_node.text
    
    for title_node in publication.findall('{http://pkp.sfu.ca}title'):
        if title_node.get('locale') == locale:
            title = title_node.text
    
    for abstract_node in publication.findall('{http://pkp.sfu.ca}abstract'):
        if abstract_node.get('locale') == locale:
            abstract = abstract_node.text
    
    try:
        page_number = publication.findall('{http://pkp.sfu.ca}pages')[0].text
    except IndexError:
        page_number = ''

        
    author_list = publication.findall('.//{http://pkp.sfu.ca}author')
    authors = []
    for a in author_list:
        first_name = a.find('{http://pkp.sfu.ca}givenname').text
        last_name = a.find('{http://pkp.sfu.ca}familyname').text
        authors.append(Author(first_name, last_name))
        
    parent_issue = find_parent_issue(article_node, root)
    issue_identification = parent_issue.find('{http://pkp.sfu.ca}issue_identification')
    
    issue = issue_identification.find('{http://pkp.sfu.ca}number').text
    
    try:
        year = issue_identification.find('{http://pkp.sfu.ca}year').text
    except AttributeError:
        year = publication_date[:4]
    
    for publication_node in issue_identification.findall('{http://pkp.sfu.ca}title'):
        if publication_node.get('locale') == locale:
            publication = publication_node.text
            
    section_information = parent_issue.find('{http://pkp.sfu.ca}sections')
    for section_node in section_information.findall('{http://pkp.sfu.ca}section'):
        if section_node.get('ref') == section_reference:
            for section_title_node in section_node.findall('{http://pkp.sfu.ca}title'):
                if section_title_node.get('locale') == locale:
                    section_title = section_title_node.text
            
            section_policy = ""
            for section_policy_node in section_node.findall('{http://pkp.sfu.ca}policy'):
                if section_policy_node.get('locale') == locale:
                    section_policy = section_policy_node.text
                    
                else:
                    section_policy = 'no section policy'
                    
    return Article(article_id, 
                 title, 
                 publication, 
                 abstract, 
                 base64_file, 
                 publication_date,
                 year, 
                 issue, 
                 page_number, 
                 section_title,
                 section_policy,
                 section_reference,
                 doi,
                 authors, 
                 locale)

In [8]:
if __name__ == "__main__":
    main()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


In [9]:
main()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


Unnamed: 0,id,title,publication,abstract,file,publication_date,year,issue,page_number,section_title,section_policy,section_reference,doi,author_given_name_0,author_family_name_0,author_given_name_1,author_family_name_1,author_given_name_2,author_family_name_2
0,0,<em>Amigo Secreto</em>,January-June,"<em>Amigo Secreto</em>, directed by Maria Augu...",C:/Users/kayp/GitHub/ojs-tools-martijn/test/te...,2024-06-13,2024,117,Rev. 2,Film reviews | Críticas de cine,<p>If you are interested in writing a review o...,Film,10.32992/erlacs.11196,Débora,Póvoa,,,,
0,1,<em>Chatd&yuml;e Tsimane</em>,January-June,<em>Chatdÿe Tsimane </em>[<em>Pariente Chimán<...,C:/Users/kayp/GitHub/ojs-tools-martijn/test/te...,2024-05-24,2024,117,Rev. 1,Film reviews | Críticas de cine,<p>If you are interested in writing a review o...,Film,10.32992/erlacs.11195,Ara,Goudsmit L.,,,,
0,2,<em>In the Shadows of Tungurahua; Disaster Pol...,January-June,<p><em>In the Shadows of Tungurahua; Disaster ...,C:/Users/kayp/GitHub/ojs-tools-martijn/test/te...,2024-05-24,2024,117,Rev. 3,Book Reviews | Reseñas,<p>If you are interested in writing a book&nbs...,Book,10.32992/erlacs.11187,Ricardo,Fuentealba,,,,
0,3,<em>The Latin American crisis and the new auth...,January-June,<p><em>The Latin American crisis and the new a...,C:/Users/kayp/GitHub/ojs-tools-martijn/test/te...,2024-05-24,2024,117,Rev. 2,Book Reviews | Reseñas,<p>If you are interested in writing a book&nbs...,Book,10.32992/erlacs.11186,Patrick,Clark,,,,
0,4,<em>Andean meltdown; A climate ethnography of ...,January-June,<p><em>Andean meltdown; A climate ethnography ...,C:/Users/kayp/GitHub/ojs-tools-martijn/test/te...,2024-04-08,2024,117,Rev. 1,Book Reviews | Reseñas,<p>If you are interested in writing a book&nbs...,Book,10.32992/erlacs.11170,Armando,Guevara Gil,,,,
0,5,La justicia transicional desde abajo en Guatem...,January-June,<p>Abstract: Transitional justice from below i...,C:/Users/kayp/GitHub/ojs-tools-martijn/test/te...,2024-06-25,2024,117,81-99,Articles | Artículos,<p>ERLACS welcomes articles on Latin America a...,ART,10.32992/erlacs.11128,Maira Ixchel,Benítez-Jiménez,Gabriela,Escobar Urrutia,,
0,6,Genealog&iacute;a h&iacute;brida de las activi...,January-June,<p>Abstract: The hybrid genealogy of young wo...,C:/Users/kayp/GitHub/ojs-tools-martijn/test/te...,2024-05-24,2024,117,63-80,Articles | Artículos,<p>ERLACS welcomes articles on Latin America a...,ART,10.32992/erlacs.11078,Elisabeth,Jay Friedman,Ana Laura,Rodríguez Gustá,,
0,7,Iras y resistencias de larga duraci&oacute;n e...,January-June,<p>Abstract: Anger and long-lasting resistance...,C:/Users/kayp/GitHub/ojs-tools-martijn/test/te...,2024-05-06,2024,117,43-62,Articles | Artículos,<p>ERLACS welcomes articles on Latin America a...,ART,10.32992/erlacs.11057,Erika Paola,Parrado Pardo,Jefferson,Jaramillo Marín,,
0,8,"Political elites, agency discretion and anti-c...",January-June,"<p>Between 2015 and 2018, Chile experienced se...",C:/Users/kayp/GitHub/ojs-tools-martijn/test/te...,2024-02-27,2024,117,1-24,Articles | Artículos,<p>ERLACS welcomes articles on Latin America a...,ART,10.32992/erlacs.11035,Bettina,Schorr,Sebastián,Carrasco,Emilio,Moya
0,9,&ldquo;En remuneraci&oacute;n de sus servicios...,January-June,<p>&ldquo;In payment for its services&rdquo; i...,C:/Users/kayp/GitHub/ojs-tools-martijn/test/te...,2024-03-08,2024,117,,Articles | Artículos,<p>ERLACS welcomes articles on Latin America a...,ART,10.32992/erlacs.11027,Pedro,Valenzuela,,,,
