In [172]:
import pandas as pd
import sys
import xml.etree.ElementTree as ET

In [177]:
xml_file = 'test.xml'

In [213]:
def main():
    if len(sys.argv) < 2:
        print("Usage: python script.py <xml_file>")
        sys.exit(1)  # Exit with an error code

    # Retrieve the XML file from command-line arguments
    #xml_file = sys.argv[1]
    
    # Open the XML file with the correct encoding
    with open(xml_file, 'rb') as file:
        # Parse the XML file
        tree = ET.parse(file)
        root = tree.getroot()
    
    articles = root.findall('.//{http://pkp.sfu.ca}article')
    
    rows = []
    row_id = 0
    
    for article in articles:
        print(row_id)
        processed = get_article_info(article, root, row_id)
        df = pd.DataFrame.from_dict(processed.to_row())
        rows.append(df)
        row_id += 1
        
    df = pd.concat(rows)
    
    df = df.fillna('')
        
    df.to_csv('output.csv', index=False, encoding='utf-8')

In [2]:
class Author:
    def __init__(self, first_name, last_name):
        self.first_name = first_name
        self.last_name = last_name

In [161]:
class Article:
    def __init__(self, 
                 article_id, 
                 title, 
                 publication, 
                 abstract, 
                 base64_file, 
                 publication_date, 
                 year, 
                 issue, 
                 page_number, 
                 section_title,
                 section_policy,
                 section_reference,
                 doi,
                 authors, 
                 locale):
        
        self.article_id = article_id
        self.title = title
        self.publication = publication
        self.abstract = abstract
        self.base64_file = base64_file
        self.publication_date = publication_date
        self.year = year
        self.issue = issue
        self.page_number = page_number
        self.section_title = section_title
        self.section_policy = section_policy
        self.section_reference = section_reference
        self.doi = doi
        self.authors = authors
        self.locale = locale
    
    def export_authors(self):
        #generate a dict with authors and column titles
        amount_of_authors = len(self.authors)
        author_id = 0
        output = {}
        for a in self.authors:
            first_name_column = 'author_given_name_' + str(author_id)
            last_name_column = 'author_family_name_' + str(author_id)
            output[first_name_column] = [a.first_name]
            output[last_name_column] = [a.last_name]
            author_id += 1
        
        return output
    
    def to_row(self):
        #function that outputs the article as a single row for a df, as a list
        output = {'article_id': [self.article_id],
                 'title': [self.title],
                 'publication': [self.publication],
                'abstract': [self.abstract],
                'file': [self.base64_file],
                'publication_date': [self.publication_date],
                'year': [self.year],
                'issue': [self.issue],
                'page_number': [self.page_number],
                'section_title': [self.section_title],
                'section_policy': [self.section_policy],
                'section_reference': [self.section_reference],
                'doi': [self.doi]}
        
        authors = self.export_authors()
        
        output = output | authors
        
        return output

In [144]:
# Function to find the parent issue of a given article node
def find_parent_issue(article_node, root):
    for issue in root.findall('.//{http://pkp.sfu.ca}issue'):  # Iterate through all issues
        if article_node in issue.findall('.//{http://pkp.sfu.ca}article'):  # Check if the article is in this issue
            return issue
    return None

In [186]:
def get_article_info(article_node, root, article_id):
    
    base64_file = 'placeholder'
    
    publications = article_node.findall('{http://pkp.sfu.ca}publication')
    publication = publications[0]
    
    locale = publication.attrib['locale']
    publication_date = publication.attrib['date_published']
    section_reference = publication.attrib['section_ref']
    
    for id_node in publication.findall('{http://pkp.sfu.ca}id'):
        if id_node.get('type') == 'doi':  # Check for the 'type' attribute
            doi = id_node.text
    
    for title_node in publication.findall('{http://pkp.sfu.ca}title'):
        if title_node.get('locale') == locale:
            title = title_node.text
    
    for abstract_node in publication.findall('{http://pkp.sfu.ca}abstract'):
        if abstract_node.get('locale') == locale:
            abstract = abstract_node.text
    
    try:
        page_number = publication.findall('{http://pkp.sfu.ca}pages')[0].text
    except IndexError:
        page_number = ''

        
    author_list = publication.findall('.//{http://pkp.sfu.ca}author')
    authors = []
    for a in author_list:
        first_name = a.find('{http://pkp.sfu.ca}givenname').text
        last_name = a.find('{http://pkp.sfu.ca}familyname').text
        authors.append(Author(first_name, last_name))
        
    parent_issue = find_parent_issue(article_node, root)
    issue_identification = parent_issue.find('{http://pkp.sfu.ca}issue_identification')
    
    issue = issue_identification.find('{http://pkp.sfu.ca}number').text
    
    try:
        year = issue_identification.find('{http://pkp.sfu.ca}year').text
    except AttributeError:
        year = publication_date[:4]
    
    for publication_node in issue_identification.findall('{http://pkp.sfu.ca}title'):
        if publication_node.get('locale') == locale:
            publication = publication_node.text
            
    section_information = parent_issue.find('{http://pkp.sfu.ca}sections')
    for section_node in section_information.findall('{http://pkp.sfu.ca}section'):
        if section_node.get('ref') == section_reference:
            for section_title_node in section_node.findall('{http://pkp.sfu.ca}title'):
                if section_title_node.get('locale') == locale:
                    section_title = section_title_node.text
            
            section_policy = ""
            for section_policy_node in section_node.findall('{http://pkp.sfu.ca}policy'):
                if section_policy_node.get('locale') == locale:
                    section_policy = section_policy_node.text
                    
    return Article(article_id, 
                 title, 
                 publication, 
                 abstract, 
                 base64_file, 
                 publication_date,
                 year, 
                 issue, 
                 page_number, 
                 section_title,
                 section_policy,
                 section_reference,
                 doi,
                 authors, 
                 locale)

In [199]:
if __name__ == "__main__":
    main()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


In [212]:
main()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
