In [57]:
import os, sys, json, numpy as np, pandas as pd, json
import xml.etree.ElementTree as ET

In [4]:
tree = ET.parse('export.xml')

In [5]:
root = tree.getroot()

In [55]:
results = []
for record in root.findall('./records/record'):
    
    ref_type = record.find('ref-type')
    ref = ref_type.get('name')
    
    authors = record.findall('contributors/*/author')    
    authors = [a.text.strip().split(', ')[::-1] for a in authors]
    for i, author in enumerate(authors):
        first, last = author        
        authors[i] = [*first.split(' '), last]
        
    authors = [{"name": a} for a in authors]
    
    
    title = record.find('*/title').text
    
    year = record.find('*/year')
    if year is not None:
        year = year.text
    
    month = record.find('*/pub-dates/date')
    if month is not None:
        month = month.text
        
    pages = record.find('pages')
    if pages is not None:
        pages = pages.text
        
    abstract = record.find('abstract')
    if abstract is not None:
        abstract = abstract.text
    
    publisher = record.find('publisher')
    if publisher is not None:
        publisher = publisher.text
    
    periodical = record.find('periodical/full-title')
    if periodical is not None:
        periodical = periodical.text
    
    keywords = [
        kw.text 
        for kw in record.findall('keywords/keyword') 
        if kw is not None
    ]
    
    urls = [
        url.text 
        for url in record.findall('urls/*/url')
        if url is not None
    ]
    urls = [{"href": url} for url in urls]
    
    volume = record.find('volumes')
    if volume is not None:
        volume = volume.text
    
    issue = record.find('issue')
    if issue is not None:
        issue = issue.text
    
    
    data = {
        "title": title,
        "type": ref,
        "authors": authors,
        "year": year,
        "month": month,
        "pages": pages,
        "abstract": abstract,
        "publisher": publisher,
        "periodical": periodical,
        "keywords": keywords,
        "urls": urls,
        "volume": volume,
        "issue": issue
    }
    results.append(data)

In [61]:
with open('pop_citations.json', 'r') as f:
    citations = json.load(f)

In [74]:
len(results), len(citations)

(126, 176)

In [71]:
found = []

In [72]:
for citation in citations:
    citation_title = citation['title']
    
    for article in results:
        article_title = article['title']
        
        if article_title == citation_title:
            found.append(citation)  
            article["google_scholar_url"] = citation["citation_url"]
            
            if article["month"] is None and "month" in citation:
                article["month"] = citation["month"]
                
            if article["year"] is None and "year" in citation:
                article["year"] = citation["year"]
                
            if article["volume"] is None and "volume" in citation:
                article["volume"] = citation["volume"]
                
            if article["issue"] is None and "issue" in citation:
                article["issue"] = citation["issue"]
                
            article["journal_source"] = citation["source"]
            
            article["cites"] = citation["cites"]            

In [75]:
with open('../src/lib/data/citations.json', 'w') as f:
    json.dump(results, f)