In [12]:
from bibutils import load_bib, write_bib
import bibtexparser
from bibtexparser.customization import author, page_double_hyphen
from copy import deepcopy
from titlecase import titlecase
import calendar
import re
from StringIO import StringIO

In [13]:
def change_journal_to_journaltitle(entry):
    journal = entry.pop('journal', None)
    if journal:
        print(u"Changing 'journal={}' to 'journaltitle' from ID:{}".format(journal, entry['ID']))
        entry['journaltitle'] = journal

def remove_curly_brackets(string):
    return string.replace('{', '').replace('}', '')

def change_month(entry):
    try:
        month = entry['month']
    except KeyError:
        print("no month")
        return
    else:
        month = month.lower()
        month = month[:3]
    print("Month", month)
    month_names = [month_name.lower() for month_name in calendar.month_abbr]
    if month in month_names:
        month_i = str(month_names.index(month))
        print(u"Changing month from", month, u"to", month_i)
        entry['month'] = month_i
    
def lower_case_id(entry):
    bib_id = entry['ID']
    if len(bib_id) >= 2 and bib_id[1].islower():
        entry['ID'] = bib_id[0].lower() + bib_id[1:]
        
def fix_IEEE_journals(s):
    split = s.split(',')
    if len(split) > 1 and 'IEEE' in split[-1]:
        print("replacing '{}' ".format(s), end='')
        s = split[1].strip() + ' ' + split[0].strip()
        print("with '{}'".format(s))
    return s

In [14]:
def process(bib_database):
    for entry in bib_database.entries:
        # Change ID
        surname_of_first_author = author(deepcopy(entry))['author'][0].split(',')[0].strip().lower()
        entry['ID'] = '{:s}{}'.format(surname_of_first_author, entry['year'])
        
        # journaltitle and booktitle
        change_journal_to_journaltitle(entry)
        for key in ['journaltitle', 'booktitle']:
            if key in entry:
                entry[key] = fix_IEEE_journals(entry[key])
                if ' &' in entry[key]:
                    entry[key] = entry[key].replace(' &', ' \&')

        if 'booktitle' in entry:
            booktitle = entry['booktitle']

            # Extract 'series' from 'booktitle'
            if '(' in booktitle and ')' in booktitle:
                start = booktitle.index('(')
                end = booktitle.index(')')
                if 'series' not in entry:
                    entry['series'] = booktitle[start+1:end]
                    print("setting 'series'")
                print("removing series from booktitle")
                booktitle = booktitle[:start].strip() + booktitle[end+1:].strip()

            # Extract 'Proceedings of the ' from 'booktitle'
            booktitle = booktitle.replace('Proceedings of the ', '')
            
            # Remove date from 'booktitle':
            if re.match(r"""\d{4}""", booktitle):
                print("removing date from booktitle")
                booktitle = booktitle[5:]    
        
            entry['booktitle'] = booktitle
        
        if 'series' in entry:
            if "'" in entry['series']:
                print("removing ' from series")
                entry['series'] = entry['series'].split("'")[0].strip()
        
        entry['title'] = remove_curly_brackets(entry['title'])
        for key_to_delete in ['keywords', 'file', 'mendeley-tags']:
            entry.pop(key_to_delete, None)
        change_month(entry)
        lower_case_id(entry)

        # Set 'publisher'
        for publisher in ['ACM', 'IEEE']:
            for key in ['journaltitle', 'booktitle']:
                if publisher in entry.get(key, ''):
                    if key == 'booktitle':
                        print("removing", publisher, "from", key)
                        entry[key] = entry[key].replace(publisher + ' ', '')
                
                    if 'publisher' not in entry:
                        entry['publisher'] = publisher
                        print("setting publisher to", publisher, "for ID", entry['ID'])

        # Remove 'year' and 'month' and use 'date' instead
        if 'date' not in entry:
            if 'year' in entry:
                year = entry.pop('year')
                entry['date'] = year
            if 'month' in entry:
                month = entry.pop('month')
                entry['date'] += '-{:02d}'.format(int(month))

        if 'date' in entry:
            date = entry['date']
            def bad_date():
                print("********##### BAD DATE", entry['date'], entry['ID'])
                
            if len(date) > 7 and not re.match(r"""\d{4}-\d{2}-\d{2}""", date):
                bad_date()
            elif len(date) > 4 and not re.match(r"""\d{4}-\d{2}""", date):
                bad_date()
            elif not re.match(r"""\d{4}""", date):
                bad_date()
                
        page_double_hyphen(entry)
        
        if 'doi' in entry:
            entry['doi'] = entry['doi'].replace('http://dx.doi.org/', '')
        
            for key in ['url', 'eprint', 'eprinttype']:
                if key in entry:
                    entry.pop(key)
                    print("removing", key)
                                
        if entry.get('publisher', '') == 'IEEE':
            entry['title'] = titlecase(entry['title'])
            
        if 'abstract' in entry:
            entry['abstract'] = entry['abstract'].replace("""\{""", "")
            entry['abstract'] = entry['abstract'].replace("""\}""", "")
                        
    return bib_database

In [45]:
bibtex = """
@INPROCEEDINGS{7001824, 
author={Kalogridis, G. and Dave, S.}, 
booktitle={e-Health Networking, Applications and Services (Healthcom), 2014 IEEE 16th International Conference on}, 
title={Privacy and eHealth-enabled smart meter informatics}, 
year={2014}, 
pages={116-121}, 
abstract={The societal need for better public healthcare calls for granular, continuous, nationwide instrumentation and data fusion technologies. However, the current trend of centralised (database) health analytics gives rise to data privacy issues. This paper proposes sensor data mining algorithms that help infer health/well-being related lifestyle patterns and anomalous (or privacy-sensitive) events. Such algorithms enable a user-centric context awareness at the network edge, which can be used for decentralised eHealth decision making and privacy protection by design. The main hypothesis of this work involves the detection of atypical behaviours from a given stream of energy consumption data recorded at eight houses over a period of a year for cooking, microwave, and TV activities. Our initial exploratory results suggest that in the case of an unemployed single resident, the day-by-day variability of TV or microwave operation, in conjunction with the variability of the absence of other cooking activity, is more significant as compared with the variability of other combinations of activities. The proposed methodology brings together appliance monitoring, privacy, and anomaly detection within a healthcare context, which is readily scalable to include other health-related sensor streams.}, 
keywords={data mining;data privacy;decision making;health care;sensor fusion;smart meters;ubiquitous computing;anomaly detection;appliance monitoring;centralised health analytics;data fusion technologies;data privacy issues;decentralised e-health decision making;e-health-enabled smart meter informatics;energy consumption data;health-related sensor streams;public healthcare;sensor data mining algorithms;user-centric context awareness;Data privacy;Home appliances;Microwave theory and techniques;Monitoring;Privacy;TV}, 
doi={10.1109/HealthCom.2014.7001824}, 
month={Oct},}

"""

bibtex = bibtex.replace("""abstract = "Abstract """, """abstract = {""")
bibtex = bibtex.replace("""abstract = \"""", """abstract = {""")
bibtex = bibtex.replace(""" "
""", """}
""")
stringio = StringIO(bibtex)
bib_database = load_bib(stringio=stringio)
# print(stringio.read())

In [46]:
print("----------------LOG----------------\n")
bib_database = process(bib_database)
print(bib_database.entries[0])
print("\n\n")

print("----------------ORIGINAL-----------\n")
print(bibtex)

print("\n\n")
print("----------------PROCESSED----------\n")
bibstr = bibtexparser.dumps(bib_database)
print(bibstr)

----------------LOG----------------

replacing 'e-Health Networking, Applications and Services (Healthcom), 2014 IEEE 16th International Conference on' with 'Applications and Services (Healthcom) e-Health Networking'
setting 'series'
removing series from booktitle
no month
{u'doi': u'10.1109/HealthCom.2014.7001824', u'title': u'Privacy and eHealth-enabled smart meter informatics', 'series': u'Healthcom', u'booktitle': u'Applications and Servicese-Health Networking', u'author': u'Kalogridis, G. and Dave, S.', u'abstract': u'The societal need for better public healthcare calls for granular, continuous, nationwide instrumentation and data fusion technologies. However, the current trend of centralised (database) health analytics gives rise to data privacy issues. This paper proposes sensor data mining algorithms that help infer health/well-being related lifestyle patterns and anomalous (or privacy-sensitive) events. Such algorithms enable a user-centric context awareness at the network edg