In [1]:
from bibutils import load_bib, write_bib
import bibtexparser
from bibtexparser.customization import author, page_double_hyphen
from copy import deepcopy
from titlecase import titlecase
import calendar
import re
from StringIO import StringIO

In [2]:
def change_journal_to_journaltitle(entry):
    journal = entry.pop('journal', None)
    if journal:
        print(u"Changing 'journal={}' to 'journaltitle' from ID:{}".format(journal, entry['ID']))
        entry['journaltitle'] = journal

def remove_curly_brackets(string):
    return string.replace('{', '').replace('}', '')

def change_month(entry):
    try:
        month = entry['month']
    except KeyError:
        print("no month")
        return
    else:
        month = month.lower()
        month = month[:3]
    print("Month", month)
    month_names = [month_name.lower() for month_name in calendar.month_abbr]
    if month in month_names:
        month_i = str(month_names.index(month))
        print(u"Changing month from", month, u"to", month_i)
        entry['month'] = month_i
    
def lower_case_id(entry):
    bib_id = entry['ID']
    if len(bib_id) >= 2 and bib_id[1].islower():
        entry['ID'] = bib_id[0].lower() + bib_id[1:]
        
def fix_IEEE_journals(s):
    split = s.split(',')
    if len(split) > 1 and 'IEEE' in split[-1]:
        print("replacing '{}' ".format(s), end='')
        s = split[1].strip() + ' ' + split[0].strip()
        print("with '{}'".format(s))
    return s

In [3]:
def process(bib_database):
    for entry in bib_database.entries:
        # Change ID
        surname_of_first_author = author(deepcopy(entry))['author'][0].split(',')[0].strip().lower()
        entry['ID'] = '{:s}{}'.format(surname_of_first_author, entry['year'])
        
        # journaltitle and booktitle
        change_journal_to_journaltitle(entry)
        for key in ['journaltitle', 'booktitle']:
            if key in entry:
                entry[key] = fix_IEEE_journals(entry[key])
                if ' &' in entry[key]:
                    entry[key] = entry[key].replace(' &', ' \&')

        if 'booktitle' in entry:
            booktitle = entry['booktitle']

            # Extract 'series' from 'booktitle'
            if '(' in booktitle and ')' in booktitle:
                start = booktitle.index('(')
                end = booktitle.index(')')
                if 'series' not in entry:
                    entry['series'] = booktitle[start+1:end]
                    print("setting 'series'")
                print("removing series from booktitle")
                booktitle = booktitle[:start].strip() + booktitle[end+1:].strip()

            # Extract 'Proceedings of the ' from 'booktitle'
            booktitle = booktitle.replace('Proceedings of the ', '')
            
            # Remove date from 'booktitle':
            if re.match(r"""\d{4}""", booktitle):
                print("removing date from booktitle")
                booktitle = booktitle[5:]    
        
            entry['booktitle'] = booktitle
        
        if 'series' in entry:
            if "'" in entry['series']:
                print("removing ' from series")
                entry['series'] = entry['series'].split("'")[0].strip()
        
        entry['title'] = remove_curly_brackets(entry['title'])
        entry['title'] = titlecase(entry['title'])
        for key_to_delete in ['keywords', 'file', 'mendeley-tags']:
            entry.pop(key_to_delete, None)
        change_month(entry)
        entry.pop('mendeley-tags', '')
        lower_case_id(entry)

        # Set 'publisher'
        for publisher in ['ACM', 'IEEE']:
            for key in ['journaltitle', 'booktitle']:
                if publisher in entry.get(key, ''):
                    print("removing", publisher, "from", key)
                    entry[key] = entry[key].replace(publisher + ' ', '')
                
                if 'publisher' not in entry:
                    entry['publisher'] = publisher
                    print("setting publisher to", publisher, "for ID", entry['ID'])

        # Remove 'year' and 'month' and use 'date' instead
        if 'date' not in entry:
            if 'year' in entry:
                year = entry.pop('year')
                entry['date'] = year
            if 'month' in entry:
                month = entry.pop('month')
                entry['date'] += '-{:02d}'.format(int(month))

        if 'date' in entry:
            date = entry['date']
            def bad_date():
                print("********##### BAD DATE", entry['date'], entry['ID'])
                
            if len(date) > 7 and not re.match(r"""\d{4}-\d{2}-\d{2}""", date):
                bad_date()
            elif len(date) > 4 and not re.match(r"""\d{4}-\d{2}""", date):
                bad_date()
            elif not re.match(r"""\d{4}""", date):
                bad_date()
                
        page_double_hyphen(entry)
        
        if 'doi' in entry:
            entry['doi'] = entry['doi'].replace('http://dx.doi.org/', '')
        
            if 'url' in entry:
                entry.pop('url')
                print("removing URL")
                        
    return bib_database

In [16]:
bibtex = """
@misc{majewski2014equipment,
  title={Equipment fault detection, diagnostics and disaggregation system},
  author={Majewski, J. and Fisera, R. and Anglin, M. and Gall, T.},
  url={https://www.google.com/patents/US20140172400},
  year={2014},
  month=jun # "~19",
  publisher={Google Patents},
  note={US Patent App. 13/715,511}
}
"""

bibtex = bibtex.replace("""abstract = "Abstract """, """abstract = {""")
bibtex = bibtex.replace(""" "
""", """}
""")
stringio = StringIO(bibtex)
bib_database = load_bib(stringio=stringio)
# print(stringio.read())

In [17]:
print("----------------LOG----------------\n")
bib_database = process(bib_database)
print(bib_database.entries[0])
print("\n\n")

print("----------------ORIGINAL-----------\n")
print(bibtex)

print("\n\n")
print("----------------PROCESSED----------\n")
bibstr = bibtexparser.dumps(bib_database)
print(bibstr)

----------------LOG----------------

Month jun
Changing month from jun to 6
{u'publisher': u'Google Patents', u'author': u'Majewski, J. and Fisera, R. and Anglin, M. and Gall, T.', u'url': u'https://www.google.com/patents/US20140172400', u'title': u'Equipment Fault Detection, Diagnostics and Disaggregation System', u'note': u'US Patent App. 13/715,511', 'date': u'2014-06', 'ID': 'majewski2014', 'ENTRYTYPE': u'misc'}



----------------ORIGINAL-----------


@misc{majewski2014equipment,
  title={Equipment fault detection, diagnostics and disaggregation system},
  author={Majewski, J. and Fisera, R. and Anglin, M. and Gall, T.},
  url={https://www.google.com/patents/US20140172400},
  year={2014},
  month=jun # "~19",
  publisher={Google Patents},
  note={US Patent App. 13/715,511}
}




----------------PROCESSED----------

@misc{majewski2014,
 author = {Majewski, J. and Fisera, R. and Anglin, M. and Gall, T.},
 date = {2014-06},
 note = {US Patent App. 13/715,511},
 publisher = {Google Pa