# Parse bibtex to produce posts

- Depends: https://github.com/aclements/biblib

**Note** not the version available on pypi

In [1]:
import biblib.bib
import sys
import yaml
from collections import Counter
import re

In [2]:
bib_file = "fruehwald.bib"

with open(bib_file, 'r') as fp:
    db = biblib.bib.Parser().parse(fp, log_fp=sys.stderr).get_entries()

In [3]:
db.keys()

odict_keys(['fruehwald2010', 'fruehwald2013b', 'fave_1.2.2', 'fruehwald2012', '100years', 'fruehwald_filled_2016', 'fruehwald_phonological_2013', 'wieling_variation_2016', 'fruehwald2017c', 'sneller_using_2019', 'boyd_evaluation_2015', 'boyd_crosslinguistic_2021', 'tanner_toward_2020', 'fruehwald_is_2019', 'mielke_age_2019', 'fruehwald_study_2022', 'fruehwald_im_2015', 'fruehwald_prolegomena_2016', 'fruehwald_response_2017', 'fruehwald2011', 'fruehwald_phonetic_2011', 'fruehwald_role_2017', 'fruehwald_simulation_2008', 'fruehwald2016a', 'fruehwald2007', 'fruehwald_evaluation_2008', 'fruehwald_spread_2008', 'fruehwald2013a', 'fruehwald2014', 'purse_frequency_2022'])

In [4]:
[x.strip() for x in db["fruehwald_spread_2008"]["keywords"].split(",")]

['language change',
 'phonology',
 'sound change',
 'phonetics',
 'lexical diffusion',
 'opcacity']

In [5]:
db["fruehwald_spread_2008"].authors()

[Name(first='Josef', von='', last='Fruehwald', jr='')]

In [6]:
m2n = {'January'   :"1",
       'February'  :"2",
       'March'     :"3", 
       'April'     :"4",
       'May'       :"5",
       'June'      :"6",
       'July'      :"7",
       'August'    :"8",
       'September' :"9",
       'October'   :"10",
       'November'  :"11" ,
       'December'  :"12"}

m2n["January"]

'1'

In [7]:
def make_post(db):

    paper_entries = [db[key] for key in db if db[key].typ in ["article", "inproceedings", "incollection"]]
    date_list = []
    years = [entry["year"] for entry in paper_entries if "year" in entry]
    year_set = set(years)
    year_dict = {year : 0 for year in year_set}
    year_dict["2099"] = 0 
    scrub = r'[^\w ]+'
    space_rep = r"\s+"
    for entry in paper_entries:
        out_dict = {"layout" : "publication"}
        if 'year' in entry:
            year = entry["year"]
            out_dict["year"] = year
            if 'month' in entry:
                month = m2n[entry["month"]]
            else:
                month = "1"
            fake_date =f"{year}-{month}-{year_dict[year]+1}"
        else:
            year = "2099"
            month = "1"
            out_dict["year"] = "forthcoming"
            fake_date = f"{year}-{month}-{year_dict[year]+1}"

        #print(fake_date)
        year_dict[year] += 1

        title = entry["title"].replace("{", "").replace("}", "")
        out_dict["title"] = title
        clean_title = re.sub(scrub, "", title)
        url_title = re.sub(r'\s+', "_", clean_title).lower()
        url_short_title = "_".join(url_title.split("_")[0:5])
        post_title = fake_date + "-" + url_short_title + ".md"

        authors_db = entry.authors()
        authors_list = [re.sub(r'\s+', ' ', " ".join(author)).rstrip() for author in authors_db]
       
        out_dict["author"] = authors_list
        if "journal" in entry:
            out_dict["published"] = re.sub(scrub, "", entry["journal"])
        elif "booktitle" in entry:
            out_dict["published"] = re.sub(scrub, "", entry["booktitle"])
        else:
            out_dict["published"] = ""

        if "url" in entry:
            out_dict["p_url"] = entry["url"]

        if "keywords" in entry:
            out_dict["tags"] = [x.strip().lower() for x in entry["keywords"].split(",")]

        if_copy = ["volume", "doi", "number", "pages" , "abstract"]
        out_dict = out_dict | {key : entry[key] for key in if_copy if key in entry}
        out_dict["category"] = "paper"
        out_path = "research/_posts/"+post_title
        with open(out_path, 'w') as file:
            file.write("---\n")
            yaml.dump(out_dict, file)
            file.write("---")

In [8]:
make_post(db)

In [9]:
db['purse_frequency_2022']

Entry([('title', 'Frequency and morphological complexity in variation'),
       ('volume', '7'),
       ('issn', '2397-1835'),
       ('url', 'https://www.glossa-journal.org/article/id/5839/'),
       ('doi', '10.16995/glossa.5839'),
       ('abstract',
        'Broad interest in probabilistic aspects of language has reignited debates about a potential delineation between the shape of an abstract grammar and patterns of language in use. A central topic in this debate is the relationship between measures capturing aspects of language use, such as word frequency, and patterns of variation. While it has become common practice to attend to frequency measures in studies of linguistic variation, fundamental questions about exactly what linguistic unit’s frequency it is appropriate to measure in each case, and what this implies about the representations or processing mechanisms at play, remain underexplored. In the present study, we compare how three frequency measures account for variance in