In [1]:
import os
import yaml

import polars as pl
import markdown

from io import StringIO

from bs4 import BeautifulSoup

In [2]:
paths = []
for root, _, files in os.walk("poki/plaintext"):
    for file in files:
        paths.append(os.path.join(root, file))

len(paths)

1502

In [3]:
def markdown_to_text(md: str) -> str:
    html = markdown.markdown(md)
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text().replace("\\\n", "\n")
    return text

In [4]:
def load_md(path):
    with open(path) as f:
        content = f.read()

    meta = content.split("---\n")[1]
    meta = yaml.safe_load(StringIO(meta))

    text = content.split("---\n")[2]
    text = markdown_to_text(text)
    return meta | {"text": text}

In [5]:
all_keys = set()
data = []

for path in paths:
    post = load_md(path)
    for key in post.keys():
        all_keys.add(key)

    data.append(post)

for row in data:
    for key in all_keys:
        if key not in row:
            row[key] = None

    if not isinstance(row["date"], str):
        row["date"] = str(row["date"])

In [6]:
df = pl.DataFrame(
    data,
    infer_schema_length=len(data)
)
df

title,description,authors,proofreaders,date,date-precision,tags,original,license,sources,archives,preprocessing,accessibility-notes,notes,text
str,str,list[str],list[str],str,str,list[str],struct[2],str,list[str],list[str],str,str,str,str
"""lipu Tili: lipu nanpa wan""",,"[""soweli Peka""]",,"""2013-04-18""","""day""",,,,"[""https://tokisoweli.blogspot.com/2013/04/pu-tili-pi-pu-lili-wan.html""]",,,,,"""lon li ike tawa jan mute. kulu…"
"""lipu Tili: lipu nanpa tu""",,"[""soweli Peka""]",,"""2013-04-21""","""day""",,,,"[""https://tokisoweli.blogspot.com/2013/04/pu-tili-pu-lili-tu.html""]",,,,,"""suno li kama mute li tawa mute…"
"""lipu Tili: lipu nanpa tu wan""",,"[""soweli Peka""]",,"""2013-04-22""","""day""",,,,"[""https://tokisoweli.blogspot.com/2013/04/pu-lili-tu-wan.html""]",,,,,"""jan Tili li jo e sona utala ta…"
"""kalama musi pi jan sewi meli -…","""kalama musi Pekan li pona mute…","[""jan Minasa""]",,"""2013-10-21""","""day""","[""music""]",,,"[""https://www.youtube.com/watch?v=IW1GkHioQ6Y""]",,,,,"""mama pimeja, mama suno ma pi l…"
"""telo nasa""",,"[""tobiah""]",,"""2013-10-10""","""day""","[""original"", ""lyrics"", … ""synth""]",,"""CC BY-SA 3.0""","[""https://tobiah.bandcamp.com/track/telo-nasa""]",,,,,"""telo nasa li pona. telo nasa…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""kili lili""",,"[""jan Pije""]",,"""2002-10-10""","""day""","[""original"", ""poetry""]",,"""CC0 1.0""","[""http://tokipona.net/tp/janpije/kililili.php"", ""http://forums.tokipona.org/viewtopic.php?t=71#p177""]","[""https://web.archive.org/web/20151001204355/http://tokipona.net/tp/janpije/kililili.php"", ""https://wikisource.org/wiki/Kili_lili""]",,,,"""mi jo e kili. ona li pona li…"
"""wan taso""","""Even angst-ridden, mediocre te…","[""jan Sonja""]",,"""2003-04-23""","""day""","[""original"", ""poetry"", ""angst""]",,"""All Rights Reserved""","[""http://www.tokipona.org/lit-angst.php""]","[""https://archive.is/b9Sjt"", ""http://web.archive.bibalex.org/web/20030423115357/http://www.tokipona.org/lit-angst.php"", ""https://archive.is/o/b9Sjt/web.archive.bibalex.org/web/20030423115357/http://www.tokipona.org/kalama/wantaso.mp3""]",,,,"""wan taso ijo li moku e mi. m…"
"""pilin ike""","""Even angst-ridden, mediocre te…","[""jan Sonja""]",,"""2003-04-23""","""day""","[""original"", ""poetry"", ""angst""]",,"""All Rights Reserved""","[""http://www.tokipona.org/lit-angst.php""]","[""https://archive.is/b9Sjt"", ""http://web.archive.bibalex.org/web/20030423115357/http://www.tokipona.org/lit-angst.php"", ""https://archive.is/o/b9Sjt/web.archive.bibalex.org/web/20030423115357/http://www.tokipona.org/kalama/pilinike.mp3""]",,,,"""pilin ike mi lon pimeja. was…"
"""ma tomo Pape""",,"[""Damian Yerrick""]",,"""2005-07-15""","""day""",,"{""Tower of Babel"",[""unknown""]}","""CC BY-SA 3.0""","[""https://en.wikipedia.org/w/index.php?title=Toki_Pona&oldid=1198391""]",,,,,"""ma ale li jo e toki wan en sam…"


In [7]:
df.filter(df["original"].is_not_null())

title,description,authors,proofreaders,date,date-precision,tags,original,license,sources,archives,preprocessing,accessibility-notes,notes,text
str,str,list[str],list[str],str,str,list[str],struct[2],str,list[str],list[str],str,str,str,str
"""pakala - kalama musi 'When I'm…","""kalama musi 'When I'm Gone' li…","[""jan Minasa""]",,"""2013-10-27""","""day""","[""music""]","{""Cups"",[""Anna Kendrick""]}",,"[""https://www.youtube.com/watch?v=Lm87rp2Zq_s""]",,,,,"""mi pilin e ni: kalama musi 'te…"
"""jan lawa lili""","""A translation of Antoine de Sa…","[""Micheal F.""]",,"""2013-01-20""","""day""","[""translation"", ""story"", ""prose""]","{""The Little Prince"",[""Antoine de Saint-Exupéry""]}","""CC BY-NC 4.0""","[""http://failbluedot.com/toki_pona/jan_lawa_lili/chap01""]","[""https://web.archive.org/web/20151009153200/http://failbluedot.com/toki_pona/jan_lawa_lili/chap01""]",,,"""The older translation""","""jan lawa lili 01 tenpo pi lili…"
"""wile""","""Performed for my NCEA Level 1 …","[""jan Minasa""]",,"""2014-10-23""","""day""","[""music""]","{""Życzenie"",[""Stefan Witwicki""]}",,"[""https://www.youtube.com/watch?v=TRTKqhyvxNw""]",,,,,"""mi suno jelo pi lon sewi laso …"
"""PoPiPo""","""if the quality of this cover s…","[""flower radio""]",,"""2022-03-06""","""day""","[""music""]","{""PoPiPo"",[""Lamaze-P""]}",,"[""https://www.youtube.com/watch?v=lHTNpca_mxM""]",,,,,"""li moku sina pilin pona telo l…"
"""o pona e ijo pakala""","""len pi jan Sijan li kama pakal…","[""jan Kita""]",,"""2022-05-14""","""day""","[""StoryWeaver level 4""]","{""A Stitch in Time"",[""Himadri Das"", ""Veena Prasad""]}","""CC-BY-4.0""","[""https://storyweaver.org.in/en/stories/442898-o-pona-e-ijo-pakala""]",,"""removed manual line breaks""",,,""" *mu pakala* ""ike a"" · jan Sij…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Toki Sewi Telo Suli (Prayer to…","""My first music video for my so…","[""Marine Guitars""]",,"""2016-10-23""","""day""","[""music""]","{null,null}",,"[""https://www.youtube.com/watch?v=61O1u0Igbcw""]",,,,,"""Suli telo li moku Suli telo li…"
"""ilo toki""",,"[""jan Tepan""]",,"""2016-09-25""","""day""","[""translation"", ""poetry""]","{""Телефон"",[""Korney Chukovsky""]}",,"[""https://github.com/stefichjo/toki-pona/blob/master/musi/ilo-toki.md""]","[""https://web.archive.org/web/20191109180014/https://github.com/stefichjo/toki-pona/blob/master/musi/ilo-toki.md""]",,,,"""ilo toki ilo toki mi li kama …"
"""ma tomo Pape""",,"[""jan Pije""]",,"""2005-07-15""","""day""",,"{""Tower of Babel"",null}","""CC BY-SA 3.0""","[""https://en.wikipedia.org/w/index.php?title=Toki_Pona&oldid=18194572"", ""https://olukin.blogspot.com/2011/04/ma-tomo-pape.html""]",,,,,"""jan ali li kepeken e toki sama…"
"""ma tomo Pape""",,"[""Damian Yerrick""]",,"""2005-07-15""","""day""",,"{""Tower of Babel"",[""unknown""]}","""CC BY-SA 3.0""","[""https://en.wikipedia.org/w/index.php?title=Toki_Pona&oldid=1198391""]",,,,,"""ma ale li jo e toki wan en sam…"


In [10]:
df["license"].unique().to_list()

['CC BY-SA-NC 4.0',
 'CC BY-ND 4.0',
 'CC BY-SA-NC 3.0',
 'CC-BY 4.0',
 'CC BY 4.0',
 'CC BY-NC 4.0',
 'CC BY-NC-ND 4.0',
 'CC BY-SA 3.0',
 'CC BY',
 'CC-BY-SA-3.0',
 'CC-BY-3.0',
 'Unknown license',
 'CC-BY-NC-3.0',
 'CC-BY-NC-SA-3.0',
 'LicenseRef-AllRightsReserved',
 'CC0 1.0',
 'CC0-1.0',
 'CC-BY-NC-SA-3.0 AND LicenseRef-AllRightsReserved',
 'MIT OR CC-BY-SA-3.0 OR CC-BY-SA-4.0',
 'CC-BY-NC 4.0',
 'All Rights Reserved',
 'CC-BY-3.0 AND LicenseRef-AllRightsReserved',
 'CC BY-NC-SA 3.0',
 'CC-BY-SA-4.0',
 'CC-BY-SA 4.0',
 'CC BY-NC 3.0',
 None,
 'CC-BY-NC-SA 3.0',
 'CC BY-NC-SA 4.0',
 'CC-BY-3.0 OR CC-BY-SA-4.0',
 'CC-BY-4.0',
 'CC-BY-SA-3.0 OR CC-BY-SA-4.0',
 'CC BY-SA 4.0',
 'CC0',
 'CC-BY-SA',
 'MIT']

In [8]:
df.write_parquet("poki.pq")