In [1]:
import xml.etree.ElementTree as ET
import sqlite3
import pandas as pd

# See https://github.com/sillsdev/lift-standard/blob/master/lift_13.pdf

In [2]:
DATABASE_NAME = ":memory:"
con = sqlite3.connect(DATABASE_NAME)
cur = con.cursor()
cur.executescript("""
    PRAGMA foreign_keys = ON;
    
    CREATE TABLE IF NOT EXISTS lexemes(
        rowid INTEGER PRIMARY KEY,
        lemma TEXT,
        morpheme_type TEXT
    );
    
    CREATE TABLE IF NOT EXISTS spellings(
        rowid INTEGER PRIMARY KEY,
        form TEXT,
        lexeme INTEGER,
        FOREIGN KEY(lexeme) REFERENCES lexemes(rowid)
    );

    CREATE TABLE IF NOT EXISTS senses(
        rowid INTEGER PRIMARY KEY,
        gloss TEXT,
        lexeme INTEGER,
        part_of_speech TEXT,
        FOREIGN KEY(lexeme) REFERENCES lexemes(rowid)
    );

    CREATE TABLE IF NOT EXISTS sense_grammatical_info(
        name TEXT,
        value TEXT,
        sense INTEGER,
        FOREIGN KEY(sense) REFERENCES senses(rowid)
    );

    CREATE TABLE IF NOT EXISTS texts(
        rowid INTEGER PRIMARY KEY,
        text_name TEXT NOT NULL,
        narrative_order INTEGER NOT NULL,
        word TEXT,
        UNIQUE(text_name, narrative_order)
    );

    CREATE TABLE IF NOT EXISTS text_morphemes(
        spellingid INTEGER,
        textid INTEGER,
        morpheme_order INTEGER NOT NULL,
        FOREIGN KEY(spellingid) REFERENCES spellings(rowid),
        FOREIGN KEY(textid) REFERENCES texts(rowid),
        PRIMARY KEY(spellingid, textid, morpheme_order)
    );
""")

<sqlite3.Cursor at 0x1eeeeb19ec0>

In [3]:
LEXICON_XML = "xml/xml.lift"
tree = ET.parse(LEXICON_XML)
root = tree.getroot()
root

<Element 'lift' at 0x000001EEEEB23BA0>

In [4]:
%%time

for entry in root.findall('entry'):
    lexeme_form = entry.find('./lexical-unit/form')
    lemma = lexeme_form.find('text').text
    morpheme_type = entry.find('./trait[@name="morph-type"]').attrib['value']
    
    cur.execute(f"INSERT INTO lexemes VALUES (NULL, '{lemma}', '{morpheme_type}')")
    con.commit()
    
    lexeme_id = cur.lastrowid
    cur.execute(f"INSERT INTO spellings VALUES (NULL, '{lemma}', {lexeme_id})")
    for variant_form in entry.findall('./variant/form/text'):
        cur.execute(f"INSERT INTO spellings VALUES (NULL, '{variant_form.text}', {lexeme_id})")
    con.commit()

    for sense in entry.findall('sense'):
        gloss = sense.find('./gloss/text').text
        part_of_speech = sense.find('grammatical-info').attrib['value']
        cur.execute(f"INSERT INTO senses VALUES (NULL, '{gloss}', {lexeme_id}, '{part_of_speech}')")
        con.commit()
        sense_id = cur.lastrowid
        for trait in sense.findall('./grammatical-info/trait'):
            name = trait.attrib['name']
            value = trait.attrib['value']
            cur.execute(f"INSERT INTO sense_grammatical_info VALUES ('{name}', '{value}', {lexeme_id})")
        con.commit()

lexemes_df = pd.read_sql_query("SELECT * FROM lexemes", con)
spellings_df = pd.read_sql_query("SELECT * FROM spellings", con)
senses_df = pd.read_sql_query("SELECT * FROM senses", con)
gramm_df = pd.read_sql_query("SELECT * FROM sense_grammatical_info", con)

display(lexemes_df)
display(spellings_df)
display(senses_df)
display(gramm_df)

Unnamed: 0,rowid,lemma,morpheme_type
0,1,r-,prefix
1,2,sayù-,prefix
2,3,*eʔ,bound stem
3,4,e-,prefix
4,5,*ǫ:méh,bound stem
5,6,*werǫ́hs,bound stem


Unnamed: 0,rowid,form,lexeme
0,1,r-,1
1,2,sayù-,2
2,3,*eʔ,3
3,4,e-,4
4,5,í:-,4
5,6,*ǫ:méh,5
6,7,*werǫ́hs,6


Unnamed: 0,rowid,gloss,lexeme,part_of_speech
0,1,3.m.sg,1,Verb
1,2,3.m.sg.3.n.sg,2,Verb
2,3,go,3,Verb
3,4,PROTHETIC,4,Verb
4,5,person,5,Verb
5,6,trick,6,Verb


Unnamed: 0,name,value,sense
0,type,inflAffix,1
1,inflection-feature,{sbj}[sbj:[gen:m pers:3 num:sg]],1
2,type,inflAffix,2
3,inflection-feature,{obj}[sbj:[num:sg pers:3 gen:m] obj:[gen:n per...,2
4,type,inflAffix,4


CPU times: total: 15.6 ms
Wall time: 34.5 ms


In [5]:
%%time
TEXTS_XML = "xml/texts.flextext"
tree = ET.parse(TEXTS_XML)
root = tree.getroot()

cur = con.cursor()

for text in root.findall('interlinear-text'):
    title = root.find(".//item[@type='title']").text
    
    word_values = []
    for narrative_order, word in enumerate(text.findall('.//word')):
        word_values.append([title, narrative_order, word.find('./item').text])
    cur.executemany(f"INSERT INTO texts VALUES (NULL, ?, ?, ?)", word_values)
    con.commit()

    morpheme_values = []
    for narrative_order, word in enumerate(text.findall('.//word')):
        query = f"SELECT rowid FROM texts WHERE text_name='{title}' AND narrative_order={narrative_order}"
        textid = cur.execute(query).fetchone()[0]
        for morpheme_order, morph in enumerate(word.findall('.//morph')):
            gloss = morph.find("./item[@type='gls']")
            spelling = morph.find("./item[@type='txt']")
            if (gloss is None) or (spelling is None):
                continue
            gloss = gloss.text
            spelling = spelling.text
            query = f"""SELECT spellings.rowid AS spellingid
                        FROM spellings, senses
                        WHERE spellings.lexeme=senses.lexeme
                          AND senses.gloss='{gloss}' AND spellings.form='{spelling}'"""
            spellingid = cur.execute(query).fetchone()[0]
            morpheme_values.append([spellingid, textid, morpheme_order])
    cur.executemany(f"INSERT INTO text_morphemes VALUES (?, ?, ?)", morpheme_values)
    con.commit()

texts_df = pd.read_sql_query("SELECT * FROM texts", con)
morphs_df = pd.read_sql_query("SELECT * FROM text_morphemes", con)
con.close()

display(texts_df)
display(morphs_df)

Unnamed: 0,rowid,text_name,narrative_order,word
0,1,The Trickster and the Witch,0,erǫ:méh
1,2,The Trickster and the Witch,1,í:reʔ
2,3,The Trickster and the Witch,2,sayùwerǫ́hs
3,4,The Trickster and the Witch,3,nę
4,5,The Trickster and the Witch,4,hàyę́ʔ
...,...,...,...,...
486,487,The Trickster and the Witch,486,déʔšaʔ
487,488,The Trickster and the Witch,487,dèhsayuwerǫ́h
488,489,The Trickster and the Witch,488,tutiwáhę̀hteʔ
489,490,The Trickster and the Witch,489,tú:h


Unnamed: 0,spellingid,textid,morpheme_order
0,4,1,0
1,1,1,1
2,6,1,2
3,5,2,0
4,1,2,1
5,3,2,2
6,2,3,0
7,7,3,1
8,2,76,0
9,7,76,1


CPU times: total: 31.2 ms
Wall time: 37.3 ms
