In [1]:
import pandas as pd
import re

def word_in_sentence(word, sentence):
    sentence = sentence.lower()
    word = word.lower()
    pattern = r'(?<!\w)' + re.escape(word) + r'(?!\w)'
    return bool(re.search(pattern, sentence))


texdoc = []
with open('texts.tex', 'r') as f:
    for line in f:
        texdoc.append(line)

speakers = ['Adama:', 'Karim:', 'Emile:']

section = 'Unknown'
speaker = 'Emile'
sentences = []

for i in range(len(texdoc)):
    line = texdoc[i]

    # get section
    if '\section' in line:
        section = line.split('\section')[1]

    # get speaker
    for s in speakers:
        if s in line:
            speaker = line.split(':')[0]

    # get three lines
    if '\gll' in line:
        if '\glt' in texdoc[i+2]:
            glt = texdoc[i+2].split('\glt ')[1].strip('\n').strip('\\').strip("'").strip('`')
        else:
            glt = ''

        # add a sentence
        sentences.append([section.split(' %')[0].replace('{', '').replace('}', ''),
                          speaker.strip(' '),
                          texdoc[i].split('\gll ')[1].strip('\n').strip('\\'),
                          texdoc[i+1].strip('\n').strip('\\').strip(' '),
                          glt
                          ])

In [2]:
# DataFrame of all sentences
columns = ['Section', 'Speaker', 'Toussian', 'Gloss', 'English']
df = pd.DataFrame(sentences, columns = columns)
df.head()

Unnamed: 0,Section,Speaker,Toussian,Gloss,English
0,History 0080,Emile,á pɔ̄=ǹ kūr kə̀pɛ̂ kɔ̰̌,\textsc{2sg} \textsc{ss}=\textsc{ipfv} village...,"You will give thanks to the village's ancestors,"
1,History 0080,Emile,nə̀ kūr síŋplɛ̰̄ dáríkɛ́,\textsc{ss} village ancestors {ask pardon},and ask pardon to the village's ancestors.
2,History 0080,Emile,ńtɛ̌ntò pɔ̄ pé kə́=rɔ̌ a᷇n pɔ̄ ɟā-sáɣ nə̀...,\textsc{rel.pro.sg} \textsc{is} \textsc{cop} \...,"What was in it, when we grew, what we heard/un..."
3,History 0080,Emile,a᷇n mə̌nə̀ wṵ̌ɲīŋ m̀byɛ́kɛ̄ˁ mə̌nə̀ wṵ̌ɲīŋ...,\textsc{1pl} and Wṵ̌ɲīŋ M̄bíɛ́kɛ̄ˁ and Wṵ̌...,"Us and Wṵ̌ɲīŋ---M̄bíɛ́kɛ̄ˁ and Wṵ̌ɲīŋ, th..."
4,History 0080,Emile,bɔ̰ ḿpra᷇ˁ pə᷇n sā̰ˁ ꜜtyā dô,okay Djigouera 3pl.hum put place \textsc{excl},"Djigouera, the moment when they founded Djigou..."


In [3]:
# word searching

'''
á	PST
pī	FUT
wú	EVID~REP
sə́	DUB~IRR~infr
à	COND
pə́	COND
yē	truly
rí	JUSS~SBJV
kə́	neg
kə̀pə́	NEG.IMP~NEG.SBJV
mɛ̄	then, (no) longer
pɔ̄	IS
tó	again
(ꜜ)pə́~pə̄	PROG
=n~=ǹ~nə̀~ǹ=~n=	IPFV
pwó/pī	come.pfv/ipfv
kɛ́y/tyu᷇ (also tyo᷇, tya᷇)	go.pfv/ipfv
fáná	also
fā̰(ˤ)/k͡pɔ́	can
'''

aux, glo = 'pī', 'FUT' # INPUT HERE

filtered_df = df[df['Gloss'].apply(lambda x: word_in_sentence(glo, x))]
filtered_df = filtered_df[filtered_df['Toussian'].apply(lambda x: word_in_sentence(aux, x))]
filtered_df.head()

Unnamed: 0,Section,Speaker,Toussian,Gloss,English
17,History 0080,Emile,pə̂r wú=n ɟín blɛ̰́ nə̀ pī=n pə́ krúꜜmɔ́ n...,\textsc{log.emph} \textsc{evid}=\textsc{ipfv} ...,
142,Marriage traditions 0084,Karim,dɔ̰̄ kə̀ fwɔ̌\textasciimacron tà tɛ̰̌ nə̀sɛ̰᷇...,so \textsc{3sg.nh} good begin grab how until \...,"Therefore, how does the tradition work from th..."
308,The rabbit and the agama lizard (0182),Adama,pə́=n pī=n yɔ́ pɛ̄y pə̀=à pə̂r ɲìŋ,\textsc{log=ipfv} \textsc{fut}=\textsc{ipfv} s...,he will say (it) if they bury him
317,The rabbit and the agama lizard (0182),Adama,fɔ́ nə̀=n pī=ǹ dyá̰ ɟín twi᷇ kə́=n mɛ̄=n p...,until \textsc{ss=ipfv} \textsc{fut=ipfv} \text...,"Just until the end of the world, the rain will..."
318,The rabbit and the agama lizard (0182),Adama,mais nīŋ à=n pī=n k͡pé twi᷇ à=n pɛ́y pə̂r...,but water \textsc{cond} \textsc{fut=ipfv} pour...,if the water will fall. If it will rain. After...


In [None]:
# get readable texts here

for i in range(filtered_df.shape[0]):
    row = filtered_df.iloc[i]
    print(
        f"{row['Toussian']}\n{row['Gloss']}\n{row['English']}\n\n"
        )

In [9]:
# WARNING: MAY OVERWRITE
# Create an empty csv

pd.DataFrame(columns= ['Marker'] + columns).to_csv('search_result.csv', index=False, header=True)

In [24]:
# WARNING: MAY OVERWRITE
# append the dataframe to an existed csv

filtered_df.to_csv('search_result.csv', mode='a', index=False, header=False)