This script reads a txt file created with the *Tagger* function of *Collatinus 11* and creates a horizontal dataframe with one token per line, including the following informations and saves it as a csv file.
- sentence_nr (Satznummer; 1:1, 1:2, ...n:n)
- Sentence (Kontext),
- morphological Annoations, 
- Lemma,
- Translation (german or English) and
- Future (yes|no)

In [None]:
import os
import pandas as pd
import re

In [None]:
# reads annotated file
file1 = open('Collatinus_annotated_files/luk_2_annotated.txt', 'r', encoding = "UTF-8")
Lines = file1.readlines()

In [None]:
# filters out non-text lines
UsefulLines = [x for x in Lines if not x.startswith((",", "\n", " \n", "snt", "avec la proba", "Deuxième choix avec la proba"))]

In [None]:
# iterates through lines and collects relevant information into lists to create dataframe
tokList = []
tokCleanList = []
posList = []
postagList=[]

ZeilenCount = 0
for line in UsefulLines:
    if line.endswith("non trouvé\n"):
            #print("NOT TAGGED", line)
            tokList.append(line[:-11])
            tokCleanList.append(line[:-11])
            posList.append("NON TROUVÉ")

    elif line.startswith("—>"):
            #print("posTag:", line)
            posList.append(line)

    else:
            #print("tok:", line)
            tokList.append(line)
            tokCleanList.append(str.split(line)[0])
            postagList.append(' '.join(str.split(line)[1:]))


In [None]:
# splits the available information into morph. annotation, Lemma and translation
morphList = []
lemmaList = []
transList = []

for line in posList:
    a = line[3:]
    a = re.split(":|—", a)
    #print(len(a))
    if len(a)==3:
        b = a[0].split(",")
        lemmaList.append(b[0])
        transList.append(a[1])
        morphList.append(a[2][:-2])
    else: 
        morphList.append("-")
        lemmaList.append("-")
        transList.append("-")

In [None]:
#creates column: Futur (FUTUR|---)

futList = []
for element in posList:
    if "futur" in element:
        futList.append("FUTUR")
    else:
        futList.append("---")

In [None]:
# erstellt datenset aus den Listen
#data_tuples = list(zip(sentList, sentsentList,tokList, tokCleanList,posList, morphList, lemmaList, lemmaCleanList, transList, futList))

# df = pd.DataFrame(data_tuples, columns=['SatzNr','Satz', 'Token', 'Token_clean','Annotation', 'MorphAnno', 'Lemma', 'Lemma_clean', 'Translation', 'Futur'])

In [None]:
# creates dataframe from lists - without line and line nr, will be added afterwards
data_tuples = list(zip(tokList, tokCleanList, postagList, posList, morphList, lemmaList,  transList, futList))

df = pd.DataFrame(data_tuples, columns=['Token', 'Token_clean', 'pos_Tag','Annotation', 'MorphAnno', 'Lemma', 'Translation', 'Futur'])

In [None]:
# reads in non-annotated file (for line nrs)
file1 = open('txt_files_vulgata/Luk_2.txt', 'r', encoding = "UTF-8")
Lines = file1.readlines()

In [None]:
counter = 0
sentList = []
sentNrList = []
verseNrList = []
for element in Lines:
    if not element == ' \n':
        if not element == '\n':
            counter +=1
            splitel = element.split()
            sentence = ' '.join(splitel[1:])
            versenr = splitel[0]
            
            #print(len(splitel[1:]))
            for i in range(len(splitel[1:])):
                sentList.append(sentence)
                verseNrList.append(versenr)
                sentNrList.append(counter)
        

In [None]:
# creates dataframe with ID, line-nr and line
data_tuples = list(zip(sentNrList, verseNrList, sentList))
df1 = pd.DataFrame(data_tuples, columns=['ID', 'Satznummer', 'Satz'])

In [None]:
# merges the two dataframes
df2 = pd.concat([df1, df], axis=1)

In [None]:
# speichert als csv-Datei
df2.to_csv('Luk_2_pos.csv', index=False, encoding='utf-8')