In [2]:
import os
import pandas as pd
from time import time

### Parsing the `.conllu` file line by lines
`records` be an empty list of lists to hold tagged information

In [3]:
start = time()
file_path = 'hi_pud-ud-test.conllu' # path where your .conllu file is located
with open(file_path, 'r', encoding='utf-8') as file:
    file_prefix = file_path.split('.')[0] + '_'
    doc_id = ''
    sent_id = ''
    records = list()
    for line in file:
        if len(line) > 1 :
            if line[0] == '#':
                line = line.split('=')
                if 'newdoc' in line[0]:
                    doc_id = file_prefix + line[1].strip()
                elif 'sent_id' in line[0]:
                    sent_id = line[1].strip()
            else:
                info = line.split('\t')
                if len(info) == 10:
                    records.append([doc_id, sent_id] + [x.strip() for x in info])
end = time()
print("Time elapsed:", end-start, "seconds")

Time elapsed: 0.07933378219604492 seconds


### Create a dataframe to copy the parsed records

In [4]:
df = pd.DataFrame(records, columns=['DOC_NO', 'SENT_NO', 'ID', 'FORM','LEMMA','UPOS', 'XPOS', 'FEAT','HEAD', 'DEPREL', 'DEPS', 'MISC'])

### Saving parsed records in tsv (tab separated view) file

In [6]:
df.to_csv('hindi-output.tsv', header=True, index=None, sep='\t')
df.head()

Unnamed: 0,DOC_NO,SENT_NO,ID,FORM,LEMMA,UPOS,XPOS,FEAT,HEAD,DEPREL,DEPS,MISC
0,hi_pud-ud-test_n01001,n01001011,1,“,_,PUNCT,``,_,16,punct,_,SpaceAfter=No|Translit=“
1,hi_pud-ud-test_n01001,n01001011,2,संयुक्त,_,ADJ,JJ,_,4,amod,_,Proper=True|Translit=saṁyukta
2,hi_pud-ud-test_n01001,n01001011,3,राज्य,_,NOUN,NN,Animacy=Inan|Gender=Masc|Number=Sing,4,compound,_,Proper=True|Translit=rājya
3,hi_pud-ud-test_n01001,n01001011,4,अमेरिका,_,PROPN,NNP,Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing,16,obl,_,Translit=amerikā
4,hi_pud-ud-test_n01001,n01001011,5,में,_,ADP,IN,Case=Loc,4,case,_,Translit=meṁ


### Iterating over a folder of .coNLLu files

In [16]:
directory = '../gdex-word-embeddings/ud/'
records = list()
for filename in os.listdir(directory):
    if '-set-' in filename and filename.endswith(".conllu"):
        start = time()
        file_path = os.path.join(directory, filename) # path where your .conllu file is located
        #print(file_path)
        with open(file_path, 'r') as file:
            file_prefix = filename.split('.')[0] + '_'
            doc_count = 1
            sent_count = 1
            par_count = 1
            doc_id = file_prefix + str(doc_count)
            sent_id = sent_count
            par_id = par_count            
            for line in file:
                if len(line) > 1 :
                    # if it is related to doc, paragraph or sentence 
                    if line[0] == '#':
                        if 'newdoc' in line:
                            if line.strip() != '# newdoc':
                                if '=' in line:
                                    doc_id = file_prefix + line.split('=')[1].strip()
                                    doc_count += 1
                                else:
                                    doc_id = file_prefix + str(doc_count)
                                    doc_count =+ 1
                            elif line.strip() == '# newdoc':
                                doc_id = file_prefix + str(doc_count)
                                doc_count =+ 1
                                
                        if 'newpar' in line:
                            if line.strip() != '# newpar':
                                if '=' in line:
                                    par_id = file_prefix + line.split('=')[1].strip()
                                    par_count += 1
                                else:
                                    par_id = file_prefix + str(par_count)
                                    par_count =+ 1
                            elif line.strip() == '# newpar':
                                par_id = file_prefix + str(par_count)
                                par_count =+ 1
                        
                        if '# sent_id' in line:
                            if '=' in line:
                                sent_id = file_prefix + line.split('=')[1].strip()
                                sent_count += 1
                            else:
                                sent_id = file_prefix + str(sent_count)
                                sent_count =+ 1
                    # if it is related to coNLLu data
                    elif line[0] != '#':
                        info = line.split('\t')
                        if len(info) == 10:
                            temp = [doc_id, par_id, sent_id] + [x.strip() for x in info]
                            if len(temp) == 13:
                                records.append(temp)
                else:
                    doc_count += 1
                    par_count += 1
                    sent_count += 1
                    
        end = time()
        print(filename, "Time elapsed:", end-start, "seconds")
    else:
        continue
df = pd.DataFrame(records, columns=['DOC_ID', 'PAR_ID', 'SENT_ID', 'ID', 'FORM','LEMMA','UPOS', 'XPOS', 'FEAT','HEAD', 'DEPREL', 'DEPS', 'MISC'])
df.to_csv('aq-all-conllu.tsv', header=True, index=None, sep='\t')
df.head()

aq-set-10.conllu Time elapsed: 0.12796545028686523 seconds
aq-set-13.conllu Time elapsed: 0.08791232109069824 seconds
aq-set-4.conllu Time elapsed: 0.04593944549560547 seconds
aq-set-14.conllu Time elapsed: 0.6223409175872803 seconds
aq-set-7.conllu Time elapsed: 0.07025718688964844 seconds
aq-set-6.conllu Time elapsed: 0.07943248748779297 seconds
aq-set-2.conllu Time elapsed: 0.010993719100952148 seconds
aq-set-8.conllu Time elapsed: 0.43564367294311523 seconds
aq-set-9.conllu Time elapsed: 0.07404804229736328 seconds
aq-set-12.conllu Time elapsed: 0.08122825622558594 seconds
aq-set-3.conllu Time elapsed: 0.015195369720458984 seconds
aq-set-1.conllu Time elapsed: 0.006630897521972656 seconds
aq-set-11.conllu Time elapsed: 0.07516884803771973 seconds
aq-set-5.conllu Time elapsed: 0.07778739929199219 seconds


Unnamed: 0,DOC_ID,PAR_ID,SENT_ID,ID,FORM,LEMMA,UPOS,XPOS,FEAT,HEAD,DEPREL,DEPS,MISC
0,aq-set-10_1,aq-set-10_1,aq-set-10_1,1,The,the,DET,DT,Definite=Def|PronType=Art,2,det,_,_
1,aq-set-10_1,aq-set-10_1,aq-set-10_1,2,development,development,NOUN,NN,Number=Sing,13,nsubj,_,_
2,aq-set-10_1,aq-set-10_1,aq-set-10_1,3,of,of,ADP,IN,_,4,case,_,_
3,aq-set-10_1,aq-set-10_1,aq-set-10_1,4,telephony,telephony,NOUN,NN,Number=Sing,2,nmod,_,_
4,aq-set-10_1,aq-set-10_1,aq-set-10_1,5,in,in,ADP,IN,_,8,case,_,_
