In [2]:
from io import open
from conllu import parse_incr

#Create a dict to store the results
word_lemma_dict = {}

#Open the file and load the sentences to a list.
data_file = open('ru_syntagrus-ud-dev1.conllu', "r", encoding="utf-8")
ud_files = []
for tokenlist in parse_incr(data_file):
    ud_files.append(tokenlist)
data_file.close()

#For each sentence loaded, let us extract all tokens, their form (we'll make it lower, so we reduce ambiguity), 
#the pos_tag and the lemma. We keep the lemma intact because there are proper names.
for sentence in ud_files:
    for token in sentence:
        form = token['form'].lower()
        postag = token['upostag']
        lemma = token['lemma']
        #There are also numbers that are annotated weirdly, let us just skip them:
        if postag == "NUM":
            continue
        #Now, we check if the form is in the dictionary, then we check if the POS is set for the form. 
        #Only then we add the lemma related to the word.
        if form in word_lemma_dict:
            if postag not in word_lemma_dict[form]:
                word_lemma_dict[form][postag] = lemma
        #If the word is not in the dict, we add it.
        else:
            word_lemma_dict[form] = {postag:lemma}


In [9]:
def lemmatize(word, pos):
    if word in word_lemma_dict:
        if pos in word_lemma_dict[word]:
            return word_lemma_dict[word][pos]
    return word

In [14]:
words = [('Алгоритмы', 'NOUN'), ('оставаться', 'VERB'),('Моя','DET'), ('родилась','VERB')]
for word_tuple in words:
    word = word_tuple[0].lower()  # Convert the word to lowercase
    pos = word_tuple[1]
    print(lemmatize(word, pos))

алгоритм
оставаться
мой
родиться
