<a href="https://colab.research.google.com/github/LUMII-AILab/NLP_Course/blob/main/notebooks/ParsingUD.ipynb" target="_new"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/></a>

# REST APIs for UD parsing

In [12]:
import requests
import urllib
import json

In [1]:
text_lv = "Ģertrūdes ielas teātris aicina uz Baltijas neatkarīgo teātru viesizrādēm, kas notiks 3. aprīlī."
text_en = "Gertrude Street Theater invites you to guest performances of Baltic independent theaters, which will take place on April 3."

## LV-PIPE

https://nlp.ailab.lv

In [None]:
# Calls the LV-PIPE REST API
def process_lvpipe(text, steps=None, api_url="https://nlp.ailab.lv/api/nlp"):
    steps = steps or ['tokenizer', 'morpho', 'parser'] # +ner

    response = requests.post(api_url, json={'data': {'text': text}, 'steps': steps})
    return response.json()['data']

In [None]:
doc = process_lvpipe(text_lv)

with open("lvpipe_output.json", 'w') as f:
    json.dump(doc, f, indent=4, ensure_ascii=False)

with open("lvpipe_output.json", 'r') as f:
    print(f.read())

In [None]:
# Most of the work in NLP is to convert data from one format to another ;)

# This code block allows for customisation and format conversion,
# since other NLP pipelines might use different data formats and naming conventions.

ID     = "index"
FORM   = "form"
LEMMA  = "lemma"
UPOS   = "upos"
XPOS   = "tag"
FEATS  = "ufeats"
HEAD   = "parent"
DEPREL = "deprel"
DEPS   = "deps"
MISC   = "misc"

FORMAT = [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]


# For each token, creates a TAB-separated list of its UD features
def token_features(token):
    line = ""
    cols = FORMAT
    for x in cols:
        if token.get(x): line += str(token[x]) + '\t'
        else: line += "_\t"
    return line.lstrip('\t') + '\n'


# De-tokenizes a tokenized sentence
def detokenize_sentence(sentence):
    text = ""
    for token in sentence["tokens"]:
        text += token[FORM] + ' ' # Naive glueing
    return text.lstrip(' ')


# Reads a JSON file produced by LV-PIPE and converts it to the CoNLL-U format
def json_to_conll(json_file, conll_file):
    with open(json_file, encoding="utf-8") as j_file:
        data = json.load(j_file)

    with open(conll_file, 'w', encoding="utf-8") as c_file:
        for sentence in data["sentences"]:
            c_file.write("# text = {}\n".format(detokenize_sentence(sentence)))

            for token in sentence["tokens"]:
                c_file.write(token_features(token))

            c_file.write('\n')

In [15]:
# Pretty-prints a CoNLL-U file
def print_conllu(conll_file):
    with open(conll_file, 'r') as f:
        lines = f.readlines()

    structured_lines = []

    for line in lines:
        if line.startswith("#"):
            # Comment lines
            structured_lines.append(line.strip())
        elif line.strip():
            # Non-empty non-comment lines
            structured_lines.append(line.strip().split("\t"))
        else:
            # Empty lines - sentence breaks
            structured_lines.append('')

    # Find the maximum width of each column
    data_rows = [row for row in structured_lines if isinstance(row, list)]
    max_widths = [max(len(row[i]) for row in data_rows) for i in range(len(data_rows[0]))]

    for line in structured_lines:
        if isinstance(line, list):
            # A data row
            print(' '.join(word.ljust(max_widths[i]) for i, word in enumerate(line)))
        else:
            # A comment or empty line
            print(line)
            if line == "": print()

In [None]:
json_to_conll("lvpipe_output.json", "lvpipe_output.conll")

print_conllu("lvpipe_output.conll")

## UDpipe

https://ufal.mff.cuni.cz/udpipe/2

In [13]:
# Calls the UDpipe REST API
def process_udpipe(text, model, steps=None, api_url="https://lindat.mff.cuni.cz/services/udpipe/api/process"):
    model = "model=" + model
    steps = steps or "tokenizer&tagger&parser"
    text = "data=" + urllib.parse.quote(text)

    response = requests.get(api_url + '?' + model + '&' + steps + '&' + text)
    return response.json()['result']

In [None]:
doc_lv = process_udpipe(text_lv, "lv")
doc_en = process_udpipe(text_en, "en")

with open("udpipe_output.conll", 'w', encoding="utf-8") as c_file:
    c_file.write(doc_lv)
    c_file.write(doc_en)

print_conllu("udpipe_output.conll")

In [None]:
# This code block allows for the conversion from CoNLL-U to LV-PIPE's JSON.
# Again, most of the code is for converting data back and forth.

def create_token(data_row):
    return {
        "index":    data_row[0],
        "form":     data_row[1],
        "lemma":    data_row[2],
        "upos":     data_row[3],
        "pos":      data_row[4],  # XPOS
        "features": data_row[5],  # FEATS
        "parent":   data_row[6],  # HEAD
        "deprel":   data_row[7],
        "deps":     data_row[8],  # +DEPS
        "misc":     data_row[9]   # +MISC
    }


def create_sentence(text, tokens):
    return {
        "tokens": tokens,
        "text": text.strip()
    }


def create_doc(sentences, full_text):
    return {
        "sentences": sentences,
        "text": full_text.strip()
    }


def conll_to_json(conll_file):
    with open(conll_file, 'r', encoding="utf-8") as c_file:
        lines = c_file.readlines()

    sentences = []
    full_text = ""
    tokens = []
    text = ""

    for line in lines:
        line = line.strip()
        data_row = line.split('\t')

        if data_row[0].isdigit():
            tokens.append(create_token(data_row))
        elif line and line.startswith("# text"):
            text = line[9:]
        elif not line and text and tokens:
            sentences.append(create_sentence(text, tokens))
            full_text += ' ' + text
            tokens = []
            text = ""

    return create_doc(sentences, full_text)

In [None]:
with open("udpipe_output.json", 'w') as f:
    json.dump(conll_to_json("udpipe_output.conll"), f, ensure_ascii=False, indent=4)

with open("udpipe_output.json", 'r') as f:
    print(f.read())

# Python modules

## Stanza (revisited)

https://stanfordnlp.github.io/stanza/performance.html

In [None]:
!pip install stanza

import stanza

In [None]:
stanza.download('lv')
stanza.download('en')

In [None]:
nlp_lv = stanza.Pipeline(lang='lv', processors='tokenize,pos,lemma,depparse')
nlp_en = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma,depparse')

In [None]:
docs = []

docs.append(nlp_lv(text_lv))
docs.append(nlp_en(text_en))

with open("stanza_output.conll", 'w', encoding="utf-8") as s_file:
    # For each sentence in a document,
    # iterate over the tokens and get their features.

    for d in docs:
        for s in d.sentences:
            s_file.write(f'# text = {s.text}\n')
            for w in s.words:
                s_file.write(f'{w.id}\t{w.text}\t{w.upos}\t{w.xpos}\t{w.head}\t{s.words[w.head-1].text if w.head > 0 else "_"}\t{w.deprel}\n')
            s_file.write("\n")

print_conllu("stanza_output.conll")