# Data Preparation for Analysis
* parse ABSA data with semantic dependencies
* parse ABSA data with syntactic dependencies
* prepare data structure for graph (use `mtool`'s Graph) and support graph visualization

In [9]:
import sys, os, json 
from typing import List, Any, Dict, Callable, Iterable
from IPython.display import HTML, display
import tabulate

# root of project in nlp-architext repo
libert_dir = "/data/home/ayalklei/nlp-architect/nlp_architect/models/libert"
domains = ["restaurants", "laptops", "device"]

In [18]:
# at first we used the jsonl files, BUT these jsonl files are before applying any tokenization. 
# Alternatively, let's take the raw text from the conll-like formatted ABSA data, 
# so that the parsed data will be with consistent tokenization as the ABSA data
conll_data_dir = f"{libert_dir}/data/conll/domains_all"
rest_source_file = f"{conll_data_dir}/restaurants.txt"
laptops_source_file = f"{conll_data_dir}/laptops.txt"

"""
Extract raw sentences files from conll files - sentence per block, delimited by empty line
"""
def extract_raw(src_fn: str, output_fn: str):
    sents = []
    current_sentence = []
    with open(src_fn) as fin:
        for line in fin:
            if line.rstrip():
                token = line.rstrip().split()[0]
                current_sentence.append(token)
            else:
                sents.append(' '.join(current_sentence) + "\n")
                current_sentence = []
    # write raw sents
    with open(output_fn, "w") as fout:
        fout.writelines(sents)

# apply extract_raw for the domains restaurants and laptops
extract_raw(rest_source_file, f"{libert_dir}/analysis/raw_sentences/restaurants.txt")
extract_raw(laptops_source_file, f"{libert_dir}/analysis/raw_sentences/laptops.txt")

In [2]:
# for device domain, sentences are already in raw format in $libert_dir/data/Wang2018/addsenti_device.txt,
# but need to clean each sentence from the suffix (starting with '##') stating the opinion terms
orig_device_raw_sentences_fn = f"{libert_dir}/data/Wang2018/addsenti_device.txt"
target_device_raw_sentences_fn = f"{libert_dir}/analysis/raw_sentences/device.txt" 
lines = []
with open(orig_device_raw_sentences_fn) as fin:
    for line in fin:
        if "##" in line:
            line = line.split("##")[0] + "\n"
        lines.append(line)
with open(target_device_raw_sentences_fn, "w") as fout:
    fout.writelines(lines)


In [3]:
# prepare allennlp-fit inputs files (jsonl files) for HIT-SCIR parser
def prepare_allennlp_predictor_input(raw_fn: str, out_fn: str):
    with open(raw_fn) as fin:
        lines = [json.dumps({"sentence": s.rstrip(), "id":f"{i:04}"}) + "\n" for i,s in enumerate(fin)]
    with open(out_fn, "w") as fout:
        fout.writelines(lines)

for domain in domains:
    prepare_allennlp_predictor_input(
        f"{libert_dir}/analysis/raw_sentences/{domain}.txt", 
        f"{libert_dir}/analysis/input_for_allennlp/{domain}.jsonl")


## Run HIT-SCIR parser
Use the following fish-shell command from the HIT-SCIR directory:
```fish
for frlsm in dm psd; for domain in restaurants laptops device; allennlp predict --output-file $libert_dir/analysis/HIT-SCIR-parses/$frlsm-$domain-output.mrp --predictor transition_predictor_sdp --include-package utils --include-package modules --batch-size 32  HIT-SCIR-CoNLL2019-model/$frlsm $libert_dir/analysis/input_for_allennlp/$domain.jsonl; end; end;
```

## SDP Visualization

We use the [MRP](http://mrp.nlpl.eu/2019/) data format, and leverage [mtool](https://github.com/cfmrp/mtool) for format-conversions and visualizations. 

In [4]:
from graph import Graph # of mtool package, available at HIT-SCIR .venv

def load_parsed_graph(formalism="dm", domain="restaurants", graph_id = 1) -> Graph:
    # load a parsed graph
    if formalism is "syndep":
        parsed_graphs_dir=f"{libert_dir}/analysis/spacy-syndep-parses"
        parsed_fn=f"{parsed_graphs_dir}/{domain}-syndep.mrp"
    else:    
        parsed_graphs_dir=f"{libert_dir}/analysis/HIT-SCIR-parses"
        parsed_fn=f"{parsed_graphs_dir}/{formalism}-{domain}-output.mrp"

    with open(parsed_fn) as fin:
        lines = list(fin.readlines())
    g = Graph.decode(json.loads(lines[graph_id]))
    return g

def view_parsed_graph(formalism="dm", domain="restaurants", graph_id = 1, method="displacy", graph=None):
    # load a parsed graph
    if not graph:
        graph = load_parsed_graph(formalism, domain, graph_id)
    # visualize
    if method == "dot":
        # visalize using dot
        dot_fn = "dot_example.dot"
        graph.dot(open(dot_fn, "w"))    # write dot file
        # see dot in jupyter
        def view_dot(fn):
            from graphviz import Source
            return Source.from_file(fn)
        return view_dot(dot_fn)
    elif method=="tikz":
        # visalize using tikz
        tikz_fn = "tikz_example.tex"
        graph.tikz(open(tikz_fn, "w"))    # write tikz latex file
        # I can't show it in notebook meantime since %load_ext tikzmagic not working
        return None
    else:
        graph.displacy(jupyter=True, options={"compact":True, "distance":100})

# example usage
view_parsed_graph(formalism="syndep", domain="restaurants", graph_id = 0, method="displacy")
view_parsed_graph(formalism="dm", domain="restaurants", graph_id = 0, method="displacy")



In [7]:
g = load_parsed_graph()    # a working example

In [8]:
# !pip install git+git://github.com/mkrphys/ipython-tikzmagic.git

## Generate Syntactic Dependencies
Use SpaCy to parse the reviews data. Output to conll format, then use `mtool` to convert to `.mrp` format.   

(spacy is installed on the `.daan_venv` env)

In [6]:
# Load and prepare spacy model for dependency parsing
import spacy
spacy_model="en_core_web_lg"
nlp = spacy.load(spacy_model, disable=["ner", "vectors", "textcat"])

# Add Conll formatter to end of spacy's pipe
from spacy_conll import ConllFormatter
conllformatter = ConllFormatter(nlp)
nlp.add_pipe(conllformatter, last=True)

# Prepend a SentenceSegmenter by newline to spacy's pipe
def set_no_sentence_segmentation(doc):
    """ don't split sentences at all. """
    for token in doc:
        doc[token.i].is_sent_start = False
    return doc
nlp.add_pipe(set_no_sentence_segmentation, name="no_sentence_segmentation", before="parser")

# Assuming sentence is pre-tokenized and merged with spaces; 
# so need to use a custom whitespace tokenizer.
# source: https://spacy.io/usage/linguistic-features#custom-tokenizer-example
from spacy.tokens import Doc
class WhitespaceTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split()
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)


# Parse syntactic dependecnies with SpaCy
def parse_syn_dep(domain: str = "device"):
    # Load sentence from raw text files
    raw_fn = f"{libert_dir}/analysis/raw_sentences/{domain}.txt"
    with open(raw_fn) as fin:
        # process all sentences in raw file with SpaCy's parser
        docs = nlp.pipe(line.rstrip() for line in fin.readlines() if line.strip()) 

    # Get the CoNLL representation of each setence, seperate with newline
    conll_str = "\n".join([doc._.conll_str for doc in docs])
    out_fn = f"{libert_dir}/analysis/spacy-syndep-parses/{domain}-syndep.conll"
    with open(out_fn, "w") as fout:
        fout.write(conll_str)

for domain in ["restaurants", "device", "laptops"]:
    parse_syn_dep(domain)
# parse_syn_dep("device")

Already downloaded a model for the 'en' language


ValueError: [E001] No component 'parser' found in pipeline. Available names: ['conll_formatter']

#### Convert conllu to mrp
After parsing syntactic dependencies into conll-u format using SpaCy, use `mtool` to convert it into `.mrp` format. (`mtool` needed some fixes in the code reading conll format.) 

Execute this on fish-shell:

```fish
set domains restaurants laptops device
set dir $libert_dir/analysis/spacy-syndep-parses
for domain in $domains; mtool --read ud --write mrp $dir/$domain-syndep.conll > $dir/$domain-syndep.mrp; end
```


### UD syntactic dependencies using UDPipe 
We use `spacy-udpipe` as a spacy wrapper of the UDPipe model.

In [19]:
# # Load and prepare spacy model for dependency parsing
import spacy
import spacy_udpipe
# spacy_udpipe.download("en") # download English model (in first run)
nlp = spacy_udpipe.load("en")


# Add Conll formatter to end of spacy's pipe
from spacy_conll import ConllFormatter
conllformatter = ConllFormatter(nlp)
nlp.add_pipe(conllformatter, last=True)


# Parse syntactic dependecnies with SpaCy
def parse_syn_dep(domain: str = "device"):
    # Load sentence from raw text files
    raw_fn = f"{libert_dir}/analysis/raw_sentences/{domain}.txt"
    with open(raw_fn) as fin:
        input = [line.strip().split() for line in fin.readlines() if line.strip()]
    # process all sentences in raw file with SpaCy's parser
    doc = nlp(input) 

    # Get the CoNLL representation of each setence, seperate with newline
#     conll_str = "\n".join([doc._.conll_str for doc in docs])
    out_fn = f"{libert_dir}/analysis/udpipe-syndep-parses/{domain}-syndep.conll"
    with open(out_fn, "w") as fout:
        fout.write(doc._.conll_str)

# for domain in ["restaurants", "laptops", "device"]:
#     parse_syn_dep(domain)
parse_syn_dep("laptops")

#### Convert conllu to mrp
After parsing syntactic dependencies into conll-u format using SpaCy, use `mtool` to convert it into `.mrp` format. (`mtool` needed some fixes in the code reading conll format.) 

Execute this on fish-shell:

```fish
set domains restaurants laptops device
set dir $libert_dir/analysis/udpipe-syndep-parses
for domain in $domains; mtool --read ud --write mrp $dir/$domain-syndep.conll > $dir/$domain-syndep.mrp; end
```

### Generate Enhanced UD (EUD), EUD++, and BART

We will use [pyBART](https://github.com/allenai/pybart) to produce all these UD enhancements.

In [10]:
ud_enhancement_formalisms = ["eud", "eud_pp", "bart", "eud_pp_bart"]
from pybart.api import convert_bart_conllu
def convert_ud_to_enhanced(conllu_formatted_file_in, conllu_formatted_file_out, formalism):
    assert formalism in ud_enhancement_formalisms, f"unrecognizable formalism {formalism}."
    # read a CoNLL-U formatted file
    with open(conllu_formatted_file_in) as f:
        sents = f.read()
    # set api kwargs - select enhancement scheme
    api_kwargs = {"enhance_ud": 'eud' in formalism,
                  "enhanced_plus_plus": 'eud_pp' in formalism,
                  "enhanced_extra": 'bart' in formalism}
    # convert
    converted = convert_bart_conllu(sents, **api_kwargs)
    # write the textual output to a new CoNLL-U file
    with open(conllu_formatted_file_out, "w") as f:
        f.write(converted)

for scheme in ud_enhancement_formalisms:
    target_dir = f"{libert_dir}/analysis/ud-enhancements/{scheme}"
    for domain in domains:
        inp_fn = f"{libert_dir}/analysis/udpipe-syndep-parses/{domain}-syndep.conll"
        converted_fn = f"{target_dir}/{domain}-syndep.conll"
        convert_ud_to_enhanced(inp_fn, converted_fn, scheme)
        

#### Convert conllu to mrp
Execute this on fish-shell:

```fish
set schemes eud eud_pp bart eud_pp_bart
set domains restaurants laptops device
set dir $libert_dir/analysis/ud-enhancements
for scheme in $schemes; for domain in $domains; mtool --read eud --write mrp $dir/$scheme/$domain-syndep.conll > $dir/$scheme/$domain-syndep.mrp; end; end;
```