## Necessary imports

In [None]:
import torch
import nltk
from nltk import Tree

## Handle GPU issue

In case you have weak GPU (e.g., 2gb), it is highly recomended to use the CPU in order to prevent an out-of-memory exception caused by torch, which is used within benepar. For this, we just need to override the "cuda.is_available" function. However, if you have a more powerful GPU, just leave the following lines commented out.

In [None]:
# import torch 
# torch.cuda.is_available = lambda : False

### Import the library

In [None]:
from constituent_treelib import ConstituentTree, BracketedTree

## Create the NLP pipeline

To instantiate a ConstituentTree object, a spaCy-based NLP pipeline that incorporates the benepar component is required. Although you can set up this pipeline yourself, it is recommended (and more convenient) to let the library do it for you automatically via the create_pipeline() method. Given the desired language, this method creates the NLP pipeline and also downloads the corresponding spaCy and benepar models, if requested. 

In [None]:
language = ConstituentTree.Language.English
spacy_model_size = ConstituentTree.SpacyModelSize.Medium

nlp = ConstituentTree.create_pipeline(language, spacy_model_size, download_models = True)
# nlp = ConstituentTree.create_pipeline(language, spacy_model_size)

## Test sentences

#### English

In [None]:
sentences = [
'Stanley Getz was an American jazz saxophonist.',
# 'It looks like the input tree may contain children with the same name. ',
# 'The bridge was unfinished when it collapsed.'
# 'The 2022 season is underway and there are a limited number of Single Game Tickets on sale now!'
# 'And with no Wild Card possibilities for either team, the game is essentially a winner-take-all endeavor.'
# 'You must construct additional pylons!',
]

#### German

In [None]:
# sentences = [
# 'In einer Gaspipeline in Litauen hat es eine Explosion gegeben.',
# 'Für Fragen zu Freiwilligendiensten, nutzen sie bitte unser Forum!',
# 'Der Künstler verlegt seit 30 Jahren Stolpersteine, die er zur Erinnerung an die Opfer des Nationalsozialismus Häusern platziert.'
# 'Damit erlangen schützenswürdige Kundendaten in den Geschäfts- und Serviceprozessen der Wertschöpfungskette im Bereich Automotive eine immer größere Bedeutung.',
# 'Die USA haben mit ihrem Investitionsprogramm für Klimaschutz reichlich Unmut der EU auf sich gezogen.',
# 'Die methodische Grundlage der Autorenerkennung bildet die Fehler- und Stilanalyse.',
# 'Ebenso empfehlenswert ist das Lesen einer Tageszeitung des Landes.', 
# 'Wie viel wird pro Jahr ungefähr weltweit benötigt?'
# ]

#### French

In [None]:
# sentences = [    
# 'Nous irons plus tard au théâtre.',
# 'Pablo Ruiz Picasso était un peintre, dessinateur, sculpteur et graphiste espagnol.', 
# 'Découvrez une belle sélection d’évènements pour fêter la nouvelle année en partenariat avec Party.',
# ]

#### Swedish

In [None]:
# sentences = [
# 'Vilken vacker skog!',
# 'Det var mycket åska och blixtar i går!',
# 'För den närmaste veckan finns ingen uppenbar risk för fjärrtransport.'
# ]

#### Polish

In [None]:
# sentences = [
# 'W dodatku szczerze wierzy, że w tej wojnie stawką jest istnienie Rosji.',
# 'Poproszę pięć kilo ziemniaków.',
# 'Przepraszam, ale nie rozumiem.',
# ]

#### Hungarian

In [None]:
# sentences = [
# 'A pizza tényleg kiváló volt!',
# 'Vannak kisebb és kiszámíthatatlan kivételek a szabály alól.',
# 'Ezt azért tesszük, hogy javítsuk és finanszírozzuk szolgáltatásainkat.' 
# ]

#### Chinese 

In [None]:
# sentences = [
# '你好吗？',
# '很高兴见到你。',
# '不好意思， 我没听懂。',
# '请再说一遍。',    
# ]

#### Korean

In [None]:
# sentences = [
# '말을 냇가에 끌고 갈 수는 있어도 억지로 물을 먹일 수는 없다'
# '반갑습니다', 
# '잘 지내세요?',
# '그 집은 한국에서 지어졌어요'
# ]

## Instantiate a ConstituentTree object

#### ... from a raw sentence

In [None]:
tree = ConstituentTree(sentences[0], nlp) 

#### ... from a bracketed tree string (wrapped as a BracketedTree object)

In [None]:
bracketed_tree_string = '(S (NP (PRP You)) (VP (MD must) (VP (VB construct) (NP (JJ additional) (NNS pylons)))) (. !))'
bracketed_tree = BracketedTree(bracketed_tree_string)

tree_from_bracketed = ConstituentTree(bracketed_tree, nlp)

#### ... from an nltk.Tree object

In [None]:
nltk_tree_obj = Tree('S', [Tree('NP', [Tree('PRP', ['It'])]), Tree('VP', [Tree('VBZ', ['looks']), Tree('SBAR', [Tree('IN', ['like']), Tree('S', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['input']), Tree('NN', ['tree'])]), Tree('VP', [Tree('MD', ['may']), Tree('VP', [Tree('VB', ['contain']), Tree('NP', [Tree('NP', [Tree('NNS', ['children'])]), Tree('PP', [Tree('IN', ['with']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['same']), Tree('NN', ['name'])])])])])])])])]), Tree('.', ['.'])])

tree_from_nltk = ConstituentTree(nltk_tree_obj, nlp) 

## Instantiate a compact ConstituentTree (without postag nodes)

In [None]:
tree_compact = ConstituentTree(sentences[0], nlp, remove_postag_nodes=True)  

In [None]:
tree_compact.export_tree('compact.png')

## Tree representations 

####  SVG representation

In [None]:
tree

#### Pretty-print bracketed tree string representation

In [None]:
print(tree) 

#### ASCII art

In [None]:
nltk_tree.pretty_print()

#### LATEX code

In [None]:
nltk_tree.pformat_latex_qtree()

## Export visualization

Constituent Treelib relies on the following two open-source tools to export the constructed constituent tree into various file formats:

1.) To export the constituent tree into a PDF, the command line tool *wkhtmltopdf* is required: 
https://wkhtmltopdf.org/downloads.html
Once downloaded and installed, the path to the wkhtmltopdf binary must be passed to the export function. However, in case of a Windows OS, an attempt is made to locate the path of the wkhtmltopdf binary by looking up the default installation directory ("Program Files/wkhtmltopdf"). 

2.) To export the constituent tree into the file formats JPG, PNG, GIF, BMP, EPS, PSD, TIFF and YAML, the software suite *ImageMagick* is required: https://imagemagick.org/script/download.php. 

#### Supported file formats are: [.pdf, .svg, .ps, .png, .jpg, .gif, .bmp, .psd, .eps, .tiff, .txt, .tex, .json, .yaml]

In [None]:
tree.export_tree(destination_filepath='my_tree.svg', verbose=True)

## Extract phrases

#### Only phrasal categories 

In [None]:
tree.extract_all_phrasal_categories()

#### All phrases (including nested)

In [None]:
for sentence in sentences:
    all_phrases = tree.extract_all_phrases(avoid_nested_phrases=False, min_words_in_phrases=1)
    for phrasal_category, phrases in all_phrases.items():
        print(phrasal_category, phrases)
    print()    

#### All phrases (without nested)

In [None]:
for sentence in sentences:
    all_phrases = tree.extract_all_phrases(avoid_nested_phrases=True, min_words_in_phrases=1)
    for phrasal_category, phrases in all_phrases.items():
        print(phrasal_category, phrases)
    print()

#### Only noun phrases

In [None]:
phrases = tree.extract_all_phrases(avoid_nested_phrases=True)
noun_phrases = phrases['NP']

print(noun_phrases)

## Extract text units

#### Only text tokens

In [None]:
tree.extract_leaves_from_tree(tree.nltk_tree, content_type=tree.NodeContent.Text)

#### Only POS tags

In [None]:
tree.extract_leaves_from_tree(tree.nltk_tree, content_type=tree.NodeContent.Pos)

#### Combination of both

In [None]:
tree.extract_leaves_from_tree(tree.nltk_tree, content_type=tree.NodeContent.Combined)