#Tool for CS221: Natural Language Processing.
A gift for all of you. Good luck 🍻

Contact me at 14520146@gm.uit.edu.vn

---


## About


**Features:**
- Automatically generate DCG for [PROLOG](http://swish.swi-prolog.org/).

- Draw lingustic syntax tree from labelled bracket notation using [RSyntaxTree](https://yohasebe.com/rsyntaxtree/).

**Learn more about:**
1. [Labelled bracket notation](http://www.glottopedia.org/index.php/Labeled_bracketing)
2. [RSyntaxTree](https://yohasebe.com/rsyntaxtree/#documentation)

![alt text](https://i.gyazo.com/6bb68b0bdb35d7a10c4a11d5788d484f.png)

## Workplace


---

** Remember to run all 3 cells below!**

In [None]:
#@title ##Backend things { run: "auto", display-mode: "form" }
#@markdown **Be careful! Only edit when you know what you are doing!**

# regex
import re
# http request for check/draw
from urllib import request, parse
# draw to GG Colab
from IPython.display import HTML

# DCG container includes: rules and words
# rules: s(s(X,Y))-->np(X),vp(Y).
# words: nn(nn(word))-->[word]

class Container:

    def __init__(self):
        self.rules = ""
        self.words = ""
        
# Clean input
patterns = {
    '[àáảãạăắằẵặẳâầấậẫẩ]': 'a',
    '[đ]': 'd',
    '[èéẻẽẹêềếểễệ]': 'e',
    '[ìíỉĩị]': 'i',
    '[òóỏõọôồốổỗộơờớởỡợ]': 'o',
    '[ùúủũụưừứửữự]': 'u',
    '[ỳýỷỹỵ]': 'y',
    '[\r\n\t\^]': '',
    '([\]\[]) +| +([\[\]])': r'\1\2'
}

def Convert(text):
    """
    Convert from 'Tiếng Việt có dấu' to 'Tieng Viet khong dau',
    remove space, newline and convert to lowercase.
    text: input string to be converted
    Return: string converted
    """
    output = text.lower()
    for regex, replace in patterns.items():
        output = re.sub(regex, replace, output)
    return output
  
  
# vars for rules like s(s(X,Y))-->np(X),vp(Y)
vars = ["X", "Y", "Z", "T"]

def CreateDCG(clean_input, out):
  """
  Creates DCG from labelled bracking notation,
  clean_input: clean labelled bracking notation by applying Convert function
  out: container contains DCG
  """
  
  c = 0
  s = ""
  child = []
  
  # test for rule's type: [{word}[][]]
  re_test1 = re.match('\[(\w+)\[.*\]\]', clean_input)
  
  if re_test1:
    # builds rules: {word(word(}
    out.rules += "{0}({1}(".format(re_test1[1], re.match("([a-z]+)", re_test1[1])[1])
    
    # searchs for all child groups in highest lever
    # [s[np[][]][vp[[][]]]] ==> [np...] & [vp...]
    for i in clean_input:
      if i == '[':
        c = c + 1
      elif i == ']':
        if c == 2:
          s += i
        c = c - 1

      if c >= 2:
        s += i
      elif c < 2:
        if s:
          child.append(s)
          #continue building rules: {word(word(X,Y,...}
          out.rules += vars[len(child)-1] + ", "
          s = ""
    
    # continue building rules: {word(word(X,Y,...))-->}
    out.rules = out.rules[:-2] + "))-->"

    # add all childs to the left side of current rule: {word(word(X,Y,...))-->child1(X), child2(Y), ...
    for i in range(len(child)):
      re_test1 = re.match('\[(\w+)[\[ ]', child[i])
      if re_test1:
        out.rules += "{0}({1}), ".format(re_test1[1], vars[i])
    
    # continue building rules. complete current rule by adding .\n
    out.rules = out.rules[:-2] + ".\n"
    
    # solve all child groups by recursion
    for i in child:
      CreateDCG(i, out)
  
  # builds DCG words
  else:
    # test for word's type: [word words]
    word_test = re.match('\[(\w+) ([\w ]+)\]', clean_input)
    if word_test:
      # builds words: {wordtype}({wordtype}(words))-->[{words}]
      out.words += "{0}({0}({1}))-->[{1}].\n".format(word_test[1], re.sub(' ', ',', word_test[2]))
      
      
def CleanOutput(out):
  """Makes distinct and sorted DCG"""
  
  # remove last empty line
  if out.rules[-1:] == '\n':
    out.rules = out.rules[:-1]
    
  if out.words[-1:] == '\n':
    out.words = out.words[:-1]
    
    
  # makes unique and sorted DCG
  out.rules = sorted(set(out.rules.split('\n')))
  out.words = sorted(set(out.words.split('\n')))
    
  # joins all elements of set, use '\n' as delimiter
  out.rules = '\n'.join(out.rules)
  out.words = '\n'.join(out.words)
  
  
# send http/https request to check/draw syntax
def SendRequest(url, data):  
  data = parse.urlencode(data).encode()

  # Post Method is invoked if data != None
  req =  request.Request(url, data=data)

  # Response
  resp = request.urlopen(req)

  return resp.read().decode()


# check syntax
def CheckSyntax(syntax):
  """
  Check syntax that user put in
  return True/False
  """
  
  data = {'data': syntax}
  
  resp = SendRequest('https://yohasebe.com/rsyntaxtree/check', data)
  return resp == "true"


# get base64 image of parse tree
def DrawParseTree(syntax, leafstyle='auto', fontstyle='noto-sans', fontsize=10, margin=0, vheight=1.0, color=True, autosub=False, symmetrize=True):
  """Draw parse tree"""
  
  # leafstyle: auto, triangle, bar, none
  # fontstyle: noto-sans, noto-serif
  # vheight: 0.5, 1.0, 1.5, 2.0
  # color, autosub, symmetrize: True, False.
  data = {
      'data': syntax,
      'leafstyle': leafstyle.lower(),
      'fontstyle': re.sub(' ', '-', fontstyle.lower()),
      'fontsize': fontsize,
      'margin': margin,
      'vheight': vheight,
      'color': 'on' if color else 'off',
      'autosub': 'on' if autosub else 'off',
      'symmetrize': 'on' if symmetrize else 'off'
  }
  
  resp = SendRequest('https://yohasebe.com/rsyntaxtree/draw_png', data)
  return resp


# draw to GG Colab
def DisplayPNGTree(base64):
  return "<img src='data:image/png;base64,{0}'/>".format(base64)

In [None]:
#@title ##Draw settings { run: "auto" }
#@markdown **Leave it default if you don't know what to do!** Learn more [here](https://yohasebe.com/rsyntaxtree/#documentation).
Connector_shape = "Auto" #@param ["Auto", "Triangle", "Bar", "None"]
Font = "Noto Sans" #@param ["Noto Sans", "Noto Serif"] {type:"string"}
Font_size = 10 #@param {type:"slider", min:6, max:26, step:2}
Margin = 0 #@param {type:"slider", min:0, max:120, step:20}
Connecter_height = 1 #@param {type:"slider", min:0.5, max:2.0, step:0.5}
Color = True #@param {type:"boolean"}
Symmetrize = True #@param {type:"boolean"}
Auto_subscript = False #@param {type:"boolean"}

In [None]:
#@title ##Editor { run: "auto", vertical-output: true, display-mode: "form" }
#@markdown Allow Unicode, multi-spacing.


Syntax = "[S \t[NP \t\t[NNP Nam] \t] \t[VP \t\t[VB h\u1ECDc] \t\t[ADJP \t\t\t[ADVB \t\t\t\t[RB r\u1EA5t] \t\t\t] \t\t\t[JJ gi\u1ECFi] \t\t] \t] ]" #@param {type:"string"}
Draw = True #@param {type:"boolean"}

error_message = '''
<div style='font-size: 250%; color: red;'><b><strong>Invalid syntax!!!</strong></b></div><br>
<img alt="Invalid syntax" class="border" src="http://memegen.link/custom/Come-on/boiiii.jpg?alt=https://i.imgur.com/CsCgN7Ll.png" title="Invalid syntax">'''

output_message = '''<table style="width:100%"><tr><td>
<textarea rows="10" cols="50" onclick="this.focus();this.select()" readonly>
{0}\n\n{1}</textarea></td><td>{2}</td></tr></table>'''

if Syntax:
  if not re.match('^\s*\[\w+.*\]', Syntax) or not CheckSyntax(Syntax):
    display(HTML(error_message))

  else:
    clean_input = Convert(Syntax)
    out = Container()

    CreateDCG(clean_input, out)
    CleanOutput(out)
    
    display(HTML(output_message.format(
        out.rules, out.words,
        DisplayPNGTree(DrawParseTree(
            Syntax, Connector_shape, Font,
            Font_size, Margin, Connecter_height,
            Color, Auto_subscript, Symmetrize)) if Draw else "")))

0,1
"adjp(adjp(X, Y))-->advb(X), jj(Y). advb(advb(X))-->rb(X). np(np(X))-->nnp(X). s(s(X, Y))-->np(X), vp(Y). vp(vp(X, Y))-->vb(X), adjp(Y). jj(jj(gioi))-->[gioi]. nnp(nnp(nam))-->[nam]. rb(rb(rat))-->[rat]. vb(vb(hoc))-->[hoc].",


0,1
"adjp(adjp(X, Y))-->advb(X), jj(Y). advb(advb(X))-->rb(X). np(np(X))-->nnp(X). s(s(X, Y))-->np(X), vp(Y). vp(vp(X, Y))-->vb(X), adjp(Y). jj(jj(gioi))-->[gioi]. nnp(nnp(nam))-->[nam]. rb(rb(rat))-->[rat]. vb(vb(hoc))-->[hoc].",
