<a href="https://colab.research.google.com/github/szubertpiotrek/Compiler/blob/master/compiler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Parser

In [0]:
class Parser:

  ##### Parser header #####
  def __init__(self, scanner):
    self.next_token = scanner.next_token
    self.token = self.next_token()

  def take_token(self, token_type):
    if self.token.type != token_type:
      self.error("Unexpected token: %s" % token_type)
    if token_type != 'EOF':
      self.token = self.next_token()

  def error(self,msg):
    raise RuntimeError('Parser error, %s' % msg)

  ##### Parser body #####

  # Starting symbol
  def start(self):
    # start -> program EOF
    if self.token.type == 'PRINT' or self.token.type == 'ID' or self.token.type == 'EOF' or self.token.type == 'IF':
      self.program()
      self.take_token('EOF')
    else:
      self.error("Epsilon not allowed")

  def program(self):
    # program -> statement program
    if self.token.type == 'PRINT' or self.token.type == 'ID' or self.token.type == 'IF':
      self.statement()
      self.program()
    # program -> eps
    else:
      pass

  def statement(self):
    # statement -> print_stmt
    if self.token.type == 'PRINT':
      self.print_stmt()
    # statement -> assign_stmt
    elif self.token.type == 'ID':
      self.assign_stmt()
    # statement -> if_stmt
    elif self.token.type == 'IF':
      self.if_stmt()
    else:
      self.error("Epsilon not allowed")

  # print_stmt -> PRINT value END
  def print_stmt(self):
    if self.token.type == 'PRINT':
      self.take_token('PRINT')
      self.value()
      self.take_token('END')
      print("print_stmt OK")
    else:
      self.error("Epsilon not allowed")
   
  # assign_stmt -> ID ASSIGN value END
  def assign_stmt(self):
    if self.token.type == 'ID':
      self.take_token('ID')
      self.take_token('ASSIGN')      
      self.value()
      self.take_token('END')
      print("assign_stmt OK")
    else:
      self.error("Epsilon not allowed")
  
  def value(self):
    # value -> NUMBER
    if self.token.type == 'NUMBER':
      self.take_token('NUMBER')
    # value -> ID
    elif self.token.type == 'ID':
      self.take_token('ID')
    else:
      self.error("Epsilon not allowed")

  def if_stmt(self):
    # if_stmt -> IF ID THEN program ENDIF END
    if self.token.type == 'IF':
      self.take_token('IF')
      self.take_token('ID')
      self.take_token('THEN')
      self.program()
      self.take_token('ENDIF')
      self.take_token('END')
      print("if_stmt OK")
    else:
      self.error("Epsilon not allowed")
       

# Scanner

In [0]:
import collections
import re

Token = collections.namedtuple('Token', ['type', 'value', 'line', 'column'])

class Scanner:

  def __init__(self, input):
    self.tokens = []
    self.current_token_number = 0
    for token in self.tokenize(input):
      self.tokens.append(token)
 
  def tokenize(self, input_string):
    keywords = {'"$id"', '$schema', 'title', 'title', 'properties', 'description',
                'required', 'minimum', 'maximum', 'minLength', 'maxLength', 'enum', 'definitions', '$ref'}
    token_specification = [
        ('NUMBER',  r'\d+(\.\d*)?'), # Integer or decimal number
        ('ASSIGN',  r':'),          # Assignment operator
        ('COMMA',     r','),           # Statement terminator
        ('ID',      r'"(.*?)"'),   # Identifiers   "(.*?)"    lub    [$A-Za-z]+
        ('NEWLINE', r'\n'),          # Line endings
        ('SKIP',    r'[ \t]'),       # Skip over spaces and tabs
        ('BRACKET_START',    r'\['),    #cos 
        ('BRACKET_END',    r'\]'),    #cos 
        ('QUOTE',   r'\"'),     # First qoute 
        ('BRACE_START',   r'\{'),
        ('BRACE_END',   r'\}'),
        ('EMPTY',   r'(\"\")|(\"\s\")'),

    ]
    tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
    get_token = re.compile(tok_regex).match
    line_number = 1
    current_position = line_start = 0
    match = get_token(input_string)
    while match is not None:
        type = match.lastgroup
        if type == 'NEWLINE':
            line_start = current_position
            line_number += 1
        elif type != 'SKIP':
            value = match.group(type)
            if type == 'ID' and value in keywords:
                type = value
            yield Token(type, value, line_number, match.start()-line_start)
        current_position = match.end()
        match = get_token(input_string, current_position)
    if current_position != len(input_string):
        raise RuntimeError('Error: Unexpected character %r on line %d' % \
                              (input_string[current_position], line_number))
    yield Token('EOF', '', line_number, current_position-line_start)

  def next_token(self):
    self.current_token_number += 1
    if self.current_token_number-1 < len(self.tokens):
      return self.tokens[self.current_token_number-1]
    else:
      raise RuntimeError('Error: No more tokens')


# Validator

In [5]:
# Simple example of parsing
# Bartosz Sawicki, 2014-03-13

# from scanner import *
# from parser import *

#input_string = '''
#x := 5;
#y := x;
#PRINT 64;
#'''

input_string = '''
{
  "$id": " ",
  "$schema": "",
  "title": "Person",
  "required": [ "name", "gender" ]
}
'''

print(input_string)
scanner = Scanner(input_string)
print(scanner.tokens)

parser = Parser(scanner)
parser.start()
  


{
  "$id": " ",
  "$schema": "",
  "title": "Person",
  "required": [ "name", "gender" ]
}

[Token(type='BRACE_START', value='{', line=2, column=1), Token(type='"$id"', value='"$id"', line=3, column=3), Token(type='ASSIGN', value=':', line=3, column=8), Token(type='ID', value='" "', line=3, column=10), Token(type='COMMA', value=',', line=3, column=13), Token(type='ID', value='"$schema"', line=4, column=3), Token(type='ASSIGN', value=':', line=4, column=12), Token(type='ID', value='""', line=4, column=14), Token(type='COMMA', value=',', line=4, column=16), Token(type='ID', value='"title"', line=5, column=3), Token(type='ASSIGN', value=':', line=5, column=10), Token(type='ID', value='"Person"', line=5, column=12), Token(type='COMMA', value=',', line=5, column=20), Token(type='ID', value='"required"', line=6, column=3), Token(type='ASSIGN', value=':', line=6, column=13), Token(type='BRACKET_START', value='[', line=6, column=15), Token(type='ID', value='"name"', line=6, column=17), Token(

RuntimeError: ignored