In [1]:
import re

class Token: #Token Class
    def __init__(self, type, value):
        self.type = type
        self.value = value

    def __repr__(self):
        return f'({self.type}, {repr(self.value)})\n'

class JASONScanner:
    def __init__(self, text):
        self.text = text
        self.position = 0
        self.tokens = []
        self.token_regex = self.build_token_regex()

    def build_token_regex(self):
        token_specification = [
            ('Num',   r'\d+(\.\d*)?'),          #Integer or decimal number
            ('Identifier',    r'[A-Za-z_]\w*'), #Identifiers
            ('OP', r'[+\-*/=]'),                #Operators
            ('OpenBrace',   r'\{'),             #Open Brace
            ('ClosedBrace',   r'\}'),           #Closed Brace
            ('OpenBracket',   r'\('),           #Open Bracket
            ('ClosedBracket',   r'\)'),         #Closed Bracket
            ('Semicolon',r';'),                 #Semicolon
            ('Space',     r'[ \t]+'),           #Space
            ('NewLine',  r'\n'),                #End Line
            ('Other', r'.'),                    #Other
        ]
        regex_patterns = '|'.join(f'(?P<{name}>{pattern})' 
                                  for name, pattern in token_specification)
        return re.compile(regex_patterns)

    def tokenize(self):
        for match in re.finditer(self.token_regex, self.text):
            kind = match.lastgroup
            value = match.group()
            if kind == 'SKIP' or kind == 'NEWLINE': #Does not produce error for space or newline
                continue
            elif kind == 'MISMATCH': #If scanner finds unrecognized item it will produce and error
                raise RuntimeError(f'Illegal character {value!r} at position {self.position}/n')
            else:
                token = Token(kind, value)
                self.tokens.append(token)
            self.position = match.end()
        return self.tokens


#Input stated in document
code = """
Integer x;
Set x = 1;
"""
#Putting code through the scanner
Scan = JASONScanner(code)
tokens = Scan.tokenize()
print(tokens)


[(NewLine, '\n')
, (Identifier, 'Integer')
, (Space, ' ')
, (Identifier, 'x')
, (Semicolon, ';')
, (NewLine, '\n')
, (Identifier, 'Set')
, (Space, ' ')
, (Identifier, 'x')
, (Space, ' ')
, (OP, '=')
, (Space, ' ')
, (Num, '1')
, (Semicolon, ';')
, (NewLine, '\n')
]


In [2]:
import re

# Regex patterns for JSON tokens
token_patterns = {
    'WHITESPACE': r'[ \t\n]+',  # Ignore whitespace
    'LBRACE': r'\{',
    'RBRACE': r'\}',
    'LBRACKET': r'\[',
    'RBRACKET': r'\]',
    'COMMA': r',',
    'COLON': r':',
    'STRING': r'\"([^"\\]|\\.)*\"',
    'NUMBER': r'-?\d+(\.\d+)?([eE][+-]?\d+)?',
    'TRUE': r'true',
    'FALSE': r'false',
    'NULL': r'null'
}

# Combine into a single pattern with named groups
token_regex = '|'.join(f'(?P<{name}>{pattern})' for name, pattern in token_patterns.items())


In [3]:
def tokenize(text):
    for match in re.finditer(token_regex, text):
        kind = match.lastgroup
        value = match.group()
        if kind == 'WHITESPACE':
            continue
        elif kind == 'STRING':
            value = value[1:-1].replace('\\"', '"')  # Remove quotes and handle escaped quotes
        elif kind == 'NUMBER':
            value = float(value) if '.' in value or 'e' in value or 'E' in value else int(value)
        elif kind == 'TRUE':
            value = True
        elif kind == 'FALSE':
            value = False
        elif kind == 'NULL':
            value = None
        yield kind, value


In [4]:
def parse_value(tokens):
    kind, value = next(tokens)
    if kind in ['STRING', 'NUMBER', 'TRUE', 'FALSE', 'NULL']:
        return value
    elif kind == 'LBRACE':
        return parse_object(tokens)
    elif kind == 'LBRACKET':
        return parse_array(tokens)
    raise SyntaxError('Unexpected value')


def parse_pair(tokens):
    kind, key = next(tokens)
    if kind != 'STRING':
        raise SyntaxError('Expected string key')
    kind, sep = next(tokens)
    if kind != 'COLON':
        raise SyntaxError('Expected colon separator')
    value = parse_value(tokens)
    return key, value


def parse_object(tokens):
    obj = {}
    kind, sep = next(tokens)
    if kind == 'RBRACE':
        return obj  # Empty object
    while True:
        key, value = parse_pair(tokens)
        obj[key] = value
        kind, sep = next(tokens)
        if kind == 'RBRACE':
            break
        elif kind != 'COMMA':
            raise SyntaxError('Expected comma or closing brace')
    return obj

def parse_array(tokens):
    arr = []
    kind, sep = next(tokens)
    if kind == 'RBRACKET':
        return arr  # Empty array
    while True:
        tokens.append((kind, sep))  # Push back the last read token
        value = parse_value(tokens)
        arr.append(value)
        kind, sep = next(tokens)
        if kind == 'RBRACKET':
            break
        elif kind != 'COMMA':
            raise SyntaxError('Expected comma or closing bracket')
    return arr


In [5]:
def parse_json(text):
    tokens = tokenize(text)
    tokens = iter(tokens)  # Create an iterator over tokens
    value = parse_value(tokens)
    try:
        next(tokens)  # Ensure there are no extra tokens
        raise SyntaxError('Unexpected data after JSON object')
    except StopIteration:
        pass
    return value