In [9]:
import re  # Import the regular expression module

# Define a list of keywords
keywords = {'int', 'return', 'void', 'if', 'else', 'while', 'for', 'break', 'continue', 'switch', 'case'}

# Define a list of token specifications using regular expressions
token_specification = [
    ('NUMBER', r'\d+(\.\d*)?'),   # Numbers (integer or float)
    ('ASSIGN', r'='),             # Assignment operator
    ('END', r';'),                # Statement terminator
    ('ID', r'[A-Za-z_][A-Za-z_0-9]*'),      # Identifier (variable or function name)
    ('OP', r'[+\-*/%]'),           # Arithmetic operators
    ('LPAREN', r'\('),            # Left parenthesis
    ('RPAREN', r'\)'),            # Right parenthesis
    ('LBRACE', r'\{'),            # Left brace
    ('RBRACE', r'\}'),            # Right brace
    ('NEWLINE', r'\n'),           # New line
    ('SKIP', r'[ \t]+'),          # Skip whitespace
    ('COMPARISON', r'==|!=|<=|>=|<|>'),  # Comparison operators
    ('MISMATCH', r'.'),           # Any unknown character
]

# Create a regex pattern from the token specifications
tok_regex = '|'.join(f'(?P<{pair[0]}>{pair[1]})' for pair in token_specification)

# Define the lexical analyzer function
def tokenize(code):
    # Initialize line number and line start position
    line_num = 1
    line_start = 0
    
    # Match the code against the patterns
    for mo in re.finditer(tok_regex, code):
        kind = mo.lastgroup  # Get the last matched group name (token type)
        value = mo.group(kind)  # Get the matched value for that token
        column = mo.start() - line_start  # Calculate the column position of the token

        # Convert matched number strings to integer or float
        if kind == 'NUMBER':
            value = float(value) if '.' in value else int(value)
        # Check if the identifier is a keyword
        elif kind == 'ID' and value in keywords:
            kind = 'KEYWORD'
        # Handle new lines
        elif kind == 'NEWLINE':
            line_start = mo.end()  # Update line start to the end of the current match
            line_num += 1  # Increment line number
            continue  # Skip to the next match
        # Ignore whitespace
        elif kind == 'SKIP':
            continue
        # Raise an error for unexpected characters
        elif kind == 'MISMATCH':
            raise RuntimeError(f'{value!r} unexpected on line {line_num}')
        
        # Yield the token type, value, line number, and column position
        yield kind, value, line_num, column

# Example usage
code = '''
int main() {
    int x = 10;
    return x + 5;
    @
}
'''

# Call the lexical analyzer and print the tokens
for token in tokenize(code):
    print(token)


('KEYWORD', 'int', 2, 0)
('ID', 'main', 2, 4)
('LPAREN', '(', 2, 8)
('RPAREN', ')', 2, 9)
('LBRACE', '{', 2, 11)
('KEYWORD', 'int', 3, 4)
('ID', 'x', 3, 8)
('ASSIGN', '=', 3, 10)
('NUMBER', 10, 3, 12)
('END', ';', 3, 14)
('KEYWORD', 'return', 4, 4)
('ID', 'x', 4, 11)
('OP', '+', 4, 13)
('NUMBER', 5, 4, 15)
('END', ';', 4, 16)


RuntimeError: '@' unexpected on line 5