In [None]:
import re

class SimpleLangLexer:
    def __init__(self):
        self.token_patterns = [
            ('Keyword', r'\b(int|word|bigint|char|dotie|bool|list|constant|tuple|if|otif|otw|for|while|get_out|go_on|return|void|try|catch|finally|display|input|start)\b'),
            ('Identifier', r'\b[a-zA-Z_][a-zA-Z0-9_]*\b'),
            ('UnaryOperator', r'-|!|&|sizeof\(\)'),
            ('BinaryOperator', r'==|!=|<=|>=|\|\||&&|[+\-*/=](?!=)'),
            ('LogicalLiteral', r'True|False'),
            ('StringLiteral', r'"([^"]*)"'),
            ('Whitespace', r'\s+'),
            ('Paranthesis', r'[()\{\}\[\]]'),
            ('Quotation', r'"'),
            ('Constant', r'[^"{}\s();]+'),
            ('EndOfStatement', r';')
        ]

    def tokenize(self, source_code):
        tokens = []
        position = 0

        while position < len(source_code):
            match = None

            for token_type, pattern in self.token_patterns:
                regex = re.compile(pattern)
                match = regex.match(source_code, position)

                if match:
                    if token_type == 'StringLiteral':
                        tokens.append(('Quotation', '"'))
                        tokens.append(('StringLiteral', match.group(1)))
                        tokens.append(('Quotation', '"'))
                    else:
                        tokens.append((token_type, match.group(0)))
                    position = match.end()
                    break

            if not match:
                raise Exception(f"Unexpected character at position {position}: {source_code[position]}")

        return tokens


def main():
    filename = "test4.kul"
    with open(filename, 'r') as file:
        source_code = file.read()

    lexer = SimpleLangLexer()
    tokens = lexer.tokenize(source_code)

    print(tokens)


if __name__ == "__main__":
    main()

[('Keyword', 'int'), ('Whitespace', ' '), ('Keyword', 'start'), ('Paranthesis', '('), ('Paranthesis', ')'), ('Paranthesis', '{'), ('Whitespace', '\n'), ('Keyword', 'bool'), ('Whitespace', ' '), ('Identifier', 'a'), ('Whitespace', ' '), ('BinaryOperator', '='), ('Whitespace', ' '), ('Identifier', 'True'), ('Whitespace', ' '), ('UnaryOperator', '&'), ('UnaryOperator', '&'), ('Whitespace', ' '), ('Identifier', 'False'), ('EndOfStatement', ';'), ('Whitespace', '\n'), ('Keyword', 'display'), ('Paranthesis', '('), ('Identifier', 'a'), ('Paranthesis', ')'), ('EndOfStatement', ';'), ('Whitespace', '\n'), ('Keyword', 'word'), ('Whitespace', ' '), ('Identifier', 'str'), ('Whitespace', ' '), ('BinaryOperator', '='), ('Whitespace', ' '), ('Constant', '“Kulant”'), ('EndOfStatement', ';'), ('Whitespace', '\n'), ('Identifier', 'Word'), ('Whitespace', ' '), ('Identifier', 'str1'), ('Whitespace', ' '), ('BinaryOperator', '='), ('Whitespace', ' '), ('Constant', '“Pro”'), ('EndOfStatement', ';'), ('White