## Tokens
##### Keywords: if, else, elif, while, for, in, def, return, True, False, None
##### Operators: +, -, *, /, %, **, =, ==, !=, <, >, <=, >=, and, or, not
##### Delimiters: (, ), {, }, [, ], ',' , '.', ':', ';'
##### Literals: INTEGER, FLOAT, STRING
##### Other: IDENTIFIER, INDENT, DEDENT, NEWLINE, EOF

## Token and Error Class

In [17]:
import re
from typing import Generator, List, Optional, Tuple

class Token:
    def __init__(self, type: str, value: str, line: int, column: int):
        self.type = type
        self.value = value
        self.line = line
        self.column = column
    
    def __str__(self) -> str:
        return f"Token({self.type}, '{self.value}', line={self.line}, col={self.column})"
    
    def __repr__(self) -> str:
        return self.__str__()

class Error:
    def __init__(self, message: str, line: int, column: int):
        self.message = message
        self.line = line
        self.column = column
    
    def __str__(self) -> str:
        return f"Error: {self.message} at line {self.line}, column {self.column}"


## Lexer

In [18]:
class Lexer:
    # Defining regular expressions for token
    TOKEN_SPECS = [
        ('COMMENT', r'#.*'),                                  # Comments
        ('STRING', r'\"([^\\\"]|\\.)*\"|\'([^\\\']|\\.)*\''), # String literals
        ('FLOAT', r'\d+\.\d+'),                               # Float literals
        ('INTEGER', r'\d+'),                                  # Integer literals
        ('KEYWORD', r'(if|else|elif|while|for|in|def|return|True|False|None)\b'),  # Keywords
        ('IDENTIFIER', r'[a-zA-Z_]\w*'),                      # Identifiers
        # Operators (multi-character ones first)
        ('OP_EQ', r'=='),
        ('OP_NE', r'!='),
        ('OP_LE', r'<='),
        ('OP_GE', r'>='),
        ('OP_ASSIGN', r'='),
        ('OP_PLUS', r'\+'),
        ('OP_MINUS', r'-'),
        ('OP_MULT', r'\*\*|\/\/|\*|\/'),  # ** (power), // (floor div), * (mult), / (div)
        ('OP_MOD', r'%'),
        ('OP_LT', r'<'),
        ('OP_GT', r'>'),
        # Delimiters
        ('LPAREN', r'\('),
        ('RPAREN', r'\)'),
        ('LBRACKET', r'\['),
        ('RBRACKET', r'\]'),
        ('LBRACE', r'\{'),
        ('RBRACE', r'\}'),
        ('COMMA', r','),
        ('DOT', r'\.'),
        ('COLON', r':'),
        ('SEMICOLON', r';'),
        ('NEWLINE', r'\n'),
        ('WHITESPACE', r'[ \t]+'),                            # Whitespace
        # INDENT and DEDENT tokens are not matched by regex but generated based on whitespace
    ]
    
    def __init__(self, source_code: str):
        self.source_code = source_code
        self.tokens = []
        self.errors = []
        self.line = 1
        self.column = 1
        self.indent_levels = [0]  # Start with indent level 0
    
    def tokenize(self) -> Generator[Token, None, None]:
        """Tokenize the source code and yield tokens."""
        # Process the source code line by line to handle indentation properly
        lines = self.source_code.split('\n')
        
        for line_num, line in enumerate(lines):
            self.line = line_num + 1
            self.column = 1
            
            #calculating the amount of leading whitespace (indentation) at the beginning of the current line
            if line.strip():  # Non-empty line      #strip() is equivalent of trim() in JS
                indent_size = len(line) - len(line.lstrip())    #lstrip() will remove leading spaces in the line i.e. remove any indentation; and the difference in length tells you how much whitespace there was
                indent_tokens = self._handle_indentation(indent_size)
                for token in indent_tokens:
                    yield token
            else:   # Skip empty lines but still count them for line numbers
                yield Token('NEWLINE', '\\n', self.line, self.column)
                continue
            
            # Process the rest of the line
            i = indent_size if 'indent_size' in locals() else 0
            line_content = line
            
            while i < len(line_content):    #traversing a line character by character
                match = None
                
                ## Skip spaces and tabs (already handled indentation)
                ##if line_content[i].isspace():   #isspace() checks if the entire line is whitespace or not; here it checks whether a single character is whitespace or not since its applied on line_content[i]
                ##    i += 1
                ##    self.column += 1
                ##    continue
                
                # Try to match each token pattern
                for token_type, pattern in self.TOKEN_SPECS:
                    regex = re.compile(pattern) #compiling the pattern into a regex so that we can use it to find tokens in the line
                    match = regex.match(line_content, i)    #match() will check if the "starting" of the string matches the given regex or not, line_content gives the line whose starting it has to compare with the regex with i telling from which point in the line to assume as thhe starting of the line
                    #if a match is found, 'match' will contain an object with info about the found match (like start and end, its value and so on), otherwise it'll contain "none"
                    
                    if match:
                        value = match.group(0)  #group will give the value of the match object
                        
                        if token_type == 'WHITESPACE':
                            #whitespace is not yielded becuz parser receives stream of tokens without space and doesnt care about space
                            # Just update column and continue
                            self.column += len(value)
                            i += len(value)
                            continue
                        elif token_type == 'COMMENT':
                            # Ignore comments
                            i += len(value)
                            self.column += len(value)
                            break   # Break out of the current character processing loop after handling comments
                        else:
                            # For all other tokens
                            token = Token(token_type, value, self.line, self.column)
                            yield token
                        
                        i += len(value)
                        self.column += len(value)
                        break
                
                if not match:
                    # No token matched, raise an error
                    error_msg = f"Invalid character: '{line_content[i]}'"
                    error = Error(error_msg, self.line, self.column)
                    self.errors.append(error)
                    print(error)  # Print the error but continue
                    i += 1
                    self.column += 1
            
            # Add NEWLINE at the end of each line except the last one
            if line_num < len(lines) - 1 or self.source_code.endswith('\n'):
                yield Token('NEWLINE', '\\n', self.line, self.column)
        
        # Output any pending dedents at the end of the file
        #end of file means all indented blocks have been dedented so pop all elements of the indent_levels stack
        while len(self.indent_levels) > 1:
            self.indent_levels.pop()    
            yield Token('DEDENT', '', self.line, self.column)
        
        # End of file token
        yield Token('EOF', '', self.line, self.column)
    
    def _handle_indentation(self, indent_size: int) -> List[Token]:
        """Handle Python's indentation-based block structure. Returns a list of INDENT or DEDENT tokens as needed."""
        tokens = []
        previous_line_indent = self.indent_levels[-1]
        
        if indent_size > previous_line_indent:
            # This is an indentation (start of a new block)
            self.indent_levels.append(indent_size)  #when a new block is recognized through indentation its indentation level is pushed in the stack so that w ecan later check whether that block was dedented properly or not
            tokens.append(Token('INDENT', ' ' * (indent_size - previous_line_indent), self.line, 1))
        
        elif indent_size < previous_line_indent:
            # This is a dedentation (end of one or more blocks)
            while self.indent_levels and indent_size < self.indent_levels[-1]: # ensuring that the latest code block is being dedented
                self.indent_levels.pop()
                tokens.append(Token('DEDENT', '', self.line, 1))
            
            if indent_size != self.indent_levels[-1]:
                # Invalid indentation
                error_msg = f"Inconsistent indentation"
                error = Error(error_msg, self.line, 1)
                self.errors.append(error)
                print(error)  # Print the error but continue
                
                
                # and what about handling indent_size == self.indent_levels[-1]
        return tokens

## Test Script for Lexer

In [19]:
def test_lexer(source_code, test_name=""):
    print(f"\n=== Testing {test_name} ===")
    print(f"Source code:\n{source_code}")
    print("\nTokens:")
    
    lexer = Lexer(source_code)
    token_count = 0
    
    try:
        for token in lexer.tokenize():
            print(token)
            token_count += 1
        
        print(f"\nTotal tokens: {token_count}")
        if lexer.errors:
            print(f"\nErrors ({len(lexer.errors)}):")
            for error in lexer.errors:
                print(f"  {error}")
    except Exception as e:
        print(f"Exception occurred: {str(e)}")
        import traceback
        traceback.print_exc()

# Test cases
if __name__ == "__main__":
    # Test 1: Basic variable assignment
    test_lexer("x = 10", "Basic Assignment")

    # Test 2: Simple function definition
    test_lexer("""def add(a, b):
    return a + b""", "Function Definition")

    # Test 3: If statement with proper indentation
    test_lexer("""if x > 10:
    print("Greater than 10")
else:
    print("Less than or equal to 10")""", "If Statement")

    # Test 4: Various literals and operators
    test_lexer("""# Testing literals
x = 42
y = 3.14
name = "John"
flag = True
result = x + y * 2
equal = x == y""", "Literals and Operators")

    # Test 5: Loops
    test_lexer("""# Testing loops
for i in range(10):
    if i % 2 == 0:
        print("Even")
    else:
        print("Odd")

# While loop
count = 0
while count < 5:
    count += 1
""", "Loops")

    # Test 6: Test error handling
    test_lexer("x = 10 @", "Error Handling")

    # Test 7: Test inconsistent indentation
    test_lexer("""def test():
    x = 1
   y = 2  # Inconsistent indentation
""", "Inconsistent Indentation")

    # Interactive testing option
    do_interactive = input("\nDo you want to run an interactive test? (y/n): ")
    if do_interactive.lower() == 'y':
        print("\n=== Interactive Lexer Test ===")
        print("Enter Python code (type 'exit()' on a new line to finish):")
        
        lines = []
        while True:
            line = input("> ")
            if line.strip() == "exit()":
                break
            lines.append(line)
        
        source_code = "\n".join(lines)
        test_lexer(source_code, "Interactive Input")



=== Testing Basic Assignment ===
Source code:
x = 10

Tokens:
Token(IDENTIFIER, 'x', line=1, col=1)
Token(OP_ASSIGN, '=', line=1, col=3)
Token(INTEGER, '10', line=1, col=5)
Token(EOF, '', line=1, col=7)

Total tokens: 4

=== Testing Function Definition ===
Source code:
def add(a, b):
    return a + b

Tokens:
Token(KEYWORD, 'def', line=1, col=1)
Token(IDENTIFIER, 'add', line=1, col=5)
Token(LPAREN, '(', line=1, col=8)
Token(IDENTIFIER, 'a', line=1, col=9)
Token(COMMA, ',', line=1, col=10)
Token(IDENTIFIER, 'b', line=1, col=12)
Token(RPAREN, ')', line=1, col=13)
Token(COLON, ':', line=1, col=14)
Token(NEWLINE, '\n', line=1, col=15)
Token(INDENT, '    ', line=2, col=1)
Token(KEYWORD, 'return', line=2, col=1)
Token(IDENTIFIER, 'a', line=2, col=8)
Token(OP_PLUS, '+', line=2, col=10)
Token(IDENTIFIER, 'b', line=2, col=12)
Token(DEDENT, '', line=2, col=13)
Token(EOF, '', line=2, col=13)

Total tokens: 16

=== Testing If Statement ===
Source code:
if x > 10:
    print("Greater than 10")
else