# Lexical Analysis

## Experiment

This is an experiment of how to use `rply.LexerGenerator`

import the necessary libraries

- rply.LexerGenerator

In [2]:
from rply import LexerGenerator

In [3]:
lg = LexerGenerator()

Lexers are generated by adding rules with regular expression

In [4]:
lg.add('NUMBER', r'\d+')

In [10]:
lg.ignore(r"\s+")
lg.ignore(r"_")
lg.ignore(r";")

In [11]:
l = lg.build()

In [14]:
INPUT = "12_;1_;1_;1"
for token in l.lex(INPUT):
    print(token)

Token('NUMBER', '12')
Token('NUMBER', '1')
Token('NUMBER', '1')
Token('NUMBER', '1')


if there is no rule added, it will be an error

In [13]:
INPUT = "1_+++1_;1_;1"
for token in l.lex(INPUT):
    print(token)

Token('NUMBER', '1')


LexingError: (None, SourcePosition(idx=2, lineno=1, colno=1))

## Build a sample tokenizer

This part is how to construct a tokenizer for the following snippets

```
str txt = "Sample String";
```


In [31]:
INPUT = 'str txt = "Sample String"'

In [48]:
my_lg = LexerGenerator()

my_lg.add("dataType", r"str")
my_lg.add("string", r'".*"')
my_lg.add("variable", r"[a-zA-Z][a-zA-Z0-9_]*")
my_lg.add("assignment", r"=")

my_lg.ignore(r"\s+")

In [49]:
my_l = my_lg.build()

In [50]:
for token in my_l.lex(INPUT):
    print(token)

Token('dataType', 'str')
Token('variable', 'txt')
Token('assignment', '=')
Token('string', '"Sample String"')


## Build Rules based on BNF

<img src="img/ebnf_1.png" />
<img src="img/ebnf_2.png" />
<img src="img/ebnf_3.png" />
<img src="img/ebnf_4.png" />

In [131]:
RULES = {
    "singleLineComment": r"^#.*;$",
    "multiLineComment": r"^#-{2,}.*-{2,}#;$",
    "string": r"\".*\"|'.*'"
}

In [132]:
class MyLexerGenerator:
    def __init__(self):
        self.lg = LexerGenerator()
        self.is_rule_added = False
    def add_rules(self, rules):
        for key, value in rules.items():
            self.lg.add(key, value)
        self.lg.ignore(r"\s+")
        self.is_rule_added = True
    def build(self):
        if self.is_rule_added:
            return self.lg.build()

In [133]:
my_lexer_generator = MyLexerGenerator()
my_lexer_generator.add_rules(RULES)
my_lexer = my_lexer_generator.build()

In [134]:
TEST_CASES = {
    "singleLineComment#1": "# asdfsd;",
    "singleLineComment#2": "# ;",
    "singleLineComment#3": "#;",
    "singleLineComment#4": "#-;",
    "singleLineComment#5": "#--;",
    "singleLineComment#6": "#-- asddsf;",
    "singleLineComment#7": "#-- asddsf #;",
    "singleLineComment#8": "#-- asddsf -#;",
    "string#1": "\"\"",
    "string#2": "\" \"",
    "string#3": "\"sdfs\"",
    "string#4": "\"sdfs \"",
    "string#5": "\"sdfs fssf\"",
    "string#6": "\"2342\"",
    "string#7": "''",
    "string#8": "' '",
    "string#9": "'sdfs'",
    "string#10": "'sdfs '",
    "string#11": "'sdfs fssf'",
    "string#12": "'2342'",
    "multiLineComment#1": "#----#;",
    "multiLineComment#2": "#-- --#;",
    "multiLineComment#3": "#--a--#;",
    "multiLineComment#4": "#--afasddsf--#;",
    "multiLineComment#5": "#--afas13 dfsa ddsf--#;",
    
}

In [135]:
for key, value in TEST_CASES.items():
    try:
        for token in my_lexer.lex(value):
            print(token)
    except:
        print(f"there is a lexing error at {key}")
        

Token('singleLineComment', '# asdfsd;')
Token('singleLineComment', '# ;')
Token('singleLineComment', '#;')
Token('singleLineComment', '#-;')
Token('singleLineComment', '#--;')
Token('singleLineComment', '#-- asddsf;')
Token('singleLineComment', '#-- asddsf #;')
Token('singleLineComment', '#-- asddsf -#;')
Token('string', '""')
Token('string', '" "')
Token('string', '"sdfs"')
Token('string', '"sdfs "')
Token('string', '"sdfs fssf"')
Token('string', '"2342"')
Token('string', "''")
Token('string', "' '")
Token('string', "'sdfs'")
Token('string', "'sdfs '")
Token('string', "'sdfs fssf'")
Token('string', "'2342'")
Token('singleLineComment', '#----#;')
Token('singleLineComment', '#-- --#;')
Token('singleLineComment', '#--a--#;')
Token('singleLineComment', '#--afasddsf--#;')
Token('singleLineComment', '#--afas13 dfsa ddsf--#;')
