In [9]:
import re 
import json 
from typing import List 
from graphviz import Digraph

In [10]:
'''
Helper Functions 
 
'''
def split_input(input: str):
    split_list = []
    opening_match_found = False 
    match = ""
    for char in input: 
        if char == '[' :
            opening_match_found = True 
        elif char == ']': 
            opening_match_found = False 
            split_list.append('['+ match +']')
            match = "" 
        elif  opening_match_found == 0 :
            split_list.append (char) 
        else : 
            match =  match + char 
    return split_list

def visualize_nfa(nfa_json):
    # Create a new directed graph
    dot = Digraph(format='png')
    
    # Set global graph attributes for visual clarity
    dot.attr(rankdir='LR')  # Left to right layout

    # Add states to the graph
    start_state = nfa_json.get("startingState")
    for state_name, state_info in nfa_json.items():
        if state_name == "startingState":
            continue

        # Determine if this state is an accepting state
        is_accepting = state_info["isTerminatingState"]

        # Customize node style based on state type
        if state_name == start_state:
            dot.node(state_name, shape='circle', color='green', label=state_name)  # Start state
        elif is_accepting:
            dot.node(state_name, shape='doublecircle', color='blue', label=state_name)  # Accepting state
        else:
            dot.node(state_name, shape='circle', label=state_name)

    # Add transitions
    for state_name, state_info in nfa_json.items():
        if state_name == "startingState":
            continue

        # Iterate over transitions and add them as edges
        for input_symbol, destinations in state_info.items():
            if input_symbol == "isTerminatingState":
                continue
            
            if isinstance(destinations, list):
                # If there are multiple destinations, create separate edges for each
                for destination in destinations:
                    label = "ε" if input_symbol == "~" else input_symbol  # Represent epsilon with 'ε'
                    dot.edge(state_name, destination, label=label)
            else:
                # Single destination, normal case
                label = "ε" if input_symbol == "~" else input_symbol  # Represent epsilon with 'ε'
                dot.edge(state_name, destinations, label=label)

    # Render and view the graph
    dot.render('nfa_graph', view=True)



In [11]:


'''
This preporcessing is applied to the regex before applying shunting yard algorithm in order to : 
1) reduce all the regex to these operations only ( * , | , parenthes , concat )
2) add a concatination symbol between characters to be recognized by the algorithm as an operator 
3) replace every [match] with one char to be treated as other alphanumeric 
'''  
def preprocessing(input : str) :
# input= 'a?a((cd)|(a|b))b+bb' 
    #Step1: Replace zero or one symbol '?'
    step_1 = re.sub(r'(\w)\?', r'(\1|~)', input)
    #Step2: Replace one or more symbol '+'
    step_2 = re.sub(r'(\w)\+', r'\1\1*', step_1)
    #Step3: Add concat symbol before every [ or (  if they are not at the start of the regex and they are not preceded by ?
    pattern_before = re.compile(r'''
        (?<!^)      # Negative lookbehind assertion to ensure the position is not at the start of the string
        (?<!\?)     # Negative lookbehind assertion to ensure the position is not preceded by '?'
        (?=[\[\(])  # Positive lookahead assertion to match '[' or '(' without consuming them
    ''', re.VERBOSE)
    step_3 = pattern_before.sub('?', step_2) 
    #Step 4: Add concat symbol after every ] or ) if they are not the end of regex OR they are not followed by * and not followed by ? 
    pattern_after = re.compile(r'''
        (?<=[\]\)])  # Positive lookbehind assertion to match ']' or ')' without consuming them
        (?!$)        # Negative lookahead assertion to ensure the position is not at the end of the string
        (?![\*\?])   # Negative lookahead assertion to ensure the position is not followed by '*' or '?'
    ''', re.VERBOSE)
    step_4 = pattern_after.sub('?', step_3)
    #Step 5 : Add concat after every * if its not the end of regex and its not follwed by star 
    pattern_star = re.compile(r'''
        \*          # Match the '*' character
        (?!$)       # Negative lookahead assertion to ensure the position is not at the end of the string
        (?![\?])    # Negative lookahead assertion to ensure the position is not followed by '?'                      
    ''', re.VERBOSE)
    step_5 = pattern_star.sub('*?', step_4)
    #Step 6 : Add concat after every alphanum or dot if its followed by alphanumeric or dot 
    pattern_alnum_dot = re.compile(r'''
        ([a-zA-Z0-9\.])  # Match any alphanumeric character or dot
        (?=[a-zA-Z0-9\.])  # Positive lookahead assertion to ensure it is followed by another alphanumeric character or dot
    ''', re.VERBOSE)
    step_6 = pattern_alnum_dot.sub(r'\1?', step_5)
    return split_input(step_6) 




In [12]:

'''
Postfix notation removes the need for parentheses and allows computer programs to read in 
mathematical expressions one symbol after the other, instead of worrying about operator precedence 
and parentheses during computation. 

'''
def shuntingYard(input) :
    precedence_dict = {'*': 3, '?': 2, '|': 1}
    out =[]
    operator_stack = []
    for char in input :
        # If the input is alphanumeric then append to the output regex 
        if char.isalnum() or char == '.' or len(char) > 1 :
            out.append(char)
        # If the input is an operator
        elif  char in precedence_dict.keys() :
            # The first operator in the stack 
            if len(operator_stack) == 0:
                operator_stack.append(char) 
            #  Any operator shouldn't be compared to an opening parenthes , if an opening parenthes is on the top of the stack Just add the char to the stack directly 
            elif operator_stack[-1] =='('or precedence_dict[ operator_stack[-1] ] < precedence_dict[char] :
                operator_stack.append(char)
            # If the operator on the top of the stack is the same as the current char then pop one to the output regex and leave the other in tha stack 
            # If two consecutive opening parenthes comes we need them both to be in the stack and not popped to the output because they will be deleted later 
            elif operator_stack[-1] != '(' and precedence_dict[ operator_stack[-1] ] == precedence_dict[char] : 
                out.append(char) 
            #If the operator at the top of the stack has higher precedence that the current operator then pop to the output until we can push the current operator to the stack 
            else : 
                while (len(operator_stack)>0 and operator_stack[-1] != '('and precedence_dict[ operator_stack[-1] ]  >= precedence_dict[char]) :
                    popped_operator = operator_stack.pop()
                    out.append(popped_operator)
                operator_stack.append(char) 
        elif char == '(' :
            operator_stack.append(char)
        # The current char is closing parenthes -> pop from the operator stack to the output until you reach an opening parenthes
        elif char == ')' :
            while operator_stack:
                operator = operator_stack.pop()
                if operator == '(':
                    break
                out.append(operator)
        # print ("parsing char :",char,"output:",out,"stack",operator_stack)
        # print("************************")
    out.extend(operator_stack[::-1]) 
    # print ("Final out :",out)
    return out 

    # print("Final stack :",operator_stack)


In [13]:
'''
Datastructures defined
 
'''
class Transition:
    def __init__(self, destination , input):
        self.destination = destination
        self.input = input
    def __repr__(self) -> str:
        return f"Transition(destination={self.destination.state_name}, input='{self.input}')"


class State:
    state_counter = 0  

    def __init__(self):
        self.state_name = 'S'+ str(State.state_counter) 
        State.state_counter += 1
        self.transitions = [] 
        self.isTerminatingState = False 

    def add_transition(self, destination, input):
        self.transitions.append(Transition(destination, input))

    def to_dict(self):
        state_dict = {
            "isTerminatingState": self.isTerminatingState
        }
        for transition in self.transitions:
            if transition.input in state_dict:
                # If key already exists, append to the list of destinations
                if isinstance(state_dict[transition.input], list):
                    state_dict[transition.input].append(transition.destination.state_name)
                else:
                    # If it's not already a list, make it a list
                    state_dict[transition.input] = [state_dict[transition.input], transition.destination.state_name]
            else:
                # If key doesn't exist, add it
                state_dict[transition.input] = transition.destination.state_name
        return state_dict

    def __repr__(self):
        return json.dumps(self.to_dict(), indent=2)
    
class NFA:
    def __init__(self):
        self.states = []
        self.start_state:State = None 
        #TODO : is this list or one element 
        self.accept_states : List [State] = []
    def add_states(self, states):
        self.states.extend(states)
        if not self.start_state and states:
            self.start_state = states[0]
        if states:
            self.accept_states.append(states[-1])
        
    def add_state(self, state , is_start = False , is_accept = False  ):
        self.states.append(state)
        if is_start:
            self.start_state = state
        if is_accept:
            self.accept_states.append(state)
        return state
    
    def to_dict(self):
        nfa_dict = {
            "startingState": self.start_state.state_name if self.start_state else None
        }
        for state in self.states:
            nfa_dict[state.state_name] = state.to_dict()
        return nfa_dict

    def __repr__(self):
        return json.dumps(self.to_dict(), indent=2)

In [14]:
'''
Finally : Thomsons construction Algorithm 
'''

def alphanumeric_nfa(char) : 
    start_state = State() 
    end_state = State() 
    end_state.isTerminatingState = True 
    start_state.add_transition(end_state , char)
    nfa = NFA() 
    nfa.add_state(start_state , is_start = True )
    nfa.add_state(end_state , is_start = False , is_accept = True) 
    
    return nfa 
def zero_or_more_nfa(operand: NFA) : 
    old_end_state = operand.accept_states[0]
    old_end_state.isTerminatingState = False 
    old_start_state = operand.start_state
    new_start_state = State() 
    new_end_state = State()  
    new_end_state.isTerminatingState = True 
    new_start_state.add_transition(old_start_state , '~')
    new_start_state.add_transition(new_end_state ,'~')
    old_end_state.add_transition(old_start_state ,'~')
    old_end_state.add_transition(new_end_state,'~')
    nfa = NFA ()
    nfa.add_states([new_start_state , old_start_state ,old_end_state ,new_end_state])
    return nfa

def union_nfa(operand1: NFA , operand2: NFA) :
    new_start_state = State() 
    new_end_state = State() 
    new_end_state.isTerminatingState = True
    new_start_state.add_transition(operand1.start_state , '~')
    new_start_state.add_transition(operand2.start_state , '~')
    operand1.accept_states[0].add_transition(new_end_state , '~')
    operand2.accept_states[0].add_transition(new_end_state , '~')
    nfa = NFA()
    nfa.add_states([new_start_state , new_end_state])
    nfa.add_states(operand1.states)
    nfa.add_states(operand2.states)
    return nfa
def constructNFA(input ) : 
    stack_NFA = []
    for char in input : 
        if char.isalnum() or char == '~' : 
            stack_NFA.append(alphanumeric_nfa(char))  
        elif char == '*':
            stack_NFA.append(zero_or_more_nfa(stack_NFA.pop()))
        elif char == '|': 
            operand2 = stack_NFA.pop() 
            operand1 = stack_NFA.pop()
            stack_NFA.append(union_nfa(operand1, operand2))
            
        # elif char == '?' :

    return stack_NFA 

def ThomsonsConstruction (input : str) -> NFA : 
    preprocessed = preprocessing(input) 
    shunting_yard = shuntingYard(preprocessed)
    return constructNFA(shunting_yard)  
nfa_result = ThomsonsConstruction('a|b')
for nfa in nfa_result: 
    # Get the JSON object from the `to_dict` method
    nfa_json = nfa.to_dict()

    # Print the JSON-formatted string
    print(json.dumps(nfa_json, indent=2))
    # visualize_nfa(nfa_json)
    print(nfa) 

{
  "startingState": "S4",
  "S4": {
    "isTerminatingState": false,
    "~": [
      "S0",
      "S2"
    ]
  },
  "S5": {
    "isTerminatingState": true
  },
  "S0": {
    "isTerminatingState": false,
    "a": "S1"
  },
  "S1": {
    "isTerminatingState": true,
    "~": "S5"
  },
  "S2": {
    "isTerminatingState": false,
    "b": "S3"
  },
  "S3": {
    "isTerminatingState": true,
    "~": "S5"
  }
}
{
  "startingState": "S4",
  "S4": {
    "isTerminatingState": false,
    "~": [
      "S0",
      "S2"
    ]
  },
  "S5": {
    "isTerminatingState": true
  },
  "S0": {
    "isTerminatingState": false,
    "a": "S1"
  },
  "S1": {
    "isTerminatingState": true,
    "~": "S5"
  },
  "S2": {
    "isTerminatingState": false,
    "b": "S3"
  },
  "S3": {
    "isTerminatingState": true,
    "~": "S5"
  }
}
