In [126]:
import re 

def split_input(input: str):
    split_list = []
    opening_match_found = False 
    match = ""
    for char in input: 
        if char == '[' :
            opening_match_found = True 
        elif char == ']': 
            opening_match_found = False 
            split_list.append('['+ match +']')
            match = "" 
        elif  opening_match_found == 0 :
            split_list.append (char) 
        else : 
            match =  match + char 
    return split_list
'''
This preporcessing is applied to the regex before applying shunting yard algorithm in order to : 
1) reduce all the regex to these operations only ( * , | , parenthes , concat )
2) add a concatination symbol between characters to be recognized by the algorithm as an operator 
3) replace every [match] with one char to be treated as other alphanumeric 
'''  
def preprocessing(input : str) :
# input= 'a?a((cd)|(a|b))b+bb' 
    #^Step1: Replace zero or one symbol '?'
    step_1 = re.sub(r'(\w)\?', r'(\1|~)', input)
    #^Step2: Replace one or more symbol '+'
    step_2 = re.sub(r'(\w)\+', r'\1\1*', step_1)
    #^Step3: Add concat symbol before every [ or (  if they are not at the start of the regex and they are not preceded by ?
    pattern_before = re.compile(r'''
        (?<!^)      # Negative lookbehind assertion to ensure the position is not at the start of the string
        (?<!\?)     # Negative lookbehind assertion to ensure the position is not preceded by '?'
        (?=[\[\(])  # Positive lookahead assertion to match '[' or '(' without consuming them
    ''', re.VERBOSE)
    step_3 = pattern_before.sub('?', step_2) 
    #^Step 4: Add concat symbol after every ] or ) if they are not the end of regex OR they are not followed by * and not followed by ? 
    pattern_after = re.compile(r'''
        (?<=[\]\)])  # Positive lookbehind assertion to match ']' or ')' without consuming them
        (?!$)        # Negative lookahead assertion to ensure the position is not at the end of the string
        (?![\*\?])   # Negative lookahead assertion to ensure the position is not followed by '*' or '?'
    ''', re.VERBOSE)
    step_4 = pattern_after.sub('?', step_3)
    #^Step 5 : Add concat after every * if its not the end of regex and its not follwed by star 
    pattern_star = re.compile(r'''
        \*          # Match the '*' character
        (?!$)       # Negative lookahead assertion to ensure the position is not at the end of the string
        (?![\?])    # Negative lookahead assertion to ensure the position is not followed by '?'                      
    ''', re.VERBOSE)
    step_5 = pattern_star.sub('*?', step_4)
    #^Step 6 : Add concat after every alphanum or dot if its followed by alphanumeric or dot 
    pattern_alnum_dot = re.compile(r'''
        ([a-zA-Z0-9\.])  # Match any alphanumeric character or dot
        (?=[a-zA-Z0-9\.])  # Positive lookahead assertion to ensure it is followed by another alphanumeric character or dot
    ''', re.VERBOSE)
    step_6 = pattern_alnum_dot.sub(r'\1?', step_5)
    return split_input(step_6) 




In [127]:
# Reeference : https://medium.com/@gregorycernera/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm-63d22ea1cf88
'''
Postfix notation removes the need for parentheses and allows computer programs to read in 
mathematical expressions one symbol after the other, instead of worrying about operator precedence 
and parentheses during computation. 

'''
def shuntingYard(input) :
    precedence_dict = {'*': 3, '?': 2, '|': 1}
    out =[]
    operator_stack = []
    for char in input :
        # If the input is alphanumeric then append to the output regex 
        if char.isalnum() or char == '.' or len(char) > 1 :
            out.append(char)
        # If the input is an operator
        elif  char in precedence_dict.keys() :
            # The first operator in the stack 
            if len(operator_stack) == 0:
                operator_stack.append(char) 
            #  Any operator shouldn't be compared to an opening parenthes , if an opening parenthes is on the top of the stack Just add the char to the stack directly 
            elif operator_stack[-1] =='('or precedence_dict[ operator_stack[-1] ] < precedence_dict[char] :
                operator_stack.append(char)
            # If the operator on the top of the stack is the same as the current char then pop one to the output regex and leave the other in tha stack 
            # If two consecutive opening parenthes comes we need them both to be in the stack and not popped to the output because they will be deleted later 
            elif operator_stack[-1] != '(' and precedence_dict[ operator_stack[-1] ] == precedence_dict[char] : 
                out.append(char) 
            #If the operator at the top of the stack has higher precedence that the current operator then pop to the output until we can push the current operator to the stack 
            else : 
                while (len(operator_stack)>0 and operator_stack[-1] != '('and precedence_dict[ operator_stack[-1] ]  >= precedence_dict[char]) :
                    popped_operator = operator_stack.pop()
                    out.append(popped_operator)
                operator_stack.append(char) 
        elif char == '(' :
            operator_stack.append(char)
        # The current char is closing parenthes -> pop from the operator stack to the output until you reach an opening parenthes
        elif char == ')' :
            while operator_stack:
                operator = operator_stack.pop()
                if operator == '(':
                    break
                out.append(operator)
        # print ("parsing char :",char,"output:",out,"stack",operator_stack)
        # print("************************")
    out.extend(operator_stack[::-1]) 
    # print ("Final out :",out)
    return out 

    # print("Final stack :",operator_stack)


In [128]:
import json 
from typing import List

class Transition:
    def __init__(self, destination, input):
        self.destination = destination
        self.input = input

class State:
    state_counter = 0  

    def __init__(self):
        self.state_name = 'S'+ str(State.state_counter) 
        State.state_counter += 1
        self.transitions = [] 
        self.isTerminatingState = False 

    def add_transition(self, destination, input):
        self.transitions.append(Transition(destination, input))

    def to_dict(self):
        state_dict = {
            "isTerminatingState": self.isTerminatingState
        }
        for transition in self.transitions:
            state_dict[transition.input] = transition.destination.state_name
        return state_dict 
    
class NFA:
    def __init__(self):
        self.states = []
        self.start_state:State = None 
        #TODO : is this list or one element 
        self.accept_states : List [State] = []

    def add_state(self, state , is_start = False , is_accept = False  ):
        self.states.append(state)
        if is_start:
            self.start_state = state
        if is_accept:
            self.accept_states.append(state)
        return state
    
    def to_dict(self):
        nfa_dict = {
            "startingState": self.start_state.state_name if self.start_state else None
        }
        for state in self.states:
            nfa_dict[state.state_name] = state.to_dict()
        return nfa_dict

    def __repr__(self):
        return json.dumps(self.to_dict(), indent=2)

In [None]:
def alphanumeric_nfa(char) : 
    start_state = State() 
    end_state = State() 
    end_state.isTerminatingState = True 
    start_state.add_transition(end_state , char)
    nfa = NFA() 
    nfa.add_state(start_state , is_start = True )
    nfa.add_state(end_state , is_start = False , is_accept = True) 
    
    return nfa 
def zero_or_more_nfa(operand: NFA) : 
    operand.accept_states[0].isTerminatingState = False 
    operand.accept_states[0].add_transition(operand.start_state , '~') 

    start_state = State() 
    end_state = State()  
    end_state.isTerminatingState = True 
    
    start_state.add_transition(operand.start_state , '~')
    operand.accept_states[0].add_transition(end_state , '~')
    start_state.add_transition(end_state , '~')
    operand.add_state(start_state , is_start = True) 
    operand.add_state(end_state , is_start = False , is_accept = True)
     
    return operand

    



def constructNFA(input ) : 
    stack_NFA = []
    for char in input : 
        if char.isalnum() or char == '~' : 
            stack_NFA.append(alphanumeric_nfa(char))  
        elif char == '*':
            stack_NFA.append(zero_or_more_nfa(stack_NFA.pop()))

        # elif char == '|': 
        # elif char == '?' :

    return stack_NFA 
def ThomsonsConstruction (input : str) : 
    preprocessed = preprocessing(input) 
    shunting_yard = shuntingYard(preprocessed)
    return constructNFA(shunting_yard)  
nfa_result = ThomsonsConstruction('c*')
for nfa in nfa_result:
    print(nfa) 

{
  "startingState": "S2",
  "S0": {
    "isTerminatingState": false,
    "c": "S1"
  },
  "S1": {
    "isTerminatingState": false,
    "~": "S3"
  },
  "S2": {
    "isTerminatingState": false,
    "~": "S3"
  },
  "S3": {
    "isTerminatingState": true
  }
}
