In [104]:
import json
import re
import pandas as pd
from enum import Enum

In [105]:
with open("courses.json", "r") as f: 
    data = json.load(f)

df = pd.DataFrame.from_records(data)

### Substitutions

In [106]:
parsed = []

useless_phrases = ["is strongly recommended", "is recommended", "Take", r"Prerequisite\(s\): "]

for text in df["prerequisites"].to_list():
    # Remove courses
    # temp = re.sub(r"\w{2,4}\*[0-9]{4}", "___", text)
    temp = text

    # Remove useless phrases
    temp = re.sub(r"|".join(useless_phrases), "", temp)

    parsed.append(temp)

print(len(set(parsed)))

set(parsed)

1124


{'',
 '(CIS*3750 or CIS*3760), (CIS*2460 or STAT*2040)',
 '10.00 credits including ONEH*2000',
 '12.50 credits including (1 of ANTH*2160, ANTH*2180, IDEV*2300, SOC*2080), (1 of IDEV*2100, SOAN*2120, WMST*3000)',
 'SART*2700',
 'THST*2050, THST*2190 and 2.00 credits at the 3000-level in Theatre Studies',
 'MCB*2050, PSYC*3270, (1 of NEUR*2000, NEUR*2100, PSYC*2410), (1 of BIOM*3200, HK*2810, ZOO*3600)',
 '15.00 credits including (STAT*2040 or STAT*2230), ZOO*2090',
 'SART*2200',
 'A minimum grade of 70% in MUSC*2410.',
 'CRWR*3100',
 'ENGG*1210, ENGG*1500, 0.50 credits in calculus',
 'HK*4371',
 'GERM*1100',
 '1 of ANTH*1150, IDEV*1000, SOC*1100',
 'BIOC*2580, ZOO*2090, (STAT*2040 or STAT*2230 )',
 'LARC*2230',
 'FRHD*2040',
 '9.00 credits including (2 of BIOL*1070, BIOL*1080, BIOL*1090)',
 'CTS*2000, CTS*2010, and CTS*3000',
 '(CIS*2030 or ENGG*3640), CIS*3110',
 'IES*1020',
 'ECON*3740',
 'FRHD*3400, NUTR*2050',
 'MCB*3010',
 '4.00 credits including (JLS*1000 or POLS*1150)',
 '10.00 c

In [107]:
# Tokenizer

class TokenType(Enum):
    N_OF = 1
    COURSE = 2
    AND = 3
    OR = 4
    OPEN_PAREN = 5
    CLOSE_PAREN = 6
    UNKNOWN = 7

class Token:
    def __init__(self, type: TokenType, value: int | str | None=None):
        self.type = type
        self.value = value
    
    def __repr__(self):
        if self.type == TokenType.UNKNOWN:
            return str(self.value)

        return f"-{self.type.name}({self.value | ''})-"

class Tokenizer:
    def __init__(self, text: str):
        self.text = text
        self.index = 0

        self.tokens = self.tokenize()

    def tokenize(self) -> list[Token]:
        tokens: Token = []

        while self.index < len(self.text):
            buffer = self._peak()

            # Parentheses
            if buffer == "[":
                tokens.append(Token(TokenType.OPEN_PAREN))
                self._next()
                continue
            if buffer == "]":
                tokens.append(Token(TokenType.CLOSE_PAREN))
                self._next()
                continue
            if buffer == "(":
                tokens.append(Token(TokenType.OPEN_PAREN))
                self._next()
                continue
            if buffer == ")":
                tokens.append(Token(TokenType.CLOSE_PAREN))
                self._next()
                continue

            # n of
            if buffer.isdigit():
                buffer += self._peak(1, 3)
                if re.match(r"[0-9] of", buffer):
                    tokens.append(Token(TokenType.N_OF, int(buffer[0])))
                    self._next(len(buffer))
                    continue

            # Default
            tokens.append(Token(TokenType.UNKNOWN, buffer))
            self._next(len(buffer))
    
        return tokens


    def _peak(self, n=0, length=1):
        return self.text[self.index + n: self.index + n + length + 1]
    
    def _next(self, n=1):
        self.index += n

In [109]:
for text in parsed[1]:
    print(Tokenizer(text).tokens)
    break

KeyboardInterrupt: 