In [None]:
import re
import numpy as np

In [None]:
def tokenizeText(text):
    pattern = r"""
    (?:[A-Za-z]\.){2,}[A-Z]
    | \w+(?:-\w+)+
    | \b([A-Za-z]+)(n't|'s|'ll|'em|'ve|'re|'d)\b
    | \b\w+\b
    | [.,!?;"()\[\]{}<>]
    """
    tokens = []
    for match in re.finditer(pattern, text, flags=re.VERBOSE):
        if match.group(1):
            tokens.extend([match.group(1), match.group(2)])
        else:
            tokens.append(match.group(0))
    return tokens

text = 'Implement a simple rule-based Text tokenizer for the English language using regular expressions. Your tokenizer should consider punctuations and special symbols as separate tokens. Contractions like "isn\'t" should be regarded as 2 tokens - "is" and "n\'t". Also identify abbreviations (eg, U.S.A) and internal hyphenation (eg. ice-cream), as single tokens.'
tokens = tokenizeText(text)
tokens = set(tokens)
print(tokens)

{'Implement', 'symbols', 'be', 'Text', '2', 'regular', 'for', 'should', 'tokenizer', 'expressions', 'regarded', 'consider', 'identify', ',', 'simple', 'special', '(', ')', 'tokens', 'language', 'n', 'hyphenation', 'English', 'separate', 'ice-cream', 'internal', 'Also', 'is', 'a', 'punctuations', 'the', 'using', "n't", 'single', 'and', 't', 'as', 'like', '.', 'Contractions', 'rule-based', 'eg', 'abbreviations', '"', 'U.S.A', 'Your'}


In [None]:
def isProperPlural(word):
    vowels = "aeiou"
    state = "start"
    accepts = False
    for i in range(1 , 4):
      letter = word[-i]
      if state == 'start':
        if letter == 's':
          state = 'plural'
        else :
          return False
      elif state == 'plural':
        if letter == 'y':
          state = 'end-Y'
        else :
          return False
      elif state == 'end-Y':
        if letter in vowels:
          return True
        else:
          return False

test = ["boys", "toys", "ponies", "skies", "puppies", "boies", "toies", "ponys" , "boykys"]
results = {word: isProperPlural(word) for word in test}
print(results)


{'boys': True, 'toys': True, 'ponies': False, 'skies': False, 'puppies': False, 'boies': False, 'toies': False, 'ponys': False, 'boykys': False}


In [None]:
def pluralize(word):
    if word.endswith("x^s#") or word.endswith("s^s#") or word.endswith("z^s#"):
        return word.replace("^s#", "es")
    elif word.endswith("^s#"):
        return word.replace("^s#", "s")
    else:
        return word

test = ["fox^s#", "boy^s#", "bus^s#", "quiz^s#", "dog^s#"]
result = {word: pluralize(word) for word in test}
print(result)


{'fox^s#': 'foxes', 'boy^s#': 'boys', 'bus^s#': 'buses', 'quiz^s#': 'quizes', 'dog^s#': 'dogs'}


In [None]:
def editDistance(source, target):
    sourceLength = len(source)
    targetLength = len(target)
    operations = []
    distanceTable = [[0] * (targetLength + 1) for _ in range(sourceLength + 1)]
    for i in range(sourceLength + 1):
        distanceTable[i][0] = i
    for k in range(targetLength + 1):
        distanceTable[0][k] = k
    for i in range(1, sourceLength + 1):
        for j in range(1, targetLength + 1):
            if source[i - 1] == target[j - 1]:
                distanceTable[i][j] = distanceTable[i - 1][j - 1]
            else:
                distanceTable[i][j] = min(
                    distanceTable[i][j - 1] + 1,              # Insertion
                    distanceTable[i - 1][j] + 1,              # Deletion
                    distanceTable[i - 1][j - 1] + 2           # Substitution
                )
    i, j = sourceLength, targetLength
    operations = []
    while i > 0 or j > 0:
        if i > 0 and j > 0 and source[i - 1] == target[j - 1]:
            i, j = i - 1, j - 1
        elif i > 0 and distanceTable[i][j] == distanceTable[i - 1][j] + 1:
            operations.append(f"Delete '{source[i - 1]}' from position {i}")
            i -= 1
        elif j > 0 and distanceTable[i][j] == distanceTable[i][j - 1] + 1:
            operations.append(f"Insert '{target[j - 1]}' at position {j}")
            j -= 1
        else:
            operations.append(f"Substitute '{source[i - 1]}' with '{target[j - 1]}' at position {i}")
            i, j = i - 1, j - 1
    operations.reverse()
    return distanceTable[sourceLength][targetLength], operations
source = "exclusive"
target = "invasive"
distance, operands = editDistance(source, target)
print("Edit Distance:", distance)
print("Operations:")
for op in operands:
    print(op)


Edit Distance: 9
Operations:
Insert 'i' at position 1
Insert 'n' at position 2
Insert 'v' at position 3
Insert 'a' at position 4
Delete 'e' from position 1
Delete 'x' from position 2
Delete 'c' from position 3
Delete 'l' from position 4
Delete 'u' from position 5
