In [16]:
import os
import pickle

import pygtrie

# Get the path of the script
# script_path = os.path.abspath(__file__)

ROOT_DIR = '.'
# ROOT_DIR = os.path.dirname(script_path)

dict_path = ROOT_DIR + '/dict.pkl'

def saveDict():
	with open(dict_path, 'wb') as f:
		pickle.dump(trie, f)

if not os.path.exists(dict_path):
	trie = pygtrie.CharTrie()
	saveDict()

# Load the trie from the file using pickle
with open(dict_path, 'rb') as f:
	trie = pickle.load(f)

def add_word(word:str) -> None:
	trie[word] = True
	saveDict()
	
def check_word(word:str) -> bool:
	return word in trie

def del_word(word:str) -> None:
	try:
		del trie[word]
	except:
		pass

In [4]:
from itertools import chain
from math import log2
import regex as re
import sys
from statistics import mode 


# Get the path of the script
from dataclasses import dataclass, field
from typing import Callable, Generator, List, Optional, Tuple

# from .UDict import add_word, check_word 
import Levenshtein
from language_tool_python import LanguageTool
from icecream import ic

# Initialize the LanguageTool tool
lang_tool = LanguageTool('en-US', config={ 'cacheSize': 1000, 'pipelineCaching': True })

# Get the path of the script
script_path = '.'
# script_path = os.path.abspath(__file__)

# ROOT_DIR = os.path.dirname(script_path)
ROOT_DIR = '.'

def parseRules(name, ROOT_DIR=ROOT_DIR+'/rules') -> Generator[Tuple[str, str], None, None]:
   with open(ROOT_DIR + f'/{name}.tsv', 'r') as f:
     for line in f:
         line = line.strip()
         if len(line) == 0:
            continue
         line = line.split('\t')
         if len(line) > 1:
            yield (line[0], line[1])
def compile_first(x:Tuple[str,str])->Tuple[re.Pattern[str],str]:
   try:
     return (re.compile(x[0]),x[1])
   except:
     print(x)
     raise ValueError(f'compilable {x}')
WORD_CORRECTION_RULES = list(map(compile_first , chain(parseRules('anti.variant'), parseRules('anti.misspelling'))))
KEYBOARD_CORRECTION_RULES = list(map(compile_first , parseRules('anti.keyboard')))
FAT_CORRECTION_RULES = list(map(compile_first , parseRules('fat.keyboard')))
WORD_RULES = list(map(compile_first ,  chain(parseRules('variant'), parseRules('grammatical'), parseRules('misspelling'))))
KEYBOARD_RULES = list(map(compile_first,  parseRules('keyboard')))

In [5]:
class FileWriter:
	def __init__(self, file_path):
		self.file = open(file_path, 'a')

	def write(self, msg):
		self.file.write(msg)
		self.flush()

	def flush(self):
		self.file.flush()

In [6]:
from typing import List, Tuple, Callable

class StringSpans:
   def __init__(self, string = None):
      if string is not None:
         self.string = string
         self._set_spans(string)
   
   @staticmethod
   def _get_words(text: str,verbose=False) -> Tuple[List[Tuple[int,int]],List[Tuple[int,int]],List[Tuple[int,int]],List[Tuple[int,int]]]:
      word_regex = re.compile(r'[a-zA-Z\'\-]+')
      space_regex = re.compile(r'\s')
      spans = []
      words = []
      spaces = []
      for match in re.finditer(word_regex, text):
         spans.append(match.span())
         words.append(match.span())
      for match in re.finditer(space_regex, text):
         spans.append(match.span())
         spaces.append(match.span())
      spans.sort()
      result = []
      last = 0
      for i in range(len(spans)):
         start, end = spans[i]
         if start != last:
               result.append((last,start))
         result.append((start,end))
         last = end
      if last != len(text):
         result.append((last,len(text)))     
      non_words = [
         (start,end) for start,end in result 
         if (start,end) not in words and space_regex.match(text[start:end]) is None
         ]
      non_spaces = [
         (start,end) for start,end in result 
         if (start,end) not in spaces
         ]
         
      return result, words, non_words, non_spaces
   
   def _set_spans(self, string:str):
      spans, words,non_words, non_spaces = StringSpans._get_words(string)
      self.words = words
      self.spans = spans
      self.non_words = non_words
      self.non_spaces = non_spaces

   def replace_word(self, word_index: int, replacement: str) -> str:
      # Check if the word_index is valid
      if not (0 <= word_index < len(self.words)):
         raise ValueError(f"Invalid word index: {word_index}")
      
      # Get the start and end indices of the word span at the given word_index
      start, end = self.words[word_index]
      
      # Replace the word span with the replacement string
      new_string = self.string[:start] + replacement + self.string[end:]
      
      return new_string
   def get_word(self, word_index: int) -> str:
      # Check if the word_index is valid
      if not (0 <= word_index < len(self.words)):
         raise ValueError(f"Invalid word index: {word_index}")
      
      # Get the start and end indices of the word span at the given word_index
      start, end = self.words[word_index]
      
      return self.string[start:end]
   def get(self,span):
      return self.string[span[0]:span[1]]
   def replace_word_StringSpans(self, word_index: int, replacement: str):
      ss = StringSpans()
      ss.string = self.replace_word(word_index, replacement)
      word_start, word_end = self.words[word_index]
      span_index = self.spans.index((word_start,word_end))
      new_len = len(replacement)
      word_len_diff = new_len - (word_end - word_start) 
      def f(span):
         start,end = span
         return (start,end) if end < word_start else (start+word_len_diff,end+word_len_diff)
      ss.words = [f(span) for span in self.words]
      ss.spans = [f(span) for span in self.spans]
      ss.non_words = [f(span) for span in self.non_words]
      ss.non_spaces = [f(span) for span in self.spans]
      ss.words[word_index] = (word_start,word_start+new_len)
      ss.spans[span_index] = (word_start,word_start+new_len)
      return ss
   def get_words(self):
      return [self.string[start:end] for start,end in self.words]
   def isw(self):
      return ((i,start,end,self.words[start:end]) for i,(start,end) in enumerate(self.words))
   def iws(self):
      return ((start,end,self.words[start:end]) for (start,end) in self.words)

In [7]:
def count_uppercase_letters(s: str) -> int:
   count = 0
   for char in s:
      if ord(char) < 97 or ord(char) > 122:
         count += 1
   return count
def normal_word(word:str)->bool:
   return lang_tool.correct(word) == word or check_word(word)
def string_mutation_distance(str1: str, str2: str) -> int:
   """Returns the number of mutations required to transform str1 into str2"""
   return Levenshtein.distance(str1, str2)
def show_diff(a: str, b: str):
   l_a = StringSpans(a).get_words()
   l_b = StringSpans(b).get_words()
   for i in range(min(len(l_a), len(l_b))):
      if l_a[i] != l_b[i]:
         print(f'i:{i} a:"{l_a[i]}" b:"{l_b[i]}"')
def diff(a: str, b: str) -> List[Tuple[str,str]]:
   l_a = StringSpans(a).get_words()
   l_b = StringSpans(b).get_words()
   return [(l_a[i], l_b[i]) for i in range(min(len(l_a), len(l_b)))
      if l_a[i] != l_b[i]]
def apply_match(text:str, match_result: Tuple[Tuple[int,int],str,re.Pattern], verbose: bool = False) -> str:
   span, repl, regex = match_result
   if verbose:
      print(f"Before replace: {text}")
   replaced_text = regex.sub(repl, text[span[0]:span[1]])
   after_replace_text = text[:span[0]] + replaced_text + text[span[1]:]
   if verbose:
      print(f"After replace: {after_replace_text}")
   return after_replace_text
def keyboard_rules_scan(text: str)->List[Tuple[Tuple[int, int], str, re.Pattern]]:
   matches = []
   rules = KEYBOARD_RULES
   for regex, repl in rules:
     for x in regex.finditer(text,overlapped=True):
       matches.append((x.span(), repl, regex))
   return matches
def word_rules_scan(text: str)->List[Tuple[Tuple[int, int], str, re.Pattern]]:
   matches = []
   for regex, repl in WORD_RULES:
     x = regex.match(text)
     if x is not None:
       start, end = x.span()
       matches.append(((start, end), repl, regex))
   return matches
def rules_scan(text: str)-> List[Tuple[Tuple[int, int], str, re.Pattern]]:
   result = word_rules_scan(text) + keyboard_rules_scan(text)
   result.sort()
   return result
def expand_span_to_word(words:List[Tuple[int,int]],span:Tuple[int,int])->Tuple[Tuple[int,int],Tuple[int,int],int]:
   ss, se = span
   for i, (start,end) in enumerate(words):
      if start <= ss and se <= end:
         return (start,end),(ss-start,se-start),i
   for i, (start,end) in enumerate(words):
      if start > ss and se <= end:
         return (start,end),(start,se-start),i
      elif start <= ss and se > end:
         return (start,end),(ss-start,end-start),i
   
   raise ValueError(f'sth is wrong {words} {span}')
def valid_matches(text:str, slots:List[Tuple[Tuple[int, int], str, re.Pattern]], verbose=False):
   texas = StringSpans(text)
   mutations: List[str] = list(map(lambda x: '', slots))

   # Apply the match to the text and get the resulting strings
   for match_index, match_result in enumerate(slots):
      span, repl, regex = match_result
      ex_span,relative_span,ex_span_index = expand_span_to_word(texas.words,span)
      old_word = texas.get(ex_span)
      new_word = apply_match(old_word,(relative_span,repl,regex),verbose)
      new_word_corrections = corrections(new_word,verbose=verbose)
      new_word_corrections_mode = mode(new_word_corrections)
      if normal_word(old_word) \
            and not normal_word(new_word) \
            and new_word[0].lower() == old_word[0].lower() \
            and new_word[-1].lower() == old_word[-1].lower() \
            and new_word_corrections_mode == old_word:
         mutations[match_index] = texas.replace_word(ex_span_index,new_word)
      else:
         if verbose:
            print(f'rule undetectable or modify looks! new word "{new_word}" != "{old_word}" original and will be corrected to {new_word_corrections_mode} from {new_word_corrections}')


   # Print the list of matches and their mutations if verbose output is enabled
   if verbose:
      print('\nlist(zip(slots,mutations))'+('%'*20))
      for v in list(zip(slots, mutations)):
         print(v)

   # Check for ambiguous and invalid matches
   ambiguous_invalid_matches = [i for i, new_string in enumerate(mutations)
         if not new_string or new_string in mutations[:i]]

   # Create a list of valid matches
   valid_slots = [elem for i, elem in enumerate(slots) if i not in ambiguous_invalid_matches]

   return valid_slots
def valid_rules_scan(text:str,verbose=False):
   proposed_slots = rules_scan(text)
   if verbose:
     print('proposed_slots: ',proposed_slots)
   valid_slots = valid_matches(text,proposed_slots,verbose=verbose)
   if verbose:
     print('valid_slots: ')
     for s in valid_slots:
       print(s)
   return valid_slots
def chunker(text:str,span_size = 6) -> List[Tuple[int,int]]:
   words = StringSpans(text).words
   if len(words) < span_size:
      return [(0,len(text))]
   chunks = []
   last_start = 0
   for i in range(span_size,len(words),span_size):
      chunks.append((last_start,words[i-1][1]))
      last_start = words[i][0]
   
   # last word ends with last word
   chunks[-1][1] = words[-1][1]
   return chunks
def word_we_misspelled(word:str,spelling:str,verbose=False):
   uls = count_uppercase_letters(word)
   if string_mutation_distance(spelling,word) == 1 \
     and spelling[0].lower() == word[0].lower() \
     and spelling[-1].lower() == word[-1].lower() \
     and uls == 2 \
     and uls < len(word):

     for regex,repl in FAT_CORRECTION_RULES:
       if regex.sub(repl,word) != spelling:
         if verbose:
            print(f"FAT_CORRECTION_RULES ({regex}) ({repl}): {regex.sub(repl,word)} == {spelling}")
         return True
     return False
   else:
     return False # speller is wrong since input is ai generated and the only source for bad spelling is us and it's probably a name of sth
def spell_word(word:str,verbose=False) -> str:
   if normal_word(word):
      return word
   spellingOpt = lang_tool.check(word)[0].replacements[0]
   spelling = spellingOpt if spellingOpt is not None else word
   return spelling if word_we_misspelled(word,spelling,verbose) else word 
def normalize(text:str,verbose=False,learn=False):
   chunks = chunker(text)
   if verbose:
      print(f'chunks={chunks}')
   to_be_original = text
   offsets = [x.offset for x in lang_tool.check(text)]
   if verbose:
      print(f'offsets={offsets}')
   chunks_offsets = [
      [o for o in offsets if chunk_start <= o < chunk_end]
      for chunk_start,chunk_end in chunks]
   if verbose:
      print(f'chunks_offsets={chunks_offsets}')
   empty_chunks = [False for _ in chunks]
   text_sss = StringSpans(text)
   affected_words = [text[s:e] for s,e in text_sss.words if s in offsets]
   if verbose:
      print(f"affected_words={affected_words}")
   affected_words_corrections = [corrections(w) for w in affected_words]
   if verbose:
      print(f"affected_words_corrections={affected_words_corrections}")
   for i, os in enumerate(chunks_offsets):
      if verbose:
         print(f'os={os}')
      if len(os) > 1:
         if verbose:
            print(f'len({os})={len(os)} > 1')
         empty_chunks[i] = True
         if learn:
            for o in os:
               add_word(affected_words[offsets.index(o)])
      elif len(os) == 1 and len(affected_words_corrections[offsets.index(os[0])]) == 0:
         if verbose:
            print(f'no suggestions for {affected_words[offsets.index(os[0])]} added to dict')
         empty_chunks[i] = True
         if learn:
            add_word(affected_words[offsets.index(os[0])])
      elif len(os) == 1:
         j = offsets.index(os[0])
         cs = affected_words_corrections[j]
         if verbose:
            print(f'typo={affected_words[j]}\nsuggestion={mode(cs)}')
            print(f'votes={cs}')
         to_be_original = to_be_original.replace(affected_words[j],mode(cs))
      else:
         empty_chunks[i] = True
   
   return text
def corrections (typo,verbose=False):
   suggestion = spell_word(typo)
   votes = [suggestion] if string_mutation_distance(suggestion,typo) == 1 and normal_word(suggestion) else []
   for regex,repl in FAT_CORRECTION_RULES:
      matches = ((x.span(), repl, regex) for x in regex.finditer(typo,overlapped=True))
      for match in matches:
         votes.append(apply_match(typo,match))
         
   for regex,repl in WORD_CORRECTION_RULES:
      if regex.match(typo) is not None:
         votes.append(regex.sub(repl,typo))

   for regex,repl in KEYBOARD_CORRECTION_RULES:
      matches = ((x.span(), repl, regex) for x in regex.finditer(typo,overlapped=True))
      for match in matches:
         votes.append(apply_match(typo,match))
         
   if verbose:
      print(f'unfiltered votes {votes}')
   votes = [v for v in votes if  normal_word(v)]
   if verbose:
      print(f'filtered votes {votes}')
   return votes

In [None]:
@dataclass
class Typo:
   """Class for Typo Engine."""

   text: str = field(repr=False)
   _length: int = field(init=False, repr=False)
   _slots: Optional[List[Tuple[Tuple[int, int], str, re.Pattern]]] = field(init=True, repr=False,default=None)
   _spaces: Optional[List[int]] = field(init=True, repr=False,default=None)
   verbose: bool = field(init=True,repr=False,default=False)

   def __post_init__(self):
      if self.text != normalize(self.text,self.verbose):
         raise ValueError("Text isn't spelled correctly")

   def apply(self, space: int, offset: int, text: str) -> str:
      if self.verbose:
         print(f"apply: space={space}, offset={offset}, text={text}")
      if offset == 0:
         return text
      match_tuple = self.slots[sum(self.spaces[0:space]) + offset - 1]
      applied = apply_match(text, match_tuple,self.verbose)
      if self.verbose:
         print(f"applied: {applied}")
      return applied
   
   @property
   def slots(self):
      if self._slots is None:
         self._slots = valid_rules_scan(self.text,self.verbose)
      return self._slots

   @property
   def length(self) -> int:
      return len(self.slots)

   @length.setter
   def length(self, length: int):
      pass

   @property
   def spaces(self) -> List[int]:
      if self._spaces is not None:
         return self._spaces
      
      sentence_ranges = chunker(self.text)
      
      # Initialize an empty list of buckets
      num_buckets = len(sentence_ranges)
      buckets: List[int] = [0 for _ in range(num_buckets)]
      
      # Iterate through each element range and put it in the corresponding bucket
      for i, (start, end) in enumerate(span for span,_,_ in self.slots):
         for j, (sent_start, sent_end) in enumerate(sentence_ranges):
            if sent_start <= start < sent_end and sent_start < end <= sent_end:
               buckets[j] += 1
               break
      return buckets

   @spaces.setter
   def spaces(self, value):
      pass

   @property
   def bits(self):
      return list(map(int, map(lambda x : log2(x + 1), self.spaces)))

   @bits.setter
   def bits(self, bits: int):
      pass

   def encode(self, values:List[int]):
      spaces = self.spaces
      if len(values) > len(spaces):
         raise ValueError("Can't encode")
      for i in range(len(values)):
         if values[i] >= spaces[i]:
            raise ValueError("Won't fit")
      result = self.text
      for i in range(len(values) - 1, -1, -1):
         result = self.apply(i, values[i], result)
      return result

   @staticmethod
   def decode(text:str,verbose=False,test_self=None) -> Tuple[str,List[int]]:
      original = normalize(text)
      if test_self is not None:
         if original != test_self.text and verbose:
            print(f'original=\n{original}')
            print(f'test_self.text=\n{test_self.text}')
         assert original == test_self.text
      t = Typo(original)

      return original, t._decode(text,test_self)
   
   def _decode(self, text:str,test=None) -> List[int]:
      a_self = test if test is not None else self
      spaces = a_self.spaces
      cnt = len(diff(text,a_self.text))
      if a_self.verbose:
         print(f'cnt={cnt}')
         print(f'diff(text,a_self.text)={diff(text,a_self.text)}')
      values = [0 for s in spaces]
      for index, space in enumerate(spaces):
         isZero = True
         for i in range(space):
            values[index] = i
            dif = diff(text, a_self.encode(values))
            if len(dif) == cnt - 1:
               if a_self.verbose:
                  print(f'values={values}')
                  print(f'dif={dif}')
               cnt -= 1
               isZero = False
               break 
         if isZero:
            values[index] = 0     
            if a_self.verbose:
               print(f'chunk is empty values={values}')
      return  values
   
   def encode_encoder(self, bytes_str: str) -> Tuple[List[int], str]:
      if not set(bytes_str) <= set('01'):
         raise ValueError(f"bytes_str isn't a bytes string : '{bytes_str}'")
      values = self.bits
      bit_values = []
      remaining_bits = bytes_str
      for i, val in enumerate(values):
         if len(remaining_bits) >= val + 1 and int(remaining_bits[:val+1]) < self.spaces[i]:
            bit_value = int(remaining_bits[:val+1], 2)
            bit_values.append(bit_value)
            remaining_bits = remaining_bits[val+1:]
         elif len(remaining_bits) >= val and val > 0:
            bit_value = int(remaining_bits[:val], 2)
            bit_values.append(bit_value)
            remaining_bits = remaining_bits[val:]
         else:
            bit_values.append(0)
      return bit_values, remaining_bits

In [None]:
dummy = Typo()

In [None]:
dummy.spaces

[36, 73, 60, 14, 90, 49]

In [None]:
dummy.verbose = False

In [None]:
def testTypoInstance(t,verbose=False,testName='test'):
    # patch
   ORIGINAL_STDOUT = sys.stdout
   sys.stdout = FileWriter(testName+'.txt')

   if isinstance (t,str):
     t = Typo(t,verbose=verbose)
   spaces = t.spaces

   print(f"t.spaces = {spaces}")
   print(f"t.bits = {t.bits}")
   print(f"max={max(spaces)}")
   print(f"len={len(spaces)}")
   
   g = (list(map(lambda x: i % x ,spaces)) for i in range(max(spaces)))
   cnt = 10
   for v in g:
     print(f'{v}')
     encoded = t.encode(v)
     print(f"after encoding {v} {encoded}")
     org, x = Typo.decode(encoded,test_self=t)
     print(f'original text candidate "{org}"')
     if org == t.text and x == v:
       print(('>'*100)+" passed!")
     if not x == v:
       print(f'\nt.decode(t.encode(v)):{x}')
       print(f't.text:{t.text}')
     if cnt < 0:
       break
     cnt -=1
   # restore
   sys.stdout = ORIGINAL_STDOUT
   return t

In [None]:
testTexts = [
   '''Hey, How are you? Did you see the last John Cena movie?'''
,'''Hi, How are you?'''
,'However, you may as well just use a function statement instead; the only advantage that a lambda offers is that you can put a function definition for a simple expression inside a larger expression.'
, '''However, you may as well just use a function statement instead; the only advantage that a lambda offers is that you can put a function definition for a simple expression inside a larger expression. But the above lambda is not part of a larger expression, it is only ever part of an assignment statement, binding it to a name. That's exactly what a statement would achieve.'''
, '''I’ve toyed with the idea of using GPT-3’s API to add much more intelligent capabilities to RC, but I can’t deny that I’m drawn to the idea of running this kind of language model locally and in an environment I control. I’d like to someday increase the speed of RC’s speech synthesis and add a speech-to-text translation model in order to achieve real-time communication between humans and the chatbot. I anticipate that with this handful of improvements, RC will be considered a fully-fledged member of our server. Often, we feel that it already is.'''
]
# LAST_TESTED_TYPO = testTypoInstance(text,testName='mon1208',verbose=True)

Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['tied', 'timed', 'tired', 'tiled', 'toyed', 'tided'], 'offsetInContext': 5, 'context': 'I’ve tiyed with the idea of using GPT-3’s API to a...', 'offset': 5, 'errorLength': 5, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': "I've tiyed with the idea of using GPT-3's API to add much more intelligent capabilities to RC"})