# Replacing the word matching regular expression

1. We often combine the two word and we will write in the short form.
2. When processing the text we may want to expand those text.
3. For this we can use the regex.

In [1]:
import re
from nltk.corpus import wordnet

In [2]:
replacement_pattern = [
    (r'won\'t',"will not"),
    (r"can\'t","can not"),
    (r"i\'m","i am"),
    (r'ain\'t',"is not"),
    (r'(\w+)\'ll',"\g<1> will"),
    (r"(\w+)n\'t","\g<1> not"),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would')
]

In [3]:
class RegexpReplacer():
    def __init__(self,replacement_pattern=replacement_pattern) -> None:
        self.patterns = [(re.compile(pattern),repl) for pattern,repl in replacement_pattern]
    
    
    def replace(self,text):
        s= text 
        for pattern,repl in self.patterns:
            s = re.sub(pattern,repl,s)
        return s

In [4]:
replacer =RegexpReplacer()

In [5]:
replacer.replace("can't is a contraction")

'can not is a contraction'

In [6]:
replacer.replace("I should've done that thing I didn't do")

'I should have done that thing I did not do'

# Removing the repeating character

In [7]:
class RepeatReplacer():
    def __init__(self) -> None:
        self.repeat_regex = re.compile(r"(\w*)(\w)\2(\w*)")
        self.repl = r"\1\2\3"
    
    def replace(self,word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regex.sub(self.repl,word)
        
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word

In [8]:
replacer = RepeatReplacer()
replacer.replace("loooove")

'love'

In [9]:
replacer.replace("goose")

'goose'

In [10]:
replacer.replace("ooooooh")

'ooh'

# Spelling correction with enchant

In [16]:
import enchant
from nltk.metrics import edit_distance

In [17]:
class SpellingReplacer:
    def __init__(self,dict_name="en",max_dist =2) -> None:
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist
    
    def replace(self,word):
        if self.spell_dict.check(word):
            return word
        
        suggestions = self.spell_dict.suggest(word)
        if suggestions and edit_distance(word,suggestions[0])<= self.max_dist:
            return suggestions[0]
        else:
            return word        

In [18]:
replacer = SpellingReplacer()
replacer.replace("cookbok")

'cookbook'