In [0]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from nltk import tokenize
import nltk.data
import re
from collections import Counter
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
content = []
for d in data_train['data']:
  content.append(d)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [0]:
misspells = ['milad', 'molazadeh', 'abouta', 'accesories', 'acadamy', 'adquiring', 'bewteen', 
             'comminication', 'dependancy', 'effeciency', 'formallize' , 
             'hierarcy', 'intertaining', 'nkow', 'persued', 'reversable', 'acress', 'navid', 'mahdi']

In [0]:
def getWords(contents_list): 
  sentences  = []

  for sentence in contents_list:
    sentence = re.sub('[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', '', sentence)
    sentence_clean = [i.lower() for i in re.split('[^a-zA-Z]+', sentence) if i]
    sentences.extend(sentence_clean)
  return sentences

tokens = Counter(getWords(content))
vocabulary = set(getWords(content))
print("here is some tokens: ", (( (tokens.most_common(5)))))


here is some tokens:  [('the', 29871), ('of', 17679), ('to', 17225), ('a', 13579), ('and', 13020)]


In [0]:
print(tokens.most_common(5))

[('the', 29871), ('of', 17679), ('to', 17225), ('a', 13579), ('and', 13020)]


In [0]:
# p(word)
def P(word, N=sum(tokens.values())): 
    return tokens[word] / N

def getSuggestion(WordsList): 
  suggests = []
  for w in WordsList:
    if w in tokens:
      suggests.append(w)
  return suggests

def getCandidates(word): 
  return (getSuggestion([word]) or getSuggestion(edit(word)) or [word])

def getCorrection(word): 
  return max(getCandidates(word), key=P)

def edit(word):
  letters    = 'abcdefghijklmnopqrstuvwxyz'
  splits     = [(word[:i], word[i:])                            for i in range(len(word) + 1)]
  deletes    = [left + right[1:]                                for left, right in splits if right]
  prob_del   = len(deletes)
  transposes = [left + right[1] + right[0] + right[2:]          for left, right in splits if len(right)>1]
  replaces   = [left + let + right[1:]                          for left, right in splits if right for let in letters]
  inserts    = [left + let + right                              for left, right in splits for let in letters]
  # print((set(deletes + transposes + replaces + inserts)))
  return set(deletes + transposes + replaces + inserts)


In [0]:
for miss in misspells:
  lib = [("miss", "Candidates", "Correction")]
  lib.append((miss, getCandidates(miss), getCorrection(miss)))
  print('miss: {:30} Candidates: {:30} Correction: {:30}'.format(miss, ", ".join(getCandidates(miss)), "".join(getCorrection(miss))) )


miss: milad                          Candidates: milan, mild                    Correction: mild                          
miss: molazadeh                      Candidates: molazadeh                      Correction: molazadeh                     
miss: abouta                         Candidates: about                          Correction: about                         
miss: accesories                     Candidates: accessories                    Correction: accessories                   
miss: acadamy                        Candidates: academy                        Correction: academy                       
miss: adquiring                      Candidates: acquiring                      Correction: acquiring                     
miss: bewteen                        Candidates: between                        Correction: between                       
miss: comminication                  Candidates: communication                  Correction: communication                 
miss: dependancy