In [1]:
import re
from collections import Counter
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd

In [2]:
df = pd.read_csv('papers.csv')

In [3]:
stop_words = set(stopwords.words('english'))
##Creating a list of custom stopwords
new_words = ["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "also", "one", "two", "three", 
             "four", "five", "seven","eight","nine"]
stop_words = list(stop_words.union(new_words))

In [4]:

def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()
    
    # remove stopwords
    text = [word for word in text if word not in stop_words]

    # remove words less than three letters
    text = [word for word in text if len(word) >= 3]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    text = [lmtzr.lemmatize(word) for word in text]
    
    return ' '.join(text)

In [5]:
docs = df['paper_text'].iloc[:3000].apply(lambda x:pre_process(x))

In [6]:
docs.shape

(3000,)

In [7]:
sentences = docs.tolist()
len(sentences)

3000

In [8]:
sentences[:2]

['self organization associative database application hisashi suzuki suguru arimoto osaka university toyonaka osaka japan abstract efficient method self organizing associative database proposed together application robot eyesight system proposed database associate input output first half part discussion algorithm self organization proposed aspect hardware produce new style neural network latter half part applicability handwritten letter recognition autonomous mobile robot system demonstrated introduction let mapping given finite infinite set another finite infinite set learning machine observes set pair sampled randomly mean cartesian product computes estimate make small estimation error measure usually say faster decrease estimation error increase number sample better learning machine however expression performance incomplete since lack consideration candidate assumed preliminarily find good learning machine clarify conception let discus type learning machine let advance understanding 

In [9]:
text_data = ' '.join(sentences)
text_data[:1000]

'self organization associative database application hisashi suzuki suguru arimoto osaka university toyonaka osaka japan abstract efficient method self organizing associative database proposed together application robot eyesight system proposed database associate input output first half part discussion algorithm self organization proposed aspect hardware produce new style neural network latter half part applicability handwritten letter recognition autonomous mobile robot system demonstrated introduction let mapping given finite infinite set another finite infinite set learning machine observes set pair sampled randomly mean cartesian product computes estimate make small estimation error measure usually say faster decrease estimation error increase number sample better learning machine however expression performance incomplete since lack consideration candidate assumed preliminarily find good learning machine clarify conception let discus type learning machine let advance understanding s

In [10]:
def misc(file_name):
    words = []
    file_name = process_tweet(file_name)
    words = re.findall(r'\w+', file_name)
    return words

def process_tweet(tweet):
      tweet = re.sub(r'\$\w*', '', tweet)
      tweet = re.sub(r'^RT[\s]+', '', tweet)
      tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
      tweet = re.sub(r'#', '', tweet)

      return tweet    

In [11]:
words = misc(text_data)
vocab = set(words)
print(f"There are {len(vocab)} unique words in the vocabulary.")

There are 90669 unique words in the vocabulary.


In [12]:
# words = re.findall(r'\w+', text_data)
# print(len(words))
# vocab = set(words)
# print(len(vocab))

In [13]:
def get_count(word_l):
    """
    Input:
        word_l: a set of words representing the corpus.
    Output:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    """
    word_count_dict = {}
    word_count_dict = Counter(word_l)

    return word_count_dict

In [14]:
word_count_dict = get_count(words)

In [15]:
word_count_dict['infinite']

1484

In [16]:
def get_probs(word_count_dict):
    """
    Input:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    Output:
        probs: A dictionary where keys are the words and the values are the probability that a word will occur.
    """
    probs = {}  # return this variable correctly
    m = sum(word_count_dict.values())
    for key in word_count_dict.keys():
        probs[key] = word_count_dict.get(key, 0) / m

    return probs

In [17]:
probs = get_probs(word_count_dict)

In [18]:
# Part 2: String Manipulation

In [19]:
# delete_letter()
def delete_letter(word, verbose=False):
    delete_l = []
    split_l = []

    split_l = [(word[:i], word[i:]) for i in range(len(word))]
    delete_l = [L + R[1:] for L, R in split_l if R]

    if verbose:
        print(f"input word {word}, \nsplit_l = {split_l}, \ndelete_l = {delete_l}")# printing implicitly.

    return delete_l

In [20]:
# checking the function
print(delete_letter(word="cans", verbose=True))

input word cans, 
split_l = [('', 'cans'), ('c', 'ans'), ('ca', 'ns'), ('can', 's')], 
delete_l = ['ans', 'cns', 'cas', 'can']
['ans', 'cns', 'cas', 'can']


In [21]:
# switch_letter()
def switch_letter(word, verbose=False):
    def swap(c, i, j):
        c = list(c)
        c[i], c[j] = c[j], c[i]
        return ''.join(c)

    switch_l = []
    split_l = []
    split_l = [(word[:i], word[i:]) for i in range(len(word))]
    switch_l = [a + b[1] + b[0] + b[2:] for a, b in split_l if len(b) >= 2]

    if verbose:
        print(f"Input word = {word} \nsplit_l = {split_l} \nswitch_l = {switch_l}")

    return switch_l

In [22]:
print(switch_letter(word="eta", verbose=True))

Input word = eta 
split_l = [('', 'eta'), ('e', 'ta'), ('et', 'a')] 
switch_l = ['tea', 'eat']
['tea', 'eat']


In [23]:
# replace_letter()
def replace_letter(word, verbose=False):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    replace_l = []
    split_l = []

    split_l = [(word[:i], word[i:]) for i in range(len(word))]
    replace_l = [a + l + (b[1:] if len(b) > 1 else '') for a, b in split_l if b for l in letters]
    replace_set = set(replace_l)
    replace_set.remove(word)
    # turn the set back into a list and sort it, for easier viewing
    replace_l = sorted(list(replace_set))

    if verbose:
        print(f"Input word = {word} \nsplit_l = {split_l} \nreplace_l {replace_l}")

    return replace_l

In [24]:
print(replace_letter(word='can', verbose=True))

Input word = can 
split_l = [('', 'can'), ('c', 'an'), ('ca', 'n')] 
replace_l ['aan', 'ban', 'caa', 'cab', 'cac', 'cad', 'cae', 'caf', 'cag', 'cah', 'cai', 'caj', 'cak', 'cal', 'cam', 'cao', 'cap', 'caq', 'car', 'cas', 'cat', 'cau', 'cav', 'caw', 'cax', 'cay', 'caz', 'cbn', 'ccn', 'cdn', 'cen', 'cfn', 'cgn', 'chn', 'cin', 'cjn', 'ckn', 'cln', 'cmn', 'cnn', 'con', 'cpn', 'cqn', 'crn', 'csn', 'ctn', 'cun', 'cvn', 'cwn', 'cxn', 'cyn', 'czn', 'dan', 'ean', 'fan', 'gan', 'han', 'ian', 'jan', 'kan', 'lan', 'man', 'nan', 'oan', 'pan', 'qan', 'ran', 'san', 'tan', 'uan', 'van', 'wan', 'xan', 'yan', 'zan']
['aan', 'ban', 'caa', 'cab', 'cac', 'cad', 'cae', 'caf', 'cag', 'cah', 'cai', 'caj', 'cak', 'cal', 'cam', 'cao', 'cap', 'caq', 'car', 'cas', 'cat', 'cau', 'cav', 'caw', 'cax', 'cay', 'caz', 'cbn', 'ccn', 'cdn', 'cen', 'cfn', 'cgn', 'chn', 'cin', 'cjn', 'ckn', 'cln', 'cmn', 'cnn', 'con', 'cpn', 'cqn', 'crn', 'csn', 'ctn', 'cun', 'cvn', 'cwn', 'cxn', 'cyn', 'czn', 'dan', 'ean', 'fan', 'gan', 'h

In [25]:
#  insert_letter()
def insert_letter(word, verbose=False):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    insert_l = []
    split_l = []
    split_l = [(word[:i], word[i:]) for i in range(len(word))]
    insert_l = [ a + l + b for a, b in split_l for l in letters]

    if verbose:
        print(f"Input word {word} \nsplit_l = {split_l} \ninsert_l = {insert_l}")

    return insert_l


In [26]:
print(insert_letter(word='at', verbose=False))

['aat', 'bat', 'cat', 'dat', 'eat', 'fat', 'gat', 'hat', 'iat', 'jat', 'kat', 'lat', 'mat', 'nat', 'oat', 'pat', 'qat', 'rat', 'sat', 'tat', 'uat', 'vat', 'wat', 'xat', 'yat', 'zat', 'aat', 'abt', 'act', 'adt', 'aet', 'aft', 'agt', 'aht', 'ait', 'ajt', 'akt', 'alt', 'amt', 'ant', 'aot', 'apt', 'aqt', 'art', 'ast', 'att', 'aut', 'avt', 'awt', 'axt', 'ayt', 'azt']


In [27]:
# Combining the edits:
# Now that you have implemented the string manipulations, you will create two functions that,
#  given a string, will return all the possible single and double edits on that string. These will
#  be edit_one_letter() and edit_two_letters().

In [28]:
#  Edit one letter
def edit_one_letter(word, allow_switches=True):

    edit_one_set = set()
    edit_one_set.update(delete_letter(word))
    if allow_switches:
        edit_one_set.update(switch_letter(word))
    edit_one_set.update(replace_letter(word))
    edit_one_set.update(insert_letter(word))

    return edit_one_set

In [29]:
# Edit two letters
def edit_two_letters(word, allow_switches=True):

    edit_two_set = set()
    edit_one = edit_one_letter(word, allow_switches=allow_switches)
    for w in edit_one:
        if w:
            edit_two = edit_one_letter(w, allow_switches=allow_switches)
            edit_two_set.update(edit_two)

    return edit_two_set


In [30]:
# proposed
edit_two_letters('propose')

{'proaqose',
 'pfrogose',
 'npriopose',
 'prorosk',
 'propogze',
 'propuope',
 'prophobe',
 'xroepose',
 'kproiose',
 'irfopose',
 'promposze',
 'propqome',
 'prdopwse',
 'puoiose',
 'pioposfe',
 'pnropoae',
 'piopofse',
 'prposn',
 'udpropose',
 'cronpose',
 'propoht',
 'pnropoie',
 'propoisv',
 'prbposse',
 'prbposz',
 'qpropohse',
 'pdoposee',
 'pcrolpose',
 'propsosm',
 'pqropoose',
 'prxopoase',
 'prtposj',
 'prypost',
 'propkonse',
 'prnhopose',
 'broposo',
 'pfopofe',
 'prcopodse',
 'pdropos',
 'phrbpose',
 'propoisue',
 'kropode',
 'proioqse',
 'prvoposn',
 'proposhh',
 'prmpnose',
 'puropos',
 'prordpose',
 'propvzse',
 'apripose',
 'prgolose',
 'promosf',
 'prbplse',
 'tropospe',
 'orpose',
 'proapyse',
 'propqtse',
 'prpposke',
 'pcoposb',
 'prowpose',
 'pirojpose',
 'paopsoe',
 'prwopiose',
 'bprxopose',
 'pripqose',
 'proxoqse',
 'plwopose',
 'pyrotpose',
 'proyposee',
 'lproposee',
 'uropopse',
 'fpoopose',
 'prhohose',
 'prhoposwe',
 'projpcose',
 'propoosh',
 'ppopoase'

In [31]:
# suggest spelling suggestions
def get_corrections(word, probs, vocab, verbose=False):
    """
    Input:
        word: a user entered string to check for suggestions
        probs: a dictionary that maps each word to its probability in the corpus
        vocab: a set containing all the vocabulary
        n: number of possible word corrections you want returned in the dictionary
    Output:
        n_best: a list of tuples with the most probable n corrected words and their probabilities.
    """

    suggestions = []
    n_best = []
    #suggestions = list((word in vocab) or edit_one_letter(word).intersection(vocab) or
    #                   edit_two_letters(word).intersection(vocab))
    suggestions = list(edit_two_letters(word).intersection(vocab))
    # suggestions = list(edit_two_letters(word, False).intersection(vocab))
    n_best = [[s, probs.get(s, -1)] for s in list(reversed(suggestions))]

    if verbose:
        print("suggestions = ", suggestions)

    return n_best

In [32]:
# Testing
my_word = 'propose'
tmp_corrections = get_corrections(my_word, probs, vocab, verbose=False)
for i, word_prob in enumerate(tmp_corrections):
    print(f"word {i}: {word_prob[0]}, probability {word_prob[1]:.6f}")

word 0: roose, probability 0.000001
word 1: porpoise, probability 0.000000
word 2: propose, probability 0.000376
word 3: prose, probability 0.000001
word 4: prope, probability 0.000000
word 5: propoesd, probability 0.000000
word 6: proposed, probability 0.000829
word 7: promote, probability 0.000005
word 8: provost, probability 0.000004
word 9: apropos, probability 0.000000
word 10: proove, probability 0.000000
word 11: prognose, probability 0.000000
word 12: oppose, probability 0.000000
word 13: protos, probability 0.000000
word 14: proposal, probability 0.000129
word 15: promise, probability 0.000021
word 16: provoke, probability 0.000001
word 17: rpose, probability 0.000000
word 18: purpose, probability 0.000222
word 19: propoj, probability 0.000000
word 20: repose, probability 0.000000
word 21: proposer, probability 0.000000
word 22: propo, probability 0.000000
word 23: ppose, probability 0.000000
word 24: proposi, probability 0.000000
word 25: propoi, probability 0.000000
word 26: