# Building a Spell Corrector/Text Suggestor using fastText

### Importing the libraries

In [1]:
import nltk
import re
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import FastText
import io
import collections

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SURINDER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SURINDER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Reading the data

In [2]:
words = []
data = []
#with io.open('comments.txt', 'r') as file:
with open('comments.txt', encoding="utf8") as file:
    for entry in file:
        entry = entry.strip()
        data.append(entry)
        words.extend(entry.split())

FileNotFoundError: [Errno 2] No such file or directory: 'comments.txt'

### Checking for common terms in the data

In [None]:
#fetch some basic information about the data in terms of the most common words in the corpus
unique_words = []
unique_words = collections.Counter(words)
unique_words.most_common(10)

In [None]:
len(data)

In [None]:
data[0]


In [None]:
data[561807]

In [None]:
data[0:7]

In [None]:
len(unique_words)

### Data Preprocessing

In [None]:
def text_clean(corpus):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = []
    for row in corpus:
        qs = []
        for word in row.split():
            p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
            p1 = p1.lower()
            qs.append(p1)
        cleaned_corpus.append(' '.join(qs))
    return cleaned_corpus

In [None]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word)
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [None]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [None]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [None]:
def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    
    if cleaning == True:
        corpus = text_clean(corpus)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)
        
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
    corpus = [' '.join(x) for x in corpus]        

    return corpus

In [None]:
#preprocess the data using the preprocessing pipeline
data = preprocess(data)

### Data conversion into formation expected by fastText

In [None]:
preprocessed_data = []
for line in data:
    if line != "":
        preprocessed_data.append(line.split())

### Building the fastText model

In [None]:
model = FastText(size=300, window=3, min_count=1, min_n=1, max_n=5)

In [None]:
model.build_vocab(sentences=preprocessed_data)
#model.build_vocab(preprocessed_data)

In [None]:
len(model.wv.vocab)
#len(model.wv.index_to_key)

In [None]:
model.train(sentences=preprocessed_data, total_examples=len(preprocessed_data), epochs=10)
#model.train(preprocessed_data, total_examples=len(preprocessed_data), epochs=10)

### Checking for top 5 similar terms returned by the model for specific words (Can be spell corrections and suggestions)

In [None]:
model.wv.most_similar('eplain', topn=5)

In [None]:
model.wv.most_similar('reminder', topn=5)

In [None]:
model.wv.most_similar('relevnt', topn=5)

In [None]:
model.wv.most_similar('purse', topn=5)

## fastText and Word Mover's Distance

In [None]:
sentence_1 = "Biden speaks to the media in Illinois"
sentence_2 = "President greets the press in Chicago"
sentence_3 = "Apple is my favorite company"

In [None]:
sentence_4 = "Trump speaks to the media in Tallahassee"
sentence_5 = "Republicans meet the press in Florida"

In [None]:
# compute the WMD between sentence_1 and sentence_2 using fastText based vectors:
#word_mover_distance = model.wmdistance(sentence_1, sentence_2)
word_mover_distance = model.wv.wmdistance(sentence_1, sentence_2)
word_mover_distance

In [None]:
#compute the distance between sentence_2 and sentence_3
#word_mover_distance = model.wmdistance(sentence_2, sentence_3)
word_mover_distance = model.wv.wmdistance(sentence_2, sentence_3)
word_mover_distance

In [None]:
#compute the distance between sentence_4 and sentence_5
#word_mover_distance = model.wmdistance(sentence_4, sentence_5)
word_mover_distance = model.wv.wmdistance(sentence_4, sentence_5)
word_mover_distance

# sent2Vec Model

In [None]:
from sent2vec.vectorizer import Vectorizer

sentences = [
    "This is an awesome book to learn NLP.",
    "DistilBERT is an amazing NLP model.",
    "We can interchangeably use embedding, encoding, or vectorizing.",
    ]
vectorizer = Vectorizer()
vectorizer.bert(sentences)
vectors = vectorizer.vectors

In [None]:
vectors.shape

In [None]:
vectors

In [None]:
vectors[1]

In [None]:
from scipy import spatial

dist_1 = spatial.distance.cosine(vectors[0], vectors[1])
dist_2 = spatial.distance.cosine(vectors[0], vectors[2])
dist_3 = spatial.distance.cosine(vectors[1], vectors[2])

print(dist_1, dist_2, dist_3)