In [2]:
import pandas as pd
import numpy as np
import pickle
import re
import spacy

import nltk
from nltk.util import ngrams
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE

In [5]:
nlp = spacy.load("en_core_web_sm",exclude=["tok2vec","parser", "ner"])

In [3]:
text_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/medquad.csv")

In [4]:
text_df.head()

Unnamed: 0,question,answer,source,focus_area
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma
2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma
3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma
4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma


In [5]:
text = text_df['answer'].dropna().to_list()

In [6]:
text[0]

"Glaucoma is a group of diseases that can damage the eye's optic nerve and result in vision loss and blindness. While glaucoma can strike anyone, the risk is much greater for people over 60. How Glaucoma Develops  There are several different types of glaucoma. Most of these involve the drainage system within the eye. At the front of the eye there is a small space called the anterior chamber. A clear fluid flows through this chamber and bathes and nourishes the nearby tissues. (Watch the video to learn more about glaucoma. To enlarge the video, click the brackets in the lower right-hand corner. To reduce the video, press the Escape (Esc) button on your keyboard.) In glaucoma, for still unknown reasons, the fluid drains too slowly out of the eye. As the fluid builds up, the pressure inside the eye rises. Unless this pressure is controlled, it may cause damage to the optic nerve and other parts of the eye and result in loss of vision. Open-angle Glaucoma The most common type of glaucoma i

# Process text

In [47]:
test_text = " ".join(text)

In [None]:
test_text

In [21]:
token_list = []

In [22]:
for i,test_text in enumerate(text):
  temp_sent = []
  for token in  nlp(test_text.lower()):
    if not token.is_punct and token.text != " ":
      temp_sent.append(str(token))
  token_list.append(temp_sent)


In [None]:
token_list[0]

In [23]:
len(token_list)

16407

# Save processed text in text file

In [24]:
with open('/content/drive/MyDrive/Colab Notebooks/nlp/processed_texts.pkl', 'wb') as f:
    pickle.dump(token_list, f)

In [25]:
with open('/content/drive/MyDrive/Colab Notebooks/nlp/processed_texts.pkl', 'rb') as f:
    processed_texts = pickle.load(f)

In [26]:
len(processed_texts)

16407

# Create bigram

In [91]:
_2gram = [' '.join(e) for e in ngrams(token_list, 2)]

In [101]:
with open("/content/drive/MyDrive/Colab Notebooks/nlp/bigram_list", "wb") as f:   #Pickling
  pickle.dump(_2gram, f)

In [103]:
with open("/content/drive/MyDrive/Colab Notebooks/nlp/bigram_list", "rb") as f:   #Pickling
  bigram_list=pickle.load(f)

In [108]:
freq_bi = nltk.FreqDist(bigram_list)

In [109]:
print(freq_bi.most_common(5))

[('of the', 34914), ('in the', 18858), ('and symptoms', 10767), ('signs and', 10718), ('may be', 10229)]


In [28]:
n = 2
train_data, padded_sents = padded_everygram_pipeline(n, processed_texts)

In [29]:
model = MLE(n)

In [30]:
model.fit(train_data, padded_sents)

In [31]:
print(model.vocab.lookup(token_list[0]))

('glaucoma', 'is', 'a', 'group', 'of', 'diseases', 'that', 'can', 'damage', 'the', 'eye', "'s", 'optic', 'nerve', 'and', 'result', 'in', 'vision', 'loss', 'and', 'blindness', 'while', 'glaucoma', 'can', 'strike', 'anyone', 'the', 'risk', 'is', 'much', 'greater', 'for', 'people', 'over', '60', 'how', 'glaucoma', 'develops', 'there', 'are', 'several', 'different', 'types', 'of', 'glaucoma', 'most', 'of', 'these', 'involve', 'the', 'drainage', 'system', 'within', 'the', 'eye', 'at', 'the', 'front', 'of', 'the', 'eye', 'there', 'is', 'a', 'small', 'space', 'called', 'the', 'anterior', 'chamber', 'a', 'clear', 'fluid', 'flows', 'through', 'this', 'chamber', 'and', 'bathes', 'and', 'nourishes', 'the', 'nearby', 'tissues', 'watch', 'the', 'video', 'to', 'learn', 'more', 'about', 'glaucoma', 'to', 'enlarge', 'the', 'video', 'click', 'the', 'brackets', 'in', 'the', 'lower', 'right', 'hand', 'corner', 'to', 'reduce', 'the', 'video', 'press', 'the', 'escape', 'esc', 'button', 'on', 'your', 'keybo

In [33]:
print(model.vocab.lookup("What is glaucoma drawing to eyesdf".lower().split()))

('what', 'is', 'glaucoma', 'drawing', 'to', '<UNK>')


In [34]:
with open('/content/drive/MyDrive/Colab Notebooks/nlp/model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [35]:
print(model.generate(20, random_seed=7))

['factors', 'and', 'socks', '    ', 'nih', 'national', 'cancer', 'in', 'a', 'large', 'amounts', 'of', 'parkinson', 'disease', 'and', 'eventual', 'liver', 'transplant', 'is', 'enough']


In [47]:
model.score('a', 'glaucoma'.split())

0.017123287671232876

Create function

In [7]:
with open('/home/mark/Data/NLP/model/model.pkl', 'rb') as f:
    model=pickle.load(f)

In [12]:
user_input = "Paste text to get spelling suggestions for though terms such as glcoauma. Paste text to get spelling suggestions for though terms such as glcoauma. Paste text to get spelling suggestions for though terms such as glcoauma"

In [10]:
print(model.vocab.lookup(user_input.lower().split()))

('paste', 'text', 'to', 'get', 'spelling', 'suggestions', 'for', 'though', 'terms', 'such', 'as', '<UNK>')


In [16]:
def detect_spell_error(sentences):
    output = list(model.vocab.lookup(sentences.lower().split()))
    print(output)
    return output

In [17]:
temp_out=detect_spell_error(user_input)

['paste', 'text', 'to', 'get', 'spelling', 'suggestions', 'for', 'though', 'terms', 'such', 'as', '<UNK>', 'paste', 'text', 'to', 'get', 'spelling', 'suggestions', 'for', 'though', 'terms', 'such', 'as', '<UNK>', 'paste', 'text', 'to', 'get', 'spelling', 'suggestions', 'for', 'though', 'terms', 'such', 'as', '<UNK>']


TypeError: list indices must be integers or slices, not str