In [1]:
import sys, os, re, json
from collections import Counter, OrderedDict
import itertools
from numpy import *
import pandas as pd
from pprint import pprint as pp

In [2]:
def make_basic_features(df):
    """Compute basic features."""

    df['f_nchars'] = df['__TEXT__'].map(len)
    df['f_nwords'] = df['word'].map(len)

    punct_counter = lambda s: sum(1 for c in s
                                  if (not c.isalnum())
                                      and not c in
                                        [" ", "\t"])
    df['f_npunct'] = df['__TEXT__'].map(punct_counter)
    df['f_rpunct'] = df['f_npunct'] / df['f_nchars']

    df['f_ndigit'] = df['__TEXT__'].map(lambda s: sum(1 for c in s
                                  if c.isdigit()))
    df['f_rdigit'] = df['f_ndigit'] / df['f_nchars']

    upper_counter = lambda s: sum(1 for c in s if c.isupper())
    df['f_nupper'] = df['__TEXT__'].map(upper_counter)
    df['f_rupper'] = df['f_nupper'] / df['f_nchars']

    # fraction named entities recognized (ner) -- 'O' is not recognized
    df['f_nner'] = df['ner'].map(lambda ts: sum(1 for t in ts
                                              if t != 'O'))
    df['f_rner'] = df['f_nner'] / df['f_nwords']

    # Check standard sentence pattern:
    # if starts with capital, ends with .?!
    def check_sentence_pattern(s):
        ss = s.strip(r"""`"'""").strip()
        return s[0].isupper() and (s[-1] in '.?!\n')
    df['f_sentence_pattern'] = df['__TEXT__'].map(check_sentence_pattern)

    # Normalize any LM features
    # by dividing logscore by number of words
    lm_cols = {c:re.sub("_lmscore_", "_lmscore_norm_",c)
               for c in df.columns if c.startswith("f_lmscore")}
    for c,cnew in lm_cols.items():
        df[cnew] = df[c] / df['f_nwords']

    return df

In [3]:
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
digits = "([0-9])"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|me|edu)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    if "”" in text: text = text.replace(".”","”.")
    if "..." in text: text = text.replace("...","<prd><prd><prd>")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    #text = text.replace("  "," <stop>") #1
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [None]:
I hate wet and reiny days.  It rained a lot in 1816.... a lot - like everyday; there weather in Europe was abnormally wet because it rained in Switzerland on 130 out of the 183 days from April to September. If I was Mary Shelley I might decide to write a book too. Afterall, it was the onnly thing you could do without TV or anything. She said that she "passed the summer of 1816 in the environs of Geneva...we occasionally amused ourselves with some German stories of ghosts... These tales excited in us a playful desire of imitation"  So, people were stuck inside and bored. Mary Shelley decided to write a book becuase it was so awful outside. I can totally see her point, you know? I guess I would write a novel if there was nothing else to do.

In [4]:
input_string = input("Please paste your paragraph here: ")
newtext = split_into_sentences(input_string) 
#pp(newtext)

Please paste your paragraph here: I hate wet and reiny days.  It rained a lot in 1816.... a lot - like everyday; there weather in Europe was abnormally wet because it rained in Switzerland on 130 out of the 183 days from April to September. If I was Mary Shelley I might decide to write a book too. Afterall, it was the onnly thing you could do without TV or anything. She said that she "passed the summer of 1816 in the environs of Geneva...we occasionally amused ourselves with some German stories of ghosts... These tales excited in us a playful desire of imitation"  So, people were stuck inside and bored. Mary Shelley decided to write a book becuase it was so awful outside. I can totally see her point, you know? I guess I would write a novel if there was nothing else to do.


In [5]:
import spacy
import en_core_web_sm # or en_core_web_lg if need tokenization.
nlp = en_core_web_sm.load()

In [6]:
data = []
for idx,text in enumerate(newtext):
    doc = nlp(text)
    row = {"__TEXT__": text}
    row['ner'] = [i.pos_ for i in doc]
    row['sentiment'] = doc.sentiment
    row['word'] = [i.text for i in doc]
    data.append(row)
    
df = pd.DataFrame(data)

In [7]:
df = make_basic_features(df)

  import sys
  


In [8]:
cleaned = {}
dirty = {}
bad = 0
for idx,row in df.iterrows():
    if row['f_sentence_pattern'] and (row['f_npunct'] + row['f_nwords']) > 5 and row['f_nner'] > 0:
        cleaned[idx] = row['__TEXT__']
    else:
        dirty[idx] = row['__TEXT__']
        bad += 1

In [9]:
pp(cleaned)

{0: 'I hate wet and reiny days.',
 1: 'It rained a lot in 1816....',
 3: 'If I was Mary Shelley I might decide to write a book too.',
 4: 'Afterall, it was the onnly thing you could do without TV or anything.',
 5: 'She said that she "passed the summer of 1816 in the environs of '
    'Geneva...we occasionally amused ourselves with some German stories of '
    'ghosts... These tales excited in us a playful desire of imitation"  So, '
    'people were stuck inside and bored.',
 6: 'Mary Shelley decided to write a book becuase it was so awful outside.',
 7: 'I can totally see her point, you know?',
 8: 'I guess I would write a novel if there was nothing else to do.'}


In [9]:
pp(dirty)

{2: 'a lot - like everyday; there weather in Europe was abnormally wet because '
    'it rained in Switzerland on 130 out of the 183 days from April to '
    'September.'}


In [10]:
dirty_list=list(dirty.values())
pp(dirty_list)

['a lot - like everyday; there weather in Europe was abnormally wet because it '
 'rained in Switzerland on 130 out of the 183 days from April to September.']


In [11]:
clean_list=list(cleaned.values())
pp(clean_list)

['I hate wet and reiny days.',
 'It rained a lot in 1816....',
 'If I was Mary Shelley I might decide to write a book too.',
 'Afterall, it was the onnly thing you could do without TV or anything.',
 'She said that she "passed the summer of 1816 in the environs of Geneva...we '
 'occasionally amused ourselves with some German stories of ghosts... These '
 'tales excited in us a playful desire of imitation"  So, people were stuck '
 'inside and bored.',
 'Mary Shelley decided to write a book becuase it was so awful outside.',
 'I can totally see her point, you know?',
 'I guess I would write a novel if there was nothing else to do.']


In [12]:
def converttostr(input_seq, seperator):
   # Join all the strings in list
   final_str = seperator.join(input_seq)
   return final_str

In [13]:
seperator = ' '
df2 = (converttostr(clean_list, seperator))
df3 = df2.lower()

In [14]:
from spellchecker import SpellChecker
spell = SpellChecker()
text = df2.lower()
#splitwords = spell.split_words("what was the natureq of your relationshop with the candidate?  what was their title?  What was your role and titlem?")
splitwords = spell.split_words(text)
newlist = list(splitwords)
misspelled = spell.unknown(newlist)
print(misspelled)
for word in misspelled:
    # Get the one most likely answer
    print("corrected: " + spell.correction(word))

{'onnly', 'reiny', 'afterall', 'becuase'}
corrected: only
corrected: reins
corrected: after all
corrected: because


In [15]:
def convert(set): 
    return list(set) 
misspelled1 = (convert(misspelled))

In [16]:
seperator = ""
paragraph = (converttostr(clean_list, seperator))
d = split_into_sentences(paragraph)
#type(paragraph)

In [17]:
import re

bad = frozenset(map(str.lower, (misspelled1)))
words = lambda sentence: (m.group() for m in re.finditer('\w+', sentence))
data = []
for index, sentence in enumerate(d):
    if frozenset(words(sentence.lower())) & bad:
        data.append(sentence)
            #write to a dictonary 

#words = lambda sentence: (m.group() for m in re.finditer('\w+', sentence))

res = [] 
for i in data: 
    if i not in res: 
        res.append(i) 
        
res
    

['I hate wet and reiny days.',
 'Afterall, it was the onnly thing you could do without TV or anything.',
 'Mary Shelley decided to write a book becuase it was so awful outside.']

In [38]:
type(res)

list

In [18]:
seperator = ' '
dirt = (converttostr(dirty_list, seperator)) 
dirt 

'a lot - like everyday; there weather in Europe was abnormally wet because it rained in Switzerland on 130 out of the 183 days from April to September.'

In [19]:
res += [dirt] 
res 

['I hate wet and reiny days.',
 'Afterall, it was the onnly thing you could do without TV or anything.',
 'Mary Shelley decided to write a book becuase it was so awful outside.',
 'a lot - like everyday; there weather in Europe was abnormally wet because it rained in Switzerland on 130 out of the 183 days from April to September.']

In [20]:
bad_sentence = res

In [21]:
bad_sentence

['I hate wet and reiny days.',
 'Afterall, it was the onnly thing you could do without TV or anything.',
 'Mary Shelley decided to write a book becuase it was so awful outside.',
 'a lot - like everyday; there weather in Europe was abnormally wet because it rained in Switzerland on 130 out of the 183 days from April to September.']

In [1]:
import pylanguagetool

def wordcheck():
    while True:
        sentence = input('enter sentence, type end to exit word checker>')
        #sentence = bad_sentence
        res = pylanguagetool.api.check(sentence,"https://languagetool.org/api/v2/", lang ='en-US')
        for match in res['matches']:
            print(match['message'],match['replacements'])
        if sentence =="end":
            break
        


In [52]:
#wordcheck()

In [31]:
sentence = bad_sentence
res = pylanguagetool.api.check(sentence,"https://languagetool.org/api/v2/", lang ='en-US')
for match in res['matches']:
    print(match['message'],match['replacements'])
   

This sentence does not start with an uppercase letter [{'value': 'A'}]
Consider using an m-dash if you do not want to join two words. [{'value': '—'}]
'Everyday' is an adjective. Did you mean "every day"? [{'value': 'every day'}]


In [22]:

import pylanguagetool

def wordcheck():
    while True:
        #sentence = input('enter sentence, type end to exit word checker>')
        sentence = bad_sentence
        res = pylanguagetool.api.check(sentence,"https://languagetool.org/api/v2/", lang ='en-US')
        for match in res['matches']:
            print(match['message'],match['replacements'])
        if res != '':
            break

if __name__=="__main__":
    wordcheck()
    
print(misspelled)
for word in misspelled:
    # Get the one most likely answer
    print("corrected: " + spell.correction(word))    
    

This sentence does not start with an uppercase letter [{'value': 'A'}]
Consider using an m-dash if you do not want to join two words. [{'value': '—'}]
'Everyday' is an adjective. Did you mean "every day"? [{'value': 'every day'}]
Statistics suggests that 'their' (as in 'It’s not their fault.') might be the correct word here, not 'there' (as in 'Is there an answer?'). Please check. [{'value': 'their', 'shortDescription': "as in 'It’s not their fault.'"}]
{'onnly', 'reiny', 'afterall', 'becuase'}
corrected: only
corrected: reins
corrected: after all
corrected: because
