In [1]:
import collections
import pathlib
import pickle

import numpy as np
import pandas as pd

from textblob import TextBlob, Word, WordList

from nltk.corpus import stopwords
from nltk.metrics.distance import edit_distance

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

import nlp_util
import utils

In [2]:
data = pathlib.Path.cwd() / 'data'
pkls = data / 'pkls'
npys = data / 'npys'

descr = pd.read_pickle(pkls / 'descr.pkl')
train = pd.read_pickle(pkls / 'train.pkl')

In [3]:
prompts = descr['prompt']
essays = train['essay']
corrections = np.load(npys / 'train_corrections.npy')

english_stop = stopwords.words('english')

In [4]:
orig_str = nlp_util.lower_with_proper(essays[0])

orig_tb = TextBlob(orig_str)
corr_tb = orig_tb.correct()

corr_str = str(corr_tb)

# orig_words = orig_tb.tokenize()
# corr_words = corr_tb.tokenize()

lemmas = nlp_util.lemmatize(corr_tb)

In [5]:
nlp_util.parts_of_speech(corr_tb)

Counter({'JJ': 18,
         'NN': 60,
         'VBP': 12,
         'NNS': 18,
         'IN': 53,
         'PRP': 26,
         'TO': 10,
         'VB': 23,
         'VBZ': 15,
         'DT': 20,
         'CC': 14,
         'RB': 23,
         'WRB': 4,
         'MD': 5,
         'PRP$': 12,
         'EX': 1,
         'NNP': 5,
         'VBG': 13,
         'RP': 3,
         'VBN': 4,
         'VBD': 1,
         'WP': 1,
         'POS': 1,
         'JJR': 1})

In [6]:
lemmas

WordList(['dear', 'local', 'newspaper', ',', 'i', 'think', 'effect', 'computer', 'have', 'on', 'people', 'are', 'great', 'learning', 'skill/affects', 'because', 'they', 'give', 'u', 'time', 'to', 'chat', 'with', 'friends/new', 'people', ',', 'help', 'u', 'learn', 'about', 'the', 'globe', '(', 'astronomy', ')', 'and', 'keep', 'u', 'out', 'of', 'trouble', '!', 'thing', 'about', '!', 'dont', 'you', 'think', 'so', '?', 'how', 'would', 'you', 'feel', 'if', 'your', 'teenager', 'is', 'always', 'on', 'the', 'phone', 'with', 'friend', '!', 'do', 'you', 'ever', 'time', 'to', 'chat', 'with', 'your', 'friend', 'or', 'business', 'partner', 'about', 'thing', '.', 'well', 'now', '-', 'there', "'s", 'a', 'new', 'way', 'to', 'chat', 'the', 'computer', ',', 'theirs', 'plenty', 'of', 'site', 'on', 'the', 'internet', 'to', 'do', 'so', ':', 'ORGANIZATION1', ',', 'ORGANIZATION2', ',', 'CAPS1', ',', 'facebook', ',', 'space', 'act', '.', 'just', 'think', 'now', 'while', 'your', 'setting', 'up', 'meeting', 'wi

In [7]:
corr_str

"dear local newspaper, i think effects computers have on people are great learning skill/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of trouble! thing about! dont you think so? how would you feel if your teenager is always on the phone with friends! do you ever time to chat with your friends or business partner about things. well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: ORGANIZATION1, ORGANIZATION2, CAPS1, facebook, space act. just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it. how did you learn about other country/states outside of yours? well i have by computer/internet, it's a new way to learn about what going on in our time! you might think your child spends a lot of time on the computer, but ask them so question about the economy, sea

In [8]:
nlp_util.PUNCTUATION

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
no_stops = WordList(l for l in lemmas if l not in nlp_util.PUNCTUATION and l not in nlp_util.STOPWORDS)
no_stops

WordList(['dear', 'local', 'newspaper', 'think', 'effect', 'computer', 'people', 'great', 'learning', 'skill/affects', 'give', 'u', 'time', 'chat', 'friends/new', 'people', 'help', 'u', 'learn', 'globe', 'astronomy', 'keep', 'u', 'trouble', 'thing', 'dont', 'think', 'would', 'feel', 'teenager', 'always', 'phone', 'friend', 'ever', 'time', 'chat', 'friend', 'business', 'partner', 'thing', 'well', "'s", 'new', 'way', 'chat', 'computer', 'plenty', 'site', 'internet', 'ORGANIZATION1', 'ORGANIZATION2', 'CAPS1', 'facebook', 'space', 'act', 'think', 'setting', 'meeting', 'bos', 'computer', 'teenager', 'fun', 'phone', 'rushing', 'get', 'cause', 'want', 'use', 'learn', 'country/states', 'outside', 'well', 'computer/internet', "'s", 'new', 'way', 'learn', 'going', 'time', 'might', 'think', 'child', 'spends', 'lot', 'time', 'computer', 'ask', 'question', 'economy', 'sea', 'floor', 'spreading', 'even', 'DATE1', "'S", "'ll", 'surprise', 'much', 'he/she', 'know', 'believe', 'computer', 'much', 'inte

In [12]:
' '.join(no_stops)

"dear local newspaper think effect computer people great learning skill/affects give u time chat friends/new people help u learn globe astronomy keep u trouble thing dont think would feel teenager always phone friend ever time chat friend business partner thing well 's new way chat computer plenty site internet ORGANIZATION1 ORGANIZATION2 CAPS1 facebook space act think setting meeting bos computer teenager fun phone rushing get cause want use learn country/states outside well computer/internet 's new way learn going time might think child spends lot time computer ask question economy sea floor spreading even DATE1 'S 'll surprise much he/she know believe computer much interesting class day reading book child home computer local library 's better friend fresh perpressured something know right might know child CAPS2 forbidden hospital bed drive-by rather child computer learning chatting playing game safe sound home community place hope reached point understand agree computer great effect

In [36]:
from importlib import reload
reload(nlp_util);