In [114]:
import collections
import pathlib
import pickle

import numpy as np
import pandas as pd
from textblob import TextBlob, Word, WordList

from nltk.metrics.distance import edit_distance
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from scipy.spatial.distance import cosine

from utils import cosine_similarity

In [3]:
data = pathlib.Path.cwd() / 'data'
pkls = data / 'pkls'
npys = data / 'npys'

descr = pd.read_pickle(pkls / 'descr.pkl')
train = pd.read_pickle(pkls / 'train.pkl')
tr_corr = np.load(npys / 'train_corrections.npy')

In [13]:
prompts = descr['prompt']
essays = train['essay']
corpus = pd.concat([pd.Series(prompts[0]), essays[:5]])

In [16]:
count_vec = CountVectorizer(stop_words='english')
X_count = count_vec.fit_transform(corpus)
doc_term_mtx_count = pd.DataFrame(X_count.toarray(), columns=count_vec.get_feature_names())

similarity_essays_to_prompt(doc_term_mtx_count, cosine_similarity)

[(3, 0.6434103964757235),
 (4, 0.4135763891545543),
 (5, 0.40486881513293227),
 (1, 0.2886150127292031),
 (2, 0.2875083056936687)]

In [17]:
tfidf_vec = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf_vec.fit_transform(corpus)
doc_term_mtx_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vec.get_feature_names())

similarity_essays_to_prompt(doc_term_mtx_tfidf, cosine_similarity)

[(3, 0.4589585009379219),
 (5, 0.22227805596964267),
 (4, 0.20122636625176427),
 (1, 0.19135359762819126),
 (2, 0.14588618973159329)]

# TextBlob Spellcheck

### Storage Size Tests

In [81]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [124]:
corr_words = tb_correct.tokenize()
orig_words = tb_essay.tokenize()

In [118]:
sum(not word in tb_correct for word in tb_essay) / len(tb_essay)

0.0003156565656565657

In [119]:
sum(not word in corr_words for word in orig_words) / len(orig_words)

0.049586776859504134

In [122]:
sum([not word in corr_words for word in orig_words])

30

In [139]:
%timeit sum([not word in (tb_correct.tokenize()) for word in tb_essay.tokenize()])

2.69 s ± 54.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [138]:
%timeit sum([not word in frozenset(tb_correct.tokenize()) for word in tb_essay.tokenize()])

2.73 s ± 10.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [137]:
%timeit sum([not word in set(tb_correct.tokenize()) for word in tb_essay.tokenize()])

2.8 s ± 28.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [145]:
%timeit tb_correct.tokenize()

4.54 ms ± 91.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [143]:
tokens = tb_correct.tokenize()

In [146]:
%timeit tokens.lemmatize()

2.81 ms ± 41.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
TextBlob

In [153]:
tb_correct.tokenize().lemmatize().__class__

textblob.blob.WordList

In [84]:
tokens.lemmatize()

WordList(['Dear', '@', 'ORGANIZATION1', ',', 'The', 'computer', 'blinked', 'to', 'life', 'and', 'an', 'image', 'of', 'a', 'blonde', 'haired', 'girl', 'filled', 'the', 'screen', '.', 'It', 'wa', 'easy', 'to', 'find', 'out', 'how', 'life', 'wa', 'in', '@', 'LOCATION2', ',', 'thanks', 'to', 'the', 'actual', '@', 'CAPS1', 'girl', 'explaining', 'it', '.', 'Going', 'to', 'the', 'library', 'would', "n't", 'have', 'filled', 'one', 'with', 'this', 'priceless', 'information', 'and', 'human', 'interaction', '.', 'Computers', 'are', 'a', 'necessity', 'of', 'life', 'if', 'society', 'wish', 'to', 'grow', 'and', 'expand', '.', 'They', 'should', 'be', 'supported', 'because', 'they', 'teach', 'hand', 'eye', 'coordination', ',', 'give', 'people', 'the', 'ability', 'to', 'learn', 'about', 'faraway', 'place', ',', 'and', 'allow', 'people', 'to', 'talk', 'to', 'others', 'online', '.', 'Firstly', ',', 'computer', 'help', 'teach', 'hand', 'eye', 'coordination', '.', 'And-eye', 'coordination', 'is', 'a', 'use

In [104]:
def spelling_confidence(wordlist, agg_func=np.max):
    return sum(agg_func([conf for _, conf in word.spellcheck()]) for word in wordlist) / len(wordlist)

spelling_confidence(tokens)

0.9539807998465871

In [105]:
set1 = train[train['essay_set'] == 1]
low = set1[set1['domain1_score'] < 4]
high = set1[set1['domain1_score'] > 10]

In [57]:
train.loc[542]

essay_id                                                         545
essay_set                                                          1
essay              I think that computers are amazing. Computers ...
domain1_score                                                      2
domain2_score                                                    NaN
domain1_percent                                                    0
domain2_percent                                                  NaN
Name: 542, dtype: object