In [12]:
import pathlib

import numpy as np
import pandas as pd
from textblob import TextBlob

from nltk.metrics.distance import edit_distance
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.spatial.distance import cosine

import utils

In [2]:
def cosine_similarity(u, v, w=None):
    return 1 - cosine(u, v, w)


def similarity_essays_to_prompt(doc_term_mtx, metric):
    prompt = doc_term_mtx.iloc[0]
    essays = doc_term_mtx[1:]
    scores = ((i, metric(essay, prompt)) for i, essay in essays.iterrows())
    return sorted(scores, key=lambda x: x[1], reverse=True)

In [3]:
data = pathlib.Path.cwd() / 'data'
descr = pd.read_pickle(data / 'descr.pkl')
train = pd.read_pickle(data / 'train.pkl')

In [4]:
prompts = descr['prompt']
essays = train['essay']
corpus = pd.concat([pd.Series(prompts[0]), essays[:5]])

In [5]:
count_vec = CountVectorizer(stop_words='english')
X_count = count_vec.fit_transform(corpus)
doc_term_mtx_count = pd.DataFrame(X_count.toarray(), columns=count_vec.get_feature_names())

similarity_essays_to_prompt(doc_term_mtx_count, cosine_similarity)

[(3, 0.6434103964757235),
 (4, 0.4135763891545543),
 (5, 0.40486881513293227),
 (1, 0.2886150127292031),
 (2, 0.2875083056936687)]

In [6]:
tfidf_vec = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf_vec.fit_transform(corpus)
doc_term_mtx_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vec.get_feature_names())

similarity_essays_to_prompt(doc_term_mtx_tfidf, cosine_similarity)

[(3, 0.4589585009379219),
 (5, 0.22227805596964267),
 (4, 0.20122636625176427),
 (1, 0.19135359762819126),
 (2, 0.14588618973159329)]

# TextBlob Spellcheck

In [97]:
tb_prompt = TextBlob(prompts[0])
# tb_essay = TextBlob(essays[1587]) # low score
tb_essay = TextBlob(essays[15])  # high score
tb_correct = tb_essay.correct()
tb_essay, tb_correct

(TextBlob("Dear @ORGANIZATION1, The computer blinked to life and an image of a blonde haired girl filled the screen. It was easy to find out how life was in @LOCATION2, thanks to the actual @CAPS1 girl explaining it. Going to the library wouldn't have filled one with this priceless information and human interection. Computers are a nessessity of life if soceity wishes to grow and expand. They should be supported because they teach hand eye coordination, give people the ability to learn about faraway places, and allow people to talk to others online. Firstly, computers help teach hand eye coordination. Hand-eye coordination is a useful ability that is usod to excel in sports. In a recent survey, @PERCENT1 of kids felt their hand eye coordination improves after computer use. Even a simple thing like tying can build up this skill. Famous neurologist @CAPS2 @PERSON1 stated in an article last week that, "@CAPS3 and computer strength the @CAPS2. When on the computer, you automatically proces

In [98]:
len(tb_correct), len(tb_essay)

(3162, 3168)

In [99]:
cv = CountVectorizer()
X_c = cv.fit_transform([str(tb_correct), str(tb_essay)])
doc_term_mtx_c = pd.DataFrame(X_c.toarray(), columns=cv.get_feature_names())

similarity_essays_to_prompt(doc_term_mtx_c, cosine_similarity)

[(1, 0.9891273653129933)]

In [100]:
tv = TfidfVectorizer()
X_t = tv.fit_transform([str(tb_correct), str(tb_essay)])
doc_term_mtx_t = pd.DataFrame(X_t.toarray(), columns=tv.get_feature_names())

similarity_essays_to_prompt(doc_term_mtx_t, cosine_similarity)

[(1, 0.9813155087358891)]

In [101]:
tb_correct.sentiment

Sentiment(polarity=0.24684049800328872, subjectivity=0.47548911037283115)

In [102]:
tokens = tb_correct.tokenize()
tokens

WordList(['Dear', '@', 'ORGANIZATION1', ',', 'The', 'computer', 'blinked', 'to', 'life', 'and', 'an', 'image', 'of', 'a', 'blonde', 'haired', 'girl', 'filled', 'the', 'screen', '.', 'It', 'was', 'easy', 'to', 'find', 'out', 'how', 'life', 'was', 'in', '@', 'LOCATION2', ',', 'thanks', 'to', 'the', 'actual', '@', 'CAPS1', 'girl', 'explaining', 'it', '.', 'Going', 'to', 'the', 'library', 'would', "n't", 'have', 'filled', 'one', 'with', 'this', 'priceless', 'information', 'and', 'human', 'interaction', '.', 'Computers', 'are', 'a', 'necessity', 'of', 'life', 'if', 'society', 'wishes', 'to', 'grow', 'and', 'expand', '.', 'They', 'should', 'be', 'supported', 'because', 'they', 'teach', 'hand', 'eye', 'coordination', ',', 'give', 'people', 'the', 'ability', 'to', 'learn', 'about', 'faraway', 'places', ',', 'and', 'allow', 'people', 'to', 'talk', 'to', 'others', 'online', '.', 'Firstly', ',', 'computers', 'help', 'teach', 'hand', 'eye', 'coordination', '.', 'And-eye', 'coordination', 'is', 'a'

In [103]:
tokens.lemmatize()

WordList(['Dear', '@', 'ORGANIZATION1', ',', 'The', 'computer', 'blinked', 'to', 'life', 'and', 'an', 'image', 'of', 'a', 'blonde', 'haired', 'girl', 'filled', 'the', 'screen', '.', 'It', 'wa', 'easy', 'to', 'find', 'out', 'how', 'life', 'wa', 'in', '@', 'LOCATION2', ',', 'thanks', 'to', 'the', 'actual', '@', 'CAPS1', 'girl', 'explaining', 'it', '.', 'Going', 'to', 'the', 'library', 'would', "n't", 'have', 'filled', 'one', 'with', 'this', 'priceless', 'information', 'and', 'human', 'interaction', '.', 'Computers', 'are', 'a', 'necessity', 'of', 'life', 'if', 'society', 'wish', 'to', 'grow', 'and', 'expand', '.', 'They', 'should', 'be', 'supported', 'because', 'they', 'teach', 'hand', 'eye', 'coordination', ',', 'give', 'people', 'the', 'ability', 'to', 'learn', 'about', 'faraway', 'place', ',', 'and', 'allow', 'people', 'to', 'talk', 'to', 'others', 'online', '.', 'Firstly', ',', 'computer', 'help', 'teach', 'hand', 'eye', 'coordination', '.', 'And-eye', 'coordination', 'is', 'a', 'use

In [104]:
def spelling_confidence(wordlist, agg_func=np.max):
    return sum(agg_func([conf for _, conf in word.spellcheck()]) for word in wordlist) / len(wordlist)

spelling_confidence(tokens)

0.9539807998465871

In [96]:
set1 = train[train['essay_set'] == 1]
low = set1[set1['domain1_score'] < 4]
high = set1[set1['domain1_score'] > 10]

high

Unnamed: 0,essay_id,essay_set,essay,domain1_score,domain2_score,domain1_percent,domain2_percent
15,16,1,"Dear @ORGANIZATION1, The computer blinked to l...",12,,100.0,
23,24,1,"Dear local newspaper, I've heard that not many...",11,,90.0,
36,37,1,"Dear @ORGANIZATION1, @CAPS1 has been brought t...",12,,100.0,
47,48,1,Have you ever had the time completely fly by w...,11,,90.0,
69,70,1,"Dear @CAPS1 @CAPS2, @CAPS3 life without comput...",12,,100.0,
91,92,1,Dear local Newspaper; @CAPS1 you know that com...,11,,90.0,
92,93,1,"Readers of ""@LOCATION2 @CAPS1"" should be well ...",11,,90.0,
97,98,1,"Dear @ORGANIZATION1, In this new digital age, ...",11,,90.0,
105,106,1,"Dear Newspaper @CAPS1, I firmly believe that c...",11,,90.0,
110,111,1,"@CAPS1 judgment comes from expirience, and exp...",11,,90.0,


In [56]:
for i, row in low.iterrows():
    print(i)
    print(row['essay'])
    print('-'*50)

21
Dear local Newspaper @CAPS1 a take all your computer and given to the people around the world for the can stay in their houses chating with their family and friend. Computers help people around the world to connect with other people computer help kids do their homework and look up staff that happen around the world.
--------------------------------------------------
40
I think computers are good because you can talk to your friends and family on the computers. People needs computers to look for a job. Some people spend to much time on the computers then on homework people need to stop.
--------------------------------------------------
356
Being active has no limit, but technology does.
--------------------------------------------------
446
Computers are good because people can find what they need and their fun ask things find places get things for a good price.
--------------------------------------------------
542
I think that computers are amazing. Computers are an amazing way to

In [57]:
train.loc[542]

essay_id                                                         545
essay_set                                                          1
essay              I think that computers are amazing. Computers ...
domain1_score                                                      2
domain2_score                                                    NaN
domain1_percent                                                    0
domain2_percent                                                  NaN
Name: 542, dtype: object