In [7]:
# Import Libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer

# Import other functions
from process.averageWordLength import averageWordLength
from process.misspellings import nmisspelled
from process.wordcount import length
from process.averageSentenceLength import averageSentenceLength
from process.grammarchecker import grammarCheck
from process.keywords import keyWords
from process.sentcount import sentcount
from process.vocabulary import VocabCounter
from process.stopwords import stopWords

In [7]:
# clean essays
df = pd.read_feather('data/essays.feather')
tokenizer = RegexpTokenizer(r'\w+')
df['cleaned_essay'] = df['essay'].apply(lambda x: tokenizer.tokenize(x))

In [5]:
# Removing filler words
def filler(word):
    if (word.isupper() == True) and (any([char.isdigit() for char in word])):
        return True
    return False

df['cleaned_essay2'] = df['cleaned_essay'].apply(lambda i : [x for x in i if not filler(x)])

In [6]:
# remove stop words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 
stop_words = set(stopwords.words('english'))
df['cleaned_essay2_no_sw'] = df['cleaned_essay2'].apply(lambda x : [w for w in x if not w.lower() in stop_words])

df['cleaned_essay2_no_sw']

0        [Dear, local, newspaper, think, effects, compu...
1        [Dear, believe, using, computers, benefit, us,...
2        [Dear, people, use, computers, everyone, agree...
3        [Dear, Local, Newspaper, found, many, experts,...
4        [Dear, know, computers, positive, effect, peop...
                               ...                        
12971    [stories, mothers, daughters, either, enemies,...
12972    [never, understood, meaning, laughter, shortes...
12973    [laugh, habit, cause, causes, laughing, even, ...
12974    [Trippin, fences, years, young, short, years, ...
12975    [Many, people, believe, laughter, improve, lif...
Name: cleaned_essay2_no_sw, Length: 12976, dtype: object

In [25]:
# Setting the prompts and normalizing the scores

def prompt(essay_set):
    if essay_set == 1:
        return "Write a letter to your local newspaper in which you state your opinion on the effects computers have on people"
    elif essay_set == 2:
        return "Write a persuasive essay to a newspaper reflecting your vies on censorship in libraries. Do you believe that certain materials, such as books, music, movies, magazines, etc., should be removed from the shelves if they are found offensive?"
    elif essay_set == 3:
        return "Write a response that explains how the features of the setting affect the cyclist"
    elif essay_set == 4:
        return "Write a response that explains why the author concludes the story with this paragraph"
    elif essay_set == 5:
        return "Describe the mood created by the author in the memoir"
    elif essay_set == 6:
        return "Based on the excerpt, describe the obstacles the builders of the Empire State Building faced in attempting to allow dirigibles to dock there"
    elif essay_set == 7:
        return "Do only one of the following: write a story about a time when you were patient OR write a story about a time when someone you know was patient OR write a story in your own way about patience"
    elif essay_set == 8:
        return "Tell a true story in which laughter was one element or part"

df['prompt'] = df['essay_set'].apply(prompt)

def normalizeScores(essay_set, score):
    if essay_set == 1:
        return round((score/12), 2)*100
    elif essay_set == 2:
        return round((score/6), 2)*100
    elif essay_set == 3:
        return round((score/3), 2)*100
    elif essay_set == 4:
        return round((score/3), 2)*100
    elif essay_set == 5:
        return round((score/4), 2)*100
    elif essay_set == 6:
        return round((score/4), 2)*100
    elif essay_set == 7:
        return round((score/30), 2)*100
    elif essay_set == 8:
        return round((score/60), 2)*100
    
df['normalized_score'] = df.apply(lambda x : normalizeScores(x.essay_set, x.domain1_score), axis=1)
df['normalized_score']

0        67.0
1        75.0
2        58.0
3        83.0
4        67.0
         ... 
12971    58.0
12972    53.0
12973    67.0
12974    67.0
12975    67.0
Name: normalized_score, Length: 12976, dtype: float64

Making features

In [7]:
# Making new features

# Average word length
df['Average Word Length'] = df['cleaned_essay2'].apply(averageWordLength)

In [10]:
# Number of misspellings
df['misspelled'] = df['cleaned_essay2'].apply(nmisspelled)

In [None]:
# word count
def length(essay):
    return len(essay)
df['word_count'] = df['cleaned_essay2'].apply(length)

In [12]:
# Average sentence length
df['average_sentence_length'] = df['essay'].apply(averageSentenceLength)

In [45]:
# ------Need to find a faster grammar checker-------
# df['grammar_errors'] = df['essay'].apply(grammarCheck)

In [27]:
# percentage of essay that uses key words
df['key_words_count'] = df.apply(lambda x: keyWords(x.prompt, x.cleaned_essay2_no_sw), axis=1)
df['key_words_count']

0         4.97
1         5.14
2        13.18
3         8.05
4         6.36
         ...  
12971     2.49
12972     2.06
12973     1.46
12974     2.60
12975     0.47
Name: key_words_count, Length: 12976, dtype: float64

In [28]:
# Sentence count
df['sentcount'] = df['essay'].apply(sentcount)

In [8]:
# vocabulary words
def vocabulary(li):
    vocab = VocabCounter()
    return round(vocab.CountVocab(li)*100, 2)
df['vocabulary'] = df['cleaned_essay2'].apply(vocabulary)
df['vocabulary']

0        0.87
1        1.69
2        1.09
3        1.02
4        0.85
         ... 
12971    1.49
12972    0.38
12973    0.77
12974    1.44
12975    1.08
Name: vocabulary, Length: 12976, dtype: float64

In [10]:
# score to quantify imagery/effective word choice of each essay
imagery = pd.read_feather('imagery.feather')
imagery.columns

Index(['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1',
       'rater3_domain1', 'domain1_score', 'rater1_domain2', 'rater2_domain2',
       'domain2_score', 'rater1_trait1', 'rater1_trait2', 'rater1_trait3',
       'rater1_trait4', 'rater1_trait5', 'rater1_trait6', 'rater2_trait1',
       'rater2_trait2', 'rater2_trait3', 'rater2_trait4', 'rater2_trait5',
       'rater2_trait6', 'rater3_trait1', 'rater3_trait2', 'rater3_trait3',
       'rater3_trait4', 'rater3_trait5', 'rater3_trait6', 'cleaned_essay',
       'Average Word Length', 'score'],
      dtype='object')

In [11]:
df2 = pd.concat([df, imagery['score']], axis=1)
df2.columns

Index(['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1',
       'rater3_domain1', 'domain1_score', 'rater1_domain2', 'rater2_domain2',
       'domain2_score', 'rater1_trait1', 'rater1_trait2', 'rater1_trait3',
       'rater1_trait4', 'rater1_trait5', 'rater1_trait6', 'rater2_trait1',
       'rater2_trait2', 'rater2_trait3', 'rater2_trait4', 'rater2_trait5',
       'rater2_trait6', 'rater3_trait1', 'rater3_trait2', 'rater3_trait3',
       'rater3_trait4', 'rater3_trait5', 'rater3_trait6', 'cleaned_essay',
       'cleaned_essay_no_sw', 'cleaned_essay2', 'cleaned_essay2_no_sw',
       'Average Word Length', 'misspelled', 'word_count',
       'average_sentence_length', 'prompt', 'normalized_score',
       'key_words_count', 'sentcount', 'vocabulary', 'score'],
      dtype='object')

In [13]:
# Percent stop words
df2['percent_stop_words'] = df['cleaned_essay2'].apply(stopWords)
df2['percent_stop_words']

0        53.33
1        48.18
2        53.26
3        46.63
4        53.09
         ...  
12971    55.20
12972    53.89
12973    55.91
12974    58.45
12975    54.21
Name: percent_stop_words, Length: 12976, dtype: float64

In [14]:
# Making grade column - (0 is middle school, 1 is high school)
def grade(essay_set):
    if (essay_set == 1) or (essay_set == 5) or (essay_set == 7):
        return 0
    else:
        return 1

df2['grade'] = df2['essay_set'].apply(grade)
df2.columns

Index(['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1',
       'rater3_domain1', 'domain1_score', 'rater1_domain2', 'rater2_domain2',
       'domain2_score', 'rater1_trait1', 'rater1_trait2', 'rater1_trait3',
       'rater1_trait4', 'rater1_trait5', 'rater1_trait6', 'rater2_trait1',
       'rater2_trait2', 'rater2_trait3', 'rater2_trait4', 'rater2_trait5',
       'rater2_trait6', 'rater3_trait1', 'rater3_trait2', 'rater3_trait3',
       'rater3_trait4', 'rater3_trait5', 'rater3_trait6', 'cleaned_essay',
       'cleaned_essay_no_sw', 'cleaned_essay2', 'cleaned_essay2_no_sw',
       'Average Word Length', 'misspelled', 'word_count',
       'average_sentence_length', 'prompt', 'normalized_score',
       'key_words_count', 'sentcount', 'vocabulary', 'score',
       'percent_stop_words', 'grade'],
      dtype='object')

In [18]:
# Making topic column
def source(essay_set):
    if (essay_set == 1) or (essay_set == 2) or (essay_set == 7) or (essay_set == 8):
        return 1
    else:
        return 0
def pne(essay_set):
    if (essay_set == 3) or (essay_set == 4) or (essay_set == 5) or (essay_set == 6):
        return 1
    else:
        return 0

df2['Source Dependent Responses'] = df2['essay_set'].apply(source)
df2['Persuasive/Narrative/Expository'] = df2['essay_set'].apply(pne)
df2.columns

Index(['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1',
       'rater3_domain1', 'domain1_score', 'rater1_domain2', 'rater2_domain2',
       'domain2_score', 'rater1_trait1', 'rater1_trait2', 'rater1_trait3',
       'rater1_trait4', 'rater1_trait5', 'rater1_trait6', 'rater2_trait1',
       'rater2_trait2', 'rater2_trait3', 'rater2_trait4', 'rater2_trait5',
       'rater2_trait6', 'rater3_trait1', 'rater3_trait2', 'rater3_trait3',
       'rater3_trait4', 'rater3_trait5', 'rater3_trait6', 'cleaned_essay',
       'cleaned_essay_no_sw', 'cleaned_essay2', 'cleaned_essay2_no_sw',
       'Average Word Length', 'misspelled', 'word_count',
       'average_sentence_length', 'prompt', 'normalized_score',
       'key_words_count', 'sentcount', 'vocabulary', 'score',
       'percent_stop_words', 'grade', 'Source Dependent Responses',
       'Persuasive/Narrative/Expository'],
      dtype='object')

In [19]:
# new dataset
data = df2[['essay_id', 'essay_set', 'grade', 'Source Dependent Responses', 'Persuasive/Narrative/Expository', 'essay', 'cleaned_essay2', 'cleaned_essay2_no_sw','prompt', 'Average Word Length', 'misspelled', 'word_count', 'average_sentence_length', 'key_words_count', 'sentcount', 'vocabulary', 'score', 'percent_stop_words', 'normalized_score']]
data

Unnamed: 0,essay_id,essay_set,grade,Source Dependent Responses,Persuasive/Narrative/Expository,essay,cleaned_essay2,cleaned_essay2_no_sw,prompt,Average Word Length,misspelled,word_count,average_sentence_length,key_words_count,sentcount,vocabulary,score,percent_stop_words,normalized_score
0,1,1,0,1,0,"Dear local newspaper, I think effects computer...","[Dear, local, newspaper, I, think, effects, co...","[Dear, local, newspaper, think, effects, compu...",Write a letter to your local newspaper in whic...,4.179710,11,345,33.800000,4.97,10,0.87,142.767857,53.33,67.0
1,2,1,0,1,0,"Dear @CAPS1 @CAPS2, I believe that using compu...","[Dear, I, believe, that, using, computers, wil...","[Dear, believe, using, computers, benefit, us,...",Write a letter to your local newspaper in whic...,4.273608,16,413,23.277778,5.14,18,1.69,195.047857,48.18,75.0
2,3,1,0,1,0,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...","[Dear, More, and, more, people, use, computers...","[Dear, people, use, computers, everyone, agree...",Write a letter to your local newspaper in whic...,4.293478,2,276,19.928571,13.18,14,1.09,134.204429,53.26,58.0
3,4,1,0,1,0,"Dear Local Newspaper, @CAPS1 I have found that...","[Dear, Local, Newspaper, I, have, found, that,...","[Dear, Local, Newspaper, found, many, experts,...",Write a letter to your local newspaper in whic...,4.652352,24,489,22.782609,8.05,23,1.02,236.191000,46.63,83.0
4,5,1,0,1,0,"Dear @LOCATION1, I know having computers has a...","[Dear, I, know, having, computers, has, a, pos...","[Dear, know, computers, positive, effect, peop...",Write a letter to your local newspaper in whic...,4.313433,13,469,15.500000,6.36,30,0.85,178.948286,53.09,67.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12971,21626,8,1,1,0,In most stories mothers and daughters are eit...,"[In, most, stories, mothers, and, daughters, a...","[stories, mothers, daughters, either, enemies,...",Tell a true story in which laughter was one el...,3.858911,1,808,38.545455,2.49,22,1.49,314.048286,55.20,58.0
12972,21628,8,1,1,0,I never understood the meaning laughter is th...,"[I, never, understood, the, meaning, laughter,...","[never, understood, meaning, laughter, shortes...",Tell a true story in which laughter was one el...,3.740038,5,527,26.095238,2.06,21,0.38,213.163714,53.89,53.0
12973,21629,8,1,1,0,"When you laugh, is @CAPS5 out of habit, or is ...","[When, you, laugh, is, out, of, habit, or, is,...","[laugh, habit, cause, causes, laughing, even, ...",Tell a true story in which laughter was one el...,4.158098,7,778,31.461538,1.46,26,0.77,209.613286,55.91,67.0
12974,21630,8,1,1,0,Trippin' on fen...,"[Trippin, on, fences, I, am, years, young, and...","[Trippin, fences, years, young, short, years, ...",Tell a true story in which laughter was one el...,3.922662,3,556,16.054054,2.60,37,1.44,174.643286,58.45,67.0


In [20]:
# Dataset to feather
data.to_feather('dataset.feather')

Test set

In [32]:
# Cleaning the test set (still needs to be done)

test = pd.read_csv('~/Documents/GitHub/AutomatedEssayGrader/essaygrader/data/test_set.tsv', sep='\t', encoding='ISO-8859-1')
test.to_feather('data/test_set.feather')

In [33]:
test = pd.read_feather('~/Documents/GitHub/AutomatedEssayGrader/essaygrader/data/test_set.feather')

In [38]:
test.columns

Index(['essay_id', 'essay_set', 'essay', 'domain1_predictionid',
       'domain2_predictionid'],
      dtype='object')

In [None]:
# clean essays
tokenizer = RegexpTokenizer(r'\w+')
test['cleaned_essay'] = test['essay'].apply(lambda x: tokenizer.tokenize(x))