In [1]:
# Import Libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer

# Import other functions
from process.averageWordLength import averageWordLength
from process.misspellings import nmisspelled
from process.wordcount import length
from process.averageSentenceLength import averageSentenceLength
from process.grammarchecker import grammarCheck
from process.keywords import keyWords
from process.sentcount import sentcount

In [7]:
# clean essays
df = pd.read_feather('data/essays.feather')
tokenizer = RegexpTokenizer(r'\w+')
df['cleaned_essay'] = df['essay'].apply(lambda x: tokenizer.tokenize(x))

In [5]:
# Removing filler words
def filler(word):
    if (word.isupper() == True) and (any([char.isdigit() for char in word])):
        return True
    return False

df['cleaned_essay2'] = df['cleaned_essay'].apply(lambda i : [x for x in i if not filler(x)])

In [6]:
# remove stop words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 
stop_words = set(stopwords.words('english'))
df['cleaned_essay2_no_sw'] = df['cleaned_essay2'].apply(lambda x : [w for w in x if not w.lower() in stop_words])

df['cleaned_essay2_no_sw']

0        [Dear, local, newspaper, think, effects, compu...
1        [Dear, believe, using, computers, benefit, us,...
2        [Dear, people, use, computers, everyone, agree...
3        [Dear, Local, Newspaper, found, many, experts,...
4        [Dear, know, computers, positive, effect, peop...
                               ...                        
12971    [stories, mothers, daughters, either, enemies,...
12972    [never, understood, meaning, laughter, shortes...
12973    [laugh, habit, cause, causes, laughing, even, ...
12974    [Trippin, fences, years, young, short, years, ...
12975    [Many, people, believe, laughter, improve, lif...
Name: cleaned_essay2_no_sw, Length: 12976, dtype: object

In [4]:
df = pd.read_feather('~/Documents/GitHub/AutomatedEssayGrader/essaygrader/data/essays.feather')
df = df.drop('Average Word Length', axis=1)
df.columns

Index(['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1',
       'rater3_domain1', 'domain1_score', 'rater1_domain2', 'rater2_domain2',
       'domain2_score', 'rater1_trait1', 'rater1_trait2', 'rater1_trait3',
       'rater1_trait4', 'rater1_trait5', 'rater1_trait6', 'rater2_trait1',
       'rater2_trait2', 'rater2_trait3', 'rater2_trait4', 'rater2_trait5',
       'rater2_trait6', 'rater3_trait1', 'rater3_trait2', 'rater3_trait3',
       'rater3_trait4', 'rater3_trait5', 'rater3_trait6', 'cleaned_essay',
       'cleaned_essay_no_sw'],
      dtype='object')

In [5]:
df = pd.read_feather('~/Documents/GitHub/AutomatedEssayGrader/essaygrader/data/essays.feather')

In [7]:
# Making new features
df['Average Word Length'] = df['cleaned_essay2'].apply(averageWordLength)

In [10]:
df['misspelled'] = df['cleaned_essay2'].apply(nmisspelled)

In [None]:
def length(essay):
    return len(essay)
df['word_count'] = df['cleaned_essay2'].apply(length)

In [12]:
df['average_sentence_length'] = df['essay'].apply(averageSentenceLength)

In [45]:
# ------Need to find a faster grammar checker-------
# df['grammar_errors'] = df['essay'].apply(grammarCheck)

In [25]:
# Setting the prompts and normalizing the scores

def prompt(essay_set):
    if essay_set == 1:
        return "Write a letter to your local newspaper in which you state your opinion on the effects computers have on people"
    elif essay_set == 2:
        return "Write a persuasive essay to a newspaper reflecting your vies on censorship in libraries. Do you believe that certain materials, such as books, music, movies, magazines, etc., should be removed from the shelves if they are found offensive?"
    elif essay_set == 3:
        return "Write a response that explains how the features of the setting affect the cyclist"
    elif essay_set == 4:
        return "Write a response that explains why the author concludes the story with this paragraph"
    elif essay_set == 5:
        return "Describe the mood created by the author in the memoir"
    elif essay_set == 6:
        return "Based on the excerpt, describe the obstacles the builders of the Empire State Building faced in attempting to allow dirigibles to dock there"
    elif essay_set == 7:
        return "Do only one of the following: write a story about a time when you were patient OR write a story about a time when someone you know was patient OR write a story in your own way about patience"
    elif essay_set == 8:
        return "Tell a true story in which laughter was one element or part"

df['prompt'] = df['essay_set'].apply(prompt)

def normalizeScores(essay_set, score):
    if essay_set == 1:
        return round((score/12), 2)*100
    elif essay_set == 2:
        return round((score/6), 2)*100
    elif essay_set == 3:
        return round((score/3), 2)*100
    elif essay_set == 4:
        return round((score/3), 2)*100
    elif essay_set == 5:
        return round((score/4), 2)*100
    elif essay_set == 6:
        return round((score/4), 2)*100
    elif essay_set == 7:
        return round((score/30), 2)*100
    elif essay_set == 8:
        return round((score/60), 2)*100
    
df['normalized_score'] = df.apply(lambda x : normalizeScores(x.essay_set, x.domain1_score), axis=1)
df['normalized_score']

0        67.0
1        75.0
2        58.0
3        83.0
4        67.0
         ... 
12971    58.0
12972    53.0
12973    67.0
12974    67.0
12975    67.0
Name: normalized_score, Length: 12976, dtype: float64

In [27]:
from tqdm.notebook import tqdm
tqdm.pandas()

df['key_words_count'] = df.apply(lambda x: keyWords(x.prompt, x.cleaned_essay2_no_sw), axis=1)
df['key_words_count']

0         4.97
1         5.14
2        13.18
3         8.05
4         6.36
         ...  
12971     2.49
12972     2.06
12973     1.46
12974     2.60
12975     0.47
Name: key_words_count, Length: 12976, dtype: float64

In [28]:
df['sentcount'] = df['essay'].apply(sentcount)

In [29]:
# To feather
df.to_feather('data/prelimData.feather')

In [32]:
# Cleaning the test set

test = pd.read_csv('~/Documents/GitHub/AutomatedEssayGrader/essaygrader/data/test_set.tsv', sep='\t', encoding='ISO-8859-1')
test.to_feather('data/test_set.feather')

In [33]:
test = pd.read_feather('~/Documents/GitHub/AutomatedEssayGrader/essaygrader/data/test_set.feather')

In [38]:
test.columns

Index(['essay_id', 'essay_set', 'essay', 'domain1_predictionid',
       'domain2_predictionid'],
      dtype='object')

In [None]:
# clean essays
tokenizer = RegexpTokenizer(r'\w+')
test['cleaned_essay'] = test['essay'].apply(lambda x: tokenizer.tokenize(x))