In [2]:
# Import Libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer

# Import other functions
from process.averageWordLength import averageWordLength
from process.misspellings import nmisspelled
from process.wordcount import length
from process.averageSentenceLength import averageSentenceLength
from process.grammarchecker import grammarCheck
from process.keywords import keyWords
from process.sentcount import sentcount

In [3]:
df = pd.read_feather('~/Documents/GitHub/AutomatedEssayGrader/essaygrader/data/essays.feather')
df = df.drop('Average Word Length', axis=1)
df.columns

Index(['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1',
       'rater3_domain1', 'domain1_score', 'rater1_domain2', 'rater2_domain2',
       'domain2_score', 'rater1_trait1', 'rater1_trait2', 'rater1_trait3',
       'rater1_trait4', 'rater1_trait5', 'rater1_trait6', 'rater2_trait1',
       'rater2_trait2', 'rater2_trait3', 'rater2_trait4', 'rater2_trait5',
       'rater2_trait6', 'rater3_trait1', 'rater3_trait2', 'rater3_trait3',
       'rater3_trait4', 'rater3_trait5', 'rater3_trait6', 'cleaned_essay',
       'cleaned_essay_no_sw'],
      dtype='object')

In [7]:
# clean essays
df = pd.read_feather('data/essays.feather')
tokenizer = RegexpTokenizer(r'\w+')
df['cleaned_essay'] = df['essay'].apply(lambda x: tokenizer.tokenize(x))

In [4]:
# Removing filler words
def filler(word):
    if (word.isupper() == True) and (any([char.isdigit() for char in word])):
        return True
    return False

df['cleaned_essay2'] = df['cleaned_essay'].apply(lambda i : [x for x in i if not filler(x)])

In [5]:
df = pd.read_feather('~/Documents/GitHub/AutomatedEssayGrader/essaygrader/data/essays.feather')

In [5]:
# remove stop words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 
stop_words = set(stopwords.words('english'))
df['cleaned_essay2_no_sw'] = df['cleaned_essay2'].apply(lambda x : [w for w in x if not w.lower() in stop_words])

df['cleaned_essay2_no_sw']

0        [Dear, local, newspaper, think, effects, compu...
1        [Dear, believe, using, computers, benefit, us,...
2        [Dear, people, use, computers, everyone, agree...
3        [Dear, Local, Newspaper, found, many, experts,...
4        [Dear, know, computers, positive, effect, peop...
                               ...                        
12971    [stories, mothers, daughters, either, enemies,...
12972    [never, understood, meaning, laughter, shortes...
12973    [laugh, habit, cause, causes, laughing, even, ...
12974    [Trippin, fences, years, young, short, years, ...
12975    [Many, people, believe, laughter, improve, lif...
Name: cleaned_essay2_no_sw, Length: 12976, dtype: object

In [6]:
# Making new features
df['Average Word Length'] = df['cleaned_essay2'].apply(averageWordLength)

In [None]:
# df['misspelled'] = df['cleaned_essay2'].swifter.apply(nmisspelled)

In [7]:
def length(essay):
    return len(essay)
df['word_count'] = df['cleaned_essay2'].apply(length)

In [8]:
df['average_sentence_length'] = df['essay'].apply(averageSentenceLength)

In [45]:
df['grammar_errors'] = df['essay'].apply(grammarCheck)

In [None]:
# df['key_words_count'] = df['cleaned_essay2'].apply(keyWords)

In [9]:
df['sentcount'] = df['essay'].apply(sentcount)

In [10]:
# To feather
df.to_feather('data/essaysPrelim.feather')