In [None]:
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import TweetTokenizer
import enchant
import string
import os
import re
from g2p_en import G2p


In [None]:
#  load training set 
TrainX = pd.read_csv('training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')
Trainy = TrainX['domain1_score']
TrainX = TrainX.dropna(axis=1)
TrainX = TrainX.drop(columns=['rater1_domain1', 'rater2_domain1'])
TrainX.head()
TrainX.sample()

In [None]:
def load_file(filename):
    dataset = pd.read_table(filename, header=0, sep=",", encoding="unicode_escape")
    
    essay_set1 = dataset.loc[dataset['essay_set'] == 1]
    # essay_set2 = dataset.loc[dataset['essay_set'] == 3]
    # essay_set3 = dataset.loc[dataset['essay_set'] == 4]
    # essay_set4 = dataset.loc[dataset['essay_set'] == 5]
    # essay_set5 = dataset.loc[dataset['essay_set'] == 6]
    # essay_set6 = dataset.loc[dataset['essay_set'] == 7]

    e = essay_set1['essay']
    return e



In [None]:
def word_tokenization(essay_set):

    # for essay in essay_set:

    e_tokenize = tknzr.tokenize(essay_set[0])
    bag_of_words = []
    for x in e_tokenize:
        if (x[:len(x)-1] in proper_pronouns) or (x in string.punctuation):
            pass
        else:
            bag_of_words.append(x.lower())

    # print(bag_of_words)

    # word-based tokenization
    word_tokens = {}
    token = 1
    for x in bag_of_words:
        if x in word_tokens.keys():
            pass
        else:
            word_tokens[x] = token
            token += 1

    return word_tokens
    


In [None]:
def vocabulary_check(word_tokens):

    # check for vocabulary errors
    total_words = len(word_tokens)
    correctness = 0
    correct = 0
    for x in word_tokens.keys():
        if (d.check(x) == True):
            correct = correct + 1

    correctness = correct / total_words
    # print(correctness)
    return correctness

In [None]:
def extract_feature_set1(word_tokens):
    
    # SECTION 3.3 - BEAUTIFUL WORDS FEATURE SET

    # find beautiful words per essay, length >= 6
    beautiful_words = []
    for x, y in word_tokens.items():
        if len(x) >= 6:
            # check if word exists in dictionary
            if d.check(x) == True:
                beautiful_words.append(y)

    # average letter frequencies per essay
    total_letters = 0
    characters = {}
    for word in word_tokens.keys():
        for ch in word:
            total_letters = total_letters + 1
            if ch not in characters.keys():
                characters[ch] = 0
            else:
                characters[ch] += 1

    for key in characters.keys():
        characters[key] = characters[key]/total_letters

    # phenomes
    # get phenome frequencies per essay
    phoneme_dict = {}

    total_phoneme = 0
    texts = word_tokens.keys()
    for text in texts:
        out = g2p(text)
        for x in out:
            total_phoneme += 1
            if re.sub(r'\d+', '', x) in phoneme_dict.keys():
                phoneme_dict[re.sub(r'\d+', '', x)] = phoneme_dict[re.sub(r'\d+', '', x)] + 1
            else:
                phoneme_dict[re.sub(r'\d+', '', x)] = 1
    
    for key in phoneme_dict.keys():
        phoneme_dict[key] = phoneme_dict[key]/total_phoneme

    # print(phenome_dict)

    return beautiful_words, characters, phoneme_dict



In [None]:
def extract_feature_set2(word_tokens):
    
    # SECTION 3.4 - EMOTIVE EFFECTIVENESS FEATURE SET
    lexicon = {}
    with open('subjclueslen1-HLTEMNLP05.tff') as f:
        
        for line in f:
            content = f.readline()
            row = content.split()
            type = row[0][5:]
            words = row[2][6:]
            pos = row[3][5:]
            polarity = row[5][14:]
            
            lexicon[words] = (type, pos, polarity)

    # print(lexicon)

    strong_positive = 0
    strong_negative = 0
    strong_neutral = 0
    strong_both = 0

    weak_positive = 0
    weak_negative = 0
    weak_neutral = 0
    weak_both = 0

    for w in word_tokens.keys():
        if w in lexicon.keys():
            if lexicon[w][0] == "strongsubj":
                if lexicon[w][2] == "positive":
                    strong_positive += 1

                elif lexicon[w][2] == "negative":
                    strong_negative += 1

                elif lexicon[w][2] == "neutral":
                    strong_neutral += 1

                elif lexicon[w][2] == "both":
                    strong_both += 1 

            elif lexicon[w][0] == "weaksubj":
                if lexicon[w][2] == "positive":
                    weak_positive += 1

                elif lexicon[w][2] == "negative":
                    weak_negative += 1

                elif lexicon[w][2] == "neutral":
                    weak_neutral += 1

                elif lexicon[w][2] == "both":
                    weak_both += 1 
    
    return strong_positive/len(word_tokens), strong_negative/len(word_tokens), strong_neutral/len(word_tokens), strong_both/len(word_tokens), weak_positive/len(word_tokens), weak_negative/len(word_tokens), weak_neutral/len(word_tokens), weak_both/len(word_tokens)

In [None]:
def extract_feature_set3(word_tokens):
    
    # SECTION 3.5 - LEARNING MATURITY
    # for every essay, find its average maturity, top mature tokens, and vocabulary maturity
    maturity_tokens = {}
    avg_maturity = 0
    with open('AoA Ratings.csv') as f:
        next(f)
        for line in f:
            content = f.readline()
            content = content.split(",")
            if content[0] in word_tokens.keys():
                avg_maturity += float(content[4])
                maturity_tokens[content[0]] = float(content[4])

    avg_maturity = avg_maturity/len(maturity_tokens)
    sorted_maturity_tokens = sorted(maturity_tokens, key=lambda item : maturity_tokens[item], reverse=True)

    # get the top 5 tokens
    top_tokens = []
    for word in sorted_maturity_tokens[:5]:
        top_tokens.append(word_tokens[word])

    return avg_maturity, top_tokens

In [None]:
essay_set = load_file("training_set.csv")
word_tokens = word_tokenization(essay_set)
correctness = vocabulary_check(word_tokens)

beautiful_words, character_list, phoneme_dict = extract_feature_set1(word_tokens)
strong_positive, strong_negative, strong_neutral, strong_both, weak_positive, weak_negative, weak_neutral, weak_both = extract_feature_set2(word_tokens)
avg_maturity, top_tokens = extract_feature_set3(word_tokens)