### Importing Libraries

In [199]:
import numpy as np
import pandas as pd

import nltk 
import num2words

import re
import pickle
import os
import time
import string

In [200]:
from sklearn.model_selection import train_test_split

In [201]:
from nltk import wordpunct_tokenize, word_tokenize, WordNetLemmatizer, sent_tokenize, pos_tag


---

### Loading Dataset

In [255]:
# Load from .csv file with complete dataset
data_essays = pd.read_csv('essays.csv', encoding = "ISO-8859-1")
data_essays['cEXT'] = np.where(data_essays['cEXT']=='y', 1, 0)
data_essays['cNEU'] = np.where(data_essays['cNEU']=='y', 1, 0)
data_essays['cAGR'] = np.where(data_essays['cAGR']=='y', 1, 0)
data_essays['cCON'] = np.where(data_essays['cCON']=='y', 1, 0)
data_essays['cOPN'] = np.where(data_essays['cOPN']=='y', 1, 0)

X_essays = data_essays['TEXT']
y_essays = data_essays[['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']]

data_essays['text length'] = data_essays['TEXT'].apply(len)

labels = ['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']
X_train, X_test, y_train, y_test = train_test_split(X_essays, y_essays, test_size=0.05)

In [256]:
print(X_test)

503     Since I am already on the computer I am going ...
1025    my thoughts, feelings, and sensations?  right ...
2374         Oh man I'm so hungry. I can't wait for th...
873     I really wish people would be more efficient w...
524     I work at a daycare in the mornings on Teusday...
                              ...                        
2405         I'm writing this assignment feeling kind ...
310     This the third time that I am typing this thin...
121     Okay. I'm in the stupid SMURF lab. there are s...
70      College seems very stressful. It's not like hi...
1622          I am a little irritated right now. I jus...
Name: TEXT, Length: 124, dtype: object


---

### Preprocessing Data

#### Lower Casing

In [236]:
def convert_lower_case(data):
    data = data.str.lower()    

In [181]:
X_test = convert_lower_case(X_test)

#### Removing Stopwords

In [237]:
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

#### Removing Punctuation

In [262]:
punctuations = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']

#### Cleaning Text

In [240]:
def clean_text(data):
    data = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", data)
    data = re.sub(r"what's", "what is ", data)
    data = re.sub(r"\'s", " ", data)
    data = re.sub(r"\'ve", " have ", data)
    data = re.sub(r"can't", "cannot ", data)
    data = re.sub(r"n't", " not ", data)
    data = re.sub(r"I'm", "I am ", data)
    data = re.sub(r"\'re", " are ", data)
    data = re.sub(r"\'d", " would ", data)
    data = re.sub(r"\'ll", " will ", data)
    
    # Convert numbers from digits to words
    data = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), data)

    #data = re.sub(r'[^\w\s]', '', data)
    #data = re.sub(r"(\d+)(k)", r"\g<1>000", data)
    return data    

#### Lemmatization

In [241]:
def lemmatize_token(token, tag):
    tag = {
        'N': nltk.corpus.wordnet.NOUN,
        'V': nltk.corpus.wordnet.VERB,
        'R': nltk.corpus.wordnet.ADV,
        'J': nltk.corpus.wordnet.ADJ
    }.get(tag[0], nltk.corpus.wordnet.NOUN)
    return WordNetLemmatizer().lemmatize(token, tag)

#### Preprocessing

In [263]:
def preprocess_text(X):
    """
    Returns a preprocessed version of a full corpus (ie. tokenization and lemmatization using POS taggs)
    """
    #X = ' '.join(X_corpus)
    lemmatized_tokens = []


    # Clean the text
    X = clean_text(X)


    # Break the text into sentences
    for sent in sent_tokenize(X):
        
        # Remove punctuation
        #sent = remove_punctuation(sent)

        # Break the sentence into part of speech tagged tokens
        for token, tag in pos_tag(word_tokenize(sent)):

            # Apply preprocessing to the token
            token = token.lower()
            token = token.strip()
            token = token.strip('_')
            token = token.strip('*')

            # If punctuation or stopword, ignore token and continue
            #if token in set(stopwords.words('english')) or all(char in set(string.punctuation) for char in token):

            # Removing stopwords, punctuation and one-letter words
            if token in set(stop_words) or token in punctuations or len(token) == 1:
                continue

            # Lemmatize the token
            lemma = lemmatize_token(token, tag)
            lemmatized_tokens.append(lemma)


    doc = ' '.join(lemmatized_tokens)

    return doc

---

#### Applying Preprocessing on Dataset

In [264]:
numberOfDocuments = len(X_test)
dataset = []
for i in range(numberOfDocuments):
    dataset.append(preprocess_text(X_test.iloc[i]))

In [274]:
print(dataset)



#### Calculating Document Frequency

In [275]:
DF = {}

#Set word as the key and the list of doc id’s as the value
for i in range(numberOfDocuments):
    tokens = dataset[i]

    for w in tokens.split():
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

# Replace the list of docs with its count
for i in DF:
    DF[i] = len(DF[i])            

In [276]:
total_vocab = [x for x in DF]

#### Calculating Term Frequency-Inverse Document Frequency

In [277]:
from collections import Counter

In [278]:
def doc_freq(text, s): 
        if text in s:
            loc = list(s).index(text)
            value_at_index = list(s.values())[loc]
            return value_at_index

In [279]:
tf_idf = {}
N = len(dataset)

for i in range(N):
    tokens = dataset[i].split()
    words_count = len(tokens)
    counter = Counter(tokens)
    for token in np.unique(tokens):
        tf = counter[token]/words_count
        df = doc_freq(token, DF)
        idf = np.log(N/(df+1))
        tf_idf[i, token] = tf*idf

In [280]:
tf_idf

{(0, 'already'): 0.019906477553675007,
 (0, 'also'): 0.0034725478934025853,
 (0, 'always'): 0.0029322375566957835,
 (0, 'anger'): 0.02116479171817996,
 (0, 'anybody'): 0.015530882545522985,
 (0, 'as'): 0.013451574298814448,
 (0, 'assignment'): 0.004358921292579051,
 (0, 'attention'): 0.014055589866283082,
 (0, 'away'): 0.005198046542742138,
 (0, 'back'): 0.004886934807706185,
 (0, 'bargain'): 0.02116479171817996,
 (0, 'become'): 0.00935666303615921,
 (0, 'believe'): 0.004975045968692196,
 (0, 'best'): 0.012403729773957052,
 (0, 'big'): 0.005802062110210772,
 (0, 'bitch'): 0.01908548347147142,
 (0, 'blow'): 0.013451574298814448,
 (0, 'bob'): 0.052830572376694564,
 (0, 'boy'): 0.011185765312768093,
 (0, 'bros'): 0.02116479171817996,
 (0, 'buds'): 0.02116479171817996,
 (0, 'cap'): 0.02116479171817996,
 (0, 'care'): 0.007277354789450675,
 (0, 'carry'): 0.017610190792231518,
 (0, 'civilized'): 0.02116479171817996,
 (0, 'clincher'): 0.02116479171817996,
 (0, 'close'): 0.007277354789450675,
 

#### Document Vectorization

In [281]:
# Document Vectorization
D = np.zeros((N, len(total_vocab)))
for i in tf_idf:
    ind = total_vocab.index(i[1])
    D[i[0]][ind] = tf_idf[i]

    

---