## Introduction
Text processing has a long history and is known under a couple of different names including Computational Linguistics and Natural Language Processing.  in this notebook we will look at some text processing options avaiable in the packages

* nltk
* gensim
* scikit-learn

### nltk - The Natural Language Toolkit
nltk is a very popular text processing tool for western languages.  It can be found at the [nltk site](http://www.nltk.org/).



In [6]:
import os
import nltk

nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords


# load training data
BASE_DIR = '../data'
TEXT_DATA_DIR = os.path.join(BASE_DIR, 'SpookyData')

import pandas as pd

# read the training data
df = pd.read_csv(os.path.join(TEXT_DATA_DIR, 'train.csv'))
print(df.shape)
df.head()


[nltk_data] Downloading package stopwords to /home/joe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/joe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
(19579, 3)


Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [7]:
# Read the text of the training examples  
corpus = df['text'].tolist()
print(corpus[0])
unique_labels = df['author'].unique().tolist()

This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.


In [8]:
from nltk.tokenize import word_tokenize

print(word_tokenize(corpus[0]))

['This', 'process', ',', 'however', ',', 'afforded', 'me', 'no', 'means', 'of', 'ascertaining', 'the', 'dimensions', 'of', 'my', 'dungeon', ';', 'as', 'I', 'might', 'make', 'its', 'circuit', ',', 'and', 'return', 'to', 'the', 'point', 'whence', 'I', 'set', 'out', ',', 'without', 'being', 'aware', 'of', 'the', 'fact', ';', 'so', 'perfectly', 'uniform', 'seemed', 'the', 'wall', '.']


In [9]:
# Remove english stopwords from the tokenized lists
stops = set(stopwords.words('english'))
modified_corpus = []
for sent in corpus:
    modified_sent = []
    for term in word_tokenize(sent):
        if term not in stops:
            modified_sent.append(term)
    modified_corpus.append(modified_sent)
print(modified_corpus[0])

labels = df['author'].tolist()
print(unique_labels)

labeled_corpus = list(zip(modified_corpus, labels))
print(labeled_corpus[0])

['This', 'process', ',', 'however', ',', 'afforded', 'means', 'ascertaining', 'dimensions', 'dungeon', ';', 'I', 'might', 'make', 'circuit', ',', 'return', 'point', 'whence', 'I', 'set', ',', 'without', 'aware', 'fact', ';', 'perfectly', 'uniform', 'seemed', 'wall', '.']
['EAP', 'HPL', 'MWS']
(['This', 'process', ',', 'however', ',', 'afforded', 'means', 'ascertaining', 'dimensions', 'dungeon', ';', 'I', 'might', 'make', 'circuit', ',', 'return', 'point', 'whence', 'I', 'set', ',', 'without', 'aware', 'fact', ';', 'perfectly', 'uniform', 'seemed', 'wall', '.'], 'EAP')


In [22]:
# create a labeled set of training features.
len(all_words)
# another way to say the same thing...
#all_words = set(word.lower() for passage in modified_corpus for word in passage[0])
all_data = []
for passage in labeled_corpus:
    d = {}
    for term in passage[0]:
        d[term] = True
    all_data.append((d, passage[1]))
    
print(all_data[0])
print(list(all_data[0][0].keys()))

({'however': True, 'means': True, ';': True, 'ascertaining': True, 'set': True, 'might': True, 'afforded': True, 'fact': True, 'wall': True, 'make': True, 'without': True, 'dungeon': True, ',': True, '.': True, 'return': True, 'circuit': True, 'dimensions': True, 'perfectly': True, 'seemed': True, 'uniform': True, 'This': True, 'process': True, 'point': True, 'whence': True, 'aware': True, 'I': True}, 'EAP')
['however', 'means', ';', 'ascertaining', 'set', 'might', 'afforded', 'fact', 'wall', 'make', 'without', 'dungeon', ',', '.', 'return', 'circuit', 'dimensions', 'perfectly', 'seemed', 'uniform', 'This', 'process', 'point', 'whence', 'aware', 'I']


In [19]:
#get a random set of features to use to create negative examples for training.
all_words = set()               # uniques terms
for passage in labeled_corpus:
    for word in passage[0]:
        all_words.add(word)

all_words = list(all_words)
print(all_words[:5])

all_words_idx = [i for i,_ in enumerate(all_words)]

# shuffle the indexes so that we can produce reamdom samples
from random import shuffle

shuffled_word_idxs = [i for i,_ in enumerate(all_words)]
shuffle(shuffled_word_idxs)
print(shuffled_word_idxs[:5])
print([all_words[shuffled_idx] for shuffled_idx in shuffled_word_idxs[:5]])

['refinery', 'humankind', 'epilepsy', 'execution', 'toothed']
[18749, 441, 12128, 5028, 25836]
['stores', 'wherewith', 'admonishing', 'darkening', 'abyss']


In [32]:
# Add some of the shuffled terms as negative examples for each of the data samples.
allw = len(all_words)
idx = 0 # we are going to loop trough the shuffled values.
for passage in all_data:
    sample = list(passage[0].keys())
    j = 0
    #print(sample) 
    while j < len(sample): #  add the same number of negative samples as positive.
        current = all_words[shuffled_word_idxs[idx]]
        #print(current)
        if current not in sample:
            #  add the current term as a negative sample
            passage[0][current] = False
            ## increment j
            j = j+1
        ## increment index counter
        idx = idx+1
        if idx == allw:
            idx = 0 # reset and go around again
                
print(all_data[1])



In [41]:
import math
# separate training and testing data
shuffle(all_data)
train_len = math.ceil(len(all_data)*.8)
train_data = all_data[:train_len]
test_data = all_data[train_len:]
test_data_stripped = list(test[0] for test in test_data)
print(test_data_stripped[0])

{'sagacity': False, 'toiled': False, 'gait': False, 'advancing': True, 'endures': False, 'Whisper': False, 'veneration': False, 'technical': False, 'resumed': True, 'stragglers': True, 'paralyzed': False, 'despotic': False, 'bran': False, 'progress': True, 'Chantilly': False, "'singular": False, 'slopes': False, 'thoroughfare': True, 'hurriedly': True, 'Organ': False, 'handkerchiefs': False, 'inspect': False, 'around': True, 'advised': False, 'got': False, 'harmonies': False, 'band': True, 'I': True, 'Eliot': True, 'attain': False, 'conqueror': False, 'satisfactorily': False, 'mine': False, 'Street': True, 'Bleu': False, 'incurred': False, 'kindred': False, 'stems': False, 'impose': False, 'still': True, 'sheltering': False, 'secured': False, 'felled': False, 'rove': False, 'emigrant': False, 'entombment': False, 'thanks': False, 'Adonis': False, 'darting': True, ';': True, 'When': True, 'corner': True, 'handling': False, 'along': True, 'precise': False, 'impossibilities': False, 'Flag

In [42]:
# learn by NaiveBayes
classifier = nltk.NaiveBayesClassifier.train(train_data)

In [43]:
classifier.show_most_informative_features()

Most Informative Features
                    West = True              HPL : EAP    =     83.8 : 1.0
                 Raymond = True              MWS : HPL    =     77.4 : 1.0
                  misery = True              MWS : EAP    =     71.3 : 1.0
                     Old = True              HPL : EAP    =     41.2 : 1.0
                 outside = True              HPL : MWS    =     40.8 : 1.0
                 curious = True              HPL : EAP    =     40.2 : 1.0
             endeavoured = True              MWS : EAP    =     38.9 : 1.0
             countenance = True              MWS : HPL    =     37.3 : 1.0
                sinister = True              HPL : EAP    =     34.6 : 1.0
                     Her = True              MWS : HPL    =     34.2 : 1.0


In [45]:
preds = [classifier.classify(test) for test in test_data_stripped]
print(preds[0])
print(test_data[0][1])

accuracy = 0.0
len_preds = len(preds)
for i in range(len_preds):
    accuracy += (preds[i] == test_data[i][1])
    
accuracy /= len(preds)
print(accuracy)

HPL
HPL
0.48071519795657724


In [15]:
classifier.labels()

['HPL', 'MWS', 'EAP']

In [16]:
# read the test data
dftest = pd.read_csv(os.path.join(TEXT_DATA_DIR, 'test.csv'))
print(dftest.shape)
dftest.head()

(8392, 2)


Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...
