In [1]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

gutenberg.fileids()

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\joshu\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [2]:
nltk.download('punkt')
author1_train = gutenberg.sents("austen-emma.txt") +gutenberg.sents("austen-persuasion.txt")
print(author1_train)
print(len(author1_train))

author1_test = gutenberg.sents("austen-sense.txt")
print(author1_test)
print(len(author1_test))
author2_train = gutenberg.sents("shakespeare-caesar.txt")+gutenberg.sents("shakespeare-hamlet.txt")
print(author2_train)
print(len(author2_train))
author2_test = gutenberg.sents("shakespeare-macbeth.txt")
print(author2_test)
print(len(author2_test))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joshu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'], ['VOLUME', 'I'], ...]
11464
[['[', 'Sense', 'and', 'Sensibility', 'by', 'Jane', 'Austen', '1811', ']'], ['CHAPTER', '1'], ...]
4999
[['[', 'The', 'Tragedie', 'of', 'Julius', 'Caesar', 'by', 'William', 'Shakespeare', '1599', ']'], ['Actus', 'Primus', '.'], ...]
5269
[['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', ']'], ['Actus', 'Primus', '.'], ...]
1907


In [3]:
def statistics(gutenberg_data):
    for work in gutenberg_data:
        num_chars= len(gutenberg.raw(work))
        num_words = len(gutenberg.words(work))
        num_sents = len(gutenberg.sents(work))
        num_vocab = len(set(w.lower() for w in gutenberg.words(work)))
        
        print(round(num_chars/num_words),
              round(num_words/num_sents),
              round(num_words/num_vocab),
              work
             )
gutenberg_data = ['austen-emma.txt','austen-persuasion.txt','austen-sense.txt','shakespeare-caesar.txt','shakespeare-hamlet.txt','shakespeare-macbeth.txt']
statistics(gutenberg_data)

5 25 26 austen-emma.txt
5 26 17 austen-persuasion.txt
5 28 22 austen-sense.txt
4 12 9 shakespeare-caesar.txt
4 12 8 shakespeare-hamlet.txt
4 12 7 shakespeare-macbeth.txt


In [4]:
import random
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit

all_sents = [(sent,"austen") for sent in author1_train]
all_sents += [(sent,"shakespeare") for sent in author2_train]
print(f"Dataset size = {str(len(all_sents))} sentences")

values =[author for (sent, author) in all_sents]
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
strat_train_set = []
strat_pretest_set = []
for train_index, pretest_index in split.split(all_sents, values):
    strat_train_set = [all_sents[index] for index in train_index]
    strat_pretest_set = [all_sents[index] for index in pretest_index]
print(len(strat_train_set))
print(len(strat_pretest_set))

Dataset size = 16733 sentences
13386
3347


In [22]:
test_set = [(sent, "austen") for sent in author1_test]
test_set += [(sent, "shakespeare")
for sent in author2_test]
print(len(test_set))

6906


In [5]:
test_set = [(sent,"austen") for sent in author1_test]
test_set += [(sent,"shakespeare") for sent in author2_test]
def cat_proportions(data, cat):
    count = 0
    for item in data:
        if item[1]==cat:
            count +=1
    return float(count)/float(len(data))

categories = ["austen", "shakespeare"]
rows = []
rows.append(["Category","Overall", "Stratified train","Stratified preset","Test"])
for  cat in categories:
    rows.append([cat, f"{cat_proportions(all_sents, cat):.6f}",
                f"{cat_proportions(strat_train_set, cat):.6f}",
                f"{cat_proportions(strat_pretest_set, cat):.6f}",
                f"{cat_proportions(test_set, cat):.6f}"])
columns = zip(*rows)
column_widths = [max(len(item) for item in col) for col in columns]
for row in rows:
    print(''.join('{:{width}}'.format(row[i], width=column_widths[i]) for i in range(0,len(row))))


Category   Overall Stratified trainStratified presetTest    
austen     0.6851130.685119        0.685091         0.723863
shakespeare0.3148870.314881        0.314909         0.276137


In [23]:
def get_features(text):
    features = {}
    word_list = [word for word in text]
    for word in word_list:
        features[word] = True
    return features
train_features = [(get_features(sents),label) for (sents,label) in strat_train_set]
pretest_features =[(get_features(sents),label) for (sents, label) in strat_pretest_set]
test_features = [(get_features(sents),label) for (sents, label) in test_set]
print(len(train_features))
print(train_features[0][0])
print(train_features[112][0])

13386
{'Why': True, 'aske': True, 'you': True, 'this': True, '?': True}
{'Bru': True, '.': True}


In [24]:
from nltk import NaiveBayesClassifier, classify

print(f"Training Set size ={str(len(train_features))} sentences.")
print(f"Pretest Set size ={str(len(pretest_features))} sentences.")
classifier = NaiveBayesClassifier.train(train_features)

print(f"Accuracy on the training set ={str(classify.accuracy(classifier,train_features))}")
print(f"Accuracy on the pretest set ={str(classify.accuracy(classifier,pretest_features))}")

test_features = [(get_features(sents),label) for (sents, label) in test_set]
print(f"Accuracy on the test set ={str(classify.accuracy(classifier,test_features))}")
classifier.show_most_informative_features(10)

Training Set size =13386 sentences.
Pretest Set size =3347 sentences.
Accuracy on the training set =0.9783355744808009
Accuracy on the pretest set =0.9611592470869436
Accuracy on the test set =0.8964668404286128
Most Informative Features
                    King = True           shakes : austen =    202.3 : 1.0
                    thou = True           shakes : austen =    192.2 : 1.0
                    been = True           austen : shakes =    154.5 : 1.0
                    only = True           austen : shakes =    125.2 : 1.0
                     own = True           austen : shakes =    105.6 : 1.0
                       d = True           shakes : austen =     65.2 : 1.0
                    doth = True           shakes : austen =     60.2 : 1.0
                   quite = True           austen : shakes =     52.1 : 1.0
                     Tis = True           shakes : austen =     50.0 : 1.0
                    Lord = True           shakes : austen =     48.9 : 1.0


In [7]:
def avg_chars(text):
    total_chars = 0.0
    for word in text:
            total_chars += len(word)
    return float(total_chars)/float(len(text))

def number_words(text):
    return float(len(text))

print(avg_chars(["Not","so","happy",",","yet","much","happyer"]))
print(number_words(["Not","so","happy",",","yet","much","happyer"]))

3.5714285714285716
7.0


In [25]:
def initialize_dataset(source):
    all_features = []
    targets = []
    for (sent, label) in source:
        feature_list = []
        feature_list.append(avg_chars(sent))
        feature_list.append(number_words(sent))
        all_features.append(feature_list)
        if label == "austen":
            targets.append(0)
        else:
            targets.append(1)
    return all_features,targets
train_data, train_targets = initialize_dataset(strat_train_set)
pretest_data, pretest_targets = initialize_dataset(strat_pretest_set)
test_data, test_targets = initialize_dataset(test_set)

print(len(train_data), len(train_targets))
print(len(pretest_data), len(pretest_targets))
print(len(test_data), len(test_targets))

13386 13386
3347 3347
6906 6906


In [26]:
from sklearn.tree import DecisionTreeClassifier

text_clf = DecisionTreeClassifier(random_state=42)
text_clf.fit(train_data,train_targets)
predicted = text_clf.predict(pretest_data)


In [27]:
import numpy as np
from sklearn import metrics

def evaluate(predicted, targets):
    print(np.mean(predicted==targets))
    print(metrics.confusion_matrix(targets,predicted))
    print(metrics.classification_report(targets,predicted))
evaluate(predicted,pretest_targets)
t_predicted =text_clf.predict(test_data)
evaluate(t_predicted,test_targets)

0.804302360322677
[[2148  145]
 [ 510  544]]
              precision    recall  f1-score   support

           0       0.81      0.94      0.87      2293
           1       0.79      0.52      0.62      1054

    accuracy                           0.80      3347
   macro avg       0.80      0.73      0.75      3347
weighted avg       0.80      0.80      0.79      3347

0.8036490008688097
[[4576  423]
 [ 933  974]]
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      4999
           1       0.70      0.51      0.59      1907

    accuracy                           0.80      6906
   macro avg       0.76      0.71      0.73      6906
weighted avg       0.79      0.80      0.79      6906



In [28]:
def word_counts(text):
    counts ={}
    for word in text:
        counts[word.lower()] = counts.get(word.lower(),0)+1
    return counts
def proportion_words(text, wordlist):
    count = 0
    for word in text:
        if word.lower() in wordlist:
            count+=1
    return float(count)/float(len(text))

In [30]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_md')

def intitalize_dataset_sp(source):
    all_features = []
    targets = []
    for (sent, label) in source:
        feature_list =[]
        feature_list.append(avg_chars(sent))
        feature_list.append(number_words(sent))
        counts = word_counts(sent)
        for word in STOP_WORDS:
            if word in counts.keys():
                feature_list.append(counts.get(word))
            else:
                feature_list.append(0)
        feature_list.append(proportion_words(sent,STOP_WORDS))
        all_features.append(feature_list)
        if label == "austen":
            targets.append(0)
        else:
            targets.append(1)
    return all_features, targets
train_data, train_targets = intitalize_dataset_sp(strat_train_set)
pretest_data, pretest_targets = intitalize_dataset_sp(strat_pretest_set)
test_data, test_targets = intitalize_dataset_sp(test_set)

print(len(train_data), len(train_targets))
print(len(pretest_data), len(pretest_targets))
print(len(test_data), len(test_targets))

13386 13386
3347 3347
6906 6906


In [31]:
text_clf = DecisionTreeClassifier(random_state=42)
text_clf.fit(train_data, train_targets)
predicted = text_clf.predict(pretest_data)
evaluate(predicted, pretest_targets)

predicted = text_clf.predict(test_data)
evaluate(predicted, test_targets)

0.8078876605915746
[[1974  319]
 [ 324  730]]
              precision    recall  f1-score   support

           0       0.86      0.86      0.86      2293
           1       0.70      0.69      0.69      1054

    accuracy                           0.81      3347
   macro avg       0.78      0.78      0.78      3347
weighted avg       0.81      0.81      0.81      3347

0.8231972198088618
[[4352  647]
 [ 574 1333]]
              precision    recall  f1-score   support

           0       0.88      0.87      0.88      4999
           1       0.67      0.70      0.69      1907

    accuracy                           0.82      6906
   macro avg       0.78      0.78      0.78      6906
weighted avg       0.83      0.82      0.82      6906



In [33]:
def preprocess(source):
    source_docs = {}
    index = 0
    for (sent, label) in source:
        text = " ".join(sent)
        source_docs[text] = nlp(text)
        if index>0 and (index%2000)==0:
            print(str(index)+ "texts processed")
        index += 1
    print("Dataset processed")
    return source_docs
train_docs = preprocess(strat_train_set)
pretest_docs = preprocess(strat_pretest_set)
test_docs = preprocess(test_set)
print(train_docs)

2000texts processed
4000texts processed
6000texts processed
8000texts processed
10000texts processed
12000texts processed
Dataset processed
2000texts processed
Dataset processed
2000texts processed
4000texts processed
6000texts processed
Dataset processed


In [38]:
from collections import Counter
pos_list = ["C", "D", "E", "F", "I", "J", "M","N", "P", "R", "T", "U", "V", "W"]
def pos_counts(text, source_docs, pos_list):
    pos_counts = {}
    doc = source_docs.get(" ".join(text))
    tags = []
    for word in doc:
        tags.append(str(word.tag_)[0])
    counts = Counter(tags)
    for pos in pos_list:
        if pos in counts.keys():
            pos_counts[pos] = counts.get(pos)
        else: pos_counts[pos] = 0
    return pos_counts
def initialize_dataset(source, source_docs):
    all_features = []
    targets = []
    for (sent, label) in source:
        feature_list=[]
        feature_list.append(avg_chars(sent))
        feature_list.append(number_words(sent))
        counts = word_counts(sent)
        for word in STOP_WORDS:
            if word in counts.keys():
                feature_list.append(counts.get(word))
            else:
                feature_list.append(0)
        feature_list.append(proportion_words(sent, STOP_WORDS))
        p_counts = pos_counts(sent,source_docs, pos_list)
        for pos in p_counts.keys():
            feature_list.append(
                        float(p_counts.get(pos)/float(len(sent))))
        all_features.append(feature_list)
        if label == "austen":
            targets.append(0)
        else:
            targets.append(1)
    return all_features, targets
            
        

In [39]:
def run():
    train_data, train_targets = initialize_dataset(strat_train_set, train_docs)
    pretest_data, pretest_targets = initialize_dataset(strat_pretest_set,
      pretest_docs)
    test_data, test_targets = initialize_dataset(
      test_set, test_docs)
    print (len(train_data), len(train_targets))
    print (len(pretest_data), len(pretest_targets))
    print (len(test_data), len(test_targets))
    print ()
    text_clf = DecisionTreeClassifier(random_state=42)
    text_clf.fit(train_data, train_targets)
    predicted = text_clf.predict(pretest_data)
    evaluate(predicted, pretest_targets)
    predicted = text_clf.predict(test_data)
    evaluate(predicted, test_targets)
run()

13386 13386
3347 3347
6906 6906

0.8267104870032865
[[2017  276]
 [ 304  750]]
              precision    recall  f1-score   support

           0       0.87      0.88      0.87      2293
           1       0.73      0.71      0.72      1054

    accuracy                           0.83      3347
   macro avg       0.80      0.80      0.80      3347
weighted avg       0.83      0.83      0.83      3347

0.8252244425137562
[[4404  595]
 [ 612 1295]]
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      4999
           1       0.69      0.68      0.68      1907

    accuracy                           0.83      6906
   macro avg       0.78      0.78      0.78      6906
weighted avg       0.82      0.83      0.82      6906

