In [0]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics, neighbors, tree
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
import nltk
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import RegexpTokenizer
import random
import timeit

In [0]:
nltk.download('gutenberg')
nltk.download('punkt')
books = ['austen-emma.txt', 'bible-kjv.txt', 'bryant-stories.txt', 'melville-moby_dick.txt', 'edgeworth-parents.txt',
             'chesterton-thursday.txt', 'milton-paradise.txt']


def preprocess():
    # Select seven books

    document_word = {}
    document_sent = {}

    count = 0

    for i in books:
        text = nltk.corpus.gutenberg.raw(i)

        count += 1

        # Transfer to lower characters
        text = text.lower()

        # Get the author name
        author = i.split('-')[0]

        print(author)

        # Tokenize sentences
        sent_tokens = sent_tokenize(text)

        # Shuffle sentences
        random.shuffle(sent_tokens)

        full_sent = ''
        # Append to a list
        for j in range(len(sent_tokens)):
            full_sent = full_sent + sent_tokens[j]

        # Remove all the punctuations and change to word list
        tokenizer = RegexpTokenizer(r'\w+')
        words_tokens = tokenizer.tokenize(full_sent)

        for k in range(200):
            key = " ".join(words_tokens[k * 200:k * 200 + 199])
            value = author
            document_sent[key] = value

    return(document_sent)

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
from sklearn.feature_extraction.text import CountVectorizer

# Using N-gram here
def ngram(document):
  sent = [k for k in document.keys()]
  author = [v for v in document.values()]
  ngram_vectorizer = CountVectorizer(ngram_range=(2, 2), decode_error="ignore",
                   token_pattern = r'\b\w+\b',min_df=1, lowercase=False)
  vector = ngram_vectorizer.fit_transform(sent).toarray()
  return author, vector

In [0]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics, neighbors, tree
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

# Give each book an index
book_index = {'austen':0,
        'bible':1,
        'bryant':2,
        'melville':3,
        'edgeworth':4,
        'chesterton':5,
        'milton':6}


def KNN_train(x, y):
    start = timeit.default_timer()
  
    clf = KNeighborsClassifier(n_neighbors = 3)
    scores = cross_val_score(clf, x, y, cv=10, scoring='accuracy')
    print('The accuracy of KNN: ',scores)
    print('Mean is: ', scores.mean())

    stop = timeit.default_timer()
    print('Time: ', stop - start)
    print()


def regression(x, y):
    start = timeit.default_timer()

    y = [book_index.get(n, n) for n in y]
    clf = LinearRegression()
    scores = cross_val_score(clf, x, y, cv=10, scoring='r2')
    print('The accuracy of regression: ',scores)
    print('Mean is: ', scores.mean())

    stop = timeit.default_timer()
    print('Time: ', stop - start)
    print()


def MLP_train(x_train, x_test, y_train, y_test):
    y_train = [book_index.get(n, n) for n in y_train]
    y_test = [book_index.get(n, n) for n in y_test]
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(500,))
    clf.fit(x_train, y_train)
    predicted = clf.predict(x_test)
    acc = metrics.accuracy_score(y_test, predicted)
    print('The accuracy of MLP: ', acc)


def binarytree(x, y):
    start = timeit.default_timer()

    y = [book_index.get(n, n) for n in y]
    clf = tree.DecisionTreeClassifier()
    scores = cross_val_score(clf, x, y, cv=10, scoring='accuracy')
    print('The accuracy of decision tree: ', scores)
    print('Mean is: ', scores.mean())

    stop = timeit.default_timer()
    print('Time: ', stop - start)
    print()


def SVM_train(x,y ):
    start = timeit.default_timer()

    y = [book_index.get(n, n) for n in y]
    clf = SVC()
    scores = cross_val_score(C=3, clf, x, y, cv=10, scoring='accuracy')
    print('The accuracy of SVM: ', scores)
    print('Mean is: ', scores.mean())

    stop = timeit.default_timer()
    print('Time: ', stop - start)
    print()



def gaussianNB(x, y):
    start = timeit.default_timer()

    clf = GaussianNB()
    scores = cross_val_score(clf, x, y, cv=10, scoring='accuracy')
    print('The accuracy of gaussianNB: ', scores)
    print('Mean is: ', scores.mean())

    stop = timeit.default_timer()
    print('Time: ', stop - start)
    print()


def bernoulliNB(x, y):
    clf = BernoulliNB()
    scores = cross_val_score(clf, x, y, cv=10, scoring='accuracy')
    print("The accuracy of BernoulliNB is: ", scores)
    print(scores.mean())


from sklearn.naive_bayes import MultinomialNB
def multinomialNB(x, y):
    clf = MultinomialNB()
    scores = cross_val_score(clf, x, y, cv=10, scoring='accuracy')
    print("The accuracy of MultinomialNB is: ", scores)
    print(scores.mean())


In [0]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold


d = preprocess()
author, vector = ngram(d)

c = list(zip(author, vector))

random.shuffle(c)

author, vector = zip(*c)


gaussianNB(vector, author)
KNN_train(vector,author)
regression(vector,author)
binarytree(vector, author)
SVM_train(vector,author)

austen
bible
bryant
melville
edgeworth
chesterton
milton
The accuracy of SVM:  [0.95714286 0.95       0.94285714 0.95714286 0.92857143 0.94285714
 0.92142857 0.92142857 0.97142857 0.95714286]
Mean is:  0.945
Time:  6785.369318087

