# dataset https://www.kaggle.com/gdberrio/spooky-authors-csv
Predict author of sentence (3 different authors)

In [1]:
from string import punctuation

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelEncoder

from nltk.corpus import stopwords
from nltk import wordpunct_tokenize, word_tokenize
from nltk.stem import PorterStemmer

from nltk.tokenize import WordPunctTokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import models, layers, callbacks, optimizers

In [2]:
df = pd.read_csv("datasets/authors.csv")
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
print(df['author'].value_counts())

texts = df['text']
le = LabelEncoder().fit(df['author'])
authors = le.transform(df['author'])
print(np.bincount(authors))

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64
[7900 5635 6044]


In [4]:
labels = to_categorical(authors)
print(authors[:3])
print(labels[:3])

[0 1 0]
[[1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [5]:
stemmer = PorterStemmer()
stopwords = set(stopwords.words('english'))

texts_lower_punct = []
for i in texts:
    texts_lower_punct.append(' '.join([stemmer.stem(x) for x in word_tokenize(i.lower())
                                       if x not in punctuation and x not in stopwords]))

In [6]:
texts_lower_punct[10]

"shall find feel injuri shall learn dread reveng '' day arriv"

In [7]:
X_train, X_test, y_train, y_test = train_test_split(texts_lower_punct, labels, stratify=labels, test_size=0.2)
print(len(X_train), len(X_test))

15663 3916


In [8]:
cv = CountVectorizer(decode_error='ignore', ngram_range=(1, 2), 
                     max_df=1.0, min_df=2, max_features=10000)  # will try bigrams as well

cv.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=2,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [9]:
x_train_transformed = cv.transform(X_train)
x_test_transformed = cv.transform(X_test)

In [10]:
x_train_transformed.shape

(15663, 10000)

In [11]:
model = models.Sequential()
model.add(layers.Dense(32, activation='relu', input_shape=(x_train_transformed.shape[1], )))
model.add(layers.Dropout(0.7))
model.add(layers.Dense(3, activation='softmax'))

model.compile(metrics=['acc'], loss='categorical_crossentropy', optimizer=optimizers.Adam(lr=1e-4))

In [12]:
cb = [callbacks.EarlyStopping(monitor='val_loss', patience=5)]

In [13]:
history = model.fit(x_train_transformed, y_train, epochs = 100, batch_size = 128, validation_split=0.2, callbacks=cb)

Train on 12530 samples, validate on 3133 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100


In [14]:
from sklearn.metrics import log_loss

In [15]:
log_loss(y_test, model.predict_proba(x_test_transformed))

0.4560514876877649

Not much better than 0.47 using sklearn