# dataset https://www.kaggle.com/gdberrio/spooky-authors-csv
Predict author of sentence (3 different authors)

In [1]:
from string import punctuation

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelEncoder

from nltk.corpus import stopwords
from nltk import wordpunct_tokenize, word_tokenize
from nltk.stem import PorterStemmer

from nltk.tokenize import WordPunctTokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
maxlen = 100  # sentences with length > 100 'words' will be cropped

In [3]:
df = pd.read_csv("../datasets/authors.csv")
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
df['author'].value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

In [5]:
texts = df['text']
le = LabelEncoder().fit(df['author'])
authors = le.transform(df['author'])
np.bincount(authors)

array([7900, 5635, 6044])

In [6]:
labels = to_categorical(authors)
print(authors[:3])
print(labels[:3])

[0 1 0]
[[1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [7]:
# lowercase, punct padded with spaces
texts_lower_punct = []
for i in texts:
    texts_lower_punct.append(' '.join([x for x in word_tokenize(i.lower())]))

In [8]:
texts_lower_punct[1000]

'it was our plan to remain where we were and intercept the liner dacia , mentioned in information from agents in new york .'

In [9]:
X_train, X_test, y_train, y_test = train_test_split(texts_lower_punct, labels, stratify=labels, test_size=0.2)
print(len(X_train), y_test.shape)

15663 (3916, 3)


In [10]:
tokenizer = Tokenizer(filters='\t\n')  # save the punctuation
tokenizer.fit_on_texts(X_train)

wi = tokenizer.word_index
wir = dict([(v, k) for k,v in wi.items()])

X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen = maxlen)

In [11]:
[wir.get(x, x) for x in X_train[10]]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 'the',
 'students',
 'all',
 'attended',
 'the',
 'hasty',
 'funeral',
 'on',
 'the',
 'th',
 ',',
 'and',
 'bought',
 'an',
 'impressive',
 'wreath',
 ',',
 'though',
 'the',
 'latter',
 'was',
 'quite',
 'overshadowed',
 'by',
 'the',
 'tributes',
 'sent',
 'by',
 'wealthy',
 'arkham',
 'citizens',
 'and',
 'by',
 'the',
 'municipality',
 'itself',
 '.']

In [12]:
from tensorflow.keras import layers, models, callbacks
from sklearn.metrics import log_loss

In [13]:
max_features = max(wi.values()) + 1

In [14]:
model = models.Sequential()

model.add(layers.Embedding(max_features, 16, input_length = maxlen))
# model.add(layers.LSTM(32, activation='relu', dropout=0.2))
# model.add(layers.Flatten())
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dropout(0.3))

model.add(layers.Dense(3, activation = 'softmax'))
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['acc'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 16)           372800    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 3)                 51        
Total params: 372,851
Trainable params: 372,851
Non-trainable params: 0
_________________________________________________________________


In [15]:
cb = [callbacks.EarlyStopping(monitor='val_loss', patience=5), 
      callbacks.ModelCheckpoint('../saved_models/authors_fast_text.hdf5', monitor='val_loss', save_best_only=True)]

In [16]:
history = model.fit(X_train, y_train, epochs = 100, batch_size = 64, validation_split=0.2, 
                   callbacks=cb)

Train on 12530 samples, validate on 3133 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100


In [17]:
# dropout = 0.3
model.load_weights('../saved_models/authors_fast_text.hdf5')
X_test_check = tokenizer.texts_to_sequences(X_test)
X_test_check = pad_sequences(X_test_check, maxlen = maxlen)
preds = model.predict_proba(X_test_check)
print(log_loss(y_test, preds))
print(model.evaluate(X_test_check, y_test))

0.3814878599256562
[0.38148786466991086, 0.8462717]


Far better than 0.47 we got in authors MultinominalNB.ipynb