In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
from keras.wrappers.scikit_learn import KerasClassifier

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
max_features = 1000000
Vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_features, output_mode='tf_idf', ngrams=2)

In [4]:
Vectorizer.adapt(np.array(df['text']))

In [5]:
vocab = Vectorizer.get_vocabulary()

In [6]:
num_tokens = len(vocab)
print(num_tokens)

246970


In [7]:
#model = tf.keras.Sequential([
#    Vectorizer,
#    tf.keras.layers.Dense(32, activation='relu'),
#    tf.keras.layers.Dense(3, activation='softmax')
#])

In [8]:
def create_model(neurons=10, layers=2):
    model = tf.keras.Sequential()
    model.add(Vectorizer)
    
    for n in range(layers):
        model.add(tf.keras.layers.Dense(neurons, activation='relu'))
        model.add(tf.keras.layers.Dropout(0.2))
    
    model.add(tf.keras.layers.Dense(3, activation='softmax'))
    
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(1e-3),
             metrics=['accuracy'])
    return model

In [40]:
model = tf.keras.Sequential()
model.add(Vectorizer)
    
for n in range(1):
    model.add(tf.keras.layers.Dense(25, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.2))
    
    model.add(tf.keras.layers.Dense(3, activation='softmax'))
    
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(1e-3),
             metrics=['accuracy'])

In [41]:
X = df['text']
y = df['author']

In [42]:
training_labels_bools = []

for author in y:
    if "EAP" == author:
        training_labels_bools.append([1,0,0])
    elif "HPL" == author:
        training_labels_bools.append([0,1,0])
    elif "MWS" == author:
        training_labels_bools.append([0,0,1])

In [43]:
y = training_labels_bools

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)

In [45]:
X_train.values

array(['But this discovery was so great and overwhelming that all the steps by which I had been progressively led to it were obliterated, and I beheld only the result.',
       'I said to myself, "This is no dream, for by what means can I prove the greater reality of that other life in the house of stone and brick south of the sinister swamp and the cemetery on the low hillock, where the Pole Star peers into my north window each night?"',
       'A robin red breast dropt from the frosty branches of the trees, upon the congealed rivulet; its panting breast and half closed eyes shewed that it was dying: a hawk appeared in the air; sudden fear seized the little creature; it exerted its last strength, throwing itself on its back, raising its talons in impotent defence against its powerful enemy.',
       ...,
       'The manner in which Wyatt received this harmless pleasantry convinced me, at once, that he was mad.',
       'She first assured him of her boundless confidence; of this he mus

In [16]:
#model = KerasClassifier(build_fn=create_model, batch_size=32, verbose=1)

batches = [32, 64]
neurons = [15, 20, 25]
layers = [1, 2, 4, 6]
epochs = [1]

param_grid = dict(neurons=neurons, epochs=epochs, layers=layers, batch_size=batches)
grid = GridSearchCV(estimator=model, scoring='neg_log_loss', param_grid=param_grid, n_jobs=1, cv=3, verbose=3)

grid_result = grid.fit(X_train, y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] batch_size=32, epochs=1, layers=1, neurons=15 ...................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  batch_size=32, epochs=1, layers=1, neurons=15, score=-0.438, total=   8.1s
[CV] batch_size=32, epochs=1, layers=1, neurons=15 ...................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.1s remaining:    0.0s


[CV]  batch_size=32, epochs=1, layers=1, neurons=15, score=-0.444, total=   8.1s
[CV] batch_size=32, epochs=1, layers=1, neurons=15 ...................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   16.2s remaining:    0.0s


[CV]  batch_size=32, epochs=1, layers=1, neurons=15, score=-0.447, total=   8.7s
[CV] batch_size=32, epochs=1, layers=1, neurons=20 ...................
[CV]  batch_size=32, epochs=1, layers=1, neurons=20, score=-0.432, total=   8.7s
[CV] batch_size=32, epochs=1, layers=1, neurons=20 ...................
[CV]  batch_size=32, epochs=1, layers=1, neurons=20, score=-0.433, total=   9.3s
[CV] batch_size=32, epochs=1, layers=1, neurons=20 ...................


KeyboardInterrupt: 

In [19]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print('%f (%f) with: %r' % (mean, stdev, param))

-0.451206 (0.010626) with: {'epochs': 1, 'layers': 1, 'neurons': 10}
-0.444123 (0.001995) with: {'epochs': 1, 'layers': 1, 'neurons': 15}
-0.439087 (0.008581) with: {'epochs': 1, 'layers': 1, 'neurons': 20}
-0.429369 (0.005241) with: {'epochs': 1, 'layers': 1, 'neurons': 25}
-0.519594 (0.043255) with: {'epochs': 1, 'layers': 2, 'neurons': 10}
-0.469815 (0.001766) with: {'epochs': 1, 'layers': 2, 'neurons': 15}
-0.458230 (0.007228) with: {'epochs': 1, 'layers': 2, 'neurons': 20}
-0.449325 (0.004645) with: {'epochs': 1, 'layers': 2, 'neurons': 25}
-0.728966 (0.068606) with: {'epochs': 1, 'layers': 4, 'neurons': 10}
-0.637830 (0.018364) with: {'epochs': 1, 'layers': 4, 'neurons': 15}
-0.506345 (0.006745) with: {'epochs': 1, 'layers': 4, 'neurons': 20}
-0.499139 (0.009262) with: {'epochs': 1, 'layers': 4, 'neurons': 25}
-0.870501 (0.044818) with: {'epochs': 1, 'layers': 6, 'neurons': 10}
-0.807303 (0.037110) with: {'epochs': 1, 'layers': 6, 'neurons': 15}
-0.732095 (0.090398) with: {'epoch

In [20]:
scores = pd.DataFrame({
    "mean": means,
    "stdev": stds,
    "params": params
})
scores.sort_values('mean', ascending=False)

Unnamed: 0,mean,stdev,params
3,-0.429369,0.005241,"{'epochs': 1, 'layers': 1, 'neurons': 25}"
2,-0.439087,0.008581,"{'epochs': 1, 'layers': 1, 'neurons': 20}"
1,-0.444123,0.001995,"{'epochs': 1, 'layers': 1, 'neurons': 15}"
7,-0.449325,0.004645,"{'epochs': 1, 'layers': 2, 'neurons': 25}"
0,-0.451206,0.010626,"{'epochs': 1, 'layers': 1, 'neurons': 10}"
6,-0.45823,0.007228,"{'epochs': 1, 'layers': 2, 'neurons': 20}"
5,-0.469815,0.001766,"{'epochs': 1, 'layers': 2, 'neurons': 15}"
11,-0.499139,0.009262,"{'epochs': 1, 'layers': 4, 'neurons': 25}"
10,-0.506345,0.006745,"{'epochs': 1, 'layers': 4, 'neurons': 20}"
4,-0.519594,0.043255,"{'epochs': 1, 'layers': 2, 'neurons': 10}"


In [46]:
model.fit(list(X_train), y_train, epochs=1, batch_size=64,
         validation_data=(list(X_test),y_test),
         validation_steps=10)



<keras.callbacks.History at 0x7fbd24046f10>

In [47]:
test_df = pd.read_csv('test_copy.csv')
test_df.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [48]:
test_df['EAP'] = 0
test_df['HPL'] = 0
test_df['MWS'] = 0

In [49]:
X_sub = test_df['text']
X_sub.head()

0    Still, as I urged our leaving Ireland with suc...
1    If a fire wanted fanning, it could readily be ...
2    And when they had broken down the frail door t...
3    While I was thinking how I should possibly man...
4    I am not sure to what limit his knowledge may ...
Name: text, dtype: object

In [58]:
sub_pred = model.predict(df['text'])
sub_pred

array([[9.93461132e-01, 1.14286551e-03, 5.39606391e-03],
       [1.07838936e-01, 8.34321439e-01, 5.78396432e-02],
       [9.99998569e-01, 2.59679695e-07, 1.24176518e-06],
       ...,
       [9.99421716e-01, 3.66384367e-04, 2.12000494e-04],
       [4.34398472e-01, 3.91423911e-01, 1.74177647e-01],
       [2.63462096e-01, 6.86733842e-01, 4.98039611e-02]], dtype=float32)

In [61]:
ngram_ensemble = pd.DataFrame({
    "id": df["id"],
    "EAP_ngram": sub_pred[:, 0],
    "HPL_ngram": sub_pred[:, 1],
    "MWS_ngram": sub_pred[:, 2],
    "actual_author": df['author']
})
ngram_ensemble.head()

Unnamed: 0,id,EAP_ngram,HPL_ngram,MWS_ngram,actual_author
0,id26305,0.993461,0.001142866,0.005396,EAP
1,id17569,0.107839,0.8343214,0.05784,HPL
2,id11008,0.999999,2.596797e-07,1e-06,EAP
3,id27763,0.001523,0.0008533042,0.997624,MWS
4,id12958,0.002619,0.9973341,4.7e-05,HPL


In [62]:
ngram_ensemble.to_csv("ngram_ensemble.csv", index=False)