### This notebook is for kaggle challenge named "Spooky author identification". https://www.kaggle.com/c/spooky-author-identification. 

Implements word embeddings. Best multiclass-logloss is around 0.82

In [0]:
# !gdown https://drive.google.com/uc?id=1Hs6daoHoz_urLbGsRsapmI0pNjUgfLR9
# !unzip spooky-author-identification.zip
# !rm -rf spooky-author-identification.zip
# !rm -rf sample_data
# !unzip train.zip
# !unzip test.zip
# !unzip sample_submission.zip
# !rm -rf test.zip train.zip
# !rm -rf sample_submission.zip

In [0]:
import numpy as np
import pandas as pd
import re
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
import matplotlib.pyplot as plt

In [0]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [34]:
train_df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [35]:
X_train = train_df['text']
Y_train = train_df['author']
X_test = test_df['text']
Y_train = pd.get_dummies(Y_train)
Y_train.head()

Unnamed: 0,EAP,HPL,MWS
0,1,0,0
1,0,1,0
2,1,0,0
3,0,0,1
4,0,1,0


In [36]:
sum(X_train.str.len())/X_train.shape[0]

149.05740844782676

In [37]:
# integer encode the documents
# vocab_size = 5000
vocab_size = 4000
encoded_docs = [one_hot(d, vocab_size) for d in X_train]

# pad documents to a max length of 300 words
# max_length = 450
max_length = 300
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[ 223 1148  949 ...    0    0    0]
 [3191 1964  653 ...    0    0    0]
 [2475 1848 3184 ...    0    0    0]
 ...
 [ 642  206 2453 ...    0    0    0]
 [ 225 3389 1272 ...    0    0    0]
 [1562 1764 3434 ...    0    0    0]]


In [38]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=max_length))
# model.add(Embedding(vocab_size, 35, input_length=max_length))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
# model.add(Dense(256, activation='relu'))
model.add(Dense(3, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
# summarize the model
# print(model.summary())
# fit the model
model.fit(padded_docs, Y_train, epochs = 6)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, Y_train)
print(f'Accuracy: {accuracy*100}')

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Accuracy: 99.99489248684816


In [0]:
encoded_docs = [one_hot(d, vocab_size) for d in X_test]
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
preds = model.predict_proba(padded_docs)

In [40]:
print (preds.shape)

(8392, 3)


In [0]:
submission_df = pd.concat([test_df['id'], pd.DataFrame(preds[:, :], columns = ['EAP', 'HPL', 'MWS'])], axis = 1)

In [42]:
submission_df.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.000239,0.0,0.002419
1,id24541,0.882096,3.781915e-05,0.0
2,id00134,4.3e-05,0.7721812,0.0
3,id27757,0.034587,1.472235e-05,0.0
4,id04081,0.000928,2.384186e-07,0.000859


In [0]:
submission_df.to_csv('predictions.csv', index=False)

In [44]:
!zip subm.zip predictions.csv

updating: predictions.csv (deflated 64%)
