In [0]:
# # Download required dataset
# !gdown https://drive.google.com/uc?id=1Hs6daoHoz_urLbGsRsapmI0pNjUgfLR9
# !unzip spooky-author-identification.zip
# !rm -rf spooky-author-identification.zip
# !rm -rf sample_data
# !unzip train.zip
# !unzip test.zip
# !unzip sample_submission.zip
# !rm -rf test.zip train.zip
# !rm -rf sample_submission.zip

Downloading...
From: https://drive.google.com/uc?id=1Hs6daoHoz_urLbGsRsapmI0pNjUgfLR9
To: /content/spooky-author-identification.zip
  0% 0.00/1.90M [00:00<?, ?B/s]100% 1.90M/1.90M [00:00<00:00, 61.2MB/s]
Archive:  spooky-author-identification.zip
  inflating: train.zip               
  inflating: sample_submission.zip   
  inflating: test.zip                
Archive:  train.zip
  inflating: train.csv               
Archive:  test.zip
  inflating: test.csv                
Archive:  sample_submission.zip
  inflating: sample_submission.csv   


In [0]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [0]:
# load the dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [0]:
train_df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [0]:
test_df.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [0]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train_df.author.values)

In [0]:
def spit_submission_zip(predictions, test_df, file_name):
    submission_df = pd.concat([test_df['id'], pd.DataFrame(predictions, columns = ['EAP', 'HPL', 'MWS'])], axis = 1)
    submission_df.to_csv('predictions.csv', index=False)
    !zip {file_name}.zip predictions.csv
    !rm -rf predictions.csv

## Logistic Regression with TF-IDF as features

In [0]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to training-set
tfv.fit(train_df['text'])
xtrain_tfv =  tfv.transform(train_df['text']) 
xtest_tfv = tfv.transform(test_df['text'])

In [0]:
# Fitting a simple Logistic Regression on TFIDF
clf = LogisticRegression(C = 3)
clf.fit(xtrain_tfv, y)
predictions = clf.predict_proba(xtest_tfv)



### This achieves 0.5174 multiclass logloss

In [0]:
spit_submission_zip(predictions, test_df, 'TFIDF-LR')

  adding: predictions.csv (deflated 54%)


## Logistic Regression with Word Counts as features

In [0]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w+',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(train_df['text'])
xtrain_ctv =  ctv.transform(train_df['text']) 
xvalid_ctv = ctv.transform(test_df['text'])

In [0]:
# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C = 3)
clf.fit(xtrain_ctv, y)
predictions = clf.predict_proba(xvalid_ctv)



### This achieves 0.5043 multiclass logloss

In [0]:
spit_submission_zip(predictions, test_df, 'CV-LR')

  adding: predictions.csv (deflated 54%)


## Naive Bayes with TF-IDF features

In [0]:
# Fitting a simple Naive Bayes on TFIDF
clf = MultinomialNB()
clf.fit(xtrain_tfv, y)
predictions = clf.predict_proba(xtest_tfv)

### This achieves 0.5645 multiclass logloss

In [0]:
spit_submission_zip(predictions, test_df, 'TFIDF-NB')

  adding: predictions.csv (deflated 54%)


## Naive Bayes with word cound as features

In [0]:
# Fitting a simple Naive Bayes on Counts
clf = MultinomialNB()
clf.fit(xtrain_ctv, y)
predictions = clf.predict_proba(xvalid_ctv)

### This achieves 0.4448 multiclass logloss

In [0]:
spit_submission_zip(predictions, test_df, 'CV-NB')

  adding: predictions.csv (deflated 55%)


## Support Vector Machine (SVM) with TF-IDF features

In [0]:
# First apply dimensionality reduction. Here TF-IDF is sparse matrix, so PCA won't work. Instead apply truncated SVD
svd = decomposition.TruncatedSVD(n_components=150)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xtest_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [0]:
# Fitting a simple SVM
# clf = SVC(C=1.0, probability=True) # since we need probabilities
# clf.fit(xtrain_svd_scl, y)
# predictions = clf.predict_proba(xvalid_svd_scl)

In [0]:
# spit_submission_zip(predictions, test_df, 'TFIDF-SVD-SVM')

## XGBoost with TF-IDF as features

In [0]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), y)
predictions = clf.predict_proba(xtest_tfv.tocsc())

### This achieves 0.7718 multiclass logloss

In [0]:
spit_submission_zip(predictions, test_df, 'TFIDF-XGBOOST')

  adding: predictions.csv (deflated 61%)


## XGBoost with word counts as features

In [0]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_ctv.tocsc(), y)
predictions = clf.predict_proba(xvalid_ctv.tocsc())

### This achieves 0.7660 multiclass logloss

In [0]:
spit_submission_zip(predictions, test_df, 'CV-XGBOOST')

## Grid Search

In [0]:
# Multiclass logloss function
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [0]:
mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)

In [0]:
# Initialize SVD
svd = TruncatedSVD()
    
# Initialize the standard scaler 
scl = preprocessing.StandardScaler()

# We will use logistic regression here..
lr_model = LogisticRegression()

# Create the pipeline 
clf = pipeline.Pipeline([('svd_k', svd),
                         ('scl_k', scl),
                         ('lr_k', lr_model)])

In [0]:
param_grid = {'svd_k__n_components' : [120, 180],
              'lr_k__C': [0.1, 1.0, 10], 
              'lr_k__penalty': ['l1', 'l2']}

In [0]:
# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(xtrain_tfv, y)  # we can use the full data here but im only using xtrain
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

## Grid Search on Naive Bayes

In [0]:
nb_model = MultinomialNB()
svd = TruncatedSVD()
scl = preprocessing.StandardScaler()

# Create the pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# parameter grid
param_grid = {'nb__alpha': [0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15]}

# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(xtrain_tfv, y)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

## Grid Search on XGBoost

In [0]:
xgb = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)

clf = pipeline.Pipeline([('xgb', xgb)])

param_grid = {
    'xgb__max_depth': [4, 5, 6, 7, 8],
    'xgb__n_estimators': [140, 160, 180, 200, 220, 240],
    'xgb__learning_rate': [0.001, 0.01, 0.1, 1]
}

model = GridSearchCV(estimator = clf, param_grid=param_grid, scoring=mll_scorer, verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)
# Fit Grid Search Model
model.fit(xtrain_tfv, y)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))