# dataset https://www.kaggle.com/gdberrio/spooky-authors-csv
Predict author of sentence (3 different authors)

In [3]:
from string import punctuation

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

from nltk.corpus import stopwords
from nltk import wordpunct_tokenize
from nltk.stem import PorterStemmer

In [4]:
df = pd.read_csv("../datasets/authors.csv")
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
df['author'].value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

more or less equal counts

In [4]:
texts = df['text']
le = LabelEncoder().fit(df['author'])
authors = le.transform(df['author'])
np.bincount(authors)

array([7900, 5635, 6044])

In [5]:
stopwords = set(stopwords.words('english'))  # remove not important words
stemmer = PorterStemmer()  # stem the words so the vocabulary will be smaller

In [6]:
# removing punctuation, stopwords and stemming words to reduce the vocabulary
cleared_texts = []
for i in texts:
    cleared_texts.append(' '.join([stemmer.stem(x) for x in wordpunct_tokenize(i.lower())
              if x not in punctuation and x not in stopwords]))  # removing punctiation and all to lowercase

In [7]:
print(texts[0])
print()
print(cleared_texts[0])

This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.

process howev afford mean ascertain dimens dungeon might make circuit return point whenc set without awar fact perfectli uniform seem wall


In [8]:
x_train, x_test, y_train, y_test = train_test_split(cleared_texts, authors,
                                                    stratify=authors, test_size=0.2)

In [9]:
cv = CountVectorizer(decode_error='ignore', ngram_range=(1, 1), 
                     max_df=1.0, min_df=1)

cv.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [10]:
x_train_transformed = cv.transform(x_train)
x_test_transformed = cv.transform(x_test)

In [11]:
x_train_transformed.shape

(15663, 14369)

In [12]:
clf = MultinomialNB()  # default

In [13]:
clf.fit(x_train_transformed.toarray(), y_train)
clf.score(cv.transform(x_test).toarray(), y_test)

0.8273748723186926

In [14]:
print(np.bincount(y_test))
print(np.bincount(clf.predict(x_test_transformed.toarray())))

[1580 1127 1209]
[1513 1090 1313]


In [None]:
# with different params
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'fit_prior': [True, False]}
grid_search_multinomial = GridSearchCV(MultinomialNB(), param_grid=param_grid, cv=5)
grid_search_multinomial.fit(x_train_transformed.toarray(), y_train)

In [None]:
grid_search_multinomial.best_params_

In [None]:
grid_search_multinomial.score(x_test_transformed.toarray(), y_test)

In [None]:
from sklearn.metrics import log_loss  # in kaggle competition this metric is used

In [None]:
predictions_proba = grid_search_multinomial.predict_proba(x_test_transformed.toarray())
log_loss(y_test, predictions_proba)

well, not great, not terrible

In [None]:
from sklearn.linear_model import LogisticRegression  # let's try LR as well
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'n_jobs': [-1]}
grid_search_lr = GridSearchCV(LogisticRegression(solver='lbfgs'), param_grid=param_grid, cv=5)
grid_search_lr.fit(x_train_transformed, y_train)

In [None]:
grid_search_lr.best_params_

In [None]:
print(grid_search_lr.score(x_test_transformed.toarray(), y_test))
predictions_proba = grid_search_lr.predict_proba(x_test_transformed.toarray())
print(log_loss(y_test, predictions_proba))

seems to be a bit better in terms of logloss

What can be tested as well:
different preprocessing - n-grams, no removal of stopwords, no stemming, TF-IDF instead of TF,
different algorithms

In [None]:
idf = TfidfTransformer().fit(x_train_transformed)

In [None]:
x_train_transformed_idf = idf.transform(x_train_transformed)
clf = MultinomialNB()
clf.fit(x_train_transformed_idf.toarray(), y_train)
x_test_transformed_idf = idf.transform(cv.transform(x_test)).toarray()
print(clf.score(x_test_transformed_idf, y_test))
predictions_proba = clf.predict_proba(x_test_transformed.toarray())
print(log_loss(y_test, predictions_proba))

Seems that TF-IDF is better in terms of logloss even with default MultinomialNB