In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import jellyfish
import gensim
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
jellyfish.jaro_distance(u'jellyfish', u'smellyfish')

0.8962962962962964

In [None]:
# DEAL WITH THE STOPWORDS!!!!!!!!! stopwords.words('english')
#he competition dataset contains text from works of fiction written by spooky authors 
#of the public domain: Edgar Allan Poe, HP Lovecraft and Mary Shelley. 
#The data was prepared by chunking larger texts into sentences using CoreNLP's 
#MaxEnt sentence tokenizer, so you may notice the odd non-sentence here and there. 
#Your objective is to accurately identify the author of the sentences in the test set.

In [3]:
# From kaggle task: https://www.kaggle.com/c/spooky-author-identification/data
test = pd.read_csv("/Users/janpintera/Dropbox/Spooky datasets/spooky_test.csv")
train = pd.read_csv("/Users/janpintera/Dropbox/Spooky datasets/train.csv")

In [4]:
X = train['text']
Y = train['author']
X_train, X_val , y_train, y_val = train_test_split(X, Y, test_size=0.33, random_state=42)
X_target = test['text']

In [16]:
train.groupby('author').agg('count')

Unnamed: 0_level_0,id,text
author,Unnamed: 1_level_1,Unnamed: 2_level_1
EAP,7900,7900
HPL,5635,5635
MWS,6044,6044


In [8]:
vectorizer = CountVectorizer()
dic = vectorizer.fit(X_train)

In [9]:
dic.get_feature_names()

['aaem',
 'aback',
 'abandon',
 'abandoned',
 'abandoning',
 'abandonment',
 'abaout',
 'abased',
 'abasement',
 'abashed',
 'abashment',
 'abate',
 'abatement',
 'abbey',
 'abbreviation',
 'abbé',
 'abdicated',
 'abdication',
 'abdul',
 'abernethy',
 'aberrant',
 'aberration',
 'aberrations',
 'abeyance',
 'abhor',
 'abhorred',
 'abhorrence',
 'abhorrent',
 'abide',
 'abigail',
 'abijah',
 'abilities',
 'ability',
 'abject',
 'abjure',
 'ablaze',
 'able',
 'ably',
 'abnormal',
 'abnormalities',
 'abnormality',
 'abnormally',
 'aboard',
 'abode',
 'abodes',
 'abolished',
 'abominable',
 'abomination',
 'abominations',
 'aboriginal',
 'abortions',
 'abortive',
 'abounded',
 'aboundingly',
 'about',
 'above',
 'abra',
 'abreast',
 'abroad',
 'abrupt',
 'abruptly',
 'abruptness',
 'absconded',
 'absence',
 'absences',
 'absense',
 'absent',
 'absolute',
 'absolutely',
 'absolved',
 'absorb',
 'absorbed',
 'absorbing',
 'absorbingly',
 'absorption',
 'abstain',
 'abstaining',
 'abstemious'

In [7]:
'and' in dic.get_feature_names()

True

In [10]:
Tvectorizer = TfidfVectorizer(stop_words = 'english')
dicT = Tvectorizer.fit(X_train)
feat_names = dicT.get_feature_names()

In [11]:
'the' in dicT.get_feature_names()

False

In [12]:
feat_names

['aaem',
 'aback',
 'abandon',
 'abandoned',
 'abandoning',
 'abandonment',
 'abaout',
 'abased',
 'abasement',
 'abashed',
 'abashment',
 'abate',
 'abatement',
 'abbey',
 'abbreviation',
 'abbé',
 'abdicated',
 'abdication',
 'abdul',
 'abernethy',
 'aberrant',
 'aberration',
 'aberrations',
 'abeyance',
 'abhor',
 'abhorred',
 'abhorrence',
 'abhorrent',
 'abide',
 'abigail',
 'abijah',
 'abilities',
 'ability',
 'abject',
 'abjure',
 'ablaze',
 'able',
 'ably',
 'abnormal',
 'abnormalities',
 'abnormality',
 'abnormally',
 'aboard',
 'abode',
 'abodes',
 'abolished',
 'abominable',
 'abomination',
 'abominations',
 'aboriginal',
 'abortions',
 'abortive',
 'abounded',
 'aboundingly',
 'abra',
 'abreast',
 'abroad',
 'abrupt',
 'abruptly',
 'abruptness',
 'absconded',
 'absence',
 'absences',
 'absense',
 'absent',
 'absolute',
 'absolutely',
 'absolved',
 'absorb',
 'absorbed',
 'absorbing',
 'absorbingly',
 'absorption',
 'abstain',
 'abstaining',
 'abstemious',
 'abstract',
 'abs

In [13]:
Xm = Tvectorizer.transform(X_train)
Xt = Tvectorizer.transform(X_val)

In [14]:
Xm

<13117x21269 sparse matrix of type '<class 'numpy.float64'>'
	with 147953 stored elements in Compressed Sparse Row format>

In [11]:
#Bernoulli Naive Bayes Classifier:
clf = BernoulliNB()
clf.fit(Xm, y_train)
prediction = clf.predict(Xt)

In [12]:
prediction

array(['EAP', 'MWS', 'MWS', ..., 'MWS', 'EAP', 'EAP'],
      dtype='<U3')

In [13]:
confusion_matrix(prediction,y_val)

array([[2355,  470,  423],
       [  64, 1250,   48],
       [ 168,  132, 1552]])

In [14]:
# Multinomial Naive Bayes Classifier:
mNB = MultinomialNB()
mNB.fit(Xm, y_train)
predict_mNB = mNB.predict(Xt)

In [15]:
confusion_matrix(predict_mNB,y_val)

array([[2265,  436,  352],
       [ 102, 1289,   66],
       [ 220,  127, 1605]])

In [17]:
# Bernoulli Naive Bayes Classifier:
bNB = BernoulliNB()
bNB.fit(Xm, y_train)
predict_bNB = bNB.predict(Xt)

In [18]:
confusion_matrix(predict_bNB,y_val)

array([[2355,  470,  423],
       [  64, 1250,   48],
       [ 168,  132, 1552]])

In [19]:
# Latent dirichlet allocation
lda = LatentDirichletAllocation(n_components=3, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [20]:
lda.fit(Xm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=3, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [37]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [36]:
for topic_idx, topic in enumerate(lda.components_):
 print(topic_idx,topic)

0 [ 0.35642702  0.34153979  0.9856561  ...,  0.33477412  0.53144622
  0.33421872]
1 [ 0.47146981  0.93369331  0.37157729 ...,  0.45983054  0.34216951
  0.55384639]
2 [ 0.33548261  0.50104363  3.2791923  ...,  0.35183447  0.33566191
  0.33486895]


In [39]:
print_top_words(lda, feat_names, 10)

Topic #0: did said man old life saw time like know long
Topic #1: room head feet remained doubt hand fact having left entered
Topic #2: mr door returned arrived second chamber followed opened house simple

