# First Application of Naive Bayes to Philosophy/Non-Philosophy Classification

In [1]:
import os
import pickle
from multiprocessing import Pool

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

from nltk.corpus import stopwords

import numpy as np

from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

import pandas as pd

import NL_helpers

STOPWORDS = list(stopwords.words('english'))
STOPWORDS_2 = STOPWORDS + ['philosophy']

## Load Labels and Corresponding Texts

Load labels with article codes.

In [2]:
annotated_df = pd.read_pickle('pickles/classified_df.pickle')

In [3]:
annotated_df

Unnamed: 0,Readable,Philosophy,Philosophy Type,Writing Type,NZ,Notes
LT_18971209_ARTICLE7,True,False,,,,
NZTIM_18860722_ARTICLE11,True,False,,,,
WC_18810721_ARTICLE7,True,False,,,,
DUNST_18980513_ARTICLE3,True,False,,,,
OAM_18960702_ARTICLE35,True,False,,,,
...,...,...,...,...,...,...
AG_18840116_ARTICLE5,True,False,,,,
MS_18830117_ARTICLE19,True,False,,,,
GRA_18970305_ARTICLE3,True,False,,,,
LT_18800611_ARTICLE5,True,False,,,,


Use article codes to load texts.

In [4]:
dataset_path = '/home/joshua/hdd/Datasets/papers-past/'

The total dataset is divided into nine pickles. The following code runs through each, and collects the text for any articles in the annotated df using a left join opperation.

In [5]:
texts = pd.DataFrame(index=annotated_df.index)
for i in range(9):
    df = pd.read_pickle(dataset_path+f'corpus_df_{i}.tar.gz')
    matching_df = texts.join(df['Text'])
    annotated_df = annotated_df.combine_first(matching_df)
    del matching_df, df # May aid python memory management

Change text from list to simple string

In [6]:
annotated_df['Text'] = annotated_df['Text'].map(NL_helpers.blocks2string)

In [7]:
annotated_df.to_pickle('pickles/classified_with_text_df.pickle')

In [3]:
# annotated_df = pd.read_pickle('pickles/classified_with_text_df.pickle')

In [9]:
annotated_df['Philosophy'].value_counts()

False    620
True     299
Name: Philosophy, dtype: int64

I will go for a 75/25 training/test split on the philosophy observations. I will do the same on 100 of the non-philosophy and then add the remaining 47 non-philosophy articles to the test set. This should somewhat mimic the prevalence of non-philosophy articles in the overall dataset.

In [38]:
training_phil = annotated_df.loc[annotated_df['Philosophy']==True].sample(n=299//4 * 3)
training_nonphil = annotated_df.loc[annotated_df['Philosophy']==False].sample(n=620//4*3)
training_df = training_phil.append(training_nonphil)
del training_phil, training_nonphil

In [39]:
test_df = annotated_df.loc[[i for i in annotated_df.index if not i in training_df.index]]

In [40]:
test_df

Unnamed: 0,NZ,Notes,Philosophy,Philosophy Type,Readable,Text,Writing Type
NZTIM_18860722_ARTICLE11,,,False,,True,By.Eleotbio Telegraph—Copyright. (BEDIEB’S TEL...,
AG_18920514_ARTICLE2,,,False,,True,At the recent annual election of members of th...,
NEM_18930614_ARTICLE34,True,First part: somewhat readable complaint about ...,True,o,True,DESIGNING CHARLATANS.\nThe following remarks a...,f
WT_18760815_ARTICLE1,,,,,False,Mr X ITU rt-pjrts that at Mr Oibson'i clearing...,
BH_18771127_ARTICLE14,,,,,False,- ■■ ;• I Fob a distance al mg the riv*r above...,
...,...,...,...,...,...,...,...
FS_18970311_ARTICLE12,,,False,,True,SHOCKING CRIME. A PHILANTHROPIST. (Per Press A...,
ODT_18760718_ARTICLE3,,,False,,True,"Daily Times Office, Monday oYeninK. Tha amount...",
CHP_18680716_ARTICLE12,,,False,,True,It appears that the Government has instructed ...,
WT_18900422_ARTICLE8,,,False,,True,"Walwouth.—On April 18fch, at Tauwhare, Jane, t...",


Gonna try some class balance adjustment in a moment. Will just run the Naive Bayes first though.

In [41]:
training_df['Philosophy'].value_counts()

False    465
True     222
Name: Philosophy, dtype: int64

In [106]:
test_df['Writing Type'].value_counts()

p              27
f              23
l              22
r               4
input error     1
Name: Writing Type, dtype: int64

## Set up Pipeline and Fit Model

Tokeniser change from default: words of length 3+ rather than 2+.

In [16]:
phil_classifier = Pipeline([
    ('vect', CountVectorizer(
        max_df=0.4,
        min_df=9,
        stop_words=STOPWORDS, 
        token_pattern=r'(?u)\b\w\w\w+\b')),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [17]:
training_features = training_df['Text'].astype('string').to_numpy()

In [18]:
training_labels = training_df['Philosophy'].to_numpy().astype(bool)

In [20]:
phil_classifier.fit(training_features, training_labels)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.4,
                                 max_features=None, min_df=9,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",...
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itse

In [21]:
test_features = test_df['Text'].astype('string').to_numpy()

In [22]:
test_labels = test_df['Philosophy'].to_numpy().astype(bool)

In [23]:
predicted = phil_classifier.predict(test_features)

In [24]:
np.mean(predicted == test_labels)

0.8529411764705882

In [25]:
metrics.confusion_matrix(test_labels, predicted)

array([[186,   9],
       [ 31,  46]])

In [26]:
len(phil_classifier['vect'].vocabulary_)

5228

## Grid Parameter Search

ACK! - Results in this section are actually using the class balanced version of the dataset.

In [89]:
parameters = {
    'vect__min_df': [7, 8, 9, 10, 11, 12],
    'vect__max_df': [0.2, 0.3, 0.4, 0.5],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__use_idf': (True, False),
}

In [107]:
gs_clf = GridSearchCV(phil_classifier, parameters, cv=5, n_jobs=-1)
gs_clf.fit(training_features, training_labels)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.4,
                                                        max_features=None,
                                                        min_df=9,
                                                        ngram_range=(1, 1),
                                                        prep

In [108]:
gs_clf.best_score_

0.8807684729064039

In [109]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

tfidf__use_idf: True
vect__max_df: 0.2
vect__min_df: 7
vect__ngram_range: (1, 2)


In [111]:
predicted = gs_clf.best_estimator_.predict(test_features)

In [112]:
np.mean(predicted == test_labels)

0.8897058823529411

In [113]:
confusion_matrix = metrics.confusion_matrix(test_labels, predicted)
confusion_matrix

array([[179,  16],
       [ 14,  63]])

In [152]:
tp = confusion_matrix[1][1]
fn = confusion_matrix[1][0]
fp = confusion_matrix[0][1]
tn = confusion_matrix[0][0]
recall = tp/(tp+fn)
precision = tp/(tp+fp)
print(f'recall: {recall}')
print(f'precision: {precision}')

recall: 0.8181818181818182
precision: 0.7974683544303798


### Let's try fix the class balance.

I'm going to try to upsample the positives by randomly doubling just over half of them.

In [53]:
half_phil_indices = (
    training_df[training_df['Philosophy']==True]
    .sample(n=20) # This is not recoverable as code!
    .index
)
for i in half_phil_indices:
    training_df.loc[f'{i}_upsample'] = training_df.loc[i]

In [54]:
training_df['Philosophy'].value_counts()

False    465
True     407
Name: Philosophy, dtype: int64

In [56]:
training_features = training_df['Text'].astype('string').to_numpy()
training_labels = training_df['Philosophy'].to_numpy().astype(bool)

In [57]:
phil_classifier.fit(training_features, training_labels)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.4,
                                 max_features=None, min_df=9,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",...
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itse

In [59]:
test_features = test_df['Text'].astype('string').to_numpy()
test_labels = test_df['Philosophy'].to_numpy().astype(bool)
predicted = phil_classifier.predict(test_features)

In [151]:
np.mean(predicted == test_labels)

0.8897058823529411

In [150]:
confusion_matrix = metrics.confusion_matrix(test_labels, predicted)
confusion_matrix

array([[179,  16],
       [ 14,  63]])

increase recall

In [83]:
gs_clf_r = GridSearchCV(
    phil_classifier, 
    parameters, 
    cv=5, 
    scoring='recall',
    n_jobs=-1)
gs_clf_r.fit(training_features, training_labels)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.4,
                                                        max_features=None,
                                                        min_df=9,
                                                        ngram_range=(1, 1),
                                                        prep

In [84]:
gs_clf_r.best_score_

0.9238482384823847

In [85]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf_r.best_params_[param_name]))

tfidf__use_idf: True
vect__max_df: 0.5
vect__min_df: 7
vect__ngram_range: (1, 1)
vect__stop_words: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 

In [99]:
predicted_r = gs_clf_r.best_estimator_.predict(test_features)

In [100]:
np.mean(predicted_r == test_labels)

0.8455882352941176

In [148]:
rec_confusion_matrix = metrics.confusion_matrix(test_labels, predicted_r)
rec_confusion_matrix
# y = true, x = predicted

array([[166,  29],
       [ 13,  64]])

In [149]:
tp = rec_confusion_matrix[1][1]
fn = rec_confusion_matrix[1][0]
fp = rec_confusion_matrix[0][1]
tn = rec_confusion_matrix[0][0]
rec_recall = tp/(tp+fn)
rec_precision = tp/(tp+fp)
print(f'recall: {rec_recall}')
print(f'precision: {rec_precision}')

recall: 0.8311688311688312
precision: 0.6881720430107527


increase precision

In [86]:
gs_clf_p = GridSearchCV(
    phil_classifier, 
    parameters, 
    cv=5, 
    scoring='precision',
    n_jobs=-1)
gs_clf_p.fit(training_features, training_labels)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.4,
                                                        max_features=None,
                                                        min_df=9,
                                                        ngram_range=(1, 1),
                                                        prep

In [87]:
gs_clf_p.best_score_

0.8472614868032105

In [88]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf_p.best_params_[param_name]))

tfidf__use_idf: True
vect__max_df: 0.3
vect__min_df: 7
vect__ngram_range: (1, 2)
vect__stop_words: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 

In [102]:
predicted_p = gs_clf_p.best_estimator_.predict(test_features)

In [103]:
np.mean(predicted_p == test_labels)

0.8860294117647058

In [145]:
prec_confusion_matrix = metrics.confusion_matrix(test_labels, predicted_p)
prec_confusion_matrix

array([[178,  17],
       [ 14,  63]])

In [147]:
tp = prec_confusion_matrix[1][1]
fn = prec_confusion_matrix[1][0]
fp = prec_confusion_matrix[0][1]
tn = prec_confusion_matrix[0][0]
prec_recall = tp/(tp+fn)
prec_precision = tp/(tp+fp)
print(f'recall: {prec_recall}')
print(f'precision: {prec_precision}')

recall: 0.8181818181818182
precision: 0.7875


Look at the false positives for overall accuracy model:

In [114]:
test_df['Predicted'] = predicted

In [128]:
false_negatives = test_df.loc[(test_df['Philosophy'] == True)&(test_df['Predicted'] == False)]

In [129]:
false_negatives['Philosophy Type'].value_counts()

e    8
o    5
r    1
Name: Philosophy Type, dtype: int64

Note: mostly ethics

In [135]:
false_negatives['Writing Type'].value_counts()

f    8
p    3
l    3
Name: Writing Type, dtype: int64

Dist of writing types isn't much different from the actual.

In [139]:
false_negatives

Unnamed: 0,NZ,Notes,Philosophy,Philosophy Type,Readable,Text,Writing Type,Predicted
NEM_18930614_ARTICLE34,True,First part: somewhat readable complaint about ...,True,o,True,DESIGNING CHARLATANS.\nThe following remarks a...,f,False
OW_18840830_ARTICLE81,True,Needs to be divided up.,True,o,True,When Queen Elizabeth on one occasion asked her...,f,False
ODT_18830714_ARTICLE20,True,section on curiosity as virtue or vice,True,e,True,(Prom Otago Witness.) v\nThe political uaei of...,f,False
LT_18831025_ARTICLE34,True,Only final block,True,o,True,[non ora own oobbxspohdmt.]\nThere is very lit...,f,False
LT_18960724_ARTICLE18,True,,True,e,True,"TO THE EDITOR.\nSir, —Kindly-allow me space iu...",l,False
WH_18710211_ARTICLE3,,"Just. It's mostly about protectionism, but has...",True,e,True,Protection and its effects >on na tional wealt...,f,False
AS_18880609_ARTICLE73,,First 6 of 15 blocks are 'philosophy' anyway,True,e,True,"goV write tople, Se thO d COunU Ti^h, rorincan...",f,False
ODT_18740209_ARTICLE10,,,True,e,True,The Rev. Dr Rnseby delivered a lecture\nlast e...,p,False
CHP_18721024_ARTICLE19,True,nature of university education,True,o,True,Tt> THE EDITOR OP THE PRESS. JslE > 1 have jus...,l,False
ESD_18830927_ARTICLE12,True,"By Robert Stout, so classified as NZ author de...",True,e,True,The following is Mr Stout’s letter to the ‘ Ar...,l,False


Interestingly, 6 of 14 are pieces where I have noted that only a small portion of the article is 'philosophical'. This isn't too bad. Let's have a look at the others.

In [143]:
def print_article_text(index, dataframe):
    print (f"{index}\n{dataframe.loc[index]['Text']}")
indices = false_negatives.index
interact(print_article_text, index=indices, dataframe=fixed(false_negatives))

interactive(children=(Dropdown(description='index', options=('NEM_18930614_ARTICLE34', 'OW_18840830_ARTICLE81'…

<function __main__.print_article_text(index, dataframe)>

 - 'LT_18960724_ARTICLE18' concerns the 'Temple of Truth', which possibly encouraged me to include it as philosophy, but is not otherwise like the articles I've labelled as philosophy. (similarly 'LT_18930606_ARTICLE41')
 - 'WH_18710211_ARTICLE3' I've already noted is a bit dubious.
 - 'ODT_18740209_ARTICLE10' also dubious. 
 - 'CHP_18721024_ARTICLE19' I would like this one in. (Mentions of metaphysics etc... Place of intellectual life in NZ)
 - 'ESD_18890826_ARTICLE1' very similar to politics ones I've explicitly excluded in the dataset, so not surprising it's not picked up. I think I included it simply because of Royce appearing.
 - 'AS_18990419_ARTICLE4' I'd also like this. Unfortunately probably 'too political'.
 - 'OW_18880608_ARTICLE103' philosophical content buried in 'proceedings'.

In [133]:
false_positives = test_df.loc[(test_df['Philosophy'] == False)&(test_df['Predicted'] == True)]

In [137]:
false_positives

Unnamed: 0,NZ,Notes,Philosophy,Philosophy Type,Readable,Text,Writing Type,Predicted
ODT_18980222_ARTICLE27,,,False,,True,■ In the Board Koom of «ie Agricultural Hall' ...,,True
LT_18700603_ARTICLE8,,,False,,True,About a year ago we had the ‘ privi lege of no...,,True
CHP_18970706_ARTICLE58,,,False,,True,"TO THE EDITOR OF THE PRESS. Sir, —Had Mr Nolan...",,True
DSC_18710429_ARTICLE26,,,False,,True,"TO THE EDITOR; Sir, — ""If the trumpet give an ...",,True
ESD_18891218_ARTICLE59,,,False,,True,On Sunday evening the Rev. E. D. Ceo J preache...,,True
ODT_18850204_ARTICLE30,,Education,False,,True,"TO THE EDITOB, Sib,—lt is somewhat curious tha...",,True
HBH_18840609_ARTICLE20,,,False,,True,"Sir,— ln your issue of May 30th I ob serve you...",,True
NZTIM_18961126_ARTICLE29,,,False,,True,"TO THE EDITOR. Sxe,—"" Temperance,"" in his open...",,True
BH_18880626_ARTICLE16,,,False,,True,"To the Editor,\nSir, — That a man utterly devo...",,True
DTN_18890125_ARTICLE15,,,False,,True,"AN EXPOSITION.\nMr. A. G. Daniells, ono of the...",,True


In [144]:
indices = false_positives.index
interact(print_article_text, index=indices, dataframe=fixed(false_positives))

interactive(children=(Dropdown(description='index', options=('ODT_18980222_ARTICLE27', 'LT_18700603_ARTICLE8',…

<function __main__.print_article_text(index, dataframe)>

 - Theosophical lecture - ODT_18980222_ARTICLE27
 - NZ institute - LT_18700603_ARTICLE8 - not surprising as I've included others of these which had some more clear bit of philosophy.
 - CHP_18980319_ARTICLE7 a piece of fiction about a young Hegelian (not in the strict sense)

Some sermons and theosophy which I excluded on the grounds that they were more 'straight sermons' that the ones I had labelled as philosophy. There's nothing in here that I would be unhappy to be in a philosophy corpus though.

I'm going to save the model optimised for overall accuracy. I think I've eered in my labelling on both sides as well.

In [153]:
with open('classifiers/NB_2.pickle', 'wb') as fout:
    pickle.dump(gs_clf.best_estimator_, fout)

## Try SVMs

I'm keen to try a slightly more 'complex' method, which is said to perform better for text classification. This is done with the proviso that the above false negatives and positives are s(pretty much) all arguable cases.

In [160]:
SVC_Pipeline = Pipeline([
    ('vect', CountVectorizer(
        max_df=0.4,
        min_df=9,
        stop_words=STOPWORDS, 
        token_pattern=r'(?u)\b\w\w\w+\b')),
    ('tfidf', TfidfTransformer()),
    ('clf', SVC(
        )),
])

In [161]:
SVC_Pipeline.fit(training_features, training_labels)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.4,
                                 max_features=None, min_df=9,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",...
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 SVC(C=1.0, break_ties=False, cache_siz

In [162]:
svc_pred = SVC_Pipeline.predict(test_features)

In [163]:
svc_confusion_matrix = metrics.confusion_matrix(test_labels, svc_pred)
svc_confusion_matrix

array([[188,   7],
       [ 34,  43]])

In [164]:
tp = svc_confusion_matrix[1][1]
fn = svc_confusion_matrix[1][0]
fp = svc_confusion_matrix[0][1]
tn = svc_confusion_matrix[0][0]
svc_recall = tp/(tp+fn)
svc_precision = tp/(tp+fp)
svc_accuracy = (tp+tn)/(tp+fp+tn+fn)
print(f'accuracy: {svc_accuracy}')
print(f'recall: {svc_recall}')
print(f'precision: {svc_precision}')

accuracy: 0.8492647058823529
recall: 0.5584415584415584
precision: 0.86


Well that's dreadful. Will have to think about this model a bit.

## Apply Model with Best Params to Subset of Total Dataset

Some of these include articles that will be in the test set. I don't think this is a big deal.

In [2]:
# If necessary reload model:
with open('classifiers/NB_2.pickle', 'rb') as fin:
  phil_classifier_2 = pickle.load(fin)

In [24]:
corpus_subset_df = pd.read_pickle('pickles/corpus_2000per_subset_df.pickle')

In [27]:
len(corpus_subset_df)

17949

In [28]:
corpus_subset_df['Text as string'] = corpus_subset_df['Text'].apply(NL_helpers.blocks2string)

In [3]:
def filter_short_articles(string):
    if len(string)<800:
        string = ''
    return string

In [30]:
corpus_subset_df['Text as string'] = corpus_subset_df['Text as string'].map(filter_short_articles)

In [31]:
corpus_subset_df.drop(corpus_subset_df[corpus_subset_df['Text as string']==''].index, inplace=True)

In [32]:
corpus_subset_features = corpus_subset_df['Text as string'].astype('string').to_numpy()

In [33]:
predicted_subset = phil_classifier.predict(corpus_subset_features)

In [34]:
predictions = pd.Series(data=predicted_subset)

In [35]:
predictions.value_counts()

False    10670
True       747
dtype: int64

In [36]:
predictions.index = corpus_subset_df.index

In [37]:
corpus_subset_df.loc[predictions]

Unnamed: 0,Title,Text,Newspaper,Date,Tokenised,Text as string
BH_18740911_ARTICLE23,Invercargill,"[(Frbrriburbwri'Gbrresponderit.), The* attenti...",,,,(Frbrriburbwri'Gbrresponderit.)\nThe* attentio...
CHP_18970902_ARTICLE28,THE QUEEN AND HER PEOPLE.,[The following is the Queen's Jubilee letter.—...,,,,"The following is the Queen's Jubilee letter.—""..."
BH_18811209_ARTICLE27,HOW IT ALL CAME ABOUT.,[1; ! Bj; a prtfce-s (O- 5 -*ydlut_b_i ;; and'...,,,,1; ! Bj; a prtfce-s (O- 5 -*ydlut_b_i ;; and' ...
CHP_18970206_ARTICLE29,"""PERFIDIOUS ALBION.""",[It is seldom that any nation has an opportuni...,,,,It is seldom that any nation has an opportunit...
CHP_18990714_ARTICLE48,RELIGIOUS EDUCATION.,[A- meeting of ministers representing the wiou...,,,,A- meeting of ministers representing the wious...
...,...,...,...,...,...,...
ODT_18980810_ARTICLE58,BIBLE IN SCHOOLS.,"[•. , .... , ~r ,j TO. THE EDITOR. ~.. ;.■■■],...",ODT,18980810,"[editor, sißj, quite, corre, spondentti, pippa...","•. , .... , ~r ,j TO. THE EDITOR. ~.. ;.■■■],...."
ODT_18980531_ARTICLE21,THE SITUATION IN ITALY.,"[A LETTER FROM THE POPE,, Press Association—By...",ODT,18980531,"[letter, pope, press, association, telegraph, ...","A LETTER FROM THE POPE,\nPress Association—By ..."
ODT_18980810_ARTICLE15,SUNDAY NIGHT CONCERTS. TO THE EDITOR.,"[•Sis.^Every.nbw' and thea-atteirjpti,■•«•.» m...",ODT,18980810,"[sis, every, nbw, thea, atteirjpti, made, less...","•Sis.^Every.nbw' and thea-atteirjpti,■•«•.» ma..."
ODT_18980131_ARTICLE30,"""CIVIS"" AND DR STORDEUR'S LECTURES.","[TO THE JBBITOS. . ' Sir,—May I crave space in...",ODT,18980131,"[jbbitos, sir, may, crave, space, columns, pap...","TO THE JBBITOS. . ' Sir,—May I crave space in ..."


In [38]:
corpus_subset_df['Newspaper'] = corpus_subset_df.index.map(lambda x: x[0:x.find('_')])
corpus_subset_df['Date'] = corpus_subset_df.index.map(lambda x: x[x.find('_')+1:x.find('_')+9])

In [39]:
corpus_subset_df['Philosophy(pred)'] = predictions

In [42]:
corpus_subset_df = corpus_subset_df.drop(['Tokenised', 'Text as string'], axis=1)

In [43]:
corpus_subset_df.to_pickle('pickles/preds_nb_1.tar.gz')

Let's check out the results:

In [240]:
phil_indices = (list(predictions[predictions==True].sample(n=500).index))
interact(NL_helpers.html_text, index=phil_indices, dataframe=fixed(corpus_subset_df), boldface=fixed(None))

interactive(children=(Dropdown(description='index', options=('LT_18990206_ARTICLE20', 'LT_18960709_ARTICLE14',…

<function NL_helpers.html_text(index, dataframe, boldface=None)>

A multicore attempt at fitting model (via https://github.com/scikit-learn/scikit-learn/issues/7448).

Final result is a list of all articles classified as true along with 'Title' and 'Text' columns.

In [4]:
slices_directory = '/home/joshua/Documents/data601_small_slices/'
slices = [f'{slices_directory}{path}' for path in os.listdir(slices_directory)]

In [5]:
def phil_from_slice(slice_path):
    df = pd.read_pickle(slice_path)
    series = (
        df['Text']
        .map(NL_helpers.blocks2string)
        .map(filter_short_articles)
    )
    series.drop(series[series==''].index, inplace=True)
    predictions = pd.Series(data=phil_classifier_2.predict(series))
    predictions.index = series.index
    phil = df[['Title', 'Text']].loc[predictions[predictions].index]
    del df
    return phil
    

In [6]:
phil_nb2 = []
if __name__ == '__main__':
    with Pool(processes=os.cpu_count()//4) as pool:
        phil_predictions = pool.imap(phil_from_slice, slices)
        for prediction in phil_predictions:
            phil_nb2.append(prediction)

In [8]:
total_phil = 0
for item in phil_nb2:
    total_phil += len(item)

In [9]:
total_phil

44730

prev: 287832, new: 44730

In [10]:
all_phil_nb2 = pd.concat(phil_nb2)

In [11]:
all_phil_nb2

Unnamed: 0,Title,Text
LT_18940102_ARTICLE15,ROMANTIC WOMEN.,[Most women are inclined to be romantic. This ...
LT_18940102_ARTICLE26,"The Lyttelton Times. TUESDAY, JAN. 2, 1894.",[Whkn previously it Las been our task to look ...
LT_18940105_ARTICLE16,STATE EDUCATION.,"[TO THE EDITOR. Sib, —No doubt a large majorit..."
LT_18940108_ARTICLE22,"The Lyttelton Times. MONDAY. JAN. 8, 1894.",[Among the five bishops who passed away last y...
LT_18940109_ARTICLE7,THE TASK OF THE BIOLOGIST.,[No. 11. When chemistry had finished _ shaping...
...,...,...
ODT_18790120_ARTICLE11,THE BIBLE IN SCHOOLS.,[The Rev. Dr Roseby presched en this sub j set...
ODT_18790120_ARTICLE17,THE EARLY SETTLERS AND THE BIBLE. TO THE EDITOR.,"[Sir, —I was gratified to read in your leading..."
ODT_18790121_ARTICLE3,The Otago Daily Times. WITH WHICH IS INCORPORA...,[The Committee appointed by the Athenceum meet...
ODT_18790121_ARTICLE16,GODLESS EDUCATION. TO THE EDITOR,"[Sir,—l clxervo that the reft-reuce to tbo ""jj..."


In [12]:
all_phil_nb2.to_pickle('pickles/nb2_philoso_df.tar.gz')