# Second Application of Naive Bayes to Philosophy/Non-Philosophy Classification

In [1]:
import os
import pickle
from multiprocessing import Pool

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

from nltk.corpus import stopwords

import numpy as np

from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

import pandas as pd

import NL_helpers

STOPWORDS = list(stopwords.words('english'))

## Load Labels and Corresponding Texts

Load labels with article codes.

In [2]:
annotated_df = pd.read_pickle('pickles/classified_df.pickle')

In [3]:
annotated_df

Unnamed: 0,Readable,Philosophy,Philosophy Type,Writing Type,NZ,Notes
LT_18971209_ARTICLE7,True,False,,,,
NZTIM_18860722_ARTICLE11,True,False,,,,
WC_18810721_ARTICLE7,True,False,,,,
DUNST_18980513_ARTICLE3,True,False,,,,
OAM_18960702_ARTICLE35,True,False,,,,
...,...,...,...,...,...,...
AG_18840116_ARTICLE5,True,False,,,,
MS_18830117_ARTICLE19,True,False,,,,
GRA_18970305_ARTICLE3,True,False,,,,
LT_18800611_ARTICLE5,True,False,,,,


Use article codes to load texts.

In [4]:
dataset_path = '/home/joshua/hdd/Datasets/papers-past/'

The total dataset is divided into nine pickles. The following code runs through each, and collects the text for any articles in the annotated df using a left join opperation.

In [5]:
texts = pd.DataFrame(index=annotated_df.index)
for i in range(9):
    df = pd.read_pickle(dataset_path+f'corpus_df_{i}.tar.gz')
    matching_df = texts.join(df['Text'])
    annotated_df = annotated_df.combine_first(matching_df)
    del matching_df, df # May aid python memory management

Change text from list to simple string

In [6]:
annotated_df['Text as String'] = annotated_df['Text'].map(NL_helpers.blocks2string)

In [8]:
annotated_df

Unnamed: 0,NZ,Notes,Philosophy,Philosophy Type,Readable,Text,Writing Type,Text as String
LT_18971209_ARTICLE7,,,False,,True,[YOUR PALE AND SALLOW GIRLS NEED ATTENTION. TH...,,YOUR PALE AND SALLOW GIRLS NEED ATTENTION. THE...
NZTIM_18860722_ARTICLE11,,,False,,True,[By.Eleotbio Telegraph—Copyright. (BEDIEB’S TE...,,By.Eleotbio Telegraph—Copyright. (BEDIEB’S TEL...
WC_18810721_ARTICLE7,,,False,,True,"[July 18. ; The ""Very Eev. A. P. Stanley, Dean...",,"July 18. ; The ""Very Eev. A. P. Stanley, Dean ..."
DUNST_18980513_ARTICLE3,,,False,,True,"[(“ Weekly Press.”), There is something pathet...",,(“ Weekly Press.”)\nThere is something patheti...
OAM_18960702_ARTICLE35,,,False,,True,[A novel suggestion as to the division of 1 a ...,,A novel suggestion as to the division of 1 a y...
...,...,...,...,...,...,...,...,...
AG_18840116_ARTICLE5,,,False,,True,[Trains Leave Ashburton for Ohrist ; church an...,,Trains Leave Ashburton for Ohrist ; church and...
MS_18830117_ARTICLE19,,,False,,True,[At a meeting of representatives of the variou...,,At a meeting of representatives of the various...
GRA_18970305_ARTICLE3,,,False,,True,"[This day— For Dunedln, per Herald, at 11 am. ...",,"This day— For Dunedln, per Herald, at 11 am. F..."
LT_18800611_ARTICLE5,,,False,,True,"[LYTTELTON., arrived. ~ , T . in_Wallinffton. ...",,"LYTTELTON.\narrived. ~ , T . in_Wallinffton. 8..."


In [7]:
annotated_df.to_pickle('pickles/classified_with_text_df.pickle')

In [4]:
# annotated_df = pd.read_pickle('pickles/classified_with_text_df.pickle')

In [86]:
annotated_df['Philosophy'].value_counts()

False    620
True     299
Name: Philosophy, dtype: int64

I will use a 75/25 training/test split. Note use of 'random_state' for reproducability. This was not done for previous classifier.

In [5]:
training_phil = annotated_df.loc[annotated_df['Philosophy']==True].sample(n=299//4 * 3, random_state=1)
training_nonphil = annotated_df.loc[annotated_df['Philosophy']==False].sample(n=620//4*3, random_state=1)
training_df = training_phil.append(training_nonphil)
del training_phil, training_nonphil

In [6]:
test_df = annotated_df.loc[[i for i in annotated_df.index if not i in training_df.index]]

In [7]:
training_df

Unnamed: 0,NZ,Notes,Philosophy,Philosophy Type,Readable,Text,Writing Type,Text as String
OW_18930608_ARTICLE44,True,,True,e,True,"[Prof cßsor Harvey's Patent St If -applied, i'...",l,"Prof cßsor Harvey's Patent St If -applied, i'a..."
CROMARG_18930822_ARTICLE4,False,American attitudes to nature (quotes Emerson a...,True,o,True,"[Mr Gamaliel Bradford, juu., in liis paper on ...",f,"Mr Gamaliel Bradford, juu., in liis paper on ‘..."
OAM_18840416_ARTICLE16,,,True,r,True,[At the last meeting held in February by the V...,p,At the last meeting held in February by the Vi...
OO_18921224_ARTICLE2,,Mr Collins public debate in Oxford (existence ...,True,r,True,[The following is a brief abstract of the Deba...,p,The following is a brief abstract of the Debat...
NOT_18920816_ARTICLE22,,On secular education,True,e,True,"[vival of the fittest. Then there is the "" Psa...",f,"vival of the fittest. Then there is the "" Psal..."
...,...,...,...,...,...,...,...,...
CHP_18890103_ARTICLE6,,,False,,True,[T&*t*d4r (TMs Day) Peugato erives aboafe 8 a....,,T&*t*d4r (TMs Day) Peugato erives aboafe 8 a.m...
ODT_18980924_ARTICLE5,,,False,,True,"[It is difficult to believe, probably nobody, ...",,"It is difficult to believe, probably nobody\nd..."
LT_18990128_ARTICLE29,,,False,,True,[It will probably be- generally admitted that ...,,It will probably be- generally admitted that t...
GRA_18961216_ARTICLE12,,,False,,True,"[~ Adelaide, December 15. Towns, a cyclist, ro...",,"~ Adelaide, December 15. Towns, a cyclist, rod..."


In [420]:
test_df

Unnamed: 0,NZ,Notes,Philosophy,Philosophy Type,Readable,Text,Writing Type,Text as String
ESD_18890912_ARTICLE37,,,False,,True,"[[By Onr. Special Reporters.!, OAMARU, Septemb...",,"[By Onr. Special Reporters.!\nOAMARU, Septembe..."
WT_18760815_ARTICLE1,,,,,False,[Mr X ITU rt-pjrts that at Mr Oibson'i clearin...,,Mr X ITU rt-pjrts that at Mr Oibson'i clearing...
LT_18941123_ARTICLE22,,Theosophy,True,r,True,"[THE COMIN6 RACE., revealing to the world one ...",f,THE COMIN6 RACE.\nrevealing to the world one o...
MEX_18960107_ARTICLE1,,,False,,True,[A G.zette extraordinary farther pro rogues Pa...,,A G.zette extraordinary farther pro rogues Par...
BH_18771127_ARTICLE14,,,,,False,[- ■■ ;• I Fob a distance al mg the riv*r abov...,,- ■■ ;• I Fob a distance al mg the riv*r above...
...,...,...,...,...,...,...,...,...
FS_18931026_ARTICLE21,,,False,,True,"[(?EB PBESS ASSOCIATION.) Auckland, October 25...",,"(?EB PBESS ASSOCIATION.) Auckland, October 25...."
HNS_18840917_ARTICLE30,,,,,False,"[, ' >± '_  , j ROME, September 15. Subsiden...",,", ' >± '_  , j ROME, September 15. Subsidenc..."
CHP_18831231_ARTICLE1,,,False,,True,"[Wat-ana __po_-—December 30.-;. ____.—We-iher,...",,"Wat-ana __po_-—December 30.-;. ____.—We-iher, ..."
FS_18970311_ARTICLE12,,,False,,True,[SHOCKING CRIME. A PHILANTHROPIST. (Per Press ...,,SHOCKING CRIME. A PHILANTHROPIST. (Per Press A...


Gonna try some class balance adjustment in a moment. Will just run the Naive Bayes first though.

In [386]:
training_df['Philosophy'].value_counts()

False    465
True     222
Name: Philosophy, dtype: int64

In [387]:
test_df['Writing Type'].value_counts()

f    30
l    20
p    18
r     9
Name: Writing Type, dtype: int64

In [389]:
test_df['Philosophy'].value_counts()

False    155
True      77
Name: Philosophy, dtype: int64

## Fit 'Readable' Model

An experiment: try to fit a model to detect poor OCR.

In [200]:
annotated_df['Readable'].value_counts()

True     918
False     41
Name: Readable, dtype: int64

Slightly different than previous. Will ensure class balance by simply sampling the same number of each class.

In [353]:
training_readable = annotated_df.loc[annotated_df['Readable']==True].sample(n=41//10 * 8, random_state=1)
training_nonreadable = annotated_df.loc[annotated_df['Readable']==False].sample(n=41//10 * 6, random_state=1)
readable_training_df = training_readable.append(training_nonreadable)
del training_readable, training_nonreadable

In [354]:
readable_test_df = annotated_df.loc[[i for i in annotated_df.index if not i in readable_training_df.index]]

On basis that non-readable are quite rare (from my exploration of random samples of the dataset), will just leave the test set as is.

In [355]:
readable_training_features = readable_training_df['Text as String'].astype('string').to_numpy()
readable_training_labels = readable_training_df['Readable'].to_numpy().astype(bool)

Will require quite different features. In this case, very rare non-word strings are likely to correspond to bad OCR.

In [356]:
readable_classifier = Pipeline([
    ('vect', CountVectorizer(
        max_df=0.1,
        min_df=1,
        stop_words=STOPWORDS, 
        token_pattern=r'\S+')),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [357]:
readable_classifier.fit(readable_training_features, readable_training_labels)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.1,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",...
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itse

In [358]:
readable_test_features = readable_test_df['Text as String'].astype('string').to_numpy()
readable_test_labels = readable_test_df['Readable'].to_numpy().astype(bool)
readable_predicted = readable_classifier.predict(readable_test_features)

In [359]:
np.mean(readable_predicted == readable_test_labels)

0.9811738648947951

In [360]:
metrics.confusion_matrix(readable_test_labels, readable_predicted)

array([[  2,  15],
       [  2, 884]])

In [361]:
readable_parameters = {
    'vect__ngram_range': [(1, 1), (1,4), (1, 5), (1, 10)],
    'vect__lowercase': (True, False),
    'tfidf__use_idf': (True, False),
    'clf__alpha': [1.0e-10, 0.001, 0.01],
    'vect__min_df': [1, 2, 3, 5, 10],
    'vect__max_df': [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
}

In [362]:
readable_gs_clf = GridSearchCV(readable_classifier, readable_parameters, cv=5, n_jobs=-1)
readable_gs_clf.fit(readable_training_features, readable_training_labels)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.1,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [363]:
readable_gs_clf.best_score_

0.8560606060606061

In [364]:
readable_grid_predicted = readable_gs_clf.best_estimator_.predict(readable_test_features)

In [365]:
metrics.confusion_matrix(readable_test_labels, readable_grid_predicted)

array([[  5,  12],
       [ 77, 809]])

In [366]:
for param_name in sorted(readable_parameters.keys()):
    print("%s: %r" % (param_name, readable_gs_clf.best_params_[param_name]))

clf__alpha: 1e-10
tfidf__use_idf: True
vect__lowercase: True
vect__max_df: 0.4
vect__min_df: 10
vect__ngram_range: (1, 1)


The amount of readable material we would use by running this classifier makes it unworkable. Instead, we'll just ignore the 'readable' tag in the following and handle bad OCR later.

## Set up Pipeline and Fit Philosophy Model

Tokeniser change from default: words of length 3+ rather than 2+.

In [401]:
phil_classifier = Pipeline([
    ('vect', CountVectorizer(
        max_df=0.4,
        min_df=9,
        stop_words=STOPWORDS, 
        token_pattern=r'(?u)\b\w\w\w+\b')),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [11]:
training_features = training_df['Text as String'].astype('string').to_numpy()

In [12]:
training_labels = training_df['Philosophy'].to_numpy().astype(bool)

In [423]:
phil_classifier.fit(training_features, training_labels)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.4,
                                 max_features=None, min_df=9,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",...
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itse

In [424]:
test_features = test_df['Text as String'].astype('string').to_numpy()
test_labels = test_df['Philosophy'].to_numpy().astype(bool)
predicted = phil_classifier.predict(test_features)

In [425]:
np.mean(predicted == test_labels)

0.8676470588235294

In [426]:
metrics.confusion_matrix(test_labels, predicted)

array([[188,   7],
       [ 29,  48]])

In [427]:
len(phil_classifier['vect'].vocabulary_)

5232

This is classifying too many philosophy articles as non-philosophy. One way to deal with this is to fix the class balance to increase the prominence of philosophy articles to the classifier.

## Class Balance

I'm going to try to upsample the positives by randomly doubling 25 of them at a time until the number of philosophy articles in the training set is at least 375.

An earlier attempt upsampled to the point that philosophy outnumbered non-philosophy. I could not achieve a good balance of recall and precision in this case.

In [409]:
j = 0
balanced = False
while balanced == False:
    indices_to_double = (
        training_df[training_df['Philosophy']==True]
        .sample(n=25, random_state=j) 
        .index
    )
    for i in indices_to_double:
        training_df.loc[f'{i}_upsample_{j}'] = training_df.loc[i]
    if len(training_df[training_df['Philosophy']==True]) >= 375:
        balanced = True
    j += 1

In [410]:
training_df['Philosophy'].value_counts()

False    465
True     397
Name: Philosophy, dtype: int64

In [411]:
training_df.to_pickle('pickles/nb2_training_df_upsampled_2.tar.gz')

In [412]:
training_features = training_df['Text as String'].astype('string').to_numpy()
training_labels = training_df['Philosophy'].to_numpy().astype(bool)

In [413]:
phil_classifier.fit(training_features, training_labels)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.4,
                                 max_features=None, min_df=9,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",...
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itse

In [414]:
test_features = test_df['Text as String'].astype('string').to_numpy()
test_labels = test_df['Philosophy'].to_numpy().astype(bool)
predicted = phil_classifier.predict(test_features)

In [415]:
np.mean(predicted == test_labels)

0.8663793103448276

In [416]:
confusion_matrix = metrics.confusion_matrix(test_labels, predicted)
confusion_matrix

array([[135,  20],
       [ 11,  66]])

In array y axis = true labels, x axis = predicted labels. This shows that 11 philosophy articles are being missed and 21 non-philosophy articles are being classifier as philosophy.

## Grid Parameter Search

In [112]:
parameters = {
    'vect__min_df': [2, 5, 7, 10, 15],
    'vect__max_df': [0.2, 0.3, 0.4, 0.5, 0.6],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': [0.5, 0.75, 1]
}

In [113]:
gs_clf = GridSearchCV(phil_classifier, parameters, cv=5, n_jobs=-1)
gs_clf.fit(training_features, training_labels)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.4,
                                                        max_features=None,
                                                        min_df=9,
                                                        ngram_range=(1, 1),
                                                        prep

In [114]:
gs_clf.best_score_

0.8805820674821885

In [115]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.5
tfidf__use_idf: True
vect__max_df: 0.4
vect__min_df: 2
vect__ngram_range: (1, 1)


In [433]:
predicted = gs_clf.best_estimator_.predict(test_features)

In [434]:
np.mean(predicted == test_labels)

0.8933823529411765

In [435]:
confusion_matrix = metrics.confusion_matrix(test_labels, predicted)
confusion_matrix

array([[181,  14],
       [ 15,  62]])

In [436]:
tp = confusion_matrix[1][1]
fn = confusion_matrix[1][0]
fp = confusion_matrix[0][1]
tn = confusion_matrix[0][0]
recall = tp/(tp+fn)
precision = tp/(tp+fp)
print(f'recall: {recall}')
print(f'precision: {precision}')

recall: 0.8051948051948052
precision: 0.8157894736842105


## Explore Missclassifications

### In test set

Look at the false positives for overall accuracy model:

In [16]:
test_df['Predicted'] = predicted

In [17]:
false_negatives = test_df.loc[(test_df['Philosophy'] == True)&(test_df['Predicted'] == False)]

In [18]:
false_negatives['Philosophy Type'].value_counts()

e    13
o     2
Name: Philosophy Type, dtype: int64

Note: mostly ethics

In [440]:
false_negatives['Writing Type'].value_counts()

f    7
l    4
p    2
r    2
Name: Writing Type, dtype: int64

Seems to be mostly misclassifying first-order pieces.

In [19]:
false_negatives

Unnamed: 0,NZ,Notes,Philosophy,Philosophy Type,Readable,Text,Writing Type,Text as String,Predicted
NZTIM_18780509_ARTICLE5,True,"First paras, discussion of liberalism and demo...",True,e,True,[It is not every day we find a ‘writer who has...,f,It is not every day we find a ‘writer who has ...,False
ODT_18830714_ARTICLE20,True,section on curiosity as virtue or vice,True,e,True,"[(Prom Otago Witness.) v, The political uaei o...",f,(Prom Otago Witness.) v\nThe political uaei of...,False
WI_18470303_ARTICLE5,True,Proposes philosophical reflections to be publi...,True,e,True,"[To the Editor of the *• Independent. , ', Sia...",l,"To the Editor of the *• Independent. , '\nSia....",False
DTN_18940820_ARTICLE7,True,Barely counts - asserts in context of NZ polit...,True,e,True,"[The Opposition are, like the conies, a feeble...",f,"The Opposition are, like the conies, a feeble ...",False
LT_18831025_ARTICLE34,True,Only final block,True,o,True,"[[non ora own oobbxspohdmt.], There is very li...",f,[non ora own oobbxspohdmt.]\nThere is very lit...,False
OO_18970911_ARTICLE4,False,Lady Cook,True,e,True,[j^isar^s specially for the 'Observer - JV: v ...,f,j^isar^s specially for the 'Observer - JV: v '...,False
LT_18980920_ARTICLE26,,Final 5 blocks - review of set of essays on et...,True,e,True,"[THE POWEES AND PEACE., Kaiser Wilhelm and Kin...",r,THE POWEES AND PEACE.\nKaiser Wilhelm and Kins...,False
AS_18820306_ARTICLE31,,,True,e,True,"[(To the Kditor.), Sin,—l trust you will permi...",l,"(To the Kditor.)\nSin,—l trust you will permit...",False
DSC_18600731_ARTICLE28,True,Deliberation on biculturalism.,True,o,True,"[, taeaty;.you'h»wi Ml oj>r^rtunity of telling...",p,\ntaeaty;.you'h»wi Ml oj>r^rtunity of telling ...,False
AS_18760720_ARTICLE20,True,,True,e,True,"[(To the Editor of the Star.), Sir,—l am a wom...",l,"(To the Editor of the Star.)\nSir,—l am a woma...",False


In [20]:
def print_article_text(index, dataframe):
    article_text = '\n\n'.join(dataframe.loc[index]['Text'])
    print (f"""
{index}
    
{dataframe.loc[index]['Philosophy Type']} -{dataframe.loc[index]['Notes']} 

{article_text}
""")
indices = false_negatives.index
interact(print_article_text, index=indices, dataframe=fixed(false_negatives))

interactive(children=(Dropdown(description='index', options=('NZTIM_18780509_ARTICLE5', 'ODT_18830714_ARTICLE2…

<function __main__.print_article_text(index, dataframe)>

What've we got:
* Composite piece: NZTIM_18780509_ARTICLE5, ODT_18830714_ARTICLE20, LT_18831025_ARTICLE34, LT_18980920_ARTICLE26, 
* Meta philosophy: WI_18470303_ARTICLE5 (proposes philosophical discussion about nature of education, doesn't do it tho). 
* Dubious: DTN_18940820_ARTICLE7 (Mill dead as door nail) (Passing reference).
* OO_18970911_ARTICLE4 Lady Cook, vices and virtues blackmail. Not similar to the others. (possibly need to find more like this by relabelling?
* AS_18820306_ARTICLE31: arg that athiest shouldn't be MP given current constitution. Not really satisfying definition.
* DSC_18600731_ARTICLE28: really gutting not to get this. Has a very narrative structure...
* LT_18970507_ARTICLE14: more ethics that it would be good to have. Args against capital punishment.
* NEM_18800301_ARTICLE9: politics, would be good to have.
* AS_18881121_ARTICLE77: More politics and with NZ connection. Again frustrating.
* ESD_18890826_ARTICLE1: Politics with NZ connection, less frustrating (more direct political discourse).
* AS_18760720_ARTICLE20: prob desirable.
* DSC_18601225_ARTICLE10: Also would be very good. NZ delib on sovereignty and bicultralism. 

NZTIM_18780509_ARTICLE5's philosophical bit is not particularly desirable anyway.

In [21]:
false_positives = test_df.loc[(test_df['Philosophy'] == False)&(test_df['Predicted'] == True)]

In [22]:
len(false_positives)

13

In [24]:
false_positives

Unnamed: 0,NZ,Notes,Philosophy,Philosophy Type,Readable,Text,Writing Type,Text as String,Predicted
LT_18801016_ARTICLE32,,,False,,True,[1h i fallowing is the result of the Annuel Pa...,,1h i fallowing is the result of the Annuel Pas...,True
AG_18990504_ARTICLE7,,,False,,True,[The maintenance or the recovery of youth hag ...,,The maintenance or the recovery of youth hag e...,True
ODT_18981013_ARTICLE51,,,False,,True,"[THE ATTEMPT OP AN ESTIMATE, ,■'•,' (By O. E. ...",,"THE ATTEMPT OP AN ESTIMATE, ,■'•,' (By O. E. H...",True
NEM_18920606_ARTICLE29,,,False,,True,[The double anniversary of the Pope's birthday...,,The double anniversary of the Pope's birthday ...,True
ODT_18980407_ARTICLE89,,,False,,True,"[, The last number ,of the "" Christian Globs ""...",,", The last number ,of the "" Christian Globs "" ...",True
OW_18770317_ARTICLE59,,l,False,,True,"[TO THE EDITOK, Sir,— ln your report or Mr Bri...",,"TO THE EDITOK, Sir,— ln your report or Mr Brig...",True
ESD_18891218_ARTICLE59,,,False,,True,[On Sunday evening the Rev. E. D. Ceo J preach...,,On Sunday evening the Rev. E. D. Ceo J preache...,True
LWM_18950614_ARTICLE27,,Poetry,False,,True,"[LATE MOST REV. DR. MCUAN, First Bisiiop o» Dc...",,"LATE MOST REV. DR. MCUAN, First Bisiiop o» Dcs...",True
ODT_18850204_ARTICLE30,,Education,False,,True,"[TO THE EDITOB, Sib,—lt is somewhat curious th...",,"TO THE EDITOB, Sib,—lt is somewhat curious tha...",True
GRA_18960522_ARTICLE19,,,False,,True,[The possible] use of the process of photogz/f...,,The possible] use of the process of photogz/fe...,True


In [23]:
indices = false_positives.index
interact(print_article_text, index=indices, dataframe=fixed(false_positives))

interactive(children=(Dropdown(description='index', options=('LT_18801016_ARTICLE32', 'AG_18990504_ARTICLE7', …

<function __main__.print_article_text(index, dataframe)>

Having a look at these:
* List of graduations: LT_18801016_ARTICLE32,
* Quack medical material referencing the philosopher's stone: AG_18990504_ARTICLE7
* Religious material, not directly philosophical: ODT_18981013_ARTICLE51 (remembering Bismark);  ODT_18980407_ARTICLE89 (in favour of theatre going)
* Catholic church vs. world: NEM_18920606_ARTICLE29 (dubious)
* letter about public lecture on infallibility and inspiration (should be in) OW_18770317_ARTICLE59
* Christian influence on society in general (prob counts as political philosophy) (ESD_18891218_ARTICLE59); 
* Poetry about clergy. LWM_18950614_ARTICLE27
* Material about denominational schooling (ODT_18850204_ARTICLE30)
* Totally off: material science concerning war ships ("GRA_18960522_ARTICLE19")
* Public meeting concerning a church body. Not particularly philosophical (White Cross Society) (ESD_18851028_ARTICLE1)
* Collection of sermon reports. Some on concept of 'knowledge' ('CHP_18951209_ARTICLE55')
* ESD_18960111_ARTICLE47 Good: material on liberal/conservative disputes over interpretation of Christianity (sermon).

Unaccount for must be 'unreadable'

In [445]:
straggler = test_df.loc[(test_df['Readable'] == False)&(test_df['Predicted'] == True)]

In [446]:
straggler

Unnamed: 0,NZ,Notes,Philosophy,Philosophy Type,Readable,Text,Writing Type,Text as String,Predicted
ME_18870708_ARTICLE32,,,,,False,"[i . , .»■ ,'..'rr ..- .:rTTTT/i ••■ !■•; •.'....",,"i . , .»■ ,'..'rr ..- .:rTTTT/i ••■ !■•; •.'. ...",True


In [448]:
indices = straggler.index
interact(print_article_text, index=indices, dataframe=fixed(straggler))

interactive(children=(Dropdown(description='index', options=('ME_18870708_ARTICLE32',), value='ME_18870708_ART…

<function __main__.print_article_text(index, dataframe)>

Not pretty, but if you squint it looks like philosophy. Also good to know that all the other non-readables have been classifier as non-philosophy.

### In training set.

## Save Model

I'm going to save the model optimised for overall accuracy. I think I've eered in my labelling on both sides as well.

In [144]:
with open('classifiers/NB_2_v2.pickle', 'wb') as fout:
    pickle.dump(gs_clf.best_estimator_, fout)

## Try SVMs

I'm keen to try a slightly more 'complex' method, which is said to perform better for text classification. This is done with the proviso that the above false negatives and positives are s(pretty much) all arguable cases.

In [136]:
SVC_Pipeline = Pipeline([
    ('vect', CountVectorizer(
        max_df=0.4,
        min_df=9,
        stop_words=STOPWORDS, 
        token_pattern=r'(?u)\b\w\w\w+\b')),
    ('tfidf', TfidfTransformer()),
    ('clf', SVC(kernel='poly',
                C=1.0
        )),
])

In [137]:
SVC_Pipeline.fit(training_features, training_labels)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.4,
                                 max_features=None, min_df=9,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",...
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 SVC(C=1.0, break_ties=False, cache_siz

In [138]:
svc_pred = SVC_Pipeline.predict(test_features)

In [139]:
svc_confusion_matrix = metrics.confusion_matrix(test_labels, svc_pred)
svc_confusion_matrix

array([[195,   0],
       [ 74,   3]])

In [140]:
tp = svc_confusion_matrix[1][1]
fn = svc_confusion_matrix[1][0]
fp = svc_confusion_matrix[0][1]
tn = svc_confusion_matrix[0][0]
svc_recall = tp/(tp+fn)
svc_precision = tp/(tp+fp)
svc_accuracy = (tp+tn)/(tp+fp+tn+fn)
print(f'accuracy: {svc_accuracy}')
print(f'recall: {svc_recall}')
print(f'precision: {svc_precision}')

accuracy: 0.7279411764705882
recall: 0.03896103896103896
precision: 1.0


In [142]:
svc_parameters = {
    'vect__min_df': [2, 5, 7, 10, 15],
    'vect__max_df': [0.2, 0.3, 0.4, 0.5, 0.6],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__C': [0.5, 0.75, 1, 2, 10],
    'clf__kernel':['poly', 'linear', 'sigmoid', 'rbf']
}

In [143]:
grid_svc_clf = GridSearchCV(SVC_Pipeline, svc_parameters, cv=5, n_jobs=-1)
grid_svc_clf.fit(training_features, training_labels)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.4,
                                                        max_features=None,
                                                        min_df=9,
                                                        ngram_range=(1, 1),
                                                        prep

In [145]:
grid_svc_pred = grid_svc_clf.best_estimator_.predict(test_features)

In [150]:
grid_svc_confusion_matrix = metrics.confusion_matrix(test_labels, grid_svc_pred)
grid_svc_confusion_matrix

array([[186,   9],
       [ 21,  56]])

In [152]:
grid_tp = grid_svc_confusion_matrix[1][1]
grid_fn = grid_svc_confusion_matrix[1][0]
grid_fp = grid_svc_confusion_matrix[0][1]
grid_tn = grid_svc_confusion_matrix[0][0]
grid_svc_recall = grid_tp/(grid_tp+grid_fn)
grid_svc_precision = grid_tp/(grid_tp+grid_fp)
grid_svc_accuracy = (grid_tp+grid_tn)/(grid_tp+grid_fp+grid_tn+grid_fn)
print(f'accuracy: {grid_svc_accuracy}')
print(f'recall: {grid_svc_recall}')
print(f'precision: {grid_svc_precision}')

accuracy: 0.8897058823529411
recall: 0.7272727272727273
precision: 0.8615384615384616


In [157]:
grid_svc_clf.best_score_

0.9269861540529641

If anything, overfitting noisy training data?

In [155]:
for param_name in sorted(svc_parameters.keys()):
    print("%s: %r" % (param_name, grid_svc_clf.best_params_[param_name]))

clf__C: 1
clf__kernel: 'linear'
vect__max_df: 0.6
vect__min_df: 2
vect__ngram_range: (1, 2)


Goes for linear kernel!

In [156]:
with open('classifiers/SVC_2.pickle', 'wb') as fout:
    pickle.dump(grid_svc_clf.best_estimator_, fout)

## Apply Model with Best Params to Dataset

Some of these include articles that will be in the test set. I don't think this is a big deal.

In [8]:
# If necessary reload model:
with open('classifiers/NB_2_v2.pickle', 'rb') as fin:
  phil_classifier_2 = pickle.load(fin)

In [3]:
def filter_short_articles(string):
    if len(string)<800:
        string = ''
    return string

A multicore attempt at fitting model (via https://github.com/scikit-learn/scikit-learn/issues/7448).

Final result is a list of all articles classified as true along with 'Title' and 'Text' columns.

In [4]:
slices_directory = '/home/joshua/Documents/data601_small_slices/'
slices = [f'{slices_directory}{path}' for path in os.listdir(slices_directory)]

In [7]:
# This only takes one argument to enable the 'imap' later.
def phil_from_slice(slice_path):
    df = pd.read_pickle(slice_path)
    series = (
        df['Text']
        .map(NL_helpers.blocks2string)
        .map(filter_short_articles)
    )
    series.drop(series[series==''].index, inplace=True)
    predictions = pd.Series(data=phil_classifier_2.predict(series))
    predictions.index = series.index
    phil = df[['Title', 'Text']].loc[predictions[predictions].index]
    del df
    return phil
    

In [8]:
phil_nb2 = []
if __name__ == '__main__':
    with Pool(processes=os.cpu_count()//4) as pool:
        phil_predictions = pool.imap(phil_from_slice, slices)
        for prediction in phil_predictions:
            phil_nb2.append(prediction)

In [9]:
total_phil = 0
for item in phil_nb2:
    total_phil += len(item)

In [10]:
total_phil

36537

prev: 287832, new: 44730; prev_2: 44730, new_2: 36537.

In [11]:
all_phil_nb2 = pd.concat(phil_nb2)

In [14]:
all_phil_nb2 = all_phil_nb2.loc[~all_phil_nb2.index.duplicated()]

In [15]:
len(all_phil_nb2)

31131

In [16]:
all_phil_nb2

Unnamed: 0,Title,Text
LT_18940102_ARTICLE15,ROMANTIC WOMEN.,[Most women are inclined to be romantic. This ...
LT_18940108_ARTICLE22,"The Lyttelton Times. MONDAY. JAN. 8, 1894.",[Among the five bishops who passed away last y...
LT_18940109_ARTICLE7,THE TASK OF THE BIOLOGIST.,[No. 11. When chemistry had finished _ shaping...
LT_18940115_ARTICLE5,THE THEORY OF EVOLUTION.,[SIGNIFICANT RUDIMENTS. [BY W.G.P.] No. lII.' ...
LT_18940124_ARTICLE6,THE THEORY OF EVOLUTION.,[THE CEADLE OP THOUGHT. [BY W.G.P.] No. IV. Iu...
...,...,...
ODT_18790118_ARTICLE23,UNTITLED,"[Sir,—Before the public can fairly under stand..."
ODT_18790120_ARTICLE11,THE BIBLE IN SCHOOLS.,[The Rev. Dr Roseby presched en this sub j set...
ODT_18790120_ARTICLE17,THE EARLY SETTLERS AND THE BIBLE. TO THE EDITOR.,"[Sir, —I was gratified to read in your leading..."
ODT_18790121_ARTICLE3,The Otago Daily Times. WITH WHICH IS INCORPORA...,[The Committee appointed by the Athenceum meet...


In [17]:
all_phil_nb2.to_pickle('pickles/nb2_philoso_df_v2.tar.gz')