# First Application of Naive Bayes to Philosophy/Non-Philosophy Classification

In [43]:
import os
import pickle
from multiprocessing import Pool

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

from nltk.corpus import stopwords

import numpy as np

from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

import pandas as pd

import NL_helpers

STOPWORDS = list(stopwords.words('english'))
STOPWORDS_2 = STOPWORDS + ['philosophy']

## Load Labels and Corresponding Texts

Load labels with article codes.

In [2]:
annotated_df = pd.read_pickle('pickles/classified_df.pickle')

In [None]:
annotated_df

Use article codes to load texts.

In [4]:
dataset_path = '/home/joshua/hdd/Datasets/papers-past/'

The total dataset is divided into nine pickles. The following code runs through each, and collects the text for any articles in the annotated df using a left join opperation.

In [None]:
texts = pd.DataFrame(index=annotated_df.index)
for i in range(9):
    df = pd.read_pickle(dataset_path+f'corpus_df_{i}.tar.gz')
    matching_df = texts.join(df['Text'])
    annotated_df = annotated_df.combine_first(matching_df)
    del matching_df, df # May aid python memory management

Change text from list to simple string

In [None]:
annotated_df['Text'] = annotated_df['Text'].map(NL_helpers.blocks2string)

In [None]:
annotated_df.to_pickle('pickles/classified_with_text_df.pickle')

In [3]:
# annotated_df = pd.read_pickle('pickles/classified_with_text_df.pickle')

In [5]:
annotated_df['Philosophy'].value_counts()

False    147
True     101
Name: Philosophy, dtype: int64

I will go for a 75/25 training/test split on the philosophy observations. I will do the same on 100 of the non-philosophy and then add the remaining 47 non-philosophy articles to the test set. This should somewhat mimic the prevalence of non-philosophy articles in the overall dataset.

In [6]:
training_phil = annotated_df.loc[annotated_df['Philosophy']==True].sample(n=75)
training_nonphil = annotated_df.loc[annotated_df['Philosophy']==False].sample(n=75)
training_df = training_phil.append(training_nonphil)
del training_phil, training_nonphil

In [7]:
test_df = annotated_df.loc[[i for i in annotated_df.index if not i in training_df.index]]

In [8]:
test_df

Unnamed: 0,NZ,Notes,Philosophy,Philosophy Type,Readable,Text,Writing Type
WC_18810721_ARTICLE7,,,False,,True,"July 18. ; The ""Very Eev. A. P. Stanley, Dean ...",
OW_18800717_ARTICLE97,,,False,,True,"1465. Charade —By Albert E Hardy, Oamaru :—\nM...",
ESD_18890912_ARTICLE37,,,False,,True,"[By Onr. Special Reporters.!\nOAMARU, Septembe...",
HBH_18860916_ARTICLE10,,,False,,True,"(Itccoivcd Soptombor la, 2.15 p.m.)\nMklhourxk...",
WT_18760815_ARTICLE1,,,,,False,Mr X ITU rt-pjrts that at Mr Oibson'i clearing...,
...,...,...,...,...,...,...,...
NOT_18760629_ARTICLE9,True,"Good grump about Duncan MacGregor, first phil ...",True,r,True,"Sib,— The Rev. A. B. Todd, in your issue of Ju...",l
FS_18830811_ARTICLE11,True,,True,r,True,' The Rev. Mr Murray delivered bis s lecture a...,p
ODT_18990920_ARTICLE15,True,,True,e,True,"TQ THE EDITOK.\nSir, —The famous saying of the...",l
MEX_18900430_ARTICLE32,,,True,r,True,"What etands ont as the one greot, estab lished...",f


## Set up Pipeline and Fit Model

Tokeniser change from default: words of length 3+ rather than 2+.

In [5]:
phil_classifier = Pipeline([
    ('vect', CountVectorizer(
        max_df=0.4,
        min_df=9,
        stop_words=STOPWORDS, 
        token_pattern=r'(?u)\b\w\w\w+\b')),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [10]:
training_features = training_df['Text'].astype('string').to_numpy()

In [11]:
training_labels = training_df['Philosophy'].to_numpy().astype(bool)

In [12]:
training_labels

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [13]:
phil_classifier.fit(training_features, training_labels)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.4,
                                 max_features=None, min_df=9,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",...
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itse

In [14]:
test_features = test_df['Text'].astype('string').to_numpy()

In [15]:
test_labels = test_df['Philosophy'].to_numpy().astype(bool)

In [16]:
predicted = phil_classifier.predict(test_features)

In [17]:
np.mean(predicted == test_labels)

0.9024390243902439

In [18]:
metrics.confusion_matrix(test_labels, predicted)

array([[88,  9],
       [ 3, 23]])

In [19]:
len(phil_classifier['vect'].vocabulary_)

1458

## Grid Parameter Search

In [20]:
parameters = {
    'vect__min_df': [7, 8, 9, 10, 11, 12],
    'vect__max_df': [0.2, 0.3, 0.4, 0.5],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vect__stop_words': [STOPWORDS, STOPWORDS_2], # Worry: presence of the word 'philosophy' is doing all the work
    'tfidf__use_idf': (True, False),
}

In [21]:
gs_clf = GridSearchCV(phil_classifier, parameters, cv=5, n_jobs=-1)
gs_clf.fit(training_features, training_labels)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.4,
                                                        max_features=None,
                                                        min_df=9,
                                                        ngram_range=(1, 1),
                                                        prep

In [25]:
gs_clf.best_score_

0.8933333333333333

In [26]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

tfidf__use_idf: True
vect__max_df: 0.2
vect__min_df: 7
vect__ngram_range: (1, 1)
vect__stop_words: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 

In [46]:
with open('classifiers/NB_1.pickle', 'wb') as fout:
  pickle.dump(phil_classifier, fout)

## Apply Model with Best Params to Subset of Total Dataset

Some of these include articles that will be in the test set. I don't think this is a big deal.

In [17]:
# If necessary reload model:
with open('classifiers/NB_1.pickle', 'rb') as fin:
  phil_classifier = pickle.load(fin)

In [24]:
corpus_subset_df = pd.read_pickle('pickles/corpus_2000per_subset_df.pickle')

In [27]:
len(corpus_subset_df)

17949

In [28]:
corpus_subset_df['Text as string'] = corpus_subset_df['Text'].apply(NL_helpers.blocks2string)

In [6]:
def filter_short_articles(string):
    if len(string)<800:
        string = ''
    return string

In [30]:
corpus_subset_df['Text as string'] = corpus_subset_df['Text as string'].map(filter_short_articles)

In [31]:
corpus_subset_df.drop(corpus_subset_df[corpus_subset_df['Text as string']==''].index, inplace=True)

In [32]:
corpus_subset_features = corpus_subset_df['Text as string'].astype('string').to_numpy()

In [33]:
predicted_subset = phil_classifier.predict(corpus_subset_features)

In [34]:
predictions = pd.Series(data=predicted_subset)

In [35]:
predictions.value_counts()

False    10670
True       747
dtype: int64

In [36]:
predictions.index = corpus_subset_df.index

In [37]:
corpus_subset_df.loc[predictions]

Unnamed: 0,Title,Text,Newspaper,Date,Tokenised,Text as string
BH_18740911_ARTICLE23,Invercargill,"[(Frbrriburbwri'Gbrresponderit.), The* attenti...",,,,(Frbrriburbwri'Gbrresponderit.)\nThe* attentio...
CHP_18970902_ARTICLE28,THE QUEEN AND HER PEOPLE.,[The following is the Queen's Jubilee letter.—...,,,,"The following is the Queen's Jubilee letter.—""..."
BH_18811209_ARTICLE27,HOW IT ALL CAME ABOUT.,[1; ! Bj; a prtfce-s (O- 5 -*ydlut_b_i ;; and'...,,,,1; ! Bj; a prtfce-s (O- 5 -*ydlut_b_i ;; and' ...
CHP_18970206_ARTICLE29,"""PERFIDIOUS ALBION.""",[It is seldom that any nation has an opportuni...,,,,It is seldom that any nation has an opportunit...
CHP_18990714_ARTICLE48,RELIGIOUS EDUCATION.,[A- meeting of ministers representing the wiou...,,,,A- meeting of ministers representing the wious...
...,...,...,...,...,...,...
ODT_18980810_ARTICLE58,BIBLE IN SCHOOLS.,"[•. , .... , ~r ,j TO. THE EDITOR. ~.. ;.■■■],...",ODT,18980810,"[editor, sißj, quite, corre, spondentti, pippa...","•. , .... , ~r ,j TO. THE EDITOR. ~.. ;.■■■],...."
ODT_18980531_ARTICLE21,THE SITUATION IN ITALY.,"[A LETTER FROM THE POPE,, Press Association—By...",ODT,18980531,"[letter, pope, press, association, telegraph, ...","A LETTER FROM THE POPE,\nPress Association—By ..."
ODT_18980810_ARTICLE15,SUNDAY NIGHT CONCERTS. TO THE EDITOR.,"[•Sis.^Every.nbw' and thea-atteirjpti,■•«•.» m...",ODT,18980810,"[sis, every, nbw, thea, atteirjpti, made, less...","•Sis.^Every.nbw' and thea-atteirjpti,■•«•.» ma..."
ODT_18980131_ARTICLE30,"""CIVIS"" AND DR STORDEUR'S LECTURES.","[TO THE JBBITOS. . ' Sir,—May I crave space in...",ODT,18980131,"[jbbitos, sir, may, crave, space, columns, pap...","TO THE JBBITOS. . ' Sir,—May I crave space in ..."


In [38]:
corpus_subset_df['Newspaper'] = corpus_subset_df.index.map(lambda x: x[0:x.find('_')])
corpus_subset_df['Date'] = corpus_subset_df.index.map(lambda x: x[x.find('_')+1:x.find('_')+9])

In [39]:
corpus_subset_df['Philosophy(pred)'] = predictions

In [42]:
corpus_subset_df = corpus_subset_df.drop(['Tokenised', 'Text as string'], axis=1)

In [43]:
corpus_subset_df.to_pickle('pickles/preds_nb_1.tar.gz')

Let's check out the results:

In [240]:
phil_indices = (list(predictions[predictions==True].sample(n=500).index))
interact(NL_helpers.html_text, index=phil_indices, dataframe=fixed(corpus_subset_df), boldface=fixed(None))

interactive(children=(Dropdown(description='index', options=('LT_18990206_ARTICLE20', 'LT_18960709_ARTICLE14',…

<function NL_helpers.html_text(index, dataframe, boldface=None)>

A multicore attempt at fitting model (via https://github.com/scikit-learn/scikit-learn/issues/7448).

Final result is a list of all articles classified as true along with 'Title' and 'Text' columns.

In [12]:
slices_directory = '/home/joshua/Documents/data601_small_slices/'
slices = [f'{slices_directory}{path}' for path in os.listdir(slices_directory)]

In [36]:
def phil_from_slice(slice_path):
    df = pd.read_pickle(slice_path)
    series = (
        df['Text']
        .map(NL_helpers.blocks2string)
        .map(filter_short_articles)
    )
    series.drop(series[series==''].index, inplace=True)
    predictions = pd.Series(data=phil_classifier.predict(series))
    predictions.index = series.index
    phil = df[['Title', 'Text']].loc[predictions[predictions].index]
    del df
    return phil
    

In [37]:
phil_nb1 = []
if __name__ == '__main__':
    with Pool(processes=os.cpu_count()//4 + 4) as pool:
        phil_predictions = pool.imap(phil_from_slice, slices)
        for prediction in phil_predictions:
            phil_nb1.append(prediction)

In [38]:
total_phil = 0
for item in phil_nb1:
    total_phil += len(item)

In [40]:
total_phil

287832

In [39]:
all_phil_nb1 = pd.concat(phil_nb1)

In [41]:
all_phil_nb1

Unnamed: 0,Title,Text
LT_18940102_ARTICLE9,HOW THEY GET ENGAGED IN GREENLAND.,[The missionaries in Greenland seem to have a ...
LT_18940102_ARTICLE12,SHEARING WET SHEEP.,"[TO THE EDITOR. Sir, —It being a matter of gre..."
LT_18940102_ARTICLE13,WORK AND WAGES.,"[TO THE EDITOR. Sir, —It appears that no cry o..."
LT_18940102_ARTICLE15,ROMANTIC WOMEN.,[Most women are inclined to be romantic. This ...
LT_18940102_ARTICLE41,IN MEMORIAM.,"[BISHOP HARPER. Ob., I)K. SS, 1533. “Go forth ..."
...,...,...
ODT_18790121_ARTICLE9,THE PRESBYTERIAN SYNOD OF OTAGO AND SOUTHLAND.,[FIFTH DAY. The Syrorl resumed its transaction...
ODT_18790121_ARTICLE15,THE SECOLD CHAIR OF MORAL PHILOSOPHY AND POLIT...,"[TO TJIE EDITOR, Sir,—You must not suppose iro..."
ODT_18790121_ARTICLE16,GODLESS EDUCATION. TO THE EDITOR,"[Sir,—l clxervo that the reft-reuce to tbo ""jj..."
ODT_18790121_ARTICLE17,THE BIBLE SCHOOLS. TO THE EDITOR.,"[Sia,—lt may not be out of place, in the prese..."


In [42]:
all_phil_nb1.to_pickle('pickles/nb1_philoso_df.tar.gz')