In [1]:
import spacy
import en_core_web_md
import re
import time

import pandas as pd

from pandarallel import pandarallel

nlp = en_core_web_md.load()

RE_STRING = '[.!?\\-]|[0-9]'

In [2]:
pandarallel.initialize()

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
philoso_df = pd.read_pickle('pickles/nb2_philoso_df.tar.gz')

In [4]:
philoso_df

Unnamed: 0,Title,Text
LT_18940102_ARTICLE15,ROMANTIC WOMEN.,[Most women are inclined to be romantic. This ...
LT_18940102_ARTICLE26,"The Lyttelton Times. TUESDAY, JAN. 2, 1894.",[Whkn previously it Las been our task to look ...
LT_18940105_ARTICLE16,STATE EDUCATION.,"[TO THE EDITOR. Sib, —No doubt a large majorit..."
LT_18940108_ARTICLE22,"The Lyttelton Times. MONDAY. JAN. 8, 1894.",[Among the five bishops who passed away last y...
LT_18940109_ARTICLE7,THE TASK OF THE BIOLOGIST.,[No. 11. When chemistry had finished _ shaping...
...,...,...
ODT_18790120_ARTICLE11,THE BIBLE IN SCHOOLS.,[The Rev. Dr Roseby presched en this sub j set...
ODT_18790120_ARTICLE17,THE EARLY SETTLERS AND THE BIBLE. TO THE EDITOR.,"[Sir, —I was gratified to read in your leading..."
ODT_18790121_ARTICLE3,The Otago Daily Times. WITH WHICH IS INCORPORA...,[The Committee appointed by the Athenceum meet...
ODT_18790121_ARTICLE16,GODLESS EDUCATION. TO THE EDITOR,"[Sir,—l clxervo that the reft-reuce to tbo ""jj..."


In [7]:
t0 = time.time()
philoso_df['Proper Nouns'] = philoso_df['Text'].parallel_apply(lambda x: propn_extract(x))
print(f'{t0 - time.time()}')

-615.3583090305328


In [8]:
philoso_df

Unnamed: 0,Title,Text,Proper Nouns
LT_18940102_ARTICLE15,ROMANTIC WOMEN.,[Most women are inclined to be romantic. This ...,"{ia , atmos , ba , phere }"
LT_18940102_ARTICLE26,"The Lyttelton Times. TUESDAY, JAN. 2, 1894.",[Whkn previously it Las been our task to look ...,"{o£ , January , Las , temptation , Whkn , tbe ..."
LT_18940105_ARTICLE16,STATE EDUCATION.,"[TO THE EDITOR. Sib, —No doubt a large majorit...","{”—l , aa , LEV , News , Bible , Onward , bo ,..."
LT_18940108_ARTICLE22,"The Lyttelton Times. MONDAY. JAN. 8, 1894.",[Among the five bishops who passed away last y...,"{Trinity , tho , himaelf , Chrysostom , Ho , B..."
LT_18940109_ARTICLE7,THE TASK OF THE BIOLOGIST.,[No. 11. When chemistry had finished _ shaping...,"{archmopteryx , Ho , ne , thq , ia , comJ , th..."
...,...,...,...
ODT_18790120_ARTICLE11,THE BIBLE IN SCHOOLS.,[The Rev. Dr Roseby presched en this sub j set...,"{thsre , somrbody, kr;o«letlge , thai , truu ,..."
ODT_18790120_ARTICLE17,THE EARLY SETTLERS AND THE BIBLE. TO THE EDITOR.,"[Sir, —I was gratified to read in your leading...","{London , Thomas , Sir, West , Bible , Colony ..."
ODT_18790121_ARTICLE3,The Otago Daily Times. WITH WHICH IS INCORPORA...,[The Committee appointed by the Athenceum meet...,"{loco , tho , num , State , rea , Btule , lar ..."
ODT_18790121_ARTICLE16,GODLESS EDUCATION. TO THE EDITOR,"[Sir,—l clxervo that the reft-reuce to tbo ""jj...","{wh , tho , pur , tbo , aueh , Ko , uso , deuh..."


In [9]:
philoso_df.to_pickle('pickles/nb2_philoso_propn_df.tar.gz')

Starting again for entities (should make a function to do both).

In [10]:
philoso_df = pd.read_pickle('pickles/nb2_philoso_df.tar.gz')

In [11]:
t0 = time.time()
philoso_df['Entities'] = philoso_df['Text'].parallel_apply(lambda x: entity_extract(x))
print(f'{t0 - time.time()}')

-625.5614910125732


In [13]:
philoso_df

Unnamed: 0,Title,Text,Entities
LT_18940102_ARTICLE15,ROMANTIC WOMEN.,[Most women are inclined to be romantic. This ...,{}
LT_18940102_ARTICLE26,"The Lyttelton Times. TUESDAY, JAN. 2, 1894.",[Whkn previously it Las been our task to look ...,"{Whkn, Las, one, the year, a year, year, last ..."
LT_18940105_ARTICLE16,STATE EDUCATION.,"[TO THE EDITOR. Sib, —No doubt a large majorit...","{Anglicsns, the Church News, Onward New Zealan..."
LT_18940108_ARTICLE22,"The Lyttelton Times. MONDAY. JAN. 8, 1894.",[Among the five bishops who passed away last y...,"{democratic, Ho, daily, the day, Constantinopl..."
LT_18940109_ARTICLE7,THE TASK OF THE BIOLOGIST.,[No. 11. When chemistry had finished _ shaping...,"{earth, Ho, three, first, aj reptile’s, The “ ..."
...,...,...,...
ODT_18790120_ARTICLE11,THE BIBLE IN SCHOOLS.,[The Rev. Dr Roseby presched en this sub j set...,"{tho shop, Tlio State, Proverbj, ins'ancn, Dr ..."
ODT_18790120_ARTICLE17,THE EARLY SETTLERS AND THE BIBLE. TO THE EDITOR.,"[Sir, —I was gratified to read in your leading...","{Thomas Coull, Christian, the other evening, L..."
ODT_18790121_ARTICLE3,The Otago Daily Times. WITH WHICH IS INCORPORA...,[The Committee appointed by the Athenceum meet...,"{Satur days and, Btule, five days in the week,..."
ODT_18790121_ARTICLE16,GODLESS EDUCATION. TO THE EDITOR,"[Sir,—l clxervo that the reft-reuce to tbo ""jj...","{uso, bien dis pi«cesi, mada, uttonti'jn, Ko f..."


In [12]:
philoso_df.to_pickle('pickles/nb2_philoso_entities_df.tar.gz')

In [12]:
dog = nlp(' '.join(philoso_df['Text'][400]))

In [18]:
[tok.string for tok in dog if tok.pos_ == "PROPN"]

['NELSON',
 'TUESDAY',
 'JULY ',
 'HON',
 'MR',
 '. ',
 'STAFFORD',
 'Parliament ',
 'New ',
 'Zealand ',
 'Opposition ',
 'Mr. ',
 'Stafford ',
 'Opposition ',
 'Mr. ',
 'Stafford ',
 'Constitution ',
 'Act ',
 '-',
 'Maori',
 '| ',
 'Public ',
 'Debts ',
 'Act',
 'Crown ',
 'Mr. ',
 'Stafford ',
 'Governor',
 'Minis ',
 'ter',
 'Blue ',
 'Books',
 'Mr. ',
 'Stafford',
 'Blue ',
 'Book ',
 'vol',
 '. ',
 'D. ',
 'Mr. ',
 'Stafford',
 'Mr. ',
 'Disraeli',
 'Parliament',
 'Mr. ',
 'Disraeli ',
 'Mr. ',
 'Glad',
 'Ministry',
 'British ',
 'House ',
 'Commons',
 'Mr. ',
 'Gladstone',
 'hsw ',
 'hs ',
 'biffl',
 'effec ',
 'tive ',
 'New ',
 'Zealand ',
 'Constitution',
 'Mr. ',
 'Disraeli',
 'tem ',
 'porarily ',
 'Mr. ',
 'Stafford',
 'becks',
 'arid ',
 'Grattane']

In [13]:
dog

NELSON, TUESDAY, JULY 4, 1871. A HINT FOR THE HON. MR. STAFFORD. The Parliament of New Zealand will assemble in the course of a few weeks, and, doubtless some new mode of action will be required by the leader of the Opposition that is, or is to be. Last year, in the face of a coming election, Mr. Stafford modestly withdrew from his customary prominence, preferring to put to the front a somewhat rash but unsuspecting gentleman, who, during the session, performed the Opposition talking, and, in the election which followed, lost his seat. Mr. Stafford has tried so many political movements for popularity, 1 that most of them must now be somewhat stale, and can hardly again be decently employed. It requires invention to provide novelties. The extreme radicalism be avowed when the Constitution Act wag being considered, led a large section of the public to suppose that he was the man for i the people. That served its turn for the moment, and, of course, was duly abandoned. Of his anti-Maori, 

In [5]:
def entity_extract(strings):
    entities = [i.text for i in nlp(' '.join(strings)).ents]
    entities = set(entities)
    unwanted = []
    for entity in entities:
        if re.search(RE_STRING, entity):
            unwanted.append(entity)
    entities = entities.difference(unwanted)
    return entities

In [6]:
def propn_extract(strings):
    propns = [tok.string for tok in nlp(' '.join(strings)) if tok.pos_ == "PROPN"]
    propns = set(propns)
    unwanted = []
    for prop in propns:
        if len(prop) < 3 or re.search(RE_STRING, prop):
            unwanted.append(prop)
    propns = propns.difference(unwanted)
    return propns

In [28]:
philoso_df.to_pickle('pickles/philoso_propn_df.tar.gz')

In [47]:
philoso_df['Newspaper'] = philoso_df.index.map(lambda x: x[0:x.find('_')])
philoso_df['Date']=philoso_df.index.map(lambda x: x[x.find('_')+1:x.find('_')+9])

In [26]:
philoso_df = philoso_df[['Title', 'Text', 'Proper Nouns']]

In [27]:
philoso_df

Unnamed: 0,Title,Text,Proper Nouns
NA_18870416_ARTICLE4,FOEEIGN AGGRESSION.,"[Fob some time past the colonial defences, the...","{Ac , compet , thiat , loverty , Mansion , ihg..."
NA_18870813_ARTICLE27,WHANGAREI MUTUAL IMPROVEMENT ASSOCIATION.,"[On Friday evening the debate "" Moral Suasion""...","{ho , Carruth , b|e , Brooks , Sealey , Jjecti..."
NA_18871022_ARTICLE5,The Northern Advocate.,"[OUR COAL FIELDS. y, Ont Thursday last the Hon...","{Government , Point, Whan , Island, Hector, iA..."
NA_18880204_ARTICLE21,HEART WISDON.,[Unnumbered years have sped since first The bu...,"{vaia, ua , Kly, Springs , Life, Becauseljto ,..."
NA_18880623_ARTICLE29,OUR AUCKLAND LETTER. I,"[« (From our own Correspondent.) Auckland, Tue...","{Saturday, Majesty, Victorian , House , ths , ..."
...,...,...,...
ODT_18980917_ARTICLE15,BOOK NOTICES.,"["" Corleone."" By F. Marion Crawford. London: M...","{Bliss , Library, Corona , Lizzie , Creator , ..."
ODT_18980924_ARTICLE5,PASSING- NOTES.,"[It is difficult to believe, probably nobody, ...","{Grey, Cathedral , Governor, Brown, Dance , no..."
ODT_18981008_ARTICLE15,PUBLISHED BY SPECIAL ARRANGEMENT. WILD EELLIN ...,"[COPYRIGHT., CHAPTER XXXIIL—""SEI MIR GEG-, RUE...","{calceolarias , liavo , Somerled , Lord , Arch..."
ODT_18981013_ARTICLE51,BISMARCK.,"[THE ATTEMPT OP AN ESTIMATE, ,■'•,' (By O. E. ...","{Goethe , Cliurch, Protestant, Frederick , sio..."


In [54]:
list(philoso_df.iloc[0]['Entities'])

['the Mansion House',
 'years',
 'Un',
 'auch',
 'tjie mo st',
 'vio lence',
 'Germany',
 'iorm',
 'thiat labour',
 'unclad',
 'England',
 'fed',
 'Ac',
 'Europe',
 'ACes',
 'md',
 'Prance',
 'India',
 'indus',
 'Ori thia',
 'colonists',
 'Sweden',
 'the xercise dt',
 'one']

map may be the wrong thing here (map replaces values).