In [1]:
import spacy
import en_core_web_md
import re
import time

import pandas as pd

from pandarallel import pandarallel

nlp = en_core_web_md.load()

RE_STRING = '[.!?\\-]|[0-9]'

In [5]:
def entity_extract(strings):
    entities = [i.text for i in nlp(' '.join(strings)).ents]
    entities = set(entities)
    unwanted = []
    for entity in entities:
        if re.search(RE_STRING, entity):
            unwanted.append(entity)
    entities = entities.difference(unwanted)
    return entities

In [6]:
def propn_extract(strings):
    propns = [tok.string for tok in nlp(' '.join(strings)) if tok.pos_ == "PROPN"]
    propns = set(propns)
    unwanted = []
    for prop in propns:
        if len(prop) < 3 or re.search(RE_STRING, prop):
            unwanted.append(prop)
    propns = propns.difference(unwanted)
    return propns

In [2]:
pandarallel.initialize()

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
philoso_df = pd.read_pickle('pickles/nb2_philoso_df_v2.tar.gz')

In [4]:
philoso_df

Unnamed: 0,Title,Text
LT_18940102_ARTICLE15,ROMANTIC WOMEN.,[Most women are inclined to be romantic. This ...
LT_18940108_ARTICLE22,"The Lyttelton Times. MONDAY. JAN. 8, 1894.",[Among the five bishops who passed away last y...
LT_18940109_ARTICLE7,THE TASK OF THE BIOLOGIST.,[No. 11. When chemistry had finished _ shaping...
LT_18940115_ARTICLE5,THE THEORY OF EVOLUTION.,[SIGNIFICANT RUDIMENTS. [BY W.G.P.] No. lII.' ...
LT_18940124_ARTICLE6,THE THEORY OF EVOLUTION.,[THE CEADLE OP THOUGHT. [BY W.G.P.] No. IV. Iu...
...,...,...
ODT_18790118_ARTICLE23,UNTITLED,"[Sir,—Before the public can fairly under stand..."
ODT_18790120_ARTICLE11,THE BIBLE IN SCHOOLS.,[The Rev. Dr Roseby presched en this sub j set...
ODT_18790120_ARTICLE17,THE EARLY SETTLERS AND THE BIBLE. TO THE EDITOR.,"[Sir, —I was gratified to read in your leading..."
ODT_18790121_ARTICLE3,The Otago Daily Times. WITH WHICH IS INCORPORA...,[The Committee appointed by the Athenceum meet...


In [10]:
t0 = time.time()
philoso_df['Entities'] = philoso_df['Text'].parallel_apply(lambda x: entity_extract(x))
print(f'{t0 - time.time()}')

-413.40772461891174


In [11]:
philoso_df.to_pickle('pickles/nb2_v2_entities_df.tar.gz')