In [1]:
import spacy
import en_core_web_md
import re
import time

import pandas as pd

from pandarallel import pandarallel

nlp = en_core_web_md.load()

RE_STRING = '[.!?\\-]|[0-9]'

In [8]:
def entity_extract(strings):
    entities = [i.text for i in nlp(' '.join(strings)).ents]
    entities = set(entities)
    unwanted = []
    for entity in entities:
        if re.search(RE_STRING, entity):
            unwanted.append(entity)
    entities = entities.difference(unwanted)
    return entities

In [9]:
def propn_extract(strings):
    propns = [tok.string for tok in nlp(' '.join(strings)) if tok.pos_ == "PROPN"]
    propns = set(propns)
    unwanted = []
    for prop in propns:
        if len(prop) < 3 or re.search(RE_STRING, prop):
            unwanted.append(prop)
    propns = propns.difference(unwanted)
    return propns

In [2]:
pandarallel.initialize()

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
philoso_df = pd.read_pickle('pickles/rel_v2_philoso_df.tar.gz')

In [5]:
# Fix dump non-block list error (needed for v1)
# philoso_df['Text'] = philoso_df['Text'].map(lambda x: [x])

In [6]:
philoso_df

Unnamed: 0,Title,Text,Newspaper,Date,Text as String,Religion (pred)
LT_18940109_ARTICLE7,THE TASK OF THE BIOLOGIST.,[No. 11. When chemistry had finished _ shaping...,LT,18940109,No. 11. When chemistry had finished _ shaping;...,True
LT_18940115_ARTICLE5,THE THEORY OF EVOLUTION.,[SIGNIFICANT RUDIMENTS. [BY W.G.P.] No. lII.' ...,LT,18940115,SIGNIFICANT RUDIMENTS. [BY W.G.P.] No. lII.' '...,True
LT_18940129_ARTICLE48,THE THEORY OF EVOLUTION.,[A CHANGED WORLD. [BY W.G.P.] No. V. The bitte...,LT,18940129,A CHANGED WORLD. [BY W.G.P.] No. V. The bitter...,True
LT_18940219_ARTICLE18,HERESY.,"[TO THE EDITOR. Sir,— The action of the Presby...",LT,18940219,"TO THE EDITOR. Sir,— The action of the Presbyt...",True
LT_18940219_ARTICLE24,REV JAMES GIBB ON CHRISTIAN DUTY.,"[At St Andrew’s Presbyterian Church, yesterday...",LT,18940219,"At St Andrew’s Presbyterian Church, yesterday ...",True
...,...,...,...,...,...,...
ODT_18771031_ARTICLE30,THE INDIAN FAMINE.,[TO THE TJDITOR. Sir—l think I have_ fair caus...,ODT,18771031,TO THE TJDITOR. Sir—l think I have_ fair cause...,True
ODT_18771107_ARTICLE32,POPULAR CULTURE.,"[The. discourse of Mi* J^hn M >r!ev on "" I'opu...",ODT,18771107,"The. discourse of Mi* J^hn M >r!ev on "" I'opul...",True
ODT_18790115_ARTICLE20,MR BRIGHT AND THE ELDERS. TO THE EDITOR.,"[Sib,—However objectionable Mr Bright's title ...",ODT,18790115,"Sib,—However objectionable Mr Bright's title m...",True
ODT_18790118_ARTICLE23,UNTITLED,"[Sir,—Before the public can fairly under stand...",ODT,18790118,"Sir,—Before the public can fairly under stand ...",True


In [12]:
t0 = time.time()
philoso_df['Proper Nouns'] = philoso_df['Text'].parallel_apply(lambda x: propn_extract(x))
print(f'{t0 - time.time()}')

-161.71653985977173


In [13]:
philoso_df.to_pickle('pickles/rel_v2_propn_df.tar.gz')