In [1]:
from pyarrow.parquet import ParquetFile
import pyarrow as pa 

ROWS_COUNT = 1_000

INPUT_DATA = "./resources/train-00000-of-00001.parquet"

pf = ParquetFile(INPUT_DATA) 
parquet_rows = next(pf.iter_batches(batch_size = ROWS_COUNT))

data_df = pa.Table.from_batches([parquet_rows]).to_pandas() 

In [2]:
data_df.head()

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


In [3]:
# huge_document = " ".join(data_df['text'].values)

In [4]:
import spacy

PATTERNS = [
    {
        "label":"A_WORD",
        "pattern":[
            {"LOWER": {"REGEX": r"^[a][a-z]{2,}"}},
        ]
    },
    {
        "label":"T_WORD",
        "pattern":[
            {"LOWER": {"REGEX": r"^[t][a-z]{3,}"}},
        ]
    }
]

In [5]:
# Setup

TEXTS = data_df['text'].values

nlp_blank = spacy.blank("en")
nlp_blank.max_length = 8_000_000
nlp_blank_ruler = nlp_blank.add_pipe("entity_ruler")
nlp_blank_ruler.add_patterns(PATTERNS)

nlp_sm = spacy.load("en_core_web_sm")
nlp_sm.max_length = 8_000_000
nlp_sm_ruler = nlp_sm.add_pipe("entity_ruler")
nlp_sm_ruler.add_patterns(PATTERNS)

nlp_md = spacy.load("en_core_web_md")
nlp_md.max_lengqth = 8_000_000
nlp_md_ruler = nlp_md.add_pipe("entity_ruler")
nlp_md_ruler.add_patterns(PATTERNS)


In [6]:
%%time

# Blank
nlp_blank_docs = list(nlp_blank.pipe(TEXTS))

CPU times: user 1.28 s, sys: 32 ms, total: 1.31 s
Wall time: 1.31 s


In [7]:
%%time

# Blank pipe 4 process
nlp_blank_docs = list(nlp_blank.pipe(TEXTS, n_process=4))

CPU times: user 1.1 s, sys: 124 ms, total: 1.23 s
Wall time: 2.94 s


In [8]:
%%time

# SM
nlp_sm_docs = list(nlp_sm.pipe(TEXTS))

CPU times: user 10.9 s, sys: 547 ms, total: 11.5 s
Wall time: 11.5 s


In [9]:
%%time

# SM pipe 4 process
nlp_sm_docs = list(nlp_sm.pipe(TEXTS, n_process=4))

CPU times: user 835 ms, sys: 132 ms, total: 967 ms
Wall time: 6.95 s


In [10]:
%%time

# MD
nlp_md_docs = list(nlp_md.pipe(TEXTS))

CPU times: user 11.5 s, sys: 212 ms, total: 11.7 s
Wall time: 11.7 s


In [11]:
%%time

# MD pipe 4 process
nlp_md_docs = list(nlp_md.pipe(TEXTS, n_process=4))

CPU times: user 863 ms, sys: 124 ms, total: 987 ms
Wall time: 7.61 s


In [12]:
def count_ents(docs):
    ents_count = 0
    for doc in docs:
        ents_count += len(doc.ents)
    return ents_count

blank_ents_count = count_ents(nlp_blank_docs)
print(f"blank_ents_count: {blank_ents_count}")

sm_ents_count = count_ents(nlp_sm_docs)
print(f"sm_ents_count: {sm_ents_count}")

md_ents_count = count_ents(nlp_md_docs)
print(f"md_ents_count: {md_ents_count}")

blank_ents_count: 19605
sm_ents_count: 24419
md_ents_count: 24636


**Tech specs**  
CPU: AMD Ryzen 7 5800H (8 cores / 16 virtual cores)  
GPU: absent  

**Data**  
1_000 small text documents  


**Results**

```
===================================================================
| MODEL | Single threaded | Multi threaded (4 threads) | Entities |
|=======|=================|============================|==========|
| Blank | 1.3 sec         | 2.9 sec                    | 19605    |
| SM    | 11.5 sec        | 6.9 sec                    | 24419    |
| MD    | 11.7 sec        | 7.61 sec                   | 24636    |
===================================================================
```