In [1]:
from pyarrow.parquet import ParquetFile
import pyarrow as pa 

ROWS_COUNT = 10_000

INPUT_DATA = "./resources/train-00000-of-00001.parquet"

pf = ParquetFile(INPUT_DATA) 
parquet_rows = next(pf.iter_batches(batch_size = ROWS_COUNT))

data_df = pa.Table.from_batches([parquet_rows]).to_pandas() 

In [2]:
data_df.head()

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


In [3]:
import spacy
from spacy import displacy

TEXT1 = "Got a letter as in the mail last week that said Dr. Goldberg is moving to Arizona to take a new position there in June. He will be missed very much. \n\nI think finding a new doctor in NYC that you actually like might almost be as awful as trying to find a date!"

PATTERNS = [
    {
        "label":"A_WORD",
        "pattern":[
            {
                "LOWER": {"REGEX": r"^[a][a-z]*"}, 
                "IS_STOP": False
            },
        ]
    },
    {
        "label":"T_WORD",
        "pattern":[
            {
                "LOWER": {"REGEX": r"^[t][a-z]*"},
                "IS_STOP": False
            },
        ]
    }
]

nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(PATTERNS)

def demo_spacy_patterns(text):
    doc = nlp(text)
    displacy.render(doc, style="ent")

demo_spacy_patterns(TEXT1)

In [4]:
def count_pattern_labels(texts):
    counter = {}
    for text in texts:
        text_doc = nlp(text)
        for ent in text_doc.ents:
            label = ent.label_

            if label not in counter:
                counter[label] = 0

            counter[label] += 1
    return counter

input_texts = data_df['text'].values
counts = count_pattern_labels(input_texts)
print(counts)

{'T_WORD': 35802, 'A_WORD': 24160}
