In [1]:
from pyarrow.parquet import ParquetFile
import pyarrow as pa 

ROWS_COUNT = 10_000

INPUT_DATA = "./resources/train-00000-of-00001.parquet"

pf = ParquetFile(INPUT_DATA) 
parquet_rows = next(pf.iter_batches(batch_size = ROWS_COUNT))

data_df = pa.Table.from_batches([parquet_rows]).to_pandas() 

In [2]:
data_df.head()

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


In [3]:
import spacy
from spacy import displacy

TEXT1 = "Got a letter as in the mail last week so that said Dr. Goldberg is moving to Arizona to take a new position there in June. He will be missed very much. \n\nI think finding a new doctor in NYC that you actually like might almost be as awful as trying to find a date!"

PATTERNS = [
    {
        "label":"EMOTION_WORD",
        "pattern":[
            {
                "LEMMA": {"IN": ["very", "much", "so", "good", "bad"]}
            },
        ]
    }
]

nlp = spacy.load("en_core_web_md")
nlp.remove_pipe("ner")
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(PATTERNS)

def demo_spacy_patterns(text):
    doc = nlp(text)
    displacy.render(doc, style="ent")

demo_spacy_patterns(TEXT1)

In [4]:
def count_patterns(dataframe):
    counter = {}

    for i in range(len(dataframe)):
        row = dataframe.iloc[i]
    
        rate = row['label'] + 1
        if rate not in counter:
            counter[rate] = {}
        
        text = row['text']
        text_doc = nlp(text)
    
        for ent in text_doc.ents:
            lemma = ent.lemma_
            if lemma not in counter[rate]:
                counter[rate][lemma] = 0
            counter[rate][lemma] += 1

    return counter

# 90 seconds
counts = count_patterns(data_df)

In [5]:
import pandas as pd

counts_df = pd.DataFrame(counts).T.sort_index()

In [6]:
counts_df

Unnamed: 0,very,good,so,bad,much
1,530,638,1173,724,287
2,891,1545,1504,552,540
3,966,2216,1493,387,515
4,977,1796,1176,166,347
5,643,1143,807,66,182
