In [1]:
from pyarrow.parquet import ParquetFile
import pyarrow as pa 

ROWS_COUNT = 10_000

INPUT_DATA = "./resources/train-00000-of-00001.parquet"

pf = ParquetFile(INPUT_DATA) 
parquet_rows = next(pf.iter_batches(batch_size = ROWS_COUNT))

data_df = pa.Table.from_batches([parquet_rows]).to_pandas()

In [2]:
data_df.head()

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


In [10]:
TEXTS = data_df['text'].values

PATTERNS = [
    {
        "label":"EMOTION_WORD",
        "pattern":[
            {
                "LEMMA": {"IN": ["very", "much", "so", "good", "bad"]}
            },
        ]
    }
]

POLARITY_SCORE_DICT = {
    "very": 1,
    "much": 1,
    "so": 1,
    "good": 5,
    "bad": -5,
    "happy": 5,
    "awful": -5,
    "awsome": 5,
    "strange": -5
}

TEXT1 = "Got a letter as in the mail very last week so that said Dr. Goldberg is moving to Arizona to take a new position there in June. He will be missed very much. \n\nI think finding a new doctor in NYC that you actually like might almost be as awful as trying to find a date!"

In [4]:
from spacy.language import Language
from spacy.tokens import Doc

def classify_polarity_scores(compound):
    result = "NEUTRAL"
    if compound > 0:
        result = "POSITIVE"
    else:
        result = "NEGATIVE"
    return result

class PolarityComponent:
    def __init__(self, nlp: Language):
        # Register custom extension on the Doc
        if not Doc.has_extension("polarity"):
            Doc.set_extension("polarity", default=0.0)

    def __call__(self, doc: Doc) -> Doc:
        polarity = 0.0

        for ent in doc.ents:
            lemma = ent.lemma_
            if lemma in POLARITY_SCORE_DICT:
                polarity += POLARITY_SCORE_DICT[lemma]

        doc._.polarity = polarity
        return doc

In [5]:
from spacy.language import Language
from spacy.tokens import Doc

@Language.factory("polarity")
def create_polarity_component(nlp: Language, name: str):
    return PolarityComponent(nlp)

In [6]:
import spacy

nlp_custom = spacy.load("en_core_web_md")

ruler = nlp_custom.add_pipe("entity_ruler", name="emotion_ruler", after="ner")
ruler.add_patterns(PATTERNS)
ruler = nlp_custom.add_pipe("polarity", after="emotion_ruler")

nlp_custom.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'emotion_ruler',
 'polarity']

In [9]:
from spacy import displacy

def demo_emotion_ruler(text):
    doc = nlp_custom(text)
    displacy.render(doc, style="ent")

demo_emotion_ruler(TEXT1)

In [12]:
%%time

nlp_docs = list(nlp_custom.pipe(TEXTS, n_process=4))

CPU times: user 18.9 s, sys: 1.08 s, total: 20 s
Wall time: 40.8 s


In [19]:
for i in nlp_docs[:5]:
    polarity_score = i._.polarity
    polarity = classify_polarity_scores(polarity_score)
    text = i.text

    print(f"Polarity: '{polarity}',\nText:\n{text}\n")

Polarity: 'POSITIVE',
Text:
dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.

Polarity: 'POSITIVE',
Text:
Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't