In [27]:
from transformers import pipeline
import pandas as pd
from nltk.tokenize import sent_tokenize

In [4]:
# import CSV
df = pd.read_csv('../data/yelp_dataset/review_1819.csv')

In [38]:
# instantiate pipeline with pre-trained model
# documentation: https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/pipelines#transformers.pipeline
# pipe = pipeline(task='sentiment-analysis', model="roberta-large-mnli")
pipe = pipeline(task='sentiment-analysis', model="distilbert-base-uncased-finetuned-sst-2-english")

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_415']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
# classification for 10 first items
print("stars\tlabel\t\tscore")
for i in range(10):
    text = df.iloc[i].text
    stars = df.iloc[i].stars
    text_dict = pipe(text)
    # print(text_dict, stars, text)
    print(f"{stars}\t{text_dict[0]['label']}\t{text_dict[0]['score']}")


stars	label		score
3.0	POSITIVE	0.9051093459129333
4.0	POSITIVE	0.9173600077629089
5.0	POSITIVE	0.9991680383682251
5.0	POSITIVE	0.9993034601211548
4.0	POSITIVE	0.9998400211334229
5.0	POSITIVE	0.9998459815979004
1.0	NEGATIVE	0.9976266026496887
4.0	POSITIVE	0.99922776222229
5.0	POSITIVE	0.999864935874939
5.0	POSITIVE	0.9997729659080505


In [43]:
# classification for 10 first items; also for each sentence separately
print("stars\tlabel\t\tscore")
for i in range(20):
    text = df.iloc[i].text
    stars = df.iloc[i].stars
    text_dict = pipe(text)
    print(f"{stars}\t{text_dict[0]['label']}\t{text_dict[0]['score']}")
    sentences = sent_tokenize(text)
    for s_idx, sent_dict in enumerate(pipe(sentences)):
        s = sentences[s_idx]
        # print(f"\t{s}")
        print(f"(s{s_idx})\t{sent_dict['label']}\t{sent_dict['score']}\t{s}")


stars	label		score
3.0	POSITIVE	0.9051093459129333
(s0)	NEGATIVE	0.992579996585846	If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end.
(s1)	POSITIVE	0.9997454285621643	We have tried it multiple times, because I want to like it!
(s2)	POSITIVE	0.9993190765380859	I have been to it's other locations in NJ and never had a bad experience.
(s3)	NEGATIVE	0.9920401573181152	The food is good, but it takes a very long time to come out.
(s4)	POSITIVE	0.9997740387916565	The waitstaff is very young, but usually pleasant.
(s5)	NEGATIVE	0.9994149208068848	We have just had too many experiences where we spent way too long waiting.
(s6)	NEGATIVE	0.9969869256019592	We usually opt for another diner or restaurant on the weekends, in order to be done quicker.
4.0	POSITIVE	0.9173600077629089
(s0)	POSITIVE	0.9679648280143738	I was really between 3 and 4 stars for this one.
(s1)	POSITIVE	0.9994314312934875	I LOVE the 96th street Naked Tchopstix so I was very excited