In [1]:
# from transformers import TFAutoModelForSequenceClassification
# from transformers import AutoTokenizer
# import re

from transformers import pipeline
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import AutoModelForSequenceClassification

import pandas as pd

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
# BERT model
# LABEL_0                 -> negative
# LABEL_1                 -> positive
# confidence score < .1  -> neutral
model1 = BertForSequenceClassification.from_pretrained('./bertmodel_imdb_trained')

model2 = BertForSequenceClassification.from_pretrained('./bertmodel_tweets_trained')

model3 = BertForSequenceClassification.from_pretrained('./bertmodel_rottentomatos_trained')

In [4]:
ensembleModels = [
  pipeline(
    "sentiment-analysis",
    model = model1,
    tokenizer = tokenizer,
  ),

  pipeline(
    "sentiment-analysis",
    model = model2,
    tokenizer = tokenizer,
  ),

  pipeline(
    "sentiment-analysis",
    model = model3,
    tokenizer = tokenizer,
  ),
]

In [5]:
def classifyDoc(doc):
  # [negative, neutral, positive]
  sentimentScore = [0, 0, 0]

  def predictionSwitch(prediction):
    match prediction:
      case "negative":
        sentimentScore[0] += 1
        return
      case "neutral":
        sentimentScore[1] += 1
        return
      case "positive":
        sentimentScore[2] += 1
        return

  for index in range(len(ensembleModels)):
    modelPrediction = ensembleModels[index](doc)[0]
    # print(modelPrediction)

    labelPrediction = ""
    if modelPrediction["score"] < 0.1:
      labelPrediction = "neutral"

    else:
      labelPrediction = "negative" if modelPrediction["label"] == "LABEL_0" else "positive"

    predictionSwitch(labelPrediction)

  if sentimentScore == [1, 1, 1]:
    return "neutral"

  maxPosition = sentimentScore.index(max(sentimentScore))
  match maxPosition:
    case 0:
      return "negative"
    case 1:
      return "neutral"
    case 2:
      return "positive"

In [6]:
classifyDoc("this makes me not so happy")

'negative'

In [7]:
ground_truth_df = pd.read_csv('./ground truth.csv')
ground_truth_df = ground_truth_df[['body', 'Sentiment']]
ground_truth_df

Unnamed: 0,body,Sentiment
0,Agree with the comments would love to play wit...,2
1,This is awesome! Nicely done. Really drawing p...,2
2,"You know, I thought the same thing about this ...",2
3,"Really neat, but this is not like a daily driv...",2
4,A few weeks ago I posted a comment (which I've...,1
...,...,...
94147,It works with OBJ. files from NX. My understan...,1
94148,"You could say that's Medium by Adobe, you can ...",2
94149,Two weeks ago I'd have agreed. Then I upgraded...,2
94150,I actually didn't hear about it before. From a...,0


In [8]:
allMatchCount = 0
allUnmatchCount = 0
allUnmatchMapping = {
    "negative": {
        "neutral": 0,
        "positive": 0,
    },
    "neutral": {
        "negative": 0,
        "positive": 0,
    },
    "positive": {
        "neutral": 0,
        "negative": 0,
    },
}

for i in range(1000, len(ground_truth_df)):
    if i >= 4000:
        break

    [body, sentiment] = ground_truth_df.iloc[i].values

    try:
        modelSentiment = classifyDoc(body)
    except:
        continue

    labelledSentiment = ""
    match sentiment:
        case 0:
            labelledSentiment = "negative"
        case 1:
            labelledSentiment = "neutral"
        case 2:
            labelledSentiment = "positive"

    if modelSentiment == labelledSentiment:
        allMatchCount += 1
    else:
        allUnmatchCount += 1
        allUnmatchMapping[modelSentiment][labelledSentiment] += 1

        # allUnmatchMapping += 1
        # print()
        # print(body)
        # print("model pred: " + modelSentiment)
        # print("label pred: " + labelledSentiment)

print("Including neutral")
print('match:\t\t', allMatchCount)
print('unmatch:\t', allUnmatchCount)

Token indices sequence length is longer than the specified maximum sequence length for this model (518 > 512). Running this sequence through the model will result in indexing errors


Including neutral
match:		 1324
unmatch:	 1662
unmatch mapping:
 {'negative': {'neutral': 291, 'positive': 1277}, 'neutral': {'negative': 0, 'positive': 0}, 'positive': {'neutral': 53, 'negative': 41}}


In [9]:
def pretty(d, indent=0):
   for key, value in d.items():
      print('\t' * indent + str(key))
      if isinstance(value, dict):
         pretty(value, indent+1)
      else:
         print('\t' * (indent+1) + str(value))

In [10]:
pretty(allUnmatchMapping)

negative
	neutral
		291
	positive
		1277
neutral
	negative
		0
	positive
		0
positive
	neutral
		53
	negative
		41
