In [1]:
# from transformers import TFAutoModelForSequenceClassification
# from transformers import AutoTokenizer
# import re

from transformers import pipeline
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import AutoModelForSequenceClassification

import pandas as pd

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
# BERT model
# LABEL_0                 -> negative
# LABEL_1                 -> positive
# confidence score < .1  -> neutral
model1 = BertForSequenceClassification.from_pretrained('./bertmodel_imdb_trained')

model2 = BertForSequenceClassification.from_pretrained('./bertmodel_tweets_trained')

model3 = BertForSequenceClassification.from_pretrained('./bertmodel_rottentomatos_trained')

In [4]:
ensembleModels = [
  pipeline(
    "sentiment-analysis",
    model = model1,
    tokenizer = tokenizer,
  ),

  pipeline(
    "sentiment-analysis",
    model = model2,
    tokenizer = tokenizer,
  ),

  pipeline(
    "sentiment-analysis",
    model = model3,
    tokenizer = tokenizer,
  ),
]

In [5]:
def classifyDoc(doc):
  # [Negative, Neutral, Positive]
  sentimentScore = [0, 0, 0]

  def predictionSwitch(prediction):
    match prediction:
      case "Negative":
        sentimentScore[0] += 1
        return
      case "Neutral":
        sentimentScore[1] += 1
        return
      case "Positive":
        sentimentScore[2] += 1
        return

  for index in range(len(ensembleModels)):
    modelPrediction = ensembleModels[index](doc)[0]
    # print(modelPrediction)

    labelPrediction = ""
    if modelPrediction["score"] < 0.05:
      labelPrediction = "Neutral"

    else:
      labelPrediction = "Negative" if modelPrediction["label"] == "LABEL_0" else "Positive"

    predictionSwitch(labelPrediction)

  if sentimentScore == [1, 1, 1]:
    return "Neutral"

  maxPosition = sentimentScore.index(max(sentimentScore))

  match maxPosition:
    case 0:
      return "Negative"
    case 1:
      return "Neutral"
    case 2:
      return "Positive"

In [6]:
classifyDoc("this makes me not so happy")

'Negative'

In [7]:
ground_truth_df = pd.read_csv('./ground truth.csv', encoding='latin1')
ground_truth_df = ground_truth_df[['body', 'sentiment']]
ground_truth_df.dropna(inplace=True)
ground_truth_df

Unnamed: 0,body,sentiment
0,Agree with the comments would love to play wit...,Negative
1,This is awesome! Nicely done. Really drawing p...,Positive
2,"You know, I thought the same thing about this ...",Neutral
3,"Really neat, but this is not like a daily driv...",Negative
4,A few weeks ago I posted a comment (which I've...,Positive
...,...,...
1114,I didn't find any posts that meet the matching...,Neutral
1115,Maybe people are just tired of reading the sam...,Negative
1116,You actually do get the effect but it's much b...,Positive
1117,I use big screen cinema but there are a few mo...,Negative


In [36]:
allMatchCount = 0
allUnmatchCount = 0
mapping = {
    "Positive": {
        "Positive": 0,
        "Negative": 0,
    },
    "Negative": {
        "Positive": 0,
        "Negative": 0,
    },
}

for i in range(len(ground_truth_df)):
    [body, labelledSentiment] = ground_truth_df.iloc[i].values
    if labelledSentiment == "Neutral": continue

    try:
        modelSentiment = classifyDoc(body)
    except:
        continue

    if modelSentiment == "Neutral": continue

    # labelledSentiment = ""
    # match sentiment:
    #     case 0:
    #         labelledSentiment = "Negative"
    #     case 1:
    #         labelledSentiment = "Neutral"
    #     case 2:
    #         labelledSentiment = "Positive"

    mapping[modelSentiment][labelledSentiment] += 1
    if modelSentiment == labelledSentiment:
        allMatchCount += 1
    else:
        allUnmatchCount += 1

        # mapping += 1
        # print()
        # print(body)
        # print("model pred: " + modelSentiment)
        # print("label pred: " + labelledSentiment)

print("Including neutral")
print('match:\t\t', allMatchCount)
print('unmatch:\t', allUnmatchCount)

Including neutral
match:		 663
unmatch:	 159


In [37]:
# tp  fp
print(mapping["Positive"]["Positive"], "\t", mapping["Negative"]["Positive"])
# np  fn
print(mapping["Positive"]["Negative"], "\t", mapping["Negative"]["Negative"])

198 	 99
60 	 465


In [38]:
def pretty(d, indent=0):
   for key, value in d.items():
      print('\t' * indent + str(key))
      if isinstance(value, dict):
         pretty(value, indent+1)
      else:
         print('\t' * (indent+1) + str(value))

In [39]:
pretty(mapping)

Positive
	Positive
		198
	Negative
		60
Negative
	Positive
		99
	Negative
		465
