In [1]:
# from transformers import TFAutoModelForSequenceClassification
# from transformers import AutoTokenizer

from transformers import pipeline
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import AutoModelForSequenceClassification

import pandas as pd
import re

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
# BERT model
# LABEL_0                 -> negative
# LABEL_1                 -> positive
# confidence score < .02  -> neutral
model1 = BertForSequenceClassification.from_pretrained('./bertmodel_imdb_trained')

model2 = BertForSequenceClassification.from_pretrained('./bertmodel_tweets_trained')

# TIMELESS model
model3 = AutoModelForSequenceClassification.from_pretrained('./timeless_model_tweets_trained')

In [4]:
ensembleModels = [
  pipeline(
    "sentiment-analysis",
    model = model1,
    tokenizer = tokenizer,
  ),

  pipeline(
    "sentiment-analysis",
    model = model2,
    tokenizer = tokenizer,
  ),

  pipeline(
    "sentiment-analysis",
    model = model3,
    tokenizer = tokenizer,
  ),
]

In [5]:
def classifyDoc(doc):
  # [negative, neutral, positive]
  sentimentScore = [0, 0, 0]

  def predictionSwitch(prediction):
    match prediction:
      case "negative":
        sentimentScore[0] += 1
        return
      case "neutral":
        sentimentScore[1] += 1
        return
      case "positive":
        sentimentScore[2] += 1
        return

  for index in range(len(ensembleModels)):
    modelPrediction = ensembleModels[index](doc)[0]
    # print(modelPrediction)

    # TIMELESS model
    if index == 2:
      predictionSwitch(modelPrediction["label"])

    # BERT model
    else:
      labelPrediction = ""
      if modelPrediction["score"] < 0.2:
        labelPrediction = "neutral"

      else:
        labelPrediction = "negative" if modelPrediction["label"] == "LABEL_0" else "positive"

      predictionSwitch(labelPrediction)

  if sentimentScore == [1, 1, 1]:
    return "neutral"

  maxPosition = sentimentScore.index(max(sentimentScore))
  match maxPosition:
    case 0:
      return "negative"
    case 1:
      return "neutral"
    case 2:
      return "positive"

In [6]:
classifyDoc("Im very sad now.")

'neutral'

In [7]:
def csvToDf(path):

  # Read the lines of the text file
  with open(path, 'r') as file:
      lines = file.readlines()

  # Process each line to separate text and number using regular expressions
  data = []
  for line in lines:
      match = re.match(r'^(.+)\t(\d)$', line)
      if match:
          text = match.group(1).strip()
          number = int(match.group(2))
          data.append((text, number))
      else:
          print("Skipping invalid line:", line.strip())

  # Create DataFrame from processed data
  df = pd.DataFrame(data, columns=['Text', 'Sentiment'])

  return df

In [8]:
ground_truth_df = csvToDf('./ground truth.csv')
ground_truth_df

Unnamed: 0,Text,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
...,...,...
2995,I think food should have flavor and texture an...,0
2996,Appetite instantly gone.,0
2997,Overall I was not impressed and would not go b...,0
2998,"The whole experience was underwhelming, and I ...",0


In [11]:
allMatchCount = 0
allUnmatchCount = 0

noNeutralMatchCount = 0
noNeutralUnmatchCount = 0

for i in range(len(ground_truth_df)):
    [text, sentiment] = ground_truth_df.iloc[i].values
    labelledSentiment = "positive" if sentiment == 1 else "negative"

    modelSentiment = classifyDoc(text)

    if modelSentiment == labelledSentiment:
        if modelSentiment != "neutral":
            noNeutralMatchCount += 1
        allMatchCount += 1
    else:
        if modelSentiment != "neutral":
            noNeutralUnmatchCount += 1
        allUnmatchCount += 1

print("Including neutral")
print('match:\t\t', allMatchCount)
print('unmatch:\t', allUnmatchCount)

print("\nExcluding neutral")
print('match:\t\t', noNeutralMatchCount)
print('unmatch:\t', noNeutralUnmatchCount)

Including neutral
match:		 2484
unmatch:	 516

Excluding neutral
match:		 2484
unmatch:	 187
