In [1]:
# Initialize, Stemmer, Lemmatizer, and Stopwords
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

stopwords = stopwords.words('english')
stopwords.remove('not')

In [2]:
# Helper function to clean text and split by words
import re

def clean_text_for_LR(text, stopwords = stopwords, stemmer = stemmer, lemmatizer = lemmatizer):
    text = re.sub(r'<.*?>', '', text) # Remove HTML tags
    text = re.sub(r"[^a-zA-Z]", " ", text) # Remove non-letter characters
    text = text.lower() # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra white spaces
    # Split words then lemmatize and stem words then remove stopwords
    words = re.split(r'\W+', text)
    # words = [stemmer.stem(lemmatizer.lemmatize(word)) for word in words if word not in stopwords] 
    words = [stemmer.stem(lemmatizer.lemmatize(word)) for word in words if word not in stopwords] 
    
    # return list of strings
    words = " ".join(words)
    
    return words

In [3]:
import pickle

# Load BoW_vectorizer
with open('../Models/BoW_logistic_regression/BoW_vectorizer.pkl', 'rb') as f:
    BoW_vectorizer = pickle.load(f)

# Load n_gram_1_2_vectorizer
with open('../Models/1_2_gram_logistic_regression/1_2_gram_vectorizer.pkl', 'rb') as f:
    n_gram_1_2_vectorizer = pickle.load(f)

print(f"BoW Vocab: {len(BoW_vectorizer.vocabulary_)}")
print(f"1_2_ngram Vocab: {len(n_gram_1_2_vectorizer.vocabulary_)}")

BoW Vocab: 70709
1_2_ngram Vocab: 2641907


In [4]:
# Function to return the class of the review
def predict_with_LR(text, vectorizer, model):
  text = clean_text_for_LR(text)
  text = vectorizer.transform([text]).toarray()
  
  # Predict using model
  prediction = model.predict(text) # Predict
  prediction = prediction[0] # Adjust prediction format to print it
  return prediction

# Function to compare the model's answer with the manual reviews answer
def results_on_manual_reviews(index, prediction, answers, model_name):
  if prediction == 1.0: print(f"{model_name} Prediction: {prediction} => Positive Sentiment => {prediction == answers[index]}")
  else: print(f"{model_name} Prediction: {prediction} => Negative Sentiment => {prediction == answers[index]}")

In [5]:
# Create sample reviews for testing the models
phrases = ["The movie had an excellent storyline with amazing acting, but the ending was quite disappointing and left me unsatisfied.", ## Negative

          "This movie was absolutely fantastic! The story was gripping, the acting was top-notch, and the cinematography was stunning. I was on the edge of my seat the entire time and couldn't look away. It's been a long time since a film has captivated me like this. Highly recommend!", ## Positive 

          "The film had its moments, but overall, it failed to leave a lasting impression. The characters lacked depth, and the storyline felt disjointed. It's a forgettable movie that I wouldn't recommend to others.", # neutral - to - negative
          
          "I hate this movie", # Negative

          "I have a love hate relationship with this movie", # Negative

          "I wish I could watch it again now", # Positive

          "I love this movie", # Positive

          "Great" # Positive
          ] 

answers = [0, 1, 0, 0, 0, 1, 1, 1]

In [6]:
## Load SGDClassifiers (Logistic Regression)
from joblib import load

# Logistic regression trained on BoW
LR_BoW_model = load('../Models/BoW_logistic_regression/BoW_Logistic_Regression.pkl')

# Logistic regression trained on 1_2 ngrams
LR_1_2_gram_model = load('../Models/1_2_gram_logistic_regression/1_2_gram_Logistic_Regression.pkl')

# Test models on manual reviews
for i in range(len(phrases)):
  prediction = predict_with_LR(model=LR_BoW_model, text=phrases[i], vectorizer=BoW_vectorizer)
  results_on_manual_reviews(prediction=prediction, model_name="LR BoW", index=i, answers=answers)

print('\n')

for i in range(len(phrases)):
  prediction = predict_with_LR(model=LR_1_2_gram_model, text=phrases[i], vectorizer=n_gram_1_2_vectorizer)
  results_on_manual_reviews(prediction=prediction, model_name="LR 1_2_n_gram", index=i, answers=answers)

LR BoW Prediction: 0 => Negative Sentiment => True
LR BoW Prediction: 1 => Positive Sentiment => True
LR BoW Prediction: 0 => Negative Sentiment => True
LR BoW Prediction: 1 => Positive Sentiment => False
LR BoW Prediction: 1 => Positive Sentiment => False
LR BoW Prediction: 1 => Positive Sentiment => True
LR BoW Prediction: 1 => Positive Sentiment => True
LR BoW Prediction: 1 => Positive Sentiment => True


LR 1_2_n_gram Prediction: 1 => Positive Sentiment => False
LR 1_2_n_gram Prediction: 1 => Positive Sentiment => True
LR 1_2_n_gram Prediction: 0 => Negative Sentiment => True
LR 1_2_n_gram Prediction: 1 => Positive Sentiment => False
LR 1_2_n_gram Prediction: 1 => Positive Sentiment => False
LR 1_2_n_gram Prediction: 1 => Positive Sentiment => True
LR 1_2_n_gram Prediction: 1 => Positive Sentiment => True
LR 1_2_n_gram Prediction: 1 => Positive Sentiment => True


In [7]:
# Initialize the RoBERTa GRU model architecture
from transformers import RobertaTokenizer, RobertaModel
import torch

class RoBERTa_GRU(torch.nn.Module):
  def __init__(self):
    super(RoBERTa_GRU, self).__init__()

    # Initialize the layers that are needed
    self.RoBERTa = RobertaModel.from_pretrained('roberta-base')
    self.gru = torch.nn.GRU( input_size = 768, hidden_size = 256 )
    self.flatten = torch.nn.Flatten()
    self.dense_1 = torch.nn.Linear( in_features = 256, out_features=1000 )
    self.gelu = torch.nn.GELU()
    self.dense_2 = torch.nn.Linear( in_features=1000, out_features=2 )

  def forward(self, input_ids, attention_mask, token_type_ids):

    # Get the last_hidden_states from RoBERTa
    roberta_output = self.RoBERTa(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

    # Select the first column from RoBERTa's last_hidden_state output. CLS is the classification of each sequence
    cls = roberta_output.last_hidden_state[:, 0]

    # GRU will learn the longterm dependencies of the sequences
    sequences, gru_hidden_states = self.gru( cls )

    # Doesn't make any difference
    flattened = self.flatten( sequences )

    # Learn relationships between the hidden states
    x = self.dense_1( flattened )
    x = self.gelu(x)

    # Return the prediction
    x = self.dense_2(x)
    output = torch.nn.functional.softmax(x, dim=1)

    return output

In [9]:
# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained("../../RoBERTa_GRU Model [Not in GitHub]/RoBERTa_Pretrained_Tokenizer")

# Load pretrained RoBERTa_GRU into the model architecture
model = RoBERTa_GRU()
model = torch.load('../../RoBERTa_GRU Model [Not in GitHub]/RoBERTa_GRU_model.pth', map_location=torch.device('cpu'))
model.eval()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa_GRU(
  (RoBERTa): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

In [10]:
# Prepare review for prediction
def preprocess_for_RoBERTa_GRU(text, tokenizer, torch):
  
  # Tokenize review
  inputs = tokenizer.encode_plus(text, None,
                                 truncation = True, padding = 'max_length', max_length = 512, 
                                 add_special_tokens = True, return_token_type_ids = True)

  # Prepare inputs for the RoBERTA_GRU model
  ids = torch.tensor(inputs['input_ids'], dtype=torch.long).unsqueeze(0)
  attention_mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).unsqueeze(0)
  token_type_ids = torch.tensor(inputs['token_type_ids'], dtype=torch.long).unsqueeze(0)

  return ids, attention_mask, token_type_ids


# Return classification/sentiment of a single review
def predict_using_RoBERTa_GRU(model, text, tokenizer, torch):
  ids, attention_mask, token_type_ids = preprocess_for_RoBERTa_GRU(text, tokenizer, torch)

  with torch.no_grad():
    y_pred = model.forward(ids, attention_mask, token_type_ids)

    return torch.argmax(y_pred).item()

In [11]:
# Text RoBERTa on manual reviews
for i in range(len(phrases)):
  prediction = predict_using_RoBERTa_GRU(model, phrases[i], tokenizer, torch)

  if prediction == 1: print(f"RoBERTa_GRU Prediction: {prediction} => Positive Sentiment => {prediction == answers[i]}")
  else: print(f"RoBERTa_GRU Prediction: {prediction} => Negative Sentiment => {prediction == answers[i]}")

RoBERTa_GRU Prediction: 0 => Negative Sentiment => True
RoBERTa_GRU Prediction: 1 => Positive Sentiment => True
RoBERTa_GRU Prediction: 0 => Negative Sentiment => True
RoBERTa_GRU Prediction: 0 => Negative Sentiment => True
RoBERTa_GRU Prediction: 0 => Negative Sentiment => True
RoBERTa_GRU Prediction: 1 => Positive Sentiment => True
RoBERTa_GRU Prediction: 1 => Positive Sentiment => True
RoBERTa_GRU Prediction: 1 => Positive Sentiment => True
