In [8]:
import html, json, re
import pandas as pd
import emoji
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

## preprocess

In [9]:
with open("./slang_dict.json",'r') as f:
    slang_dict = json.load(f)
    
with open("./emoji_dic.json",'r') as f:
    emoji_dict = json.load(f)

In [10]:
def preprocess(text):
    # converting html codes 
    decoded_text = html.unescape(text)
    decoded_text = decoded_text.lower()
    # match strings starting with 'http'
    text = re.sub(r'http\S+', '', decoded_text)
    # match strings start with '<' and end with '>'
    text = re.sub(r'<.*?>', '', text)

    # remove emoji
    for em, meaning in emoji_dict.items():
        text = text.replace(em, meaning)
    text = emoji.demojize(text)

    #standard preprocessing technique
    tokens = word_tokenize(text)
    # remove punctuation
    punctuation_removed = [word for word in tokens if word not in list(string.punctuation)]
    # lemmatization
    lemmatized_text = [WordNetLemmatizer().lemmatize(word) for word in punctuation_removed]
    text =  ' '.join(lemmatized_text)

    ## removing slangs
    words = text.split()
    corrected_words = [slang_dict.get(word, word) for word in words]
    text = ' '.join(corrected_words)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    return text 



## featurization

In [11]:
from transformers import BertTokenizer, BertModel
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

# Load pre-trained BERT model and tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True).to(device)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Using GPU: NVIDIA GeForce RTX 3060 Laptop GPU


In [12]:
def get_bert_embedding(comment):
    inputs = bert_tokenizer.encode_plus(
        comment,
        add_special_tokens=True,
        return_tensors='pt',
        max_length=128,
        pad_to_max_length=True,
        return_attention_mask=True
    )
    
    # Move input tensors to the GPU
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = bert_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])

    # Extract the [CLS] token's embedding and move it back to the CPU
    cls_embedding = outputs['last_hidden_state'][:, 0, :].squeeze().cpu().numpy()
    return cls_embedding

In [17]:
from joblib import load

# Load the model from the file
loaded_model = load('svm_classifier_bert.joblib')
def infer(X):
   X = X.reshape(1, -1)
   return loaded_model.predict(X) 


In [27]:
aa = """Where do I even begin with the Oppenheimer movie? It's a perplexing mess of a film that fails to capture the essence of its subject matter and leaves the audience scratching their heads in confusion. With high expectations due to its talented cast and promising premise, this movie ultimately disappoints on every level.

First and foremost, the pacing is an absolute nightmare. The movie meanders aimlessly, dragging out scenes that add little to the plot and leaving essential elements underdeveloped. It's almost as if the filmmakers had no idea how to structure the narrative or maintain a cohesive flow. As a result, the movie feels like a jumbled collection of disconnected events that leave viewers struggling to make sense of the story.

The characters in Oppenheimer are equally underwhelming. Despite the exceptional actors involved, their performances are hampered by a lack of depth and poorly written dialogues. The titular character, J. Robert Oppenheimer, comes across as one-dimensional and devoid of real personality or emotional resonance. Supporting characters receive even less attention, leaving us indifferent to their fates and unable to invest in their arcs.

The film's attempts at historical accuracy are laughable at best. While some creative liberties are expected in any biographical movie, Oppenheimer takes it to an extreme. The inaccuracies and distortions of actual events not only disrespect the legacy of those involved but also undermine the film's credibility. The filmmakers were more interested in sensationalism than telling a compelling and fact-based story.

Perhaps the most egregious aspect of the Oppenheimer movie is its lack of a coherent message or thematic depth. It raises significant moral and ethical questions about the development of nuclear weapons and their consequences, but it never delves into these issues with any real substance. Instead, the movie superficially glazes over these crucial aspects, leaving viewers with a sense of emptiness and missed opportunities.

The cinematography and direction do little to salvage the film's shortcomings. The visual style lacks creativity, and the director seems to rely on tired and overused cinematic clichés. The lack of a distinct visual identity only adds to the overall mediocrity of the movie.

In conclusion, the Oppenheimer movie is a colossal disappointment. Its weak storytelling, poorly developed characters, historical inaccuracies, and lack of a compelling message all contribute to a film that is an absolute failure. Save your time and money and skip this cinematic disaster. There are far better biographical dramas out there that do justice to their subjects and deliver a more engaging and coherent experience. """

In [28]:
def get_sentiment(review):
    processed = preprocess(review)
    features = get_bert_embedding(processed)
    output = infer(features)
    return output

get_sentiment(aa)



array(['negative'], dtype=object)

In [14]:
import tkinter as tk
from tkinter import messagebox

# Define the function that will determine the sentiment.
def get_sentiment(review):
    processed = preprocess(review)
    features = get_bert_embedding(processed)
    output = infer(features)
    return output

# Function to be called when the "Get Sentiment" button is pressed.
def on_button_press():
    review = entry.get("1.0", "end-1c")  # Get text from the entry widget.
    sentiment = get_sentiment(review)
    messagebox.showinfo("Sentiment Result", f"The sentiment is: {sentiment}")

# Create the main window.
root = tk.Tk()
root.title("Sentiment Analysis")

# Create and pack widgets.
label = tk.Label(root, text="Enter your review:")
label.pack(pady=20)

entry = tk.Text(root, height=10, width=50)
entry.pack(pady=20)

button = tk.Button(root, text="Get Sentiment", command=on_button_press)
button.pack(pady=20)

# Run the application.
root.mainloop()


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Exception in Tkinter callback
Traceback (most recent call last):
  File "c:\Users\ekyus\AppData\Local\Programs\Python\Python311\Lib\tkinter\__init__.py", line 1948, in __call__
    return self.func(*args)
           ^^^^^^^^^^^^^^^^
  File "C:\Users\ekyus\AppData\Local\Temp\ipykernel_22796\2782166149.py", line 14, in on_button_press
    sentiment = get_sentiment(review)
                ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ekyus\AppData\Local\Temp\ipykernel_22796\2782166149.py", line 8, in get_sentiment
    output = infer(features)
             ^^^^^^^^^^^^^^^
  File "C:\Users\ekyus\AppData\Local\Temp\ipykernel_22796\1

In [15]:
import pandas as pd
import emoji
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

## Read dataset

## Data Cleaning

In [19]:
# use python emoji library to deal with emoji
# reference: https://carpedm20.github.io/emoji/docs/index.html
# the converted text is sth like :smiley_face:
# need to remove ':' and replace '_' with ' '

def convert_emojis(text):
    return emoji.demojize(text)

df['Comment'] = df['Comment'].apply(convert_emojis)
df


Unnamed: 0,Author,Comment,sentiment_label,labels
0,Edward Spence,justin trudeau is really desperate pierre poli...,2 stars,negative
1,Edward Spence,just trudeau will loose the next election pier...,1 star,negative
2,Cedric Farrell,thanks you for your service mr trudeau. god bl...,5 stars,positive
3,Wapn Perfo,every person here should be ashamed of themsel...,1 star,negative
4,Marcel Dagenais,so why did justin not complete his tenure as a...,2 stars,negative
...,...,...,...,...
3876,Jean-Guy Rubberboot,inconceivable that people still voted for this...,1 star,negative
3877,Cookie Cute as a puppy,:clapping_hands_medium_skin_tone::clapping_han...,1 star,negative
3878,Oscar Vandermeer,"our little dictator, his father gave us the ch...",5 stars,positive
3879,Steve-o Moreno,"if you wanna leave canada, you guys are welcom...",5 stars,positive


## Data preprocessing

In [29]:
df.to_csv('df_cleaned_new.csv', index=False)


In [30]:
def basic_preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = text.split()
    return " ".join(tokens)

# Apply basic preprocessing to the Comment column
df['processed_comment'] = df['Comment'].apply(basic_preprocess_text)

In [31]:
from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets (80-20 split)
train_data, test_data = train_test_split(df, test_size=0.2, random_state=11, stratify=df['labels'])


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['processed_comment'])
X_test_tfidf = tfidf_vectorizer.transform(test_data['processed_comment'])


In [33]:
import gensim.downloader as api
import numpy as np

glove_model = api.load("glove-wiki-gigaword-100")


In [34]:
def get_glove_embedding(comment):
    words = comment.split()
    embeddings = [glove_model[word] for word in words if word in glove_model.key_to_index]
    
    if not embeddings:
        return np.zeros(glove_model.vector_size)
    
    return np.mean(embeddings, axis=0)

X_train_glove = np.array([get_glove_embedding(comment) for comment in train_data['processed_comment']])
X_test_glove = np.array([get_glove_embedding(comment) for comment in test_data['processed_comment']])


In [35]:
# !pip install transformers

In [36]:
from transformers import BertTokenizer, BertModel
import torch



# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")


# Load pre-trained BERT model and tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True).to(device)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Using GPU: NVIDIA GeForce RTX 3060 Laptop GPU


In [37]:
def get_bert_embedding(comment):
    inputs = bert_tokenizer.encode_plus(
        comment,
        add_special_tokens=True,
        return_tensors='pt',
        max_length=128,
        pad_to_max_length=True,
        return_attention_mask=True
    )
    
    # Move input tensors to the GPU
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = bert_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])

    # Extract the [CLS] token's embedding and move it back to the CPU
    cls_embedding = outputs['last_hidden_state'][:, 0, :].squeeze().cpu().numpy()
    return cls_embedding


X_train_bert = np.array([get_bert_embedding(comment) for comment in train_data['processed_comment']])
X_test_bert = np.array([get_bert_embedding(comment) for comment in test_data['processed_comment']])


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [38]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear', probability=True)

# Train the classifier using the training data (TF-IDF representations)
svm_classifier.fit(X_train_tfidf, train_data['labels'])


In [39]:
# Predict sentiments for training and testing data
train_predictions = svm_classifier.predict(X_train_tfidf)
test_predictions = svm_classifier.predict(X_test_tfidf)

# Evaluate the classifier's performance
train_accuracy = accuracy_score(train_data['labels'], train_predictions)
test_accuracy = accuracy_score(test_data['labels'], test_predictions)
train_report = classification_report(train_data['labels'], train_predictions)
test_report = classification_report(test_data['labels'], test_predictions)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)
print("\nTraining Classification Report:\n", train_report)
print("\nTesting Classification Report:\n", test_report)


Training Accuracy: 0.8810096153846154
Testing Accuracy: 0.7451923076923077

Training Classification Report:
               precision    recall  f1-score   support

    negative       0.86      0.99      0.92      1745
     neutral       1.00      0.18      0.30       151
    positive       0.96      0.73      0.83       600

    accuracy                           0.88      2496
   macro avg       0.94      0.63      0.69      2496
weighted avg       0.89      0.88      0.86      2496


Testing Classification Report:
               precision    recall  f1-score   support

    negative       0.75      0.96      0.84       436
     neutral       0.00      0.00      0.00        38
    positive       0.70      0.32      0.44       150

    accuracy                           0.75       624
   macro avg       0.48      0.43      0.43       624
weighted avg       0.69      0.75      0.69       624



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Initialize the SVM classifier
svm_classifier_glove = SVC(kernel='linear', probability=True)

# Train the classifier using the training data (GloVe representations)
svm_classifier_glove.fit(X_train_glove, train_data['labels'])


In [41]:
# Predict sentiments for training and testing data
train_predictions_glove = svm_classifier_glove.predict(X_train_glove)
test_predictions_glove = svm_classifier_glove.predict(X_test_glove)

# Evaluate the classifier's performance
train_accuracy_glove = accuracy_score(train_data['labels'], train_predictions_glove)
test_accuracy_glove = accuracy_score(test_data['labels'], test_predictions_glove)
train_report_glove = classification_report(train_data['labels'], train_predictions_glove)
test_report_glove = classification_report(test_data['labels'], test_predictions_glove)

print("Training Accuracy (GloVe):", train_accuracy_glove)
print("Testing Accuracy (GloVe):", test_accuracy_glove)
print("\nTraining Classification Report (GloVe):\n", train_report_glove)
print("\nTesting Classification Report (GloVe):\n", test_report_glove)


Training Accuracy (GloVe): 0.7808493589743589
Testing Accuracy (GloVe): 0.7548076923076923

Training Classification Report (GloVe):
               precision    recall  f1-score   support

    negative       0.78      0.97      0.86      1745
     neutral       0.00      0.00      0.00       151
    positive       0.79      0.43      0.56       600

    accuracy                           0.78      2496
   macro avg       0.52      0.47      0.47      2496
weighted avg       0.73      0.78      0.74      2496


Testing Classification Report (GloVe):
               precision    recall  f1-score   support

    negative       0.76      0.95      0.85       436
     neutral       0.00      0.00      0.00        38
    positive       0.69      0.39      0.50       150

    accuracy                           0.75       624
   macro avg       0.49      0.44      0.45       624
weighted avg       0.70      0.75      0.71       624



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
# Initialize another SVM classifier for BERT
svm_classifier_bert = SVC(kernel='linear', probability=True)

# Train the classifier using the training data (BERT representations)
svm_classifier_bert.fit(X_train_bert, train_data['labels'])  # Note: Using train_data since we took a subset for BERT


In [43]:
# Predict sentiments for training and testing data
train_predictions_bert = svm_classifier_bert.predict(X_train_bert)
test_predictions_bert = svm_classifier_bert.predict(X_test_bert)

# Evaluate the classifier's performance
train_accuracy_bert = accuracy_score(train_data['labels'], train_predictions_bert)
test_accuracy_bert = accuracy_score(test_data['labels'], test_predictions_bert)
train_report_bert = classification_report(train_data['labels'], train_predictions_bert)
test_report_bert = classification_report(test_data['labels'], test_predictions_bert)

print("Training Accuracy (BERT):", train_accuracy_bert)
print("Testing Accuracy (BERT):", test_accuracy_bert)
print("\nTraining Classification Report (BERT):\n", train_report_bert)
print("\nTesting Classification Report (BERT):\n", test_report_bert)


Training Accuracy (BERT): 0.9198717948717948
Testing Accuracy (BERT): 0.6794871794871795

Training Classification Report (BERT):
               precision    recall  f1-score   support

    negative       0.91      0.98      0.95      1745
     neutral       0.99      0.80      0.89       151
    positive       0.93      0.77      0.84       600

    accuracy                           0.92      2496
   macro avg       0.95      0.85      0.89      2496
weighted avg       0.92      0.92      0.92      2496


Testing Classification Report (BERT):
               precision    recall  f1-score   support

    negative       0.78      0.84      0.81       436
     neutral       0.13      0.11      0.12        38
    positive       0.44      0.36      0.40       150

    accuracy                           0.68       624
   macro avg       0.45      0.43      0.44       624
weighted avg       0.66      0.68      0.67       624

