<h><h>Import necessary dependencies</h></h>

In [2]:
import pandas as pd
import numpy as np
import text_normalizer as tn
import warnings

warnings.filterwarnings("ignore")

2022-12-23 14:06:41.005010: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-23 14:06:41.130655: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-23 14:06:41.133470: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-23 14:06:41.133483: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

<b><b>Load and normalize data</b></b>

In [5]:
dataset = pd.read_csv(r'movie_reviews.csv')

# Take a peek at the data
print(dataset.head())
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

# Build train and test datasets
train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

# Normalize datasets
norm_train_reviews = tn.normalize_corpus(train_reviews)
norm_test_reviews = tn.normalize_corpus(test_reviews)

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


<b><b>Extract features from positive and negative reviews</b></b>

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# consolidate all normalized reviews
norm_reviews = norm_train_reviews+norm_test_reviews
# get tf-idf features for only positive reviews
positive_reviews = [review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'positive']
ptvf = TfidfVectorizer(use_idf=True, min_df=0.05, max_df=0.95, ngram_range=(1,1), sublinear_tf=True)
ptvf_features = ptvf.fit_transform(positive_reviews)
# get tf-idf features for only negative reviews
negative_reviews = [review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'negative']
ntvf = TfidfVectorizer(use_idf=True, min_df=0.05, max_df=0.95, ngram_range=(1,1), sublinear_tf=True)
ntvf_features = ntvf.fit_transform(negative_reviews)
# view feature set dimensions
print(ptvf_features.shape, ntvf_features.shape)

(25000, 330) (25000, 331)


<b><b>Topic Modeling on Reviews</b></b>

In [15]:
import pyLDAvis
import pyLDAvis.sklearn
from sklearn.decomposition import NMF
import topic_model_utils as tmu

pyLDAvis.enable_notebook()
total_topics = 10

<b><b>Display and visualize topics for positive reviews</b></b>

In [17]:
# Build topic model on positive sentiment review features
pos_nmf = NMF(n_components=total_topics, random_state=42, alpha=0.1, l1_ratio=0.2)
pos_nmf.fit(ptvf_features)

# Extract features and components weights
pos_feature_names = ptvf.get_feature_names()
pos_weights = pos_nmf.components_

# Extract and display topics and their components
pos_topics = tmu.get_topics_terms_weights(pos_weights, pos_feature_names)
tmu.print_topics_udf(topics=pos_topics, total_topics=total_topics, num_terms=15, display_weights=False)

Topic #1 without weights
['like', 'not', 'think', 'really', 'say', 'would', 'know', 'thing', 'get', 'much', 'go', 'well', 'bad', 'could', 'lot']

Topic #2 without weights
['movie', 'see', 'watch', 'great', 'good', 'one', 'not', 'time', 'ever', 'enjoy', 'recommend', 'make', 'acting', 'first', 'like']

Topic #3 without weights
['show', 'episode', 'series', 'tv', 'watch', 'dvd', 'first', 'see', 'time', 'one', 'good', 'ever', 'remember', 'year', 'new']

Topic #4 without weights
['performance', 'role', 'play', 'actor', 'cast', 'good', 'well', 'great', 'excellent', 'character', 'give', 'support', 'also', 'star', 'job']

Topic #5 without weights
['man', 'old', 'two', 'take', 'young', 'year', 'get', 'woman', 'come', 'go', 'one', 'back', 'find', 'time', 'girl']

Topic #6 without weights
['film', 'see', 'one', 'scene', 'make', 'not', 'time', 'many', 'cinema', 'director', 'horror', 'music', 'release', 'work', 'ever']

Topic #7 without weights
['story', 'character', 'tell', 'book', 'true', 'line',

In [18]:
pyLDAvis.sklearn.prepare(pos_nmf, ptvf_features, ptvf, R=15)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


<b><b>Display and visualize topics for negative reviews</b></b>

In [19]:
# build topic model on negative sentiment review features
neg_nmf = NMF(n_components=10, random_state=42, alpha=0.1, l1_ratio=0.2)
neg_nmf.fit(ntvf_features)      

# extract features and component weights
neg_feature_names = ntvf.get_feature_names()
neg_weights = neg_nmf.components_

# extract and display topics and their components
neg_topics = tmu.get_topics_terms_weights(neg_weights, neg_feature_names)
tmu.print_topics_udf(topics=neg_topics, total_topics=total_topics, num_terms=15, display_weights=False) 

Topic #1 without weights
['get', 'scene', 'go', 'kill', 'guy', 'look', 'take', 'end', 'start', 'one', 'around', 'back', 'first', 'thing', 'run']

Topic #2 without weights
['bad', 'movie', 'ever', 'acting', 'see', 'terrible', 'one', 'not', 'plot', 'awful', 'make', 'effect', 'even', 'horrible', 'watch']

Topic #3 without weights
['film', 'make', 'not', 'see', 'director', 'would', 'one', 'bad', 'many', 'however', 'no', 'horror', 'feature', 'avoid', 'feel']

Topic #4 without weights
['actor', 'play', 'good', 'cast', 'role', 'well', 'performance', 'script', 'great', 'much', 'act', 'star', 'give', 'director', 'lead']

Topic #5 without weights
['would', 'think', 'movie', 'not', 'say', 'people', 'like', 'could', 'know', 'watch', 'see', 'really', 'go', 'well', 'want']

Topic #6 without weights
['funny', 'comedy', 'laugh', 'joke', 'try', 'not', 'stupid', 'suppose', 'fun', 'moment', 'like', 'black', 'really', 'even', 'annoying']

Topic #7 without weights
['man', 'woman', 'old', 'year', 'young', '