In [1]:
import pandas as pd
from gensim import corpora, models
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import jupyter
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

In [2]:
# Download NLTK resources if not already downloaded
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hmuen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hmuen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hmuen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Preprocess stopwords
stop_words = set(stopwords.words('english'))
specific_words = {"br", "positive", "negative"}
stop_words.update(specific_words)

In [4]:
# Define function to preprocess text
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())

    # Remove punctuation and stopwords
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

In [5]:
# Read data from CSV file
csv_file = r'C:\Users\hmuen\PycharmProjects\imdb_analysis\data\IMDB Dataset.csv'
df = pd.read_csv(csv_file)

In [6]:
 df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
# convert reviews to list
texts = df["review"].tolist()

In [8]:
# Preprocess documents
preprocessed_documents = [preprocess_text(text) for text in texts]

In [9]:
# Create dictionary and corpus
dictionary = corpora.Dictionary(preprocessed_documents)
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

In [10]:
# Train LDA model
num_topics = 10  # You can adjust the number of topics as per your requirement
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

In [11]:
# Print topics
for topic in lda_model.print_topics():
    print(topic)

(0, '0.022*"show" + 0.013*"like" + 0.010*"one" + 0.009*"funny" + 0.008*"episode" + 0.008*"series" + 0.007*"get" + 0.007*"kid" + 0.006*"time" + 0.006*"comedy"')
(1, '0.029*"film" + 0.009*"one" + 0.008*"scene" + 0.007*"horror" + 0.006*"plot" + 0.006*"character" + 0.006*"even" + 0.004*"much" + 0.004*"like" + 0.004*"make"')
(2, '0.013*"song" + 0.011*"comedy" + 0.010*"great" + 0.009*"musical" + 0.008*"play" + 0.008*"best" + 0.007*"star" + 0.007*"role" + 0.007*"dance" + 0.007*"music"')
(3, '0.014*"character" + 0.014*"film" + 0.011*"story" + 0.010*"life" + 0.009*"one" + 0.008*"love" + 0.007*"movie" + 0.007*"performance" + 0.005*"role" + 0.005*"well"')
(4, '0.016*"people" + 0.010*"american" + 0.007*"u" + 0.007*"world" + 0.006*"show" + 0.006*"war" + 0.006*"country" + 0.006*"documentary" + 0.005*"political" + 0.005*"fact"')
(5, '0.059*"film" + 0.012*"one" + 0.009*"time" + 0.008*"story" + 0.007*"great" + 0.007*"see" + 0.007*"year" + 0.006*"many" + 0.006*"like" + 0.006*"would"')
(6, '0.014*"get" +

In [12]:
# Visualize the LDA model results
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)