In [25]:
import pandas as pd
from gensim import corpora, models
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import jupyter
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

In [26]:
# Download NLTK resources if not already downloaded
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\A1D5688\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\A1D5688\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\A1D5688\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
# Preprocess stopwords
stop_words = set(stopwords.words('english'))
specific_words = {"br", "positive", "negative","stupid", "horrible", "ever", "even", "waste", "movie", "one","story", "movies", "book", "film", "show", "good", "bad", "worst", "episode", "tv", "watch", "series", "really", "great", "like", "would", "see", "well", "people", "much", "get", "think"}
stop_words.update(specific_words)

In [28]:
# Define function to preprocess text
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())

    # Remove punctuation and stopwords
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

In [29]:
# Read data from CSV file
csv_file = r'IMDB Dataset.csv'
df = pd.read_csv(csv_file)

In [31]:
# convert reviews to list
texts = df["review"].tolist()

In [32]:
# Preprocess documents
preprocessed_documents = [preprocess_text(text) for text in texts]

In [33]:
# Create dictionary and corpus
dictionary = corpora.Dictionary(preprocessed_documents)
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

In [37]:
# Train LDA model
num_topics = 5
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

In [38]:
# Print topics
for topic in lda_model.print_topics():
    print(topic)

(0, '0.009*"life" + 0.008*"love" + 0.006*"family" + 0.006*"woman" + 0.006*"man" + 0.005*"young" + 0.005*"girl" + 0.005*"father" + 0.004*"play" + 0.004*"wife"')
(1, '0.008*"time" + 0.007*"could" + 0.007*"make" + 0.006*"thing" + 0.005*"scene" + 0.005*"know" + 0.005*"say" + 0.005*"go" + 0.005*"acting" + 0.005*"made"')
(2, '0.009*"time" + 0.008*"best" + 0.007*"character" + 0.007*"also" + 0.006*"actor" + 0.006*"comedy" + 0.006*"first" + 0.005*"role" + 0.005*"funny" + 0.005*"year"')
(3, '0.011*"character" + 0.005*"scene" + 0.005*"time" + 0.005*"make" + 0.005*"film" + 0.005*"way" + 0.004*"life" + 0.004*"many" + 0.004*"director" + 0.004*"also"')
(4, '0.007*"war" + 0.004*"man" + 0.004*"american" + 0.003*"year" + 0.003*"world" + 0.003*"soldier" + 0.003*"also" + 0.003*"western" + 0.002*"two" + 0.002*"country"')


In [39]:
# Visualize the LDA model results
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)