In [5]:
#Step 1 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Adjusted function to preprocess text with a check for non-string inputs
def preprocess_nltk(text):
    if not isinstance(text, str):
        return ''  # Return an empty string for non-string inputs
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(lemmatized)

# Load the reviews dataset
df = pd.read_csv('IA3-2.csv')

# Apply preprocessing to each review
df['processed_reviews'] = df['review'].apply(preprocess_nltk)

# Create a term-document matrix
vectorizer = CountVectorizer(min_df=5, ngram_range=(1, 2))
dtm = vectorizer.fit_transform(df['processed_reviews'])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ken\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ken\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ken\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# #Step 2 - Initialize and fit the LDA model
lda = LatentDirichletAllocation(n_components=6, random_state=0)
lda.fit(dtm)

In [10]:
#Step 3 - Get top N terms for each topic
def get_top_terms_for_topic(model, feature_names, topic_idx, n_top_terms=5):
    topic = model.components_[topic_idx]
    top_terms = [feature_names[i] for i in topic.argsort()[:-n_top_terms - 1:-1]]
    return " ".join(top_terms)

# print_top_topics function
def print_top_topics_with_words(doc_indices):
    for doc_index in doc_indices:
        doc_topic = topic_distribution[doc_index]
        top_topics = doc_topic.argsort()[-2:][::-1]  # Get indices of top 2 topics
        top_topics_terms = [get_top_terms_for_topic(lda, feature_names, topic_idx) for topic_idx in top_topics]
        print(f"Document ID={doc_index + 1}: Top-2 Topics: {top_topics} ({top_topics_terms[0]} | {top_topics_terms[1]})")

# print the top 2 topics with words for the first 10 restaurant and movie reviews
print("First 10 Restaurant Reviews:")
print_top_topics_with_words(range(0, 10))

print("\nFirst 10 Movie Reviews:")
print_top_topics_with_words(range(500, 510))



First 10 Restaurant Reviews:
Document ID=1: Top-2 Topics: [2 3] (good eat delicious like food | gt real real estate estate investment)
Document ID=2: Top-2 Topics: [2 4] (good eat delicious like food | film quot war also movie)
Document ID=3: Top-2 Topics: [2 1] (good eat delicious like food | love people like life time)
Document ID=4: Top-2 Topics: [2 1] (good eat delicious like food | love people like life time)
Document ID=5: Top-2 Topics: [2 0] (good eat delicious like food | book quot people also life)
Document ID=6: Top-2 Topics: [2 0] (good eat delicious like food | book quot people also life)
Document ID=7: Top-2 Topics: [2 1] (good eat delicious like food | love people like life time)
Document ID=8: Top-2 Topics: [2 5] (good eat delicious like food | quot film also quot quot people)
Document ID=9: Top-2 Topics: [2 1] (good eat delicious like food | love people like life time)
Document ID=10: Top-2 Topics: [1 2] (love people like life time | good eat delicious like food)

First

In [11]:
# Function to print top 5 terms for each topic
def print_top_terms(model, feature_names, n_top_terms=5):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}: ", end='')
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_terms - 1:-1]]))

# Print top 5 terms for each topic
feature_names = vectorizer.get_feature_names_out()
print_top_terms(lda, feature_names)

Topic 0: book quot people also life
Topic 1: love people like life time
Topic 2: good eat delicious like food
Topic 3: gt real real estate estate investment
Topic 4: film quot war also movie
Topic 5: quot film also quot quot people


**Topic 0: book quot people also life**
I think this topic is most likely about literature and quoting books. Particularly ones related to people and life.
**Topic 1: love people like life time**
This topic is about people, life, and love
**Topic 2: good eat delicious like food**
This topic is obviously about eating delicous and good food
**Topic 3: gt real real estate estate investment**
This one is about good real estate investments
**Topic 4: film quot war also movie**
This topic is about films, especially about war. Also quotes from the film or movie.
**Topic 5: quot film also quot quot people**
This topic is about quoting people from films

**Review 1[ID=1]**
I think this review is about a restaurant that has great food and also has a great location as far as real estate is concerned.

**Review 501[ID=501]**
This review is about a movie about life and people. There is also probably a book that this movie is based on.