In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation

# Load the dataset
data = pd.read_csv('tripadvisor_hotel_reviews.csv')

# Define a function for data cleaning
def clean_data(text):
    # Convert the text to lowercase
    text = text.lower()
    # Tokenize the text into words
    words = word_tokenize(text)
    # Define a set of stop words
    stop_words = set(stopwords.words('english'))
    # Remove stop words and punctuation
    words = [word for word in words if word not in stop_words and word.isalpha()]
    # Join the words back into a single string
    cleaned_text = ' '.join(words)
    return cleaned_text

# Clean the dataset
data['cleaned_text'] = data['Review'].apply(clean_data)

# Rule-based method
def rule_based_topic_modeling(data):
    # Initialize a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    # Vectorize the text data
    vectorized_data = tfidf_vectorizer.fit_transform(data['cleaned_text'])
    # Cluster the data using KMeans
    kmeans = KMeans(n_clusters=10)
    kmeans.fit(vectorized_data)
    # Print the top words in each cluster
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
    terms = tfidf_vectorizer.get_feature_names_out()
    for i in range(10):
        print("Cluster %d:" % i)
        for j in order_centroids[i, :10]:
            print(' %s' % terms[j])
        print()

# LDA method
def lda_topic_modeling(data):
    # Initialize a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    # Vectorize the text data
    vectorized_data = tfidf_vectorizer.fit_transform(data['cleaned_text'])
    # Fit an LDA model to the vectorized data
    lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
    lda_model.fit(vectorized_data)
    # Print the top words in each topic
    terms = tfidf_vectorizer.get_feature_names_out()
    for i, topic in enumerate(lda_model.components_):
        print("Topic %d:" % i)
        print(' '.join([terms[j] for j in topic.argsort()[:-11:-1]]))
        print()

# Perform rule-based topic modeling on the dataset





In [15]:
# Perform rule-based topic modeling on the dataset
rule_based_topic_modeling(data)

Cluster 0:
 great
 hotel
 location
 staff
 stay
 room
 rooms
 clean
 friendly
 nice

Cluster 1:
 resort
 beach
 food
 great
 beautiful
 time
 people
 good
 pool
 punta

Cluster 2:
 san
 juan
 hotel
 francisco
 great
 old
 room
 stay
 nice
 beach

Cluster 3:
 beach
 food
 pool
 good
 resort
 great
 people
 time
 day
 water

Cluster 4:
 new
 york
 hotel
 orleans
 room
 great
 stay
 location
 quarter
 stayed

Cluster 5:
 paris
 hotel
 metro
 room
 location
 staff
 great
 stay
 eiffel
 small

Cluster 6:
 hotel
 stay
 staff
 room
 service
 stayed
 rooms
 great
 place
 best

Cluster 7:
 barcelona
 hotel
 ramblas
 metro
 room
 location
 great
 good
 las
 city

Cluster 8:
 hotel
 good
 room
 location
 station
 breakfast
 walk
 excellent
 clean
 staff

Cluster 9:
 room
 hotel
 nice
 bed
 good
 night
 stay
 floor
 bathroom
 small



In [13]:
# Perform LDA topic modeling on the dataset
lda_topic_modeling(data)


Topic 0:
resort beach food pool great time good people day beautiful

Topic 1:
hotel great location staff stay room clean excellent helpful stayed

Topic 2:
andra alila cow hollow na abroad wan gon valley wing

Topic 3:
muse thomson trail charlesmark raffles freedom columbus motor zero gross

Topic 4:
room hotel stay desk told night service rooms staff got

Topic 5:
venice hotel florence wonderful staff stay fantastic casci great ponte

Topic 6:
hong kong rex milano juan gras mardi kowloon san condado

Topic 7:
hotel room great location good stay breakfast staff nice rooms

Topic 8:
aqua palms majestic colonial denny specially carlos roberto animation team

Topic 9:
clarendon ic vitale remain paintings slide tray rid behavior excellence

