In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
# Set the seed value
seed = 123

In [2]:
# Import the CSV file
df = pd.read_csv('/kaggle/input/515k-hotel-reviews-data-in-europe/Hotel_Reviews.csv')

In [3]:
# Create a new dataframe with only the Positive_Review and Negative_Review columns
df_train = df[["Positive_Review", "Negative_Review"]]
# Use the melt function to stack the Positive_Review and Negative_Review columns on top of each other
df_train = df_train.melt(var_name="Sentiment", value_name="Text")

# Map the Sentiment column to 1 for Positive_Review and 0 for Negative_Review
df_train["Sentiment"] = df_train["Sentiment"].map({"Positive_Review": 1, "Negative_Review": 0})

# Drop any rows with empty or missing Text
df_train = df_train.dropna(subset=["Text"])

# Reset the index of the dataframe
df_train = df_train.reset_index(drop=True)

df_train["ID"] = df_train.index
df_train

Unnamed: 0,Sentiment,Text,ID
0,1,Only the park outside of the hotel was beauti...,0
1,1,No real complaints the hotel was great great ...,1
2,1,Location was good and staff were ok It is cut...,2
3,1,Great location in nice surroundings the bar a...,3
4,1,Amazing location and building Romantic setting,4
...,...,...,...
1031471,0,no trolly or staff to help you take the lugga...,1031471
1031472,0,The hotel looks like 3 but surely not 4,1031472
1031473,0,The ac was useless It was a hot week in vienn...,1031473
1031474,0,No Negative,1031474


In [5]:
# Define a function to count the number of words in a string
def count_words(text):
    return len(text.split())

df_train = df_train[df_train["Text"].apply(lambda x: count_words(x) > 2)]

# Randomly select 2% of data
df_train = df_train.sample(frac=0.02, random_state=seed)
df_train

Unnamed: 0,Sentiment,Text,ID
614439,0,the bed was huge but the mattress was not the...,614439
270955,1,This was a nice older hotel in a residential ...,270955
485273,1,large and quiet rooms king size beds smoking ...,485273
567131,0,The water pressure was not good in the shower...,567131
150214,1,Clean friendly and easy access to the tube,150214
...,...,...,...
82181,1,Staff were fantastic Friendly and very helpful,82181
486507,1,Breakfast selection and quality was excellent,486507
407708,1,The staff were very helpful The roof terrace ...,407708
406197,1,Beautiful hotel in great location close to ce...,406197


In [16]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# load the data

# preprocess the text data
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.PorterStemmer()

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stopwords]
    tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)

df_train['processed_text'] = df_train['Text'].apply(preprocess_text)

# create a bag of words matrix
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=2000) #max_features=1000
bow_matrix = vectorizer.fit_transform(df_train['processed_text'])

# train an LDA model
lda_model = LatentDirichletAllocation(n_components=10, max_iter=10, learning_method='online')
lda_model.fit(bow_matrix)

df_topics = pd.DataFrame()

# print the top words for each topic
for i, topic in enumerate(lda_model.components_):
    print(f'Topic {i}:')
    top_words = [vectorizer.get_feature_names_out()[index] for index in topic.argsort()[:-11:-1]]
    print(top_words)
    df_topics[f'Topic {i}'] = top_words


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Topic 0:
['bar', 'room', 'view', 'coffe', 'breakfast', 'tea', 'drink', 'like', 'love', 'nice']
Topic 1:
['hotel', 'stay', 'night', 'would', 'time', 'day', 'like', 'need', 'noth', 'poor']
Topic 2:
['room', 'small', 'bed', 'bathroom', 'door', 'shower', 'comfort', 'work', 'clean', 'light']
Topic 3:
['room', 'one', 'book', 'check', 'even', 'better', 'arriv', 'got', 'floor', 'facil']
Topic 4:
['staff', 'locat', 'great', 'good', 'friendli', 'help', 'room', 'clean', 'excel', 'breakfast']
Topic 5:
['breakfast', 'bit', 'good', 'expens', 'park', 'price', 'littl', 'noisi', 'far', 'money']
Topic 6:
['hotel', 'use', 'walk', 'close', 'locat', 'charg', 'pay', 'get', 'station', 'around']
Topic 7:
['room', 'water', 'wifi', 'air', 'hot', 'bad', 'shower', 'bed', 'free', 'condit']
Topic 8:
['staff', 'us', 'help', 'front', 'way', 'desk', 'extrem', 'request', 'manag', 'went']
Topic 9:
['room', 'could', 'recept', 'ask', 'everyth', 'nois', 'expect', 'size', 'chang', 'lift']


In [12]:
#pd.DataFrame(top_words).to_csv("top_words.csv")

In [17]:
from textblob import TextBlob

# extract sentiment polarity for each aspect
for i, topic in enumerate(lda_model.components_):
    top_words = [vectorizer.get_feature_names_out()[index] for index in topic.argsort()[:-11:-1]]
    for word in top_words:
        aspect_text = df_train[df_train['processed_text'].str.contains(word)]['Text'].values
        aspect_blob = TextBlob(' '.join(aspect_text))
        aspect_sentiment = aspect_blob.sentiment.polarity
        print(f"Aspect '{word}': Sentiment polarity: {aspect_sentiment}")


Aspect 'bar': Sentiment polarity: 0.24583329998143702
Aspect 'room': Sentiment polarity: 0.219668113870551
Aspect 'view': Sentiment polarity: 0.3000628166994255
Aspect 'coffe': Sentiment polarity: 0.21716453050200893
Aspect 'breakfast': Sentiment polarity: 0.28950895291811024
Aspect 'tea': Sentiment polarity: 0.18285712814452335
Aspect 'drink': Sentiment polarity: 0.20527927169316787
Aspect 'like': Sentiment polarity: 0.18134766820620626
Aspect 'love': Sentiment polarity: 0.39812933620235164
Aspect 'nice': Sentiment polarity: 0.3775542934866381
Aspect 'hotel': Sentiment polarity: 0.2530442366219782
Aspect 'stay': Sentiment polarity: 0.2437913250481021
Aspect 'night': Sentiment polarity: 0.12735528454436038
Aspect 'would': Sentiment polarity: 0.18868673599504143
Aspect 'time': Sentiment polarity: 0.13957111883847095
Aspect 'day': Sentiment polarity: 0.16251014667167774
Aspect 'like': Sentiment polarity: 0.18134766820620626
Aspect 'need': Sentiment polarity: 0.1599675312979063
Aspect 'no

In [18]:
'''Appendix

Parameter Tuning

from sklearn.feature_extraction.text import CountVectorizer

# define a list of max features to try
max_features_list = [500, 1000, 2000, 3000]

# define a list of max_df values to try
max_df_list = [0.9, 0.95, 0.99]

# define a list of min_df values to try
min_df_list = [1, 2, 3]

# define a function to create a bag of words matrix and train an LDA model
def train_lda_model(max_features, max_df, min_df):
    # create a bag of words matrix
    vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, max_features=max_features)
    bow_matrix = vectorizer.fit_transform(df_train['processed_text'])
    
    # train an LDA model
    lda_model = LatentDirichletAllocation(n_components=10, max_iter=10, learning_method='online')
    lda_model.fit(bow_matrix)
    
    return lda_model

# loop over all combinations of parameters and train an LDA model for each one
for max_features in max_features_list:
    for max_df in max_df_list:
        for min_df in min_df_list:
            lda_model = train_lda_model(max_features, max_df, min_df)
            # evaluate the performance of the model
'''

"from sklearn.feature_extraction.text import CountVectorizer\n\n# define a list of max features to try\nmax_features_list = [500, 1000, 2000, 3000]\n\n# define a list of max_df values to try\nmax_df_list = [0.9, 0.95, 0.99]\n\n# define a list of min_df values to try\nmin_df_list = [1, 2, 3]\n\n# define a function to create a bag of words matrix and train an LDA model\ndef train_lda_model(max_features, max_df, min_df):\n    # create a bag of words matrix\n    vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, max_features=max_features)\n    bow_matrix = vectorizer.fit_transform(df_train['processed_text'])\n    \n    # train an LDA model\n    lda_model = LatentDirichletAllocation(n_components=10, max_iter=10, learning_method='online')\n    lda_model.fit(bow_matrix)\n    \n    return lda_model\n\n# loop over all combinations of parameters and train an LDA model for each one\nfor max_features in max_features_list:\n    for max_df in max_df_list:\n        for min_df in min_df_list:\