In [2]:
# Tokenization for Ontology Population
# This notebook focuses on the tokenization of hotel reviews to aid in the identification 
# of key phrases and terms that are relevant for ontology population.

In [3]:
import nltk
from nltk.tokenize import word_tokenize

In [4]:
import pandas as pd

In [5]:
# Load data
data = pd.read_csv('../data/merged_dataset.csv')

In [6]:
# Display the first few rows of the data
print(data.head())

        Area              Hotel Name  \
0    unknown  108 palms beach resort   
1       ella      3 arch resort ella   
2  hikkaduwa         33 lake terrace   
3      kandy     360 lake view hotel   
4    unknown               3r resort   

                                       Hotel Address  \
0  ward no1/25, salli - sampalthivu, 31010 trinco...   
1            no 4, maduragama, 90090 ella, sri lanka   
2  no: 33, godaudawatta rd, hettigoda, hikkaduwa,...   
3  09, keerthi s rajasinghe ave, 20000 kandy, sri...   
4  90, stjoseph mawatha,ettukala, 11500 negombo, ...   

                                  Popular Facilities  \
0  outdoor swimming pool, airport shuttle, non-sm...   
1  airport shuttle, non-smoking rooms, room servi...   
2  outdoor swimming pool, airport shuttle, non-sm...   
3  free wifi, family rooms, free parking, restaur...   
4  outdoor swimming pool, airport shuttle, non-sm...   

                                         Description  \
0  youre eligible for a geniu

In [7]:
# Tokenizing a single review
example_review = data['Reviews'].iloc[0]
tokens = word_tokenize(example_review)

print("Original Review:", example_review)
print("Tokens:", tokens)


Original Review: all in all good, but expensive like most of the resorts here. garden area friendly staff good service very cute dogs from the hotel and im not a pet lover - unpleasant smell from the bathroom - wifi soso be aware that you are dependent on the hotel offers because the resort is too far away from trincomallethe road, so there is no uber or other restaurants available. thats why i would recommend a hotel in trincomalle center if your not especially shooting for a resort. also consider a resort in nilaveli, i found the beach and sea even better in there. || one of the best places i have been in my entire tour of asia. both the service, the attention to the public and the kindness, hospitality and warmth of the owner to the great facilities, everything completely clean, perfect and with the best environment, and 50 m from the beach, one of the best in sri lanka, the exquisite breakfast and varied, without a doubt, i will repeat again || magical, quiet place, positive energy

In [8]:
# Tokenizing all reviews
# data['tokens'] = data['Reviews'].apply(word_tokenize)


In [9]:
# Convert the 'Reviews' column to string and fill NaNs
data['Reviews'] = data['Reviews'].astype(str).fillna('')

# Now apply the tokenization
data['tokens'] = data['Reviews'].apply(word_tokenize)


In [10]:
# Example: Counting word frequency
from collections import Counter

all_tokens = [token for sublist in data['tokens'] for token in sublist]  # Flatten list of tokens
token_counts = Counter(all_tokens)

# Display the most common words
print(token_counts.most_common(100))


[('.', 193627), ('the', 152606), ('and', 104373), (',', 86786), ('||', 85284), ('a', 63720), ('to', 59295), ('is', 49498), ('was', 39834), ('in', 36274), ('hotel', 35940), ('very', 35467), ('of', 35460), ('for', 32640), ('good', 30749), ('with', 26615), ('place', 26264), ('we', 25974), ('staff', 23660), ('food', 22339), ('i', 22266), ('it', 21207), ('are', 19823), ('nice', 19537), ('great', 18747), ('service', 18563), ('you', 18080), ('not', 15738), ('this', 15735), ('rooms', 15619), ('at', 15340), ('room', 15102), ('but', 14017), ('stay', 13963), ('were', 13557), ('friendly', 13156), ('they', 12955), ('there', 11768), ('on', 11389), ('from', 11376), ('had', 11266), ('clean', 11213), ('that', 10814), ('have', 10763), ('view', 10238), ('all', 9979), ('our', 9853), ('pool', 9610), ('best', 9256), ('as', 8885), ('beautiful', 8819), ('excellent', 8357), ('one', 8262), ('so', 8142), ('location', 7881), ('us', 7478), ('amazing', 7476), ('my', 7410), ('also', 7338), ('breakfast', 7333), ('be'

In [11]:
if 'all_tokens' in locals():
    del all_tokens
if 'data' in locals():
    del data