In [9]:
# Tokenization for Ontology Population
# This notebook focuses on the tokenization of hotel reviews to aid in the identification 
# of key phrases and terms that are relevant for ontology population.

In [10]:
import os
import pandas as pd
import spacy

In [11]:
if 'tokenized_columns' in locals():
    del tokenized_columns
if 'data' in locals():
    del data
if 'df' in locals():
    del df
if 'new_data' in locals():
    del new_data

In [12]:
# Load data
df = pd.read_csv('../data/merged_dataset.csv')
print("code is running")

code is running


In [13]:
# Print the column names
print("Column Names in the Dataset New:")
print(df.columns)

Column Names in the Dataset New:
Index(['Area', 'Hotel Name', 'Hotel Address', 'Popular Facilities',
       'Description', 'Facilities', 'Surroundings', 'Bathroom Features',
       'Bedroom Features', 'Outdoors', 'Room Amenities', 'Activities',
       'Living Area', 'Media & Technology', 'Food & Drink', 'Parking',
       'Reception services', 'Entertainment and family services',
       'Cleaning services', 'Safety & security', 'General', 'Accessibility',
       'Wellness', 'Languages spoken', 'Restaurants & cafes',
       'Top attractions', 'Natural beauty', 'Beaches in the neighbourhood',
       'Public transport', 'Closest airports', 'Check-in', 'Check-out',
       'Cancellation/Prepayment', 'Children and Bed Policies',
       'Age Restriction', 'Pets', 'Accepted Payment Methods', 'Rating Value',
       'Reviews'],
      dtype='object')


In [14]:
# Here's the modified code that:

# Excludes certain columns from being tokenized by default.
# Applies tokenization using the list of columns deemed suitable after the automatic detection, excluding specific columns.
# Saves the tokenized data and prints a preview.


# This setup ensures that:

# You explicitly avoid tokenizing columns that shouldn't be split, like 'Hotel Name' and 'Hotel Address'.
# You apply tokenization only to suitable columns detected automatically but filtered through your exclusion list.
# You can utilize the results immediately in further data processing or analysis steps.

In [15]:
# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

def is_text_column(column, exclude_columns=[]):
    """
    Heuristic to determine if a column should be tokenized based on checking
    if more than a certain percentage of the rows contain mostly text data,
    excluding specified columns.
    """
    # Sample 100 entries or less
    sample = column.dropna().sample(min(100, len(column)))
    text_count = 0
    for entry in sample:
        # Consider it text if more than half of the characters are letters
        if isinstance(entry, str) and sum(c.isalpha() for c in entry) / len(entry) > 0.5:
            text_count += 1
    # Threshold: 80% of the sample are text entries
    return text_count > 0.8 * len(sample)

def apply_tokenization(data, exclude_columns=[]):
    """
    Apply tokenization to automatically detected text columns, excluding specified columns.
    """
    text_columns = [col for col in data.columns if is_text_column(data[col]) and col not in exclude_columns]
    for column in text_columns:
        data[column + '_tokens'] = data[column].apply(lambda x: [token.text for token in nlp(str(x))] if pd.notnull(x) else [])
    return data, text_columns

def custom_tokenizer(text):
    """
    Custom tokenizer using spaCy to handle named entities and regular text.
    """
    doc = nlp(text)
    tokens = []
    for ent in doc.ents:
        tokens.append(ent.text)  # Keep the entire entity intact.

    # Tokenize the text around named entities
    index = 0
    for ent in doc.ents:
        tokens.extend([token.text for token in doc[index:ent.start] if not token.is_punct and not token.is_space])
        index = ent.end
    tokens.extend([token.text for token in doc[index:] if not token.is_punct and not token.is_space])
    
    return tokens

def apply_custom_tokenization(data, columns):
    """
    Apply the custom tokenizer to multiple columns of the dataframe.
    """
    new_data = pd.DataFrame()
    for column in columns:
        if column in data.columns:
            new_data[column + '_tokens'] = data[column].apply(lambda x: custom_tokenizer(str(x)) if pd.notnull(x) else [])
    return new_data


# Automatically detect text columns and apply tokenization, excluding 'Hotel Name' and 'Hotel Address'
data, tokenized_columns = apply_tokenization(df, exclude_columns=['Hotel Name', 'Hotel Address'])

# Apply custom tokenization using the detected columns and create a new dataset
new_data = apply_custom_tokenization(data, tokenized_columns)

# Check and create directory if needed, then save new dataset
save_path = '../data/new_tokenized_data.csv'
directory = os.path.dirname(save_path)
if not os.path.exists(directory):
    os.makedirs(directory)

try:
    new_data.to_csv(save_path, index=False)
    print(f"Data successfully saved to {save_path}")
except Exception as e:
    print(f"An error occurred while saving the data: {e}")

# Preview the tokenized data
print("Columns tokenized:", tokenized_columns)
print(new_data.head())


Data successfully saved to ../data/new_tokenized_data.csv
Columns tokenized: ['Area', 'Popular Facilities', 'Description', 'Facilities', 'Surroundings', 'Bathroom Features', 'Bedroom Features', 'Outdoors', 'Room Amenities', 'Activities', 'Living Area', 'Media & Technology', 'Food & Drink', 'Parking', 'Reception services', 'Entertainment and family services', 'Cleaning services', 'Safety & security', 'General', 'Accessibility', 'Wellness', 'Languages spoken', 'Restaurants & cafes', 'Top attractions', 'Natural beauty', 'Beaches in the neighbourhood', 'Public transport', 'Closest airports', 'Cancellation/Prepayment', 'Children and Bed Policies', 'Age Restriction', 'Pets', 'Accepted Payment Methods', 'Reviews']
   Area_tokens                          Popular Facilities_tokens  \
0    [unknown]  [private beach, outdoor, swimming, pool, airpo...   
1       [ella]  [airport, shuttle, non, smoking, rooms, room, ...   
2  [hikkaduwa]  [outdoor, swimming, pool, airport, shuttle, no...   
3      

In [16]:
if 'tokenized_columns' in locals():
    del tokenized_columns
if 'data' in locals():
    del data
if 'df' in locals():
    del df
if 'new_data' in locals():
    del new_data
