In [10]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import logging


In [11]:
# Setup basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')

In [12]:
def create_hotel_id(hotel_name):
    """Create a deterministic HotelID based on the hotel name."""
    if pd.isna(hotel_name):
        return np.nan
    initials = ''.join([word[0] for word in hotel_name.split() if word]).upper()
    name_length = len(hotel_name.replace(' ', ''))
    return f"{initials}{name_length}"

In [13]:
# Function to remove HTML tags from text
def remove_html_tags(text):
    """Remove HTML tags from text using BeautifulSoup."""
    if pd.isna(text):
        return text
    return BeautifulSoup(text, "html.parser").get_text()

In [14]:
# Function to remove URLs and emails from text
def remove_urls_and_emails(text):
    """Remove URLs and email addresses from text."""
    if pd.isna(text):
        return text
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    return re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za.]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)  # Remove emails

In [16]:
# Apply normalization functions
# The following function should be called to normalize text to lowercase.
def normalize_text(dataframe):
    """Convert all text columns to lowercase."""
    text_columns = dataframe.select_dtypes(include=['object']).columns
    for column in text_columns:
        dataframe[column] = dataframe[column].apply(lambda x: x.lower() if pd.notna(x) else x)
    print("Capitalization has been normalized across all text columns.")


In [17]:
def is_non_english(text):
    """Check if the majority of the text characters are non-ASCII."""
    if pd.isna(text):
        return False
    return sum((ord(char) > 128 for char in text)) > 0.5 * len(text)

In [18]:

def replace_non_english_reviews(dataframe):
    """Replace non-English reviews with their translated versions where applicable."""
    non_english_indices = dataframe.apply(lambda row: is_non_english(row['text']) and pd.notna(row['textTranslated']), axis=1)
    dataframe.loc[non_english_indices, 'text'] = dataframe.loc[non_english_indices, 'textTranslated']
    dataframe.loc[non_english_indices, 'textTranslated'] = np.nan

In [19]:
# Load dataset
# This line reads a CSV file into a DataFrame. The correct method is `pd.read_csv`.
hotel_review_dataset = pd.read_csv('../Master Datasets/Hashed and etc/new_reviews_to_process.csv')


# Drop columns not needed for analysis
# This code snippet drops specific columns from the DataFrame that are not required for further analysis.
columns_to_drop = ['reviewerId', 'name', 'likesCount', 'reviewerNumberOfReviews', 'publishedAtDate', 
                   'reviewsDistribution/fiveStar', 'reviewsDistribution/threeStar', 'reviewsDistribution/oneStar', 
                   'totalScore','reviewsDistribution/twoStar','reviewsDistribution/fourStar']
hotel_review_dataset.drop(columns=columns_to_drop, axis=1, inplace=True)

# call for normalization functions
normalize_text(hotel_review_dataset)

Capitalization has been normalized across all text columns.


In [21]:
# Apply the cleaning functions
hotel_review_dataset['text'] = hotel_review_dataset['text'].apply(remove_html_tags).apply(remove_urls_and_emails)
hotel_review_dataset['textTranslated'] = hotel_review_dataset['textTranslated'].apply(remove_html_tags).apply(remove_urls_and_emails)

# Replace non-English reviews with translations
replace_non_english_reviews(hotel_review_dataset)

# Create HotelID(this should be called after combining the reviews into cells
#hotel_review_dataset['HotelID'] = hotel_review_dataset['HotelName'].apply(create_hotel_id)

# Save the cleaned and processed data
hotel_review_dataset.to_csv('../Master Datasets/Reviews_data_processed.csv', index=False)

logging.info("Preprocessing completed and data saved to path_to_cleaned_data.csv")

  return BeautifulSoup(text, "html.parser").get_text()
2024-06-02 21:21:34,934 - INFO: Preprocessing completed and data saved to path_to_cleaned_data.csv
