<a href="https://colab.research.google.com/github/KBE25/hotel_sentiment/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview

# Business Understanding

# Data Understanding

## Data Preparation

In [None]:
!pip install emoji

In [None]:
!pip install symspellpy

In [131]:
#importing relevant libraries
import os
import re
import sys
import string
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import kagglehub

import spacy
import emoji

from symspellpy import SymSpell, Verbosity
import pkg_resources

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import spacy
import re
import emoji
from symspellpy import SymSpell, Verbosity
import pkg_resources

from sklearn.feature_extraction.text import TfidfVectorizer
import multiprocessing as mp
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from tensorflow import keras

# Suppress warnings
warnings.simplefilter(action="ignore", category=FutureWarning)




Data Preparation will contain the following parts:

### Data Acquisition

In [None]:
# Download latest dataset version
path = kagglehub.dataset_download("thedevastator/booking-com-hotel-reviews")


In [None]:
# Identify the path for the dataset
print("Path to dataset files:", path)

In [None]:
# The path where the data is in the environment
base_dataset_directory = "/kaggle/input/booking-com-hotel-reviews"
csv_file_name = "booking_reviews copy.csv"
full_csv_path = os.path.join(base_dataset_directory, csv_file_name)
df = pd.read_csv(full_csv_path)

## Exploratory Data Analysis

### Evaluating the dataframe

This part of the project will contain an evaluation of the dataframa and the columns available in order to decide which ones to drop based on their relevance for the analysis.


In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head(-5)

In [None]:
# Handle potential NaN values for comparison between raw_review_text and review_text
# For this I fill NaN with a placeholder string to avoid issues, as NaN != NaN
df_temp = df.fillna({'review_text': '', 'raw_review_text': ''}).copy()

# Count how many rows have identical content
identical_reviews = (df_temp['review_text'] == df_temp['raw_review_text']).sum()
# Only count rows where both are non-null
total_reviews_comparable = len(df_temp) - df_temp['review_text'].isnull().sum() - df_temp['raw_review_text'].isnull().sum()

print(f"Number of reviews where 'review_text' and 'raw_review_text' are identical (after filling NaN): {identical_reviews}")
print(f"Total comparable reviews (where both are non-null): {total_reviews_comparable}")
print(f"Percentage identical: {((identical_reviews / total_reviews_comparable) * 100):.2f}%")

Given that only 1.08% of review_text and raw_review_text are identical,this indicates significant differences between the two. Because of this it's crucial to understand the nature of these disparities before deciding which column to retain for further analysis.

In [None]:
# Calculate the length of 'review_text' and 'raw_review_text'
df_temp['review_text_len'] = df_temp['review_text'].apply(len)
df_temp['raw_review_text_len'] = df_temp['raw_review_text'].apply(len)

# Look at descriptive statistics of lengths
print("\nDescriptive statistics for 'review_text' length:")
print(df_temp['review_text_len'].describe())

print("\nDescriptive statistics for 'raw_review_text' length:")
print(df_temp['raw_review_text_len'].describe())

# Check for rows where one is significantly longer than the other (e.g., >20 characters difference)
print("\n--- Examples where review_text is MUCH longer (diff > 20 chars) ---")
longer_review_text_diff = df_temp[df_temp['review_text_len'] - df_temp['raw_review_text_len'] > 20]
print(f"Number of such rows: {len(longer_review_text_diff)}")
if not longer_review_text_diff.empty:
    print(longer_review_text_diff[['review_text', 'raw_review_text']].head(3).to_string())

print("\n--- Examples where raw_review_text is MUCH longer (diff > 20 chars) ---")
longer_raw_review_text_diff = df_temp[df_temp['raw_review_text_len'] - df_temp['review_text_len'] > 20]
print(f"Number of such rows: {len(longer_raw_review_text_diff)}")
if not longer_raw_review_text_diff.empty:
    print(longer_raw_review_text_diff[['review_text', 'raw_review_text']].head(3).to_string())

In [None]:
# Find a few rows where they are different (and both non-empty) for manual inspection
diff_reviews_sample = df_temp[(df_temp['review_text'] != df_temp['raw_review_text']) &
                              (df_temp['review_text'] != '') & (df_temp['raw_review_text'] != '')].sample(min(5, len(df_temp))).copy()

print("\n--- Detailed Examples of Differences (5 random samples) ---")
for index, row in diff_reviews_sample.iterrows():
    print(f"\nRow Index: {index}")
    print(f"review_text:     '{row['review_text']}'")
    print(f"raw_review_text: '{row['raw_review_text']}'")
    print("-" * 50)

In [None]:
# Displaying the first few non-null entries
print("First 10 non-null entries in 'meta' column:")
print(df['meta'].dropna().head(10).tolist())

# Displaying a random sample
print("\n10 random non-null entries in 'meta' column:")
print(df['meta'].dropna().sample(10).tolist())

Based on the above checks, I will be dropping the following columns that are not relevant for this analysis:

1. index: Redundant DataFrame index.

2. images: Too sparse, and image analysis is out of scope for this analysis as we are using text for hotel rating prediction.

3. crawled_at: Irrelevant timestamp for this analysis as this is when the review was crawled from the site.

4. url: Unique identifier for the review page, no predictive power.

5. hotel_url: Unique identifier for the hotel page, but this becomes redundant if hotel_name or hotel_id is used.

6. meta: content is consistently redundant, specifying 'en-gb' language and 'booking.com' source, and therefore provides no unique or discriminative information for this analysis.

7. review_text: as it seems to be a simplified version of raw_review_text and is better to keep the columns with full information to better extract richer features for the analysis.

In [None]:
# Dropping unnecesary columns
# Create a list of columns to drop
columns_to_drop = ['index', 'images', 'crawled_at', 'url', 'hotel_url', 'meta', 'review_text']

# Perform the drop operation
df_cleaned = df.drop(columns=columns_to_drop, errors='ignore')

# Inspecting new dataframe
df_cleaned.info()

In [None]:
#Initial Missing Value Overview
df_cleaned.isnull().sum()

Before analyzing the distribution of our target variable 'rating', it's crucial to address its missing values. As observed, the rating column contains 289 missing entries. For supervised learning, every data point used in the target distribution analysis and model training must have a defined target value.

Therefore, I made the decision to drop all rows where the rating value was NaN. This ensures the integrity of our target variable. Given this drop accounts for only about 1.08% of our total dataset, the data loss is minimal and justified to maintain data quality for supervised learning.

In [None]:
# Dropping NAN from target variable 'rating'
df_cleaned = df_cleaned.dropna(subset=['rating']).copy()

In [None]:
# Additional Missing Value Overview
df_cleaned.isnull().sum()

The additional missing values will be managed later in the Data processing part.

### Target Variable Distribution Analysis

For this project, the target variable is rating, which mean that I have two different ways to approach the solution:

1.   A regression approach by predicting the exact rating
2.   A classification approach by converting rating to sentiment categories

So in this part, I will conduct a distribution analysis for both, since I have a numerical rating from the Booking.com data I will first approach a solution with a regression problem to predict a numerical rating and then explore the sentiment classification as an alternative.

#### Rating Prediction (Regression Problem)

In [None]:
#Creating an histogram for the target variable 'rating'
rating_column_name = 'rating'

plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned[rating_column_name], bins=20, kde=True) # Adjust bins if rating scale is very small/large
plt.title(f'Distribution of Hotel Ratings (Target Variable)')
plt.xlabel('Rating Score')
plt.ylabel('Number of Reviews')
plt.grid(axis='y', alpha=0.75)
plt.show()

# Get descriptive statistics for the varible 'rating'
print(f"\nDescriptive Statistics for '{rating_column_name}':")
print(df_cleaned[rating_column_name].describe())

# Check for specific unique values (e.g., 1-10)
if df_cleaned[rating_column_name].dtype in ['int64', 'float64']:
    print(f"\nUnique values in '{rating_column_name}':")
    print(df_cleaned[rating_column_name].unique())
    print(f"\nValue counts for '{rating_column_name}':")
    print(df_cleaned[rating_column_name].value_counts().sort_index())

Based on the histogram, the key findings are that the 'rating' column is a numerical target with ample data (26386 non-null entries). However, it exhibits a strong positive skew, with a mean of 8.56, and over 75% of ratings are 7.9 or higher, making lower ratings significantly sparse.

This skew will cause regression models to be biased towards predicting higher ratings, potentially performing poorly on and underrepresenting the crucial, but rare, lower (negative) ratings.

Moving forward with the regression to predict the rating will mean that:

*   I will need to prioritize MAE/RMSE for evaluation and analyze errors across rating ranges.
*  Employ robust regression models like Gradient Boosting Machines (XGBoost, LightGBM).
*   Focus feature engineering on attributes indicative of negative sentiment to aid prediction of sparse low ratings.
*   Consider converting to a classification problem by binning ratings if direct regression proves too challenging.

#### Sentiment Classification (Classification Problem derived from rating)

In [None]:
# Define a function to categorize ratings into sentiments
# The split for the binary classification was done to try to keep a 50/50 split
def derive_binary_sentiment(rating):
    if rating >= 9.0:
        return 'Positive'
    else:
        return 'Negative'

# Create the new binary sentiment column
df_cleaned['sentiment_label'] = df_cleaned[rating_column_name].apply(derive_binary_sentiment)

#df_classification['sentiment_label'] = df_classification['rating'].apply(derive_binary_sentiment)

# Display the distribution of sentiment labels
plt.figure(figsize=(8, 5))
sns.countplot(x='sentiment_label', data=df_cleaned, order=['Negative', 'Positive'])
plt.title('Distribution of Derived Binary Sentiment Labels (Threshold: 9.0)')
plt.xlabel('Sentiment')
plt.ylabel('Number of Reviews')
plt.show()

# Print counts and percentages to quantify imbalance
sentiment_counts = df_cleaned['sentiment_label'].value_counts()
print("\nDerived Binary Sentiment Label Counts:")
print(sentiment_counts)
print("\nDerived Binary Sentiment Label Proportions:")
print((df_cleaned['sentiment_label'].value_counts(normalize=True) * 100).round(2))


Your rating data was effectively transformed into a well-balanced binary classification target (Positive: 53.72%, Negative: 46.28%) using the rating 9.0 as a threshold. This near-even split mitigates class imbalance issues, aiding robust classifier training and evaluation.

The limitations of this split is that, the 9.0 threshold is arbitrary, leading to a loss of original rating granularity where diverse dissatisfaction levels are grouped. Reviews just below 9.0 are categorized with truly negative ones, potentially misrepresenting nuanced sentiment. "Neutral" reviews might also fall into "Negative" based purely on score.

To address the limitations, I will be performing a boundary analysis during model evaluation to understand misclassifications near the 9.0 mark as well a conducting detailed error analysis on miscategorized reviews.

## Data Processing

### Handling missing values

In [None]:
# Current missing values
df_cleaned.isnull().sum()

We will manage the the missing values for 'review_title', 'nationality', 'raw_review_text' and 'tags' in the following way:

1.  review_title: I will add a placeholder "No Title" to preserve data. This approach is ideal because only one value is missing, and mean/median/mode imputation isn't suitable for text.
2.  nationality: I will fill in the NaNs with the mode so the most frequent nationality. Using the mode is a common and  reasonable approach for categorical features as it preserves the distribution of the existing data as much as possible by assuming the missing values are most likely to belong to the most common category. While it might introduce a slight bias towards the mode, for a small number of missing values (16 in this case), the impact will be minimal.

3.  raw_review_text: I will add a placeholder in this case an empty string. In this case an empty String '' is the best option as in an NLP pipeline this will naturally result in no tokens or zero vectors, which is the correct representation for an absent review.

4.  tags: I will add a placeholder in this case an empty string.


In [None]:
# Review title
df_cleaned['review_title'].fillna('No Title', inplace=True)

# Nationality
most_frequent_nationality = df_cleaned['nationality'].mode()[0]
df_cleaned['nationality'].fillna(most_frequent_nationality, inplace=True)

# Raw_review_text
df_cleaned['raw_review_text'].fillna('', inplace=True)

# Tags
df_cleaned['tags'].fillna('', inplace=True)

In [None]:
# Validating that the missing values were handled
df_cleaned.isnull().sum()

### Feature Engineering from Structured Data

The goal of this part is to extract more information from existing numerical and categorical features, especially time-related ones, to potentially improve model performance or provide deeper insights.

#### Step 1: Convert reviewed_at to Datetime object

Convert reviewed_at in df_cleaned to a pandas datetime object as this allows easy extraction of year, month, day, day of week and facilitates date arithmetic.

In [None]:
# Convert to datetime
df_cleaned['reviewed_at'] = pd.to_datetime(df_cleaned['reviewed_at'])


In [None]:
# Now that reviewed_at is a datetime object, I can easily extract various time-based features.
# Seasonality: Month, quarter, and day of week can capture seasonal trends in reviews.
# Temporal Trends: Year can capture long-term trends or changes.
# Recency: review_age_days

# Basic features to create
df_cleaned['review_year'] = df_cleaned['reviewed_at'].dt.year
df_cleaned['review_month'] = df_cleaned['reviewed_at'].dt.month
df_cleaned['review_day'] = df_cleaned['reviewed_at'].dt.day
df_cleaned['review_day_of_week'] = df_cleaned['reviewed_at'].dt.dayofweek # Monday=0, Sunday=6
df_cleaned['review_quarter'] = df_cleaned['reviewed_at'].dt.quarter

# Age relative to the latest review date in the dataset
latest_review_date = df_cleaned['reviewed_at'].max()
df_cleaned['review_age_days'] = (latest_review_date - df_cleaned['reviewed_at']).dt.days

print("\nDataFrame after adding time-based features:")
print(df_cleaned[['reviewed_at', 'review_year', 'review_month', 'review_day', 'review_day_of_week', 'review_quarter', 'review_age_days']].head())

#### Step 2: Aggregation Features

Reviewer centric metrics?

In [None]:
#reviewer_stats = df_cleaned.groupby('reviewed_by').agg(reviewer_total_reviews=('reviewed_by', 'count'),reviewer_avg_rating=('rating', 'mean')).reset_index()
#df_cleaned = pd.merge(df_cleaned, reviewer_stats, on='reviewed_by', how='left')

#print("\nDataFrame after adding aggregation features:")
#print(df_cleaned[['hotel_name', 'rating', 'hotel_avg_rating', 'hotel_total_reviews',
#          'reviewed_by', 'reviewer_total_reviews', 'reviewer_avg_rating']].head())

### Feature Selection

In [None]:
# Define target variable
y = df_cleaned['rating']

X = df_cleaned.drop(columns=[
    'rating', 'reviewed_at', 'review_title', 'raw_review_text', 'tags',
    'nationality', 'reviewed_by', 'hotel_name', 'sentiment_label'

])


In [None]:
# Select only numerical columns for correlation calculation
numerical_cols = X.select_dtypes(include=np.number).columns

# Calculate correlation with the target variable
correlations = X[numerical_cols].corrwith(y).sort_values(ascending=False)
print("\nCorrelation of numerical features with 'rating':\n", correlations)

# Visualize correlation matrix among numerical features if you have many
plt.figure(figsize=(12, 10))
sns.heatmap(X[numerical_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features')
plt.show()


Based on the correlation matrix, we will be dropping the following variables: review_quarter, review_day and review_year. These show extremely low correlation with rating and review_quarter is redundant with review_month. review_year is also redundant with review_age_days. Removing them reduces noise and multicollinearity.

I'm keeping review_age_days because even if correlation is low, temporal features can have non-linear impacts or interactions not captured by simple correlation, warranting further evaluation by other methods.

In [None]:
# Columns to drop based on low correlation and redundancy
columns_to_drop_now = [
    'review_quarter',
    'review_day',
    'review_year'
]

# Ensure these columns actually exist in X before dropping
existing_columns_to_drop = [col for col in columns_to_drop_now if col in X.columns]

if existing_columns_to_drop:
    X_filtered = X.drop(columns=existing_columns_to_drop)
    print(f"\nDropped columns: {existing_columns_to_drop}")
    print("\nFeatures (X_filtered) columns after preliminary drops:\n", X_filtered.columns.tolist())
    print("\nShape of X_filtered:", X_filtered.shape)
else:
    X_filtered = X.copy()
    print("\nNo additional columns to drop based on this correlation analysis.")

In [None]:
# F-regression
print("\n--- Feature Selection using F-regression (SelectKBest) on X_filtered ---")
selector_f_reg_filtered = SelectKBest(score_func=f_regression, k='all')
selector_f_reg_filtered.fit(X_filtered, y)
f_scores_filtered = pd.DataFrame({'Feature': X_filtered.columns, 'F-Score': selector_f_reg_filtered.scores_, 'P-value': selector_f_reg_filtered.pvalues_})
f_scores_filtered = f_scores_filtered.sort_values(by='F-Score', ascending=False)
print(f_scores_filtered)

# Mutual Information Regression
print("\n--- Feature Selection using Mutual Information Regression (SelectKBest) on X_filtered ---")
selector_mi_filtered = SelectKBest(score_func=mutual_info_regression, k='all')
selector_mi_filtered.fit(X_filtered, y)
mi_scores_filtered = pd.DataFrame({'Feature': X_filtered.columns, 'Mutual_Info_Score': selector_mi_filtered.scores_})
mi_scores_filtered = mi_scores_filtered.sort_values(by='Mutual_Info_Score', ascending=False)
print(mi_scores_filtered)

Based on the F-regression and Mutual Information scores we will drop and keep the following variables:

Drop:

1. review_day_of_week: Its F-score is low, and the P-value (0.109) is above the 0.05 significance threshold. The Mutual Information score is also extremely low (0.006). This indicates it has little to no linear or non-linear relationship with rating.

Keep (for further evaluation by model-based methods):

1. review_age_days: Exhibits high statistical significance (low P-value) and is the strongest feature by Mutual Information, suggesting a potentially strong non-linear relationship.

2. review_month: While its Mutual Information score is low, its F-score's P-value (0.0348) is still below 0.05. It might have a subtle linear impact or interact with other features, so it's worth assessing with tree-based models next.

As only 'review_month' and 'review_age_days' are left, I'm going to evaluate if it will be ok keeping both or not using a Variance Inflation Factor (VIF) calculation:

In [None]:
# Adding a constant as this is a good practice for VIF calculation
X_vif = X_filtered[['review_month', 'review_age_days']].copy() # Just these two for now
X_vif['intercept'] = 1

vif_data = pd.DataFrame()
vif_data["feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]

print("\nVariance Inflation Factor (VIF) for review_month and review_age_days:")
print(vif_data)

The Variance Inflation Factor (VIF) for both review_month and review_age_days is approximately 1.03. This score, being very close to 1, indicates extremely low multicollinearity between these two features.

Crucially, these variables capture distinct temporal aspects. Review_month accounts for seasonality, reflecting potential cyclical patterns in hotel ratings throughout the year (e.g., holiday seasons, peak travel months). In contrast, review_age_days measures recency, indicating how old a review is, which can reflect current hotel conditions or reviewer sentiment trends over time.

Since they provide independent information about different facets of time and do not exhibit problematic redundancy, retaining both enriches the model's understanding of review dynamics. This allows the model to leverage both seasonal influences and the impact of review recency, leading to a more comprehensive and potentially more accurate predictive capability without introducing noise or instability.

In [None]:
# Final cleaned df with feature engineering variables define through feature selection
df_filtered = df_cleaned.drop(columns=[ 'review_quarter', 'review_day', 'review_year',  'review_day_of_week'])

df_filtered.info()

### Text Preprocessing

This text preprocessing methodology is meticulously designed to transform raw, unstructured review data into a clean, normalized, and semantically rich format suitable for advanced natural language processing tasks.

This multi-stage process begins with fundamental cleanups, including lowercasing to standardize text representation and the removal of URLs and HTML tags to eliminate extraneous noise. Subsequently, emojis and emoticons are converted into descriptive text to capture their inherent sentiment, while the text is simultaneously tokenized into individual words. I then apply lemmatization to reduce words to their base forms, ensuring consistent representation across variations, and systematically remove punctuation and numbers to focus on meaningful lexical content. A crucial step involves comprehensive stop word removal, which not only leverages standard lists but also incorporates custom, domain-specific terms (e.g., "hotel," "room") that lack discriminative power in review contexts.

Furthermore, I implement sophisticated negation handling by tagging words that follow negations (e.g., "good_NEG" from "not good"), thus preserving accurate sentiment. Finally, optional spell correction is employed to address typographical errors, preventing data sparsity and ensuring that misspelled words contribute correctly to feature representation. This rigorous preprocessing pipeline is essential for generating high-quality features for subsequent TF-IDF modeling and enhancing the overall performance of our machine learning models.


In [None]:
# Initialize spaCy
# Load a pre-trained English model. 'en_core_web_sm' is a good starting point.
try:
    nlp = spacy.load("en_core_web_sm")
    print("spaCy 'en_core_web_sm' model loaded successfully.")
except OSError:
    print("spaCy 'en_core_web_sm' model not found. Please run: !python -m spacy download en_core_web_sm")
    import sys
    sys.exit("SpaCy model not found. Please download it first.")

In [None]:
# 1. Clone my github to bring my helper function: processing_text.py file into my Colab environment.
repo_url = "https://github.com/KBE25/hotel_sentiment.git"
repo_name = "hotel_sentiment"

# Check if the repo is already cloned to avoid re-cloning on successive runs
if not os.path.exists(repo_name):
    print(f"Cloning {repo_name} from GitHub...")
    !git clone {repo_url}
    print(f"Repository '{repo_name}' cloned successfully.")
else:
    print(f"Repository '{repo_name}' already exists. Skipping clone.")

# Navigate into the cloned repository's directory
# This is important so Python can find the helper file directly
%cd {repo_name}/
print(f"Current working directory changed to: {os.getcwd()}")

In [None]:
# The current working directory should now be the root of your cloned repo
# 'processing_text.py' is directly in the root of my repository.
path_to_add = os.getcwd()

if path_to_add not in sys.path:
    sys.path.append(path_to_add)
    print(f"Added '{path_to_add}' to sys.path for module import.")

In [None]:
# 2. Import the functions from my processing_text.py helper file
try:
    from processing_text import (preprocess_text_enhanced_spacy, parallelize_series_with_tqdm, worker_initializer)
    print("Successfully imported preprocessing functions from 'processing_text.py'.")
except ImportError as e:
    print(f"Error importing functions from processing_text.py: {e}")
    print("Please check: 1. File name 'processing_text.py'. 2. Its location in your repo. 3. Python path.")
    # Exit the script if the helper cannot be loaded
    sys.exit("Failed to import preprocessing helper. Exiting.")

In [None]:
# 3. Run the core text preprocessing
# This step takes a really long time so after running it the next step will be to save it so is easier just to load the data and run the rest of the analysis
if __name__ == '__main__':
    text_cols_to_process = ['raw_review_text', 'review_title', 'tags']
    processed_output_cols = [f'{col}_processed' for col in text_cols_to_process]

    for col in text_cols_to_process:
        print(f"Starting parallel processing for column: '{col}' (with full spaCy pipeline)...")
        df_filtered[f'{col}_processed'] = parallelize_series_with_tqdm(
            df_filtered[col],
            preprocess_text_enhanced_spacy,
            n_cores=None, # Use all available CPU cores
            apply_spell_correction=True # Pass keyword arguments
        )
        print(f"Finished processing column: '{col}'")

    print("\n--- Processed Text Columns Sample (after all preprocessing) ---")
    print(df_filtered[processed_output_cols].head(2).to_string())
    print("-" * 50)

In [126]:
 # 4. Saving the processed dataframe
# This step saves the df_filtered (including original and new processed columns) to a file for later use, avoiding reprocessing.
output_filepath_parquet = 'processed_reviews.parquet'

To be able to load the data saved in step (4) we are going to have to complete a couple of additional steps using Kaggle. After I completed step (4) and saved the df_filtered data I went ahead and downloaded the data and uploaded into Kaggle for easier management. To be able to recover this information the next steps are needed:

1. Create an API connection with Kaggle
2. Download the processed data from Kaggle
3. Load the data


#### 1. Creating a connection with Kaggle API

The code below is used to create a connection with Kaggle Public API. In this case this connection is done to be able to load the models that we saved in Kaggle.

The only change that is needed below is to edit the following part by finding your API token information in your Kaggle account settings part:

*   "username":"ADD_KEY HERE"
*   "key":"ADD_KEY HERE"

IMPORTANT - After you ran the below code, make sure to delete your 'username' and 'key' as that information is personal and should not be made available in any public site like Github. More information on Kaggle's API can be found here.

In [127]:
import json
import os
from pathlib import Path

# Go to Kaggle and get your API public https://www.kaggle.com/docs/api
api_key = {
    "username":"", #add your Kaggle API username
    "key":""       #add your Kaggle API key
    }

# uses pathlib Path
kaggle_path = Path('/root/.kaggle')
os.makedirs(kaggle_path, exist_ok=True)

# opens file and dumps python dict to json object
with open (kaggle_path/'kaggle.json', 'w') as handl:
    json.dump(api_key,handl)

os.chmod(kaggle_path/'kaggle.json', 600)

#IMPORTANT: after running this code, delete your username and key

 #### 2. Downloading the dataset from Kaggle to Colab

In [129]:
! kaggle datasets download -d karinabe25us/df-filtered
! unzip df-filtered.zip

Dataset URL: https://www.kaggle.com/datasets/karinabe25us/df-filtered
License(s): apache-2.0
df-filtered.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  df-filtered.zip
replace processed_reviews.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: processed_reviews.parquet  


 #### 3. Loading the dataset using pandas

In [136]:
#Loading the model

# Define the full path to the processed Parquet file
file_path = '/content/hotel_sentiment/processed_reviews.parquet'

# Load the DataFrame directly from the Parquet file
df_filtered = pd.read_parquet(file_path)

#Confirming the dataset has the correct information
print(df_filtered.info())
print(df_filtered.head().to_string())
print("-" * 50)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26386 entries, 0 to 26385
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   review_title               26386 non-null  object        
 1   reviewed_at                26386 non-null  datetime64[ns]
 2   reviewed_by                26386 non-null  object        
 3   hotel_name                 26386 non-null  object        
 4   avg_rating                 26386 non-null  float64       
 5   nationality                26386 non-null  object        
 6   rating                     26386 non-null  float64       
 7   raw_review_text            26386 non-null  object        
 8   tags                       26386 non-null  object        
 9   sentiment_label            26386 non-null  object        
 10  review_month               26386 non-null  int32         
 11  review_age_days            26386 non-null  int64         
 12  raw_