# Download Dataset from kaggle

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("raj713335/tbo-hotels-dataset")

print("Path to dataset files:", path)

# Installing Dependencies

In [None]:
# !pip install cupy-cuda12x
# !pip install cuml-cu12==24.12.0
!pip install --upgrade pip
!pip install lightgbm
!pip install nltk
# needed only if using colab GPU version
# !mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd


# Import all the dependencies

In [None]:
import pandas as pd
import numpy as np
import joblib
import pickle

from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report,r2_score,mean_squared_error,accuracy_score,confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from scipy.sparse import issparse,hstack,csr_matrix

# from cuml.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from joblib import Memory
from joblib import Parallel,delayed
memory = Memory(location='./cache', verbose=0)

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Cleaning Data

In [None]:
df = pd.read_csv("hotels.csv", encoding='ISO-8859-1')
df.columns = df.columns.str.strip()
df = df.drop(columns=['Map','countyCode','cityCode','HotelCode','PhoneNumber','FaxNumber','PinCode'])
# df = df.drop(columns=['Map','HotelName','countyCode','cityCode','HotelCode','Address','HotelWebsiteUrl','PhoneNumber','FaxNumber','PinCode'])


# df[(df.Attractions.isnull())&(df.Description.isnull())&(df.HotelFacilities.isnull())].all(axis=1).sum()

cols_to_check = ['Attractions','Description','HotelFacilities','HotelName','Address','HotelWebsiteUrl']
# cols_to_check = ['Attractions','Description','HotelFacilities']
df = df[~df[cols_to_check].isnull().all(axis=1)]

df['Attractions'] = df['Attractions'].fillna(df['Attractions'].mode()[0])
df['Description'] = df['Description'] = df['Description'].fillna(df['Description'].mode()[0])
df['HotelFacilities'] = df['HotelFacilities'].fillna(df['HotelFacilities'].mode()[0])
df['HotelName'] = df['HotelName'].fillna(df['HotelName'].mode()[0])
df['Address'] = df['Address'].fillna(df['Address'].mode()[0])
df['HotelWebsiteUrl'] = df['HotelWebsiteUrl'].fillna(df['HotelWebsiteUrl'].mode()[0])

text_cols = ['Attractions','Description','HotelFacilities','HotelName','Address','HotelWebsiteUrl']
# text_cols = ['Attractions','Description','HotelFacilities']
categorical_cols = ['countyName','cityName']

target_map = {'FourStar':4,'FiveStar':5,'ThreeStar':3,'TwoStar':2,'OneStar':1,'All':5}
df['HotelRating'] = df['HotelRating'].map(target_map)

# Declare text operation functions

In [None]:
def lemmatize_tokenizer(text):
    tokens = word_tokenize(text)
    return [lemmatizer.lemmatize(token) for token in tokens]

def combine_row(row):
    """Combine the values of a row into a single space-separated string."""
    return " ".join(row.astype(str))

def join_text_columns(X):
    """
    Combine multiple text columns into a single column.
    Returns a 2D DataFrame with one column.
    """
    if isinstance(X, pd.DataFrame):
        joined = X.apply(combine_row, axis=1)
        return joined.to_frame()  # Ensure 2D output
    else:
        return np.array([combine_row(row) for row in X]).reshape(-1, 1)

def flatten_array(x):
    """Flatten a 2D array to a 1D array."""
    return x.ravel()


def remove_nan(X):
    if hasattr(X, 'dropna'):
        return X.dropna()
    elif issparse(X):
        return X
    else:
        return X

from sklearn.preprocessing import FunctionTransformer
nan_remover = FunctionTransformer(remove_nan, validate=False)


# Vectorize the Data and Save it to pkl file

In [None]:
x = df.drop(columns=['HotelRating'])
y = df['HotelRating']
del df
def preprocess_text(text):
    """Preprocess text by tokenizing, lemmatizing, and removing stopwords."""
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return " ".join(tokens)

@memory.cache
def preprocess_texts_parallel(text_list,n_jobs=-1):
    """Apply preprocess_text to each element in the list."""
    return Parallel(n_jobs=n_jobs,verbose=10)(
        delayed(preprocess_text)(text)for text in text_list
    )

text_data = join_text_columns(x[text_cols])
text_data.fillna("", inplace=True)
text_data_flat = text_data.values.ravel()

processed_texts = preprocess_texts_parallel(text_data_flat,n_jobs=-1)
process_texts_series = pd.Series(processed_texts)
tfidf_vectorizer = TfidfVectorizer()
x_text = tfidf_vectorizer.fit_transform(process_texts_series)

x_cat = x[categorical_cols].copy()
for col in categorical_cols:
    x_cat[col] = x_cat[col].fillna(x_cat[col].mode()[0])

onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
x_cat = onehot_encoder.fit_transform(x_cat)
x_cat_sparse = csr_matrix(x_cat)
x_final = hstack([x_text, x_cat_sparse])
print(type(x_final))
print(x_final.shape)
import joblib

# Save the processed feature matrix and labels
joblib.dump(x_final, 'x_final.pkl')
joblib.dump(y, 'y.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(onehot_encoder, 'onehot_encoder.pkl')

In [None]:
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(onehot_encoder, 'onehot_encoder.pkl')

# Load the feature matrix if it was unloaded from the memory

In [None]:
x_final = joblib.load('x_final.pkl')
y = joblib.load('y.pkl')

# Train Test Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_final, y, test_size=0.2, random_state=12)
del x_final,y

# Train the model

In [None]:
model = LGBMClassifier(random_state=42,
                       force_col_wise=True,
                       n_jobs=-1,
                       # device='gpu',
                       # gpu_platform_id=0,
                       # gpu_device_id=0,
                       max_bin=512,
                       #boosting_type="rf",
                       num_leaves=175,
                       learning_rate=0.1,
                       n_estimators=200
                      )
model.fit(x_train, y_train)

In [None]:
model = joblib.load('model.pkl')

## Note if you have memory issues then these are the functions/variables which we do not need now so you can remove them from memory

In [None]:
del RandomizedSearchCV, SimpleImputer, TfidfVectorizer, WordNetLemmatizer, hstack,issparse, lemmatizer, memory, nltk, stop_words,stopwords,word_tokenize

# Test the model

In [None]:
y_pred = model.predict(x_test)

# Check Accuracy of model

In [None]:
print(classification_report(y_test,y_pred))
print(r2_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

In [None]:
plt.figure(figsize=(6,5))
sns.heatmap(confusion_matrix(y_test,y_pred),
            annot=True,
            fmt="d",
            cmap="Blues",
            xticklabels=[1,2,3,4,5],
            yticklabels=[1,2,3,4,5]
)
plt.show()

# Save the model

In [None]:
# Save the model using joblib
joblib.dump(model, 'hotel_rating_pipeline_cpu.joblib')
# Save the model using pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)