**The dataset used in this notebook comes from KAggle:**
    
**https://www.kaggle.com/datasets/jiashenliu/515k-hotel-reviews-data-in-europe (see the CSV file "Hotel_Reviews.csv")**

In [2]:
import pandas as pd
import pickle 
import time 

In [70]:
data = pd.read_csv('C:/Users/n_cic/PycharmProjects/march_24_streamlit/Hotel_Reviews.csv')
data = data[['Negative_Review', 'Positive_Review', 'Reviewer_Score']]
data.shape
# print(data.head(5))

(515738, 3)

# create new column Reviewer_Score_Binary

In [71]:
data['Reviewer_Score_Binary'] = data['Reviewer_Score'].map(lambda x: 0 if x<6 else 1)

In [72]:
data['Reviewer_Score_Binary'].value_counts()

1    463231
0     52507
Name: Reviewer_Score_Binary, dtype: int64

In [73]:
data.sample(4)

Unnamed: 0,Negative_Review,Positive_Review,Reviewer_Score,Reviewer_Score_Binary
101586,LACK OF IN FORMAL EATING AREAS,WELL SET OUT ROOM,7.1,1
165818,Rooms are very small It would not qualify thi...,Excellent location Walking distance from Conv...,8.3,1
437308,No Negative,The hole atmosphere was friendly and professi...,10.0,1
356479,Teas at breakfast on offer in an impressive J...,Friendly helpful and professional staff Most ...,9.6,1


# keep only 50000 rows for negative ratings and 50000 rows for positive  ratings 

In [74]:
grouped = data.groupby('Reviewer_Score_Binary')
data_sampled = grouped.apply(lambda x: x.sample(n=10000, random_state=42)).reset_index(drop=True)


In [75]:
data_sampled['Reviewer_Score_Binary'].value_counts()

0    10000
1    10000
Name: Reviewer_Score_Binary, dtype: int64

In [76]:
del data 

In [77]:
data = data_sampled.copy()

In [78]:
del data_sampled 

# join the negative and positive reviews in a single column: 

In [79]:
def join_reviews(neg, pos):
    return neg + ' ' + pos 

data['joined_review'] = data.apply(lambda row: join_reviews(row['Negative_Review'], row['Positive_Review']), axis=1)

In [80]:
data

Unnamed: 0,Negative_Review,Positive_Review,Reviewer_Score,Reviewer_Score_Binary,joined_review
0,I left a shirt in the room and they never ret...,hotel was nice,5.8,0,I left a shirt in the room and they never ret...
1,It was not stated when booking on the Non Ref...,The Hotel reception were extreamely helpful f...,5.0,0,It was not stated when booking on the Non Ref...
2,Our junior suite was exceptionally tired A ch...,Location fair althiugh changes in local traff...,2.9,0,Our junior suite was exceptionally tired A ch...
3,you have to pay 8pounds to use that gym swimm...,Location,5.8,0,you have to pay 8pounds to use that gym swimm...
4,Our room was not available when we arrived to...,Good location for our evening at the Albert H...,5.4,0,Our room was not available when we arrived to...
...,...,...,...,...,...
19995,Valet parking quite expensive but central Lon...,Amazing hotel sets the standard,10.0,1,Valet parking quite expensive but central Lon...
19996,rooms very small,No Positive,7.1,1,rooms very small No Positive
19997,Have to pay 16 per day for the car park,I will definitely stay here again the next ti...,9.6,1,Have to pay 16 per day for the car park I w...
19998,Shower head was too small by today standards ...,Discounted 50 Shower was nice except the show...,8.3,1,Shower head was too small by today standards ...


# training of the model based onTfidfVectorizer: 

In [81]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import time

start_time = time.time()

# Create a TF-IDF vectorizer for the positive reviews
vectorizer = TfidfVectorizer(max_features=5000, token_pattern=r'\b\w+\b')

# Fit the vectorizer on the positive review column
X = vectorizer.fit_transform(data['joined_review'])

# Add prefix to the feature names of the positive reviews
feature_names = ['pos_' + f for f in vectorizer.get_feature_names_out()]

# Convert the sparse matrix to a dataframe with unique feature names
X = pd.DataFrame(X.toarray(), columns=feature_names)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, data['Reviewer_Score_Binary'], test_size=0.2, random_state=42)

# Fit an XGBoost classifier on the training data
model = XGBClassifier()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

end_time = time.time()
print(end_time-start_time)

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

72.71767687797546
Accuracy: 0.814


# store the xgboost model:

In [82]:
with open('xgboost_model.pkl' , 'wb') as f:
    pickle.dump(model, f)

# store the vectorizer: 

In [87]:
with open('vectorizer.pkl' , 'wb') as f:
    pickle.dump(vectorizer, f)

# store the feature_names: 

In [88]:
with open('feature_names.pkl' , 'wb') as f:
    pickle.dump(feature_names, f)

# predict a new review: 

In [3]:
with open('xgboost_model.pkl' , 'rb') as f_new:
    reloaded_model = pickle.load(f_new)

In [4]:
with open('vectorizer.pkl' , 'rb') as f_new:
    vectorizer = pickle.load(f_new)

In [5]:
with open('feature_names.pkl' , 'rb') as f_new:
    feature_names = pickle.load(f_new)

In [6]:
# Preprocess the new review and tokenize it
new_review = "This hotel is great! The staff is friendly and the room is spacious."
# preprocessed_review = preprocess_text(new_review)

# Transform the preprocessed review into a feature vector
new_review_vector = vectorizer.transform([new_review])

# Create a new dataframe with the feature vector
new_review_df = pd.DataFrame(new_review_vector.toarray(), columns=feature_names)

# Predict the binary score of the new review
new_review_score = reloaded_model.predict(new_review_df)[0]

print("Binary score of the new review:", new_review_score)

Binary score of the new review: 1


negative review:

In [7]:
# Preprocess the new review and tokenize it
new_review = "This hotel is aweful. The staff is unfriendly and the room is tiny."
# preprocessed_review = preprocess_text(new_review)

# Transform the preprocessed review into a feature vector
new_review_vector = vectorizer.transform([new_review])

# Create a new dataframe with the feature vector
new_review_df = pd.DataFrame(new_review_vector.toarray(), columns=feature_names)

# Predict the binary score of the new review
new_review_score = reloaded_model.predict(new_review_df)[0]

print("Binary score of the new review:", new_review_score)

Binary score of the new review: 0
