In [2]:
import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')

In [3]:
print("Version of Tensorflow: ", tf.__version__)

Version of Tensorflow:  2.11.0


In [4]:
print("Cuda Availability: ", tf.test.is_built_with_cuda())

Cuda Availability:  True


In [5]:
print("GPU  Availability: ", tf.test.is_gpu_available())

GPU  Availability:  True


In [6]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [7]:
# Check nos of GPUS

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [8]:
import pandas as pd
import numpy as np

# Set the seed value
seed = 123

In [10]:
df = pd.read_csv('/kaggle/input/515k-hotel-reviews-data-in-europe/Hotel_Reviews.csv')

In [11]:
# Create a new dataframe with only the Positive_Review and Negative_Review columns
df_train = df[["Positive_Review", "Negative_Review"]]

# Use the melt function to stack the Positive_Review and Negative_Review columns on top of each other
df_train = df_train.melt(var_name="Sentiment", value_name="Text")

# Map the Sentiment column to 1 for Positive_Review and 0 for Negative_Review
df_train["Sentiment"] = df_train["Sentiment"].map({"Positive_Review": 1, "Negative_Review": 0})

# Drop any rows with empty or missing Text
df_train = df_train.dropna(subset=["Text"])

# Reset the index of the dataframe
df_train = df_train.reset_index(drop=True)

df_train["ID"] = df_train.index
df_train

Unnamed: 0,Sentiment,Text,ID
0,1,Only the park outside of the hotel was beauti...,0
1,1,No real complaints the hotel was great great ...,1
2,1,Location was good and staff were ok It is cut...,2
3,1,Great location in nice surroundings the bar a...,3
4,1,Amazing location and building Romantic setting,4
...,...,...,...
1031471,0,no trolly or staff to help you take the lugga...,1031471
1031472,0,The hotel looks like 3 but surely not 4,1031472
1031473,0,The ac was useless It was a hot week in vienn...,1031473
1031474,0,No Negative,1031474


In [12]:
# Define a function to count the number of words in a string
def count_words(text):
    return len(text.split())

df_train = df_train[df_train["Text"].apply(lambda x: count_words(x) > 2)]

# Randomly select one third of the rows with a seed
df_train = df_train.sample(frac=0.02, random_state=seed)
df_train

Unnamed: 0,Sentiment,Text,ID
614439,0,the bed was huge but the mattress was not the...,614439
270955,1,This was a nice older hotel in a residential ...,270955
485273,1,large and quiet rooms king size beds smoking ...,485273
567131,0,The water pressure was not good in the shower...,567131
150214,1,Clean friendly and easy access to the tube,150214
...,...,...,...
82181,1,Staff were fantastic Friendly and very helpful,82181
486507,1,Breakfast selection and quality was excellent,486507
407708,1,The staff were very helpful The roof terrace ...,407708
406197,1,Beautiful hotel in great location close to ce...,406197


In [13]:
df_train.Text.apply(lambda x: len(x)).describe()

count    15460.000000
mean       121.938486
std        142.599095
min          8.000000
25%         42.000000
50%         78.000000
75%        148.000000
max       1923.000000
Name: Text, dtype: float64

# Multinomial Naive Bayes classifier

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Defining input texts and labels
texts = df_train.Text
labels = df_train.Sentiment

# Splitting the data into training and testing sets
text_train, text_test, label_train, label_test = train_test_split(texts, labels, test_size=0.2)

# Creating bag of words features using CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(text_train)

# Converting text data into numerical features
train_features = vectorizer.transform(text_train)
test_features = vectorizer.transform(text_test)

In [17]:
# Hyperparameter tuning
param_grid = {'alpha': [0.4, 0.6, 0.8, 1]}

classifier = MultinomialNB()

classifier_grid = GridSearchCV(estimator = classifier, param_grid = param_grid, cv = 5, scoring = 'accuracy')
classifier_grid.fit(train_features, label_train)

classifier_grid_results = pd.DataFrame(classifier_grid.cv_results_)
classifier_grid_results.sort_values(by = 'rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.004021,0.000127,0.000883,0.0001,1.0,{'alpha': 1},0.926435,0.919968,0.92118,0.912657,0.9167,0.919388,0.004599,1
0,0.004363,0.000564,0.000901,7e-05,0.4,{'alpha': 0.4},0.925627,0.92118,0.919159,0.91387,0.9167,0.919307,0.003996,2
1,0.003919,4.5e-05,0.000849,5.1e-05,0.6,{'alpha': 0.6},0.926031,0.920372,0.918351,0.913465,0.9167,0.918984,0.004187,3
2,0.0039,3.3e-05,0.000904,6.8e-05,0.8,{'alpha': 0.8},0.925627,0.919563,0.919159,0.913061,0.917105,0.918903,0.004076,4


In [18]:
# Training a Multinomial Naive Bayes classifier
classifier = MultinomialNB(alpha = 1)
classifier.fit(train_features, label_train)

# Predicting the sentiment of test data
predictions = classifier.predict(test_features)

# Evaluating the accuracy of the model
accuracy = classifier.score(test_features, label_test)

# Printing model metrics
print(classification_report(label_test, predictions))
print(f"Precision: {precision_score(label_test, predictions):.4f}")
print(f"Recall: {recall_score(label_test, predictions):.4f}")
print(f"F1: {f1_score(label_test, predictions):.4f}")
print(f"Accuracy: {accuracy:.4f}")


              precision    recall  f1-score   support

           0       0.91      0.91      0.91      1323
           1       0.93      0.93      0.93      1769

    accuracy                           0.92      3092
   macro avg       0.92      0.92      0.92      3092
weighted avg       0.92      0.92      0.92      3092

Precision: 0.9304
Recall: 0.9293
F1: 0.9299
Accuracy: 0.9198


# CatBoost

In [14]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
data = df_train
# Load data
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data["Text"], data["Sentiment"], test_size=0.2, random_state=seed)

# Convert text into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

In [15]:
# Hyperparameter tuning
param_grid = {'depth': [4, 10],
              'learning_rate': [0.05, 0.1],
              'iterations': [500, 1000]}

classifier = CatBoostClassifier()

classifier_grid = GridSearchCV(estimator = classifier, param_grid = param_grid, cv = 5, scoring = 'accuracy')
classifier_grid.fit(X_train_counts, y_train, verbose=False)

classifier_grid_results = pd.DataFrame(classifier_grid.cv_results_)
classifier_grid_results.sort_values(by = 'rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_depth,param_iterations,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,229.558789,0.946895,0.046354,0.000706,10,1000,0.1,"{'depth': 10, 'iterations': 1000, 'learning_ra...",0.924414,0.936944,0.92401,0.928427,0.934897,0.929738,0.005318,1
6,230.0762,1.175251,0.045428,0.000625,10,1000,0.05,"{'depth': 10, 'iterations': 1000, 'learning_ra...",0.927243,0.933711,0.930477,0.923979,0.932471,0.929576,0.003551,2
5,115.006873,0.621157,0.045715,0.017247,10,500,0.1,"{'depth': 10, 'iterations': 500, 'learning_rat...",0.923201,0.933711,0.927243,0.927618,0.932875,0.92893,0.003894,3
3,20.055054,0.361829,0.040708,0.020005,4,1000,0.1,"{'depth': 4, 'iterations': 1000, 'learning_rat...",0.924414,0.932094,0.929669,0.921957,0.932875,0.928202,0.004301,4
4,115.091453,0.731781,0.03635,0.00109,10,500,0.05,"{'depth': 10, 'iterations': 500, 'learning_rat...",0.924818,0.930073,0.926839,0.919531,0.930449,0.926342,0.003994,5
2,20.186451,0.311159,0.025459,0.000261,4,1000,0.05,"{'depth': 4, 'iterations': 1000, 'learning_rat...",0.920372,0.930073,0.926839,0.919127,0.928831,0.925048,0.004465,6
1,10.157464,0.162407,0.030504,0.013754,4,500,0.1,"{'depth': 4, 'iterations': 500, 'learning_rate...",0.917947,0.927243,0.926839,0.918722,0.929236,0.923997,0.004701,7
0,10.582406,0.283809,0.030831,0.011924,4,500,0.05,"{'depth': 4, 'iterations': 500, 'learning_rate...",0.911479,0.926031,0.919159,0.91387,0.921148,0.918338,0.005191,8


In [16]:
# Train CatBoost model
model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=10, loss_function='Logloss')
model.fit(X_train_counts, y_train, verbose=False)

# Predict sentiment on testing set
y_pred = model.predict(X_test_counts)

# Evaluate model accuracy
accuracy = accuracy_score(y_test, y_pred)

# Printing model metrics
print(classification_report(y_test, y_pred))
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1: {f1_score(y_test, y_pred):.4f}")
print(f"Accuracy: {accuracy:.4f}")

              precision    recall  f1-score   support

           0       0.89      0.93      0.91      1341
           1       0.95      0.91      0.93      1751

    accuracy                           0.92      3092
   macro avg       0.92      0.92      0.92      3092
weighted avg       0.92      0.92      0.92      3092

Precision: 0.9467
Recall: 0.9126
F1: 0.9293
Accuracy: 0.9214


# LSTM

In [14]:
pip install keras_preprocessing

Collecting keras_preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras_preprocessing
Successfully installed keras_preprocessing-1.1.2
[0mNote: you may need to restart the kernel to use updated packages.


In [15]:
from keras_preprocessing.sequence import pad_sequences
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras_preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
# The dataset should have two columns, one for the text and one for the label
dataset = df_train

# Preprocessing the text data
max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(dataset['Text'].values)
X = tokenizer.texts_to_sequences(dataset['Text'].values)
X = pad_sequences(X)

# Splitting the dataset into training and testing sets
Y = dataset['Sentiment'].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

# Defining the model architecture
embedding_dim = 32
lstm_out = 49
model = Sequential()
model.add(Embedding(max_features, embedding_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compiling the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training the model
epochs = 10
batch_size = 32
early_stop = EarlyStopping(monitor='val_loss', patience=2)
model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, verbose=2, validation_data=(X_test, Y_test), callbacks=[early_stop])

# Predict sentiment on testing set
Y_pred = model.predict(X_test)

# Evaluating the model on the test set
score, accuracy = model.evaluate(X_test, Y_test, verbose=0)

Epoch 1/10
387/387 - 653s - loss: 0.5242 - accuracy: 0.7983 - val_loss: 0.2166 - val_accuracy: 0.9269 - 653s/epoch - 2s/step
Epoch 2/10
387/387 - 621s - loss: 0.2424 - accuracy: 0.9169 - val_loss: 0.2834 - val_accuracy: 0.8984 - 621s/epoch - 2s/step
Epoch 3/10
387/387 - 611s - loss: 0.1844 - accuracy: 0.9375 - val_loss: 0.2095 - val_accuracy: 0.9266 - 611s/epoch - 2s/step
Epoch 4/10
387/387 - 611s - loss: 0.1487 - accuracy: 0.9510 - val_loss: 0.1621 - val_accuracy: 0.9447 - 611s/epoch - 2s/step
Epoch 5/10
387/387 - 605s - loss: 0.1314 - accuracy: 0.9567 - val_loss: 0.1615 - val_accuracy: 0.9424 - 605s/epoch - 2s/step
Epoch 6/10
387/387 - 616s - loss: 0.1139 - accuracy: 0.9641 - val_loss: 0.1640 - val_accuracy: 0.9424 - 616s/epoch - 2s/step
Epoch 7/10
387/387 - 620s - loss: 0.1086 - accuracy: 0.9639 - val_loss: 0.1707 - val_accuracy: 0.9392 - 620s/epoch - 2s/step


In [16]:
# Printing model metrics
print(classification_report(Y_test, Y_pred.round()))
print(f"Precision: {precision_score(Y_test, Y_pred.round()):.4f}")
print(f"Recall: {recall_score(Y_test, Y_pred.round()):.4f}")
print(f"F1: {f1_score(Y_test, Y_pred.round()):.4f}")
print(f"Accuracy: {accuracy:.4f}")

              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1354
           1       0.95      0.95      0.95      1738

    accuracy                           0.94      3092
   macro avg       0.94      0.94      0.94      3092
weighted avg       0.94      0.94      0.94      3092

Precision: 0.9454
Recall: 0.9465
F1: 0.9459
Accuracy: 0.9392
