In [None]:

# nltk is one of the most useful libraries when it comes to nlp
!pip install nltk

In [None]:
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle
from wordcloud import WordCloud

# Preprocessing and evaluation
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l1, l2

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB

In [None]:
df=pd.read_csv('tripadvisor_hotel_reviews.csv')
df.head()

In [None]:
df.info()

In [None]:
sns.countplot(data=df, x='Rating', palette='flare').set_title('Rating Distribution Across Dataset')

In [None]:
# Length of word in sentence
df['Length'] = df['Review'].apply(len)
df.head()

In [None]:
sns.displot(data=df, x='Length', hue='Rating', palette='flare', kind='kde', fill=True, aspect=4)

In [None]:
g = sns.FacetGrid(data=df, col='Rating')
g.map(plt.hist, 'Length', color='#973aa8')

In [None]:
sns.stripplot(data=df, x='Rating', y='Length', palette='flare', alpha=0.3)

In [None]:
# Let's change the rating to be more general and easier to understand
def rating(score):
    if score > 3:
        return 'Good'
    elif score == 3:
        return 'Netral'
    else:
        return 'Bad'

In [None]:
df['Rating'] = df['Rating'].apply(rating)

In [None]:
df.head()

In [None]:
# Total word in dataset before cleaning
length = df['Length'].sum()

In [None]:
import nltk
nltk.download('wordnet')
print('Original:')
print(df['Review'][0])
print()

sentence = []
for word in df['Review'][0].split():
    stemmer = SnowballStemmer('english')
    sentence.append(stemmer.stem(word))
print('Stemming:')
print(' '.join(sentence))
print()

sentence = []
for word in df['Review'][0].split():
    lemmatizer = WordNetLemmatizer()
    sentence.append(lemmatizer.lemmatize(word, 'v'))
print('Lemmatization:')
print(' '.join(sentence))

In [None]:
def cleaning(text):
    #remove punctuations and uppercase
    clean_text = text.translate(str.maketrans('','',string.punctuation)).lower()

    #remove stopwords
    clean_text = [word for word in clean_text.split() if word not in stopwords.words('english')]

    #lemmatize the word
    sentence = []
    for word in clean_text:
        lemmatizer = WordNetLemmatizer()
        sentence.append(lemmatizer.lemmatize(word, 'v'))

    return ' '.join(sentence)

In [None]:
import nltk
nltk.download('stopwords')
df['Review'] = df['Review'].apply(cleaning)

In [None]:
df['Length'] = df['Review'].apply(len)
new_length = df['Length'].sum()

print('Total text length before cleaning: {}'.format(length))
print('Total text length after cleaning: {}'.format(new_length))

In [None]:
df.to_csv('cleaned_df.csv', index=False)

In [None]:
# After cleaning, let's see the most common used word
plt.figure(figsize=(20,20))
wc = WordCloud(max_words=1000, min_font_size=10,
                height=800,width=1600,background_color="white", colormap='flare').generate(' '.join(df['Review']))

plt.imshow(wc)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Review'], df['Rating'], test_size=0.2)

In [None]:
tfid = TfidfVectorizer()
train_tfid_matrix = tfid.fit_transform(X_train)
test_tfid_matrix = tfid.transform(X_test)

In [None]:
pickle.dump(tfid, open('tfidf.pkl', 'wb'))

In [None]:
models = [DecisionTreeClassifier(),
          RandomForestClassifier(),
          SVC(),
          LogisticRegression(max_iter=1000),
          KNeighborsClassifier(),
          BernoulliNB()]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'X_train' is your feature matrix obtained using TF-IDF
# Assuming 'y_train' is your target variable for the training set

# Vectorize the data
vectorizer = TfidfVectorizer()
train_tfid_matrix = vectorizer.fit_transform(X_train)

# Initialize RandomForestClassifier with parallel processing
rf_classifier = RandomForestClassifier(n_jobs=-1)

# Perform cross-validation
cross_val = cross_val_score(rf_classifier, train_tfid_matrix, y_train, scoring='accuracy',
                            cv=StratifiedKFold(5)).mean()

print(f"Random Forest Classifier Accuracy: {cross_val:.2f}")


In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
import pydotplus
from IPython.display import Image

# Assuming 'X_train' is your feature matrix obtained using TF-IDF
# Assuming 'y_train' is your target variable for the training set

# Vectorize the data
vectorizer = TfidfVectorizer()
train_tfid_matrix = vectorizer.fit_transform(X_train)

# Initialize DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier()

# Perform cross-validation
cross_val = cross_val_score(dt_classifier, train_tfid_matrix, y_train, scoring='accuracy',
                            cv=StratifiedKFold(5)).mean()

print(f"Decision Tree Classifier Accuracy: {cross_val:.2f}")

# Fit the decision tree on the entire dataset
dt_classifier.fit(train_tfid_matrix, y_train)

# Export the decision tree to a Graphviz file
dot_data = export_graphviz(dt_classifier, out_file=None,
                           feature_names=vectorizer.get_feature_names_out(),
                           class_names=dt_classifier.classes_,
                           filled=True, rounded=True, special_characters=True)

graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png('decision_tree.png')

# Display the decision tree image (requires Graphviz installed)
Image(graph.create_png())


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'X_train' is your feature matrix obtained using TF-IDF
# Assuming 'y_train' is your target variable for the training set

# Vectorize the data
vectorizer = TfidfVectorizer()
train_tfid_matrix = vectorizer.fit_transform(X_train)

# Initialize Support Vector Classifier
svc_classifier = SVC()

# Perform cross-validation
cross_val = cross_val_score(svc_classifier, train_tfid_matrix, y_train, scoring='accuracy',
                            cv=StratifiedKFold(5)).mean()

print(f"SVC Classifier Accuracy: {cross_val:.2f}")


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'X_train' is your feature matrix obtained using TF-IDF
# Assuming 'y_train' is your target variable for the training set

# Vectorize the data
vectorizer = TfidfVectorizer()
train_tfid_matrix = vectorizer.fit_transform(X_train)

# Initialize Logistic Regression
logreg_classifier = LogisticRegression(max_iter=1000)

# Perform cross-validation
cross_val = cross_val_score(logreg_classifier, train_tfid_matrix, y_train, scoring='accuracy',
                            cv=StratifiedKFold(5)).mean()

print(f"Logistic Regression Classifier Accuracy: {cross_val:.2f}")


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'X_train' is your feature matrix obtained using TF-IDF
# Assuming 'y_train' is your target variable for the training set

# Vectorize the data
vectorizer = TfidfVectorizer()
train_tfid_matrix = vectorizer.fit_transform(X_train)

# Initialize KNN Classifier
knn_classifier = KNeighborsClassifier()

# Perform cross-validation
cross_val = cross_val_score(knn_classifier, train_tfid_matrix, y_train, scoring='accuracy',
                            cv=StratifiedKFold(5)).mean()

print(f"KNN Classifier Accuracy: {cross_val:.2f}")


In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'X_train' is your feature matrix obtained using TF-IDF
# Assuming 'y_train' is your target variable for the training set

# Vectorize the data
vectorizer = TfidfVectorizer()
train_tfid_matrix = vectorizer.fit_transform(X_train)

# Initialize Bernoulli Naive Bayes Classifier
bernoulli_nb_classifier = BernoulliNB()

# Perform cross-validation
cross_val = cross_val_score(bernoulli_nb_classifier, train_tfid_matrix, y_train, scoring='accuracy',
                            cv=StratifiedKFold(5)).mean()

print(f"Bernoulli Naive Bayes Classifier Accuracy: {cross_val:.2f}")


In [None]:
log = LogisticRegression(max_iter=1000)
log.fit(train_tfid_matrix, y_train)

pred = log.predict(test_tfid_matrix)

In [None]:
pickle.dump(log, open('ml_model.pkl', 'wb'))

In [None]:
ml = pickle.load(open('ml_model.pkl','rb'))
tfidf = pickle.load(open('tfidf.pkl','rb'))
def ml_predict(text):
    clean_text = cleaning(text)
    tfid_matrix = tfidf.transform([clean_text])
    pred_proba = ml.predict_proba(tfid_matrix)
    idx = np.argmax(pred_proba)
    pred = ml.classes_[idx]

    return pred, pred_proba[0][idx]

ml_predict('poor room service')

In [None]:
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

In [None]:
tokenizer = Tokenizer(num_words=50000, oov_token='<OOV>')

tokenizer.fit_on_texts(X_train)
# print(tokenizer.word_index)
total_word = len(tokenizer.word_index)
print('Total distinct words: {}'.format(total_word))

train_seq = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_seq)

test_seq = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_seq)

# One hot encoding the label
lb = LabelBinarizer()
train_labels = lb.fit_transform(y_train)
test_labels = lb.transform(y_test)

In [None]:
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))
pickle.dump(lb, open('label.pkl', 'wb'))

In [None]:
model = tf.keras.models.Sequential([tf.keras.layers.Embedding(total_word, 8),
                                    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
                                    tf.keras.layers.Dropout(0.5),
                                    tf.keras.layers.Dense(8, kernel_regularizer=l2(0.001),
                                                          bias_regularizer=l2(0.001), activation='relu'),
                                    tf.keras.layers.Dropout(0.5),
                                    tf.keras.layers.Dense(3, activation='softmax')])

model.summary()

In [None]:
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(train_padded, train_labels, epochs=3, validation_data=(test_padded, test_labels))

In [None]:
metrics = pd.DataFrame(model.history.history)
metrics[['accuracy', 'val_accuracy']].plot()
metrics[['loss', 'val_loss']].plot()


In [None]:
pred2 = model.predict(test_padded)

In [None]:
true_labels = np.argmax(test_labels, axis=-1)
pred_labels = np.argmax(pred2, axis=-1)

In [None]:
print(confusion_matrix(true_labels, pred_labels))
print(classification_report(true_labels, pred_labels))

In [None]:
# Logistic Regression
def ml_predict(text):
    clean_text = cleaning(text)
    tfid_matrix = tfid.transform([clean_text])
    pred = log.predict(tfid_matrix)[0]

    return pred

# Deep Neural Network
def dl_predict(text):
    clean_text = cleaning(text)
    seq = tokenizer.texts_to_sequences([clean_text])
    padded = pad_sequences(seq)

    pred = model.predict(padded)
    # Get the label name back
    result = lb.inverse_transform(pred)[0]

    return result

In [None]:
text = 'Such a comfy place to stay with the loved one'

print('Prediction using Logistic Regression: {}'.format(ml_predict(text)))
print('Prediction using DNN: {}'.format(dl_predict(text)))

In [None]:
text2 = 'Awful room services and slow wifi connection'

print('Prediction using Logistic Regression: {}'.format(ml_predict(text2)))
print('Prediction using DNN: {}'.format(dl_predict(text2)))

In [None]:
text3 = 'Hard to get here but the scenery is wonderful'

print('Prediction using Logistic Regression: {}'.format(ml_predict(text3)))
print('Prediction using DNN: {}'.format(dl_predict(text3)))

In [None]:
text4 = 'waste service'

print('Prediction using Logistic Regression: {}'.format(ml_predict(text2)))
print('Prediction using DNN: {}'.format(dl_predict(text2)))