# Imports

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore

from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout # type: ignore

In [None]:
MODEL_PATH = 'sentiment_analysis_model--doubled.keras'
DATASET_PATH = 'tripadvisor_hotel_reviews.csv'
TOKENIZER_PATH = 'tokenizer.pickle'

# Preprocessing

In [None]:
df = pd.read_csv(DATASET_PATH)
# print(df.head())

df = df[['Review', 'Rating']]
df['sentiment'] = df['Rating'].apply(lambda x: 'positive' if x > 3
                                     else 'negative' if x < 3
                                     else 'neutral')
df = df[['Review', 'sentiment']]
df.sample(frac=1).reset_index(drop=True)
df.iloc[[0, 1, 2, -2, -1]]

In [None]:
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['Review'])

sequences = tokenizer.texts_to_sequences(df['Review'])
padded_sequences = pad_sequences(sequences, maxlen=100, truncating='post')

sentiment_labels = pd.get_dummies(df['sentiment']).values

# Data Split, Neural Network

In [None]:
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, sentiment_labels, test_size=0.2)

model = Sequential()
model.add(Embedding(5000, 100))
model.add(Conv1D(64, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Training

In [None]:
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test))

# Evaluation

In [None]:
y_pred = np.argmax(model.predict(x_test), axis=1)
print("Accuracy: ", accuracy_score(np.argmax(y_test, axis=-1), y_pred))

# Saving the Model

In [None]:
import pickle

model.save(MODEL_PATH)
with open(TOKENIZER_PATH, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Prediction

In [None]:
import keras

model = keras.models.load_model(MODEL_PATH)
with open(TOKENIZER_PATH, 'rb') as handle:
    tokenizer = pickle.load(handle)

def predict_sentiment(review):
    text_sequence = tokenizer.texts_to_sequences([review])
    text_sequence = pad_sequences(text_sequence, maxlen=100, truncating='post')

    predicted_rating = model.predict(text_sequence)[0]
    if np.argmax(predicted_rating) == 0:
        return 'negative'
    elif np.argmax(predicted_rating) == 1:
        return 'neutral'
    else:
        return 'positive'

In [None]:
print(predict_sentiment("I love this hotel!"))
print(predict_sentiment("I hate this hotel!"))
print(predict_sentiment("I don't know how I feel about this hotel!"))
print(predict_sentiment("I have no strong feelings about this hotel!"))
print(predict_sentiment("Idrk man, it's mid ig"))
print(predict_sentiment("meh i just don't care"))