# NLP MINI-PROJECT SENTIMENT ANALYSIS USING A CUSTOM CNN MODEL

In [2]:
# Import libraries
import numpy as np
import pandas as pd
import tensorflow as tf

OVERVIEW OF DATA

In [3]:
# Load the dataset
df = pd.read_csv('tripadvisor_hotel_reviews.csv')
print(df.head())

                                              Review  Rating
0  nice hotel expensive parking got good deal sta...       4
1  ok nothing special charge diamond member hilto...       2
2  nice rooms not 4* experience hotel monaco seat...       3
3  unique, great stay, wonderful time hotel monac...       5
4  great stay great stay, went seahawk game aweso...       5


DATA PREPROCESSING

In [4]:
# Preprocess the dataset
df = df[['Review', 'Rating']]
df['sentiment'] = df['Rating'].apply(lambda x: 'positive' if x > 3 else 'negative' if x < 3 else 'neutral')

In [27]:
df.shape

(20491, 2)

In [28]:
positive_count = (df['sentiment'] == 'positive').sum()
print("Total count of positive sentiment values:", positive_count)


Total count of positive sentiment values: 15093


In [29]:
negative_count  = (df['sentiment'] == 'negative').sum()
print("Total count of positive sentiment values:", negative_count)


Total count of positive sentiment values: 3214


In [30]:
neutral_count  = (df['sentiment'] == 'neutral').sum()
print("Total count of positive sentiment values:", neutral_count)


Total count of positive sentiment values: 2184


In [5]:
df = df[['Review', 'sentiment']]

In [6]:
df.head()

Unnamed: 0,Review,sentiment
0,nice hotel expensive parking got good deal sta...,positive
1,ok nothing special charge diamond member hilto...,negative
2,nice rooms not 4* experience hotel monaco seat...,neutral
3,"unique, great stay, wonderful time hotel monac...",positive
4,"great stay great stay, went seahawk game aweso...",positive


In [7]:
df = df.sample(frac=1).reset_index(drop=True)

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

WE PERFORM TOKENIZATION ON TEXT AND CREATE A VOCABULARY BASED ON TOP 5000 MOST OCCURING WORDS. AND THEN CONVERT TEXT TO SEQUENCE OF INTEGERS WITH MAX SIZE 100.

In [9]:
# Tokenize and pad the review sequences
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['Review'])
word_index = tokenizer.word_index

In [10]:
sequences = tokenizer.texts_to_sequences(df['Review'])
padded_sequences = pad_sequences(sequences, maxlen=100, truncating='post')

In [11]:
sequences

[[256,
  1,
  719,
  32,
  113,
  351,
  86,
  254,
  19,
  136,
  256,
  1,
  55,
  1064,
  170,
  16,
  42,
  83,
  170,
  279,
  615,
  2382,
  543,
  3472,
  8,
  170,
  96,
  41,
  1,
  1,
  1,
  326,
  2591,
  152,
  1513,
  786,
  1,
  1,
  1,
  1,
  719,
  3245,
  1,
  27,
  610,
  22,
  244,
  73,
  2088,
  2660,
  680,
  824,
  1,
  170,
  2661,
  1547,
  31,
  453,
  824,
  1561,
  1676,
  38,
  295,
  1146,
  4789,
  1,
  335,
  4365,
  2592,
  89,
  125,
  1627,
  195,
  490,
  293,
  180,
  44,
  360,
  824,
  43,
  129,
  5,
  1548,
  1,
  90,
  3198,
  335,
  1538,
  2344,
  1146,
  36,
  2011,
  2311,
  4944,
  4561,
  235,
  78,
  1,
  860,
  4,
  3172,
  171,
  1,
  817,
  553,
  81,
  34,
  868,
  1,
  35,
  3881,
  1527,
  1,
  230,
  70,
  1,
  3644,
  943,
  854,
  1275,
  181,
  156,
  247,
  1811,
  860,
  373,
  860,
  114,
  16,
  31,
  13,
  190,
  31,
  33,
  3716,
  67,
  33,
  23,
  56,
  177,
  1711,
  1,
  4,
  1826,
  171,
  207,
  1,
  1843,
  428,
  

WE PERFORM ONE-HOT ENCODING ON TARGET DATASET WHICH BASICALLY ASSIGNS INTEGER VALUE TO +VE,-VE AND NEUTRAL

In [12]:
# Convert the sentiment labels to one-hot encoding
sentiment_labels = pd.get_dummies(df['sentiment']).values

SPLIT OUR DATASET INTO TRAINING AND TESTING

In [13]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

In [14]:
# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, sentiment_labels, test_size=0.2)

In [15]:
# Build the model
model = Sequential()
model.add(Embedding(5000, 100, input_length=100))
model.add(Conv1D(64, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          500000    
                                                                 
 conv1d (Conv1D)             (None, 96, 64)            32064     
                                                                 
 global_max_pooling1d (Globa  (None, 64)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 3)                 99        
                                                        

In [17]:
# Train the model
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2909cdf9480>

CHECKING THE ACCURACY SCORE

In [18]:
from sklearn.metrics import accuracy_score

In [19]:
# Evaluate the model
y_pred = np.argmax(model.predict(x_test), axis=-1)
print("Accuracy:", accuracy_score(np.argmax(y_test, axis=-1), y_pred))

Accuracy: 0.8365454989021712


In [20]:
import pickle

In [21]:
# Save the trained model
model.save('sentiment_analysis_model.h5')
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [22]:
# Load the saved model and tokenizer
import keras

model = keras.models.load_model('sentiment_analysis_model.h5')
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [23]:
def predict_sentiment(text):
    # Tokenize and pad the input text
    text_sequence = tokenizer.texts_to_sequences([text])
    text_sequence = pad_sequences(text_sequence, maxlen=100)
    
    predicted_rating = model.predict(text_sequence)
    if np.argmax(predicted_rating) == 0:
        return 'Negative'
    elif np.argmax(predicted_rating) == 1:
        return 'Neutral'
    else:
        return 'Positive'


In [24]:
# Example usage
text_input = "I absolutely loved my stay at that hotel. The staff was amazing and the room was fantastic!"
predicted_sentiment = predict_sentiment(text_input)
print(predicted_sentiment)

Positive


In [25]:
# Example usage
text_input = "I hate that product. Will not buy it again"
predicted_sentiment = predict_sentiment(text_input)
print(predicted_sentiment)

Negative


In [26]:
# Example usage
text_input = "Overall, it was an average experience. "
predicted_sentiment = predict_sentiment(text_input)
print(predicted_sentiment)

Neutral
