In [1]:
import tensorflow as tf

In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
import pandas as pd
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import numpy as np
from tensorflow import keras
from keras import Sequential
from keras import Input
from keras.layers import TextVectorization, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense
from keras.models import load_model
from sklearn.model_selection import train_test_split

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jcvar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jcvar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jcvar\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def clean_text(text):
    text = strip_html(text)
    text = re.sub(r'[^A-Za-z0-9]+',' ',text)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

In [6]:
df = pd.read_csv('IMDB_Dataset.csv', encoding = 'Latin-1')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df['Processed_Reviews'] = df.review.apply(lambda x: clean_text(x))

x = df['Processed_Reviews']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

  soup = BeautifulSoup(text, "html.parser")


In [7]:
vector_layer = TextVectorization(standardize='lower_and_strip_punctuation',
                                 max_tokens=10000,
                                 output_mode='int',
                                 output_sequence_length=30)
vector_layer.adapt(np.array(X_train))

# Save the TextVectorization layer's vocabulary
vocab = vector_layer.get_vocabulary()
with open('vocab.txt', 'w') as f:
    for item in vocab:
        f.write("%s\n" % item)

In [8]:
model = Sequential()
model.add(Input(shape=(1,), dtype=tf.string))
model.add(vector_layer)
model.add(Embedding(10001, 16))
model.add(Dropout(0.2))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

In [9]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(np.array(X_train), np.array(y_train), validation_data=(np.array(X_test), np.array(y_test)), epochs=20)

model.save('sentiment')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20




INFO:tensorflow:Assets written to: sentiment\assets


INFO:tensorflow:Assets written to: sentiment\assets


In [10]:
_, accuracy = model.evaluate(np.array(X_test), np.array(y_test))
print(accuracy)

0.7441999912261963


In [11]:
predictions =model.predict(["The movie was generally bad, the plot was boring and the characters badly interpreted"])
prediction = float(predictions[0][0])
class_name = 'negative' if prediction < 0.5 else 'positive'
if class_name == 'positive':
    confidence = (prediction - 0.5) / 0.5
else:
    confidence = (0.5 - prediction) / 0.5
print({'sentiment': class_name, 'confidence': confidence})

{'sentiment': 'negative', 'confidence': 0.9367949888110161}


In [13]:
from kerastuner.tuners import RandomSearch
from keras.layers import Bidirectional, LSTM

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def clean_text(text):
    text = strip_html(text)
    text = re.sub(r'[^A-Za-z0-9]+',' ',text)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

df = pd.read_csv('IMDB_Dataset.csv', encoding = 'Latin-1')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df['Processed_Reviews'] = df.review.apply(lambda x: clean_text(x))

x = df['Processed_Reviews']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jcvar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jcvar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jcvar\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  soup = BeautifulSoup(text, "html.parser")


In [14]:
vector_layer = TextVectorization(standardize='lower_and_strip_punctuation',
                                 max_tokens=10000,
                                 output_mode='int',
                                 output_sequence_length=100)
vector_layer.adapt(np.array(X_train))

vocab = vector_layer.get_vocabulary()
with open('vocab.txt', 'w') as f:
    for item in vocab:
        f.write("%s\n" % item)

In [15]:
def build_model(hp):
    model = Sequential()
    model.add(Input(shape=(1,), dtype=tf.string))
    model.add(vector_layer)
    model.add(Embedding(10000, hp.Int('embedding_dim', min_value=32, max_value=512, step=32)))
    model.add(Bidirectional(LSTM(hp.Int('LSTM_units', min_value=32, max_value=512, step=32))))
    model.add(Dropout(hp.Float('dropout', min_value=0.0, max_value=0.5, step=0.1)))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(hp.Float('dropout', min_value=0.0, max_value=0.5, step=0.1)))
    model.add(Dense(50, activation='relu'))
    model.add(Dropout(hp.Float('dropout', min_value=0.0, max_value=0.5, step=0.1)))
    model.add(Dense(25, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [16]:
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=3,
    executions_per_trial=3,
    directory='model_dir',
    project_name='sentiment_analysis'
)

tuner.search(np.array(X_train), np.array(y_train), epochs=3, validation_data=(np.array(X_test), np.array(y_test)))

best_model = tuner.get_best_models(num_models=1)[0]

Trial 3 Complete [00h 05m 47s]
val_accuracy: 0.8637333313624064

Best val_accuracy So Far: 0.8673333326975504
Total elapsed time: 00h 18m 09s
INFO:tensorflow:Oracle triggered exit


INFO:tensorflow:Oracle triggered exit


In [17]:
best_model.fit(np.array(X_train), np.array(y_train), epochs=20, validation_data=(np.array(X_test), np.array(y_test)))

# Save the model
best_model.save('sentiment')

_, accuracy = best_model.evaluate(np.array(X_test), np.array(y_test))
print(accuracy)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20




INFO:tensorflow:Assets written to: sentiment\assets


INFO:tensorflow:Assets written to: sentiment\assets


0.8395000100135803


In [18]:
predictions =best_model.predict(["Quite horrible"])
prediction = float(predictions[0][0])
class_name = 'negative' if prediction < 0.5 else 'positive'
if class_name == 'positive':
    confidence = (prediction - 0.5) / 0.5
else:
    confidence = (0.5 - prediction) / 0.5
print({'sentiment': class_name, 'confidence': confidence})

{'sentiment': 'negative', 'confidence': 0.9999999815164298}


In [19]:
predictions =best_model.predict(["The movie was generally bad, the plot was boring and the characters badly interpreted"])
prediction = float(predictions[0][0])
class_name = 'negative' if prediction < 0.5 else 'positive'
if class_name == 'positive':
    confidence = (prediction - 0.5) / 0.5
else:
    confidence = (0.5 - prediction) / 0.5
print({'sentiment': class_name, 'confidence': confidence})

{'sentiment': 'negative', 'confidence': 0.9999993320166709}


In [None]:
import pickle

# save the model to disk
filename = 'LSTM.pkl'
pickle.dump(best_model, open(filename, 'wb'))


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=7a5b1e8f-5fb8-49ad-8f36-77068147d699' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>