In [1]:
import pandas as pd
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import numpy as np
import tensorflow
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.keras import Sequential
from tensorflow.keras import Input
from tensorflow.keras.layers import TextVectorization, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def clean_text(text):
    text = strip_html(text)
    text = re.sub(r'[^A-Za-z0-9]+',' ',text)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

df = pd.read_csv('IMDB Dataset.csv', encoding = 'Latin-1')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df['Processed_Reviews'] = df.review.apply(lambda x: clean_text(x))

x = df['Processed_Reviews']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

from_scratch = True 

vector_layer = TextVectorization(standardize='lower_and_strip_punctuation',
                                 max_tokens=10000,
                                 output_mode='int',
                                 output_sequence_length=30)
vector_layer.adapt(np.array(X_train))

# Save the TextVectorization layer's vocabulary
vocab = vector_layer.get_vocabulary()
with open('vocab.txt', 'w') as f:
    for item in vocab:
        f.write("%s\n" % item)

if from_scratch:
    model = Sequential()
    model.add(Input(shape=(1,), dtype=tensorflow.string))
    model.add(vector_layer)
    model.add(Embedding(10001, 16))
    model.add(Dropout(0.2))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
else:
    model = load_model('sentiment')

    # Load the vocabulary
    with open('vocab.txt') as f:
        vocab = [line.rstrip() for line in f]

    # Create a new TextVectorization layer
    new_vector_layer = TextVectorization(standardize='lower_and_strip_punctuation',
                                         max_tokens=10000,
                                         output_mode='int',
                                         output_sequence_length=30)
    new_vector_layer.set_vocabulary(vocab)

    # Replace the old vectorization layer with the new one
    model.layers[1] = new_vector_layer

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(np.array(X_train), np.array(y_train), validation_data=(np.array(X_test), np.array(y_test)), epochs=20)

model.save('sentiment')

_, accuracy = model.evaluate(np.array(X_test), np.array(y_test))
print(accuracy)


2023-06-05 01:39:43.213599: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-05 01:39:43.368640: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-06-05 01:39:43.368671: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-06-05 01:39:43.404767: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-06-05 01:39:44.407599: W tensorflow/stream_executor/pla

In [2]:
predictions =model.predict(["The movie was generally bad, the plot was boring and the characters badly interpreted"])
prediction = float(predictions[0][0])
class_name = 'negative' if prediction < 0.5 else 'positive'
if class_name == 'positive':
    confidence = (prediction - 0.5) / 0.5
else:
    confidence = (0.5 - prediction) / 0.5
print({'sentiment': class_name, 'confidence': confidence})

{'sentiment': 'negative', 'confidence': 0.8420833647251129}


In [3]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.3.5-py3-none-any.whl (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 KB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.3.5 kt-legacy-1.0.5
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [None]:
import pandas as pd
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import numpy as np
import tensorflow
from tensorflow.keras.layers import TextVectorization, Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras import Input
from sklearn.model_selection import train_test_split
from kerastuner.tuners import RandomSearch

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def clean_text(text):
    text = strip_html(text)
    text = re.sub(r'[^A-Za-z0-9]+',' ',text)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

df = pd.read_csv('IMDB Dataset.csv', encoding = 'Latin-1')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df['Processed_Reviews'] = df.review.apply(lambda x: clean_text(x))

x = df['Processed_Reviews']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

vector_layer = TextVectorization(standardize='lower_and_strip_punctuation',
                                 max_tokens=10000,
                                 output_mode='int',
                                 output_sequence_length=100)
vector_layer.adapt(np.array(X_train))

vocab = vector_layer.get_vocabulary()
with open('vocab.txt', 'w') as f:
    for item in vocab:
        f.write("%s\n" % item)

from_scratch = True 

if from_scratch:
    def build_model(hp):
        model = Sequential()
        model.add(Input(shape=(1,), dtype=tensorflow.string))
        model.add(vector_layer)
        model.add(Embedding(10000, hp.Int('embedding_dim', min_value=32, max_value=512, step=32)))
        model.add(Bidirectional(LSTM(hp.Int('LSTM_units', min_value=32, max_value=512, step=32))))
        model.add(Dropout(hp.Float('dropout', min_value=0.0, max_value=0.5, step=0.1)))
        model.add(Dense(1, activation='sigmoid'))
        
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        return model

    tuner = RandomSearch(
        build_model,
        objective='val_accuracy',
        max_trials=3,
        executions_per_trial=3,
        directory='model_dir',
        project_name='sentiment_analysis'
    )

    tuner.search(np.array(X_train), np.array(y_train), epochs=3, validation_data=(np.array(X_test), np.array(y_test)))

    best_model = tuner.get_best_models(num_models=1)[0]
else:
    best_model = load_model('sentiment')

    with open('vocab.txt') as f:
        vocab = [line.rstrip() for line in f]

    new_vector_layer = TextVectorization(standardize='lower_and_strip_punctuation',
                                         max_tokens=10000,
                                         output_mode='int',
                                         output_sequence_length=100)
    new_vector_layer.set_vocabulary(vocab)

    best_model.layers[1] = new_vector_layer

best_model.fit(np.array(X_train), np.array(y_train), epochs=10, validation_data=(np.array(X_test), np.array(y_test)))

# Save the model
best_model.save('sentiment')

_, accuracy = best_model.evaluate(np.array(X_test), np.array(y_test))
print(accuracy)


Trial 3 Complete [02h 24m 30s]
val_accuracy: 0.8592666784922282

Best val_accuracy So Far: 0.8646666606267294
Total elapsed time: 06h 22m 46s
INFO:tensorflow:Oracle triggered exit
INFO:tensorflow:Oracle triggered exit
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

KernelInterrupted: Execution interrupted by the Jupyter kernel.

In [None]:
predictions =best_model.predict(["Quite horrible"])
prediction = float(predictions[0][0])
class_name = 'negative' if prediction < 0.5 else 'positive'
if class_name == 'positive':
    confidence = (prediction - 0.5) / 0.5
else:
    confidence = (0.5 - prediction) / 0.5
print({'sentiment': class_name, 'confidence': confidence})

In [None]:
predictions =best_model.predict(["The movie was generally bad, the plot was boring and the characters badly interpreted"])
prediction = float(predictions[0][0])
class_name = 'negative' if prediction < 0.5 else 'positive'
if class_name == 'positive':
    confidence = (prediction - 0.5) / 0.5
else:
    confidence = (0.5 - prediction) / 0.5
print({'sentiment': class_name, 'confidence': confidence})

In [None]:
model.save()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=7a5b1e8f-5fb8-49ad-8f36-77068147d699' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>