# Preparation

In [76]:
# Import necessary libraries
import pandas as pd
import kagglehub
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from keras.models import load_model
from keras.optimizers import Adam, RMSprop
import numpy as np
import keras_tuner as kt

In [35]:
# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
print("Path to dataset files:", path)
data = pd.read_csv("/Users/suzukikenta/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1/IMDB Dataset.csv")
data

Path to dataset files: /Users/suzukikenta/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [36]:
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})
max_vocab_size = 10000  # Top 10,000 most common words
max_sequence_length = 200  # Cut or pad all reviews to 200 words
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(data['review'])  # Fit on all reviews (train + test)
# Convert training and test sets into sequences

# RNN_model

In [74]:
def build_model(hp):
    model = Sequential()

    # Tune embedding dimension
    embedding_dim = hp.Choice('embedding_dim', values=[50, 100, 150])
    model.add(Embedding(input_dim=max_vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))

    # Tune number of LSTM units
    lstm_units = hp.Int('lstm_units', min_value=32, max_value=128, step=32)
    model.add(LSTM(lstm_units))

    # Tune dropout rate
    dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)
    model.add(Dropout(dropout_rate))

    model.add(Dense(1, activation='sigmoid'))

    # Tune optimizer choice
    optimizer = hp.Choice('optimizer', values=['adam', 'rmsprop'])

    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model


In [77]:
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,             # Number of models to try
    executions_per_trial=1,    # How many times to train each model (average performance)
    directory='tuner_dir',
    project_name='imdb_sentiment'
)

# Run the search
tuner.search(
    x_train, train_labels,
    epochs=5,
    validation_data=(x_test, test_labels)
)


Trial 10 Complete [00h 02m 06s]
val_accuracy: 0.8593000173568726

Best val_accuracy So Far: 0.8989999890327454
Total elapsed time: 00h 43m 33s


In [78]:
model = tuner.get_best_models(num_models=1)[0]
best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]

print("Best hyperparameters:")
print(f"Embedding dim: {best_hp.get('embedding_dim')}")
print(f"LSTM units: {best_hp.get('lstm_units')}")
print(f"Dropout rate: {best_hp.get('dropout_rate')}")
print(f"Optimizer: {best_hp.get('optimizer')}")


Best hyperparameters:
Embedding dim: 150
LSTM units: 96
Dropout rate: 0.1
Optimizer: adam


  saveable.load_own_variables(weights_store.get(inner_path))


# Try the model 

In [92]:
# Save the model
model.save("rnn_model.h5")
model = load_model("rnn_model.h5")



In [90]:
new_review = ["It was the worst movie I have ever seen and I really hated it more than any movies"]

# Convert text to sequence of integers
sequence = tokenizer.texts_to_sequences(new_review)

# Pad the sequence to match training input length (e.g., 200)
padded = pad_sequences(sequence, maxlen=200)

# Predict
prediction = model.predict(padded)

# Interpret result
sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
print(f"Predicted sentiment: {sentiment} ({prediction[0][0]:.4f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Predicted sentiment: negative (0.0466)
