In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SimpleRNN
import matplotlib.pyplot as plt
import keras_tuner as kt
from keras_tuner import RandomSearch
from tensorflow import keras

In [2]:
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

def load_data(data_folder):
    reviews = []
    labels = []
    
    for label in ['pos', 'neg']:
        folder_path = os.path.join(data_folder, label)
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                review = file.read()
                reviews.append(review)
                labels.append(1 if label == 'pos' else 0)
    
    return reviews, labels



In [3]:
# Load and preprocess data
data_folder_train = 'D:\\jaysh\\FALL2023\\DLRL\\HW3\\dataset\\train'  # Replace with the actual path to your dataset
data_folder_test = 'D:\\jaysh\\FALL2023\\DLRL\\HW3\\dataset\\test'  # Replace with the actual path to your dataset


In [4]:
X_train, y_train = load_data(data_folder_train)
X_test, y_test = load_data(data_folder_test)




In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Convert to numpy arrays



In [6]:
y_train, y_test = np.array(y_train), np.array(y_test)

In [7]:

X_train = X_train_tfidf.toarray()
X_test = X_test_tfidf.toarray()

In [10]:
type(X_test)

numpy.ndarray

In [8]:
type(X_train)

numpy.ndarray

In [9]:
type(y_test)

numpy.ndarray

In [13]:
def build_model20(hp):
    vocab_size = 5000
    embedding_size = 128
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_size, input_length=20))
    model.add(SimpleRNN(units=128, input_shape=(None, 1), dropout=hp.Choice("dropout", values=[0.0, 0.4, 0.5, 0.6])))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer=keras.optimizers.Adam(hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])),
                  metrics=['accuracy'])
    return model
def build_model50(hp):
    vocab_size = 5000
    embedding_size = 128
    model=Sequential()
    model.add(Embedding(vocab_size, embedding_size, input_length=50))
    model.add(SimpleRNN(units=128, input_shape=(None, 1), dropout = hp.Choice("dropout", values=[0.0, 0.4, 0.5, 0.6])))
    model.add(Dense(1, activation='sigmoid'))
    #print(model.summary())

    model.compile(loss='binary_crossentropy', 
                 optimizer=keras.optimizers.Adam(hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])), 
                 metrics=['accuracy'])
    return model

def build_model100(hp):
    vocab_size = 5000
    embedding_size = 128
    model=Sequential()
    model.add(Embedding(vocab_size, embedding_size, input_length=100))
    model.add(SimpleRNN(units=128, input_shape=(None, 1), dropout = hp.Choice("dropout", values=[0.0, 0.4, 0.5, 0.6])))
    model.add(Dense(1, activation='sigmoid'))
    #print(model.summary())

    model.compile(loss='binary_crossentropy', 
                 optimizer=keras.optimizers.Adam(hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])), 
                 metrics=['accuracy'])
    return model

def build_model200(hp):
    vocab_size = 5000
    embedding_size = 128
    model=Sequential()
    model.add(Embedding(vocab_size, embedding_size, input_length=200))
    model.add(SimpleRNN(units=128, input_shape=(None, 1), dropout = hp.Choice("dropout", values=[0.0, 0.4, 0.5, 0.6])))
    model.add(Dense(1, activation='sigmoid'))
    #print(model.summary())

    model.compile(loss='binary_crossentropy', 
                 optimizer=keras.optimizers.Adam(hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])), 
                 metrics=['accuracy'])
    return model

def build_model500(hp):
    vocab_size = 5000
    embedding_size = 128
    model=Sequential()
    model.add(Embedding(vocab_size, embedding_size, input_length=500))
    model.add(SimpleRNN(units=128, input_shape=(None, 1), dropout = hp.Choice("dropout", values=[0.0, 0.4, 0.5, 0.6])))
    model.add(Dense(1, activation='sigmoid'))
    #print(model.summary())

    model.compile(loss='binary_crossentropy', 
                 optimizer=keras.optimizers.Adam(hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])), 
                 metrics=['accuracy'])
    return model

In [14]:
#20 state dimensions batch_size 32
X20_train = sequence.pad_sequences(X_train, maxlen=20)
X20_test = sequence.pad_sequences(X_test, maxlen=20)
bsize = 32
X_valid, y_valid = X20_train[:bsize], y_train[:bsize]
X_train2, y_train2 = X20_train[bsize:], y_train[bsize:]
tuner = RandomSearch(
    build_model20,
    objective='val_accuracy',
    max_trials=3,
    executions_per_trial=2,
    overwrite=True,
    directory="my_dir",
    project_name="helloworld",
)
tuner.search(X_train2, y_train2, epochs=3, validation_data=(X_valid, y_valid), batch_size = bsize)
tuner.results_summary()
print("\n********************************************************************************\n")



Trial 3 Complete [00h 00m 59s]
val_accuracy: 1.0

Best val_accuracy So Far: 1.0
Total elapsed time: 00h 02m 44s
Results summary
Results in my_dir\helloworld
Showing 10 best trials
Objective(name="val_accuracy", direction="max")

Trial 1 summary
Hyperparameters:
dropout: 0.4
learning_rate: 0.001
Score: 1.0

Trial 2 summary
Hyperparameters:
dropout: 0.6
learning_rate: 0.001
Score: 1.0

Trial 0 summary
Hyperparameters:
dropout: 0.6
learning_rate: 0.01
Score: 0.5

********************************************************************************



In [10]:
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# # Assuming you have X_train, X_test, y_train, y_test

# # Set hyperparameters
# # state_dimensions = [20, 50, 100, 200, 500]
# state_dimensions = [20]
# embedding_dim = 100
# max_len = 5000

# # Tokenize the text
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(X_train)

# X_train_seq = tokenizer.texts_to_sequences(X_train)
# X_test_seq = tokenizer.texts_to_sequences(X_test)

# vocab_size = len(tokenizer.word_index) + 1

# # Pad sequences to a fixed length
# X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
# X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# # Model building function
# def build_rnn_model(state_dim):
#     model = Sequential()
#     model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, trainable=True))
#     model.add(SimpleRNN(units=state_dim))
#     model.add(Dense(1, activation='sigmoid'))
    
#     model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#     return model

# def build_lstm_model(state_dim):
#     model = Sequential()
#     model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, trainable=True))
#     model.add(LSTM(units=state_dim))
#     model.add(Dense(1, activation='sigmoid'))
    
#     model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#     return model

# # Train and evaluate models
# results_rnn = []
# results_lstm = []

# for state_dim in state_dimensions:
#     # RNN
#     rnn_model = build_rnn_model(state_dim)
#     rnn_model.fit(X_train_pad, y_train, epochs=5, batch_size=64, verbose=0)
#     rnn_result = rnn_model.evaluate(X_test_pad, y_test, verbose=0)
#     results_rnn.append((state_dim, rnn_result[1]))  # Accuracy is at index 1
    
#     # LSTM
#     lstm_model = build_lstm_model(state_dim)
#     lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=64,  verbose=0)
#     lstm_result = lstm_model.evaluate(X_test_pad, y_test, verbose=0)
#     results_lstm.append((state_dim, lstm_result[1]))

# # Display results
# print("RNN Results:")
# print("State Dimension\tAccuracy")
# for result in results_rnn:
#     print(f"{result[0]}\t\t\t{result[1]}")

# print("\nLSTM Results:")
# print("State Dimension\tAccuracy")
# for result in results_lstm:
#     print(f"{result[0]}\t\t\t{result[1]}")
