In [3]:
import os
import gensim
import numpy as np
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from collections import Counter
from bs4 import BeautifulSoup
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import contractions
# Define function to load dataset
def load_dataset(directory, label):
    data = []
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), 'r') as file:
            review = file.read()
            data.append((review, label))
    return data

# Define preprocessing function
def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Expand contractions
    text = contractions.fix(text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove non-alphabetic tokens and convert to lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

# Define function to find maximum sequence length
def find_max_sequence_length(tokenized_texts):
    max_length = 0
    for tokens in tokenized_texts:
        sequence_length = len(tokens)
        if sequence_length > max_length:
            max_length = sequence_length
    return max_length

# Define paths to dataset directories
directory_path = "/Users/mrbinit/Downloads/aclImdb"
train_pos_dir = os.path.join(directory_path, 'train', 'pos')
train_neg_dir = os.path.join(directory_path, 'train', 'neg')
test_pos_dir = os.path.join(directory_path, 'test', 'pos')
test_neg_dir = os.path.join(directory_path, 'test', 'neg')
val_pos_dir = os.path.join(directory_path, 'val', 'pos')
val_neg_dir = os.path.join(directory_path, 'val', 'neg')

# Load dataset
train_data = load_dataset(train_pos_dir, 1) + load_dataset(train_neg_dir, 0)
test_data = load_dataset(test_pos_dir, 1) + load_dataset(test_neg_dir, 0)
val_data = test_data[:15000]
test_data = test_data[15000:25000]

# Separate reviews and labels
train_reviews, train_labels = zip(*train_data)
test_reviews, test_labels = zip(*test_data)
val_reviews, val_labels = zip(*val_data)

# Preprocess all the data
train_reviews_processed = [preprocess_text(review) for review in train_reviews]
test_reviews_processed = [preprocess_text(review) for review in test_reviews]
val_reviews_processed = [preprocess_text(review) for review in val_reviews]

# Find maximum sequence length
max_sequence_length = find_max_sequence_length(train_reviews_processed)
print("Maximum sequence length:", max_sequence_length)

# Combine tokenized words from the training set into single list
all_tokenized_words_train = [word for review in train_reviews_processed for word in review]

# Count the occurrences of each unique word in the training set
word_counts_train = Counter(all_tokenized_words_train)

# Determine the vocabulary size of the training set
vocab_size_train = len(word_counts_train)
print("Vocabulary Size of Training Set:", vocab_size_train)

# Train Word2Vec model
word2vec_model = gensim.models.Word2Vec(sentences=train_reviews_processed + test_reviews_processed + val_reviews_processed,
                                        vector_size=100, window=5, min_count=5, workers=4)

# Function to get vector representation of a sentence
def get_sentence_vector(tokens):
    vector = np.zeros((100,))
    count = 0
    for word in tokens:
        if word in word2vec_model.wv:
            vector += word2vec_model.wv[word]
            count += 1
    if count != 0:
        vector /= count
    return vector

# Get sentence vector for all the datasets
train_vectors = [get_sentence_vector(tokens) for tokens in train_reviews_processed]
test_vectors = [get_sentence_vector(tokens) for tokens in test_reviews_processed]
val_vectors = [get_sentence_vector(tokens) for tokens in val_reviews_processed]

# Convert lists to numpy arrays
X_train = np.array(train_vectors)
X_test = np.array(test_vectors)
X_val = np.array(val_vectors)

y_train = np.array(train_labels)
y_test = np.array(test_labels)
y_val = np.array(val_labels)




  text = BeautifulSoup(text, "html.parser").get_text()


Maximum sequence length: 1394
Vocabulary Size of Training Set: 64602


In [4]:
import tensorflow as tf
from tensorflow.keras import layers
batch_size = 32

# Define the model
model = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(100,)),  # Input layer with 100 features
    layers.Dense(32, activation='relu'),  # Hidden layer with 32 neurons
    layers.Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, 
                    batch_size=batch_size, 
                    epochs=20, 
                    validation_data=(X_val, y_val))

# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 579us/step - accuracy: 0.7643 - loss: 0.4890 - val_accuracy: 0.8387 - val_loss: 0.3693
Epoch 2/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 571us/step - accuracy: 0.8292 - loss: 0.3840 - val_accuracy: 0.7493 - val_loss: 0.5179
Epoch 3/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 512us/step - accuracy: 0.8330 - loss: 0.3738 - val_accuracy: 0.8577 - val_loss: 0.3401
Epoch 4/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 471us/step - accuracy: 0.8353 - loss: 0.3713 - val_accuracy: 0.8677 - val_loss: 0.3167
Epoch 5/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 458us/step - accuracy: 0.8435 - loss: 0.3591 - val_accuracy: 0.8222 - val_loss: 0.3997
Epoch 6/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 462us/step - accuracy: 0.8445 - loss: 0.3559 - val_accuracy: 0.8709 - val_loss: 0.3121
Epoch 7/20
[1m7

In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameter

# Define a model-building function
def build_model(hp):
    model = keras.Sequential()

    # Tune the number of units in the first dense layer
    hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
    model.add(layers.Dense(units=hp_units, activation='relu'))

    # Tune the dropout rate
    hp_dropout = hp.Float('dropout', min_value=0.0, max_value=0.5, step=0.1)
    model.add(layers.Dropout(rate=hp_dropout))

    # Tune the kernel regularization
    hp_reg = hp.Choice('regularization', values=['l1', 'l2'])
    if hp_reg == 'l1':
        reg = keras.regularizers.l1(hp.Float('lambda', 1e-5, 1e-1, sampling='log'))
    else:
        reg = keras.regularizers.l2(hp.Float('lambda', 1e-5, 1e-1, sampling='log'))

    # Tune the kernel initializer
    hp_init = hp.Choice('initializer', values=['glorot_uniform', 'he_normal'])
    if hp_init == 'glorot_uniform':
        init = 'glorot_uniform'
    else:
        init = 'he_normal'

    # Add output layer
    model.add(layers.Dense(1, activation='sigmoid', kernel_regularizer=reg, kernel_initializer=init))

    # Compile the model
    model.compile(optimizer=keras.optimizers.Adam(),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Define the hyperparameter search space
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    directory='keras_tuner_results',
    project_name='hyperparameter_tuning')

# Perform hyperparameter search
tuner.search(X_train, y_train, validation_data=(X_val, y_val), epochs=10)

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Evaluate the best model
loss, accuracy = best_model.evaluate(X_test, y_test)
print(f'Test accuracy: {accuracy}')


Trial 10 Complete [00h 00m 11s]
val_accuracy: 0.838866651058197

Best val_accuracy So Far: 0.8778666853904724
Total elapsed time: 00h 03m 21s
[1m188/313[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 268us/step - accuracy: 0.7559 - loss: 0.5059

  trackable.load_own_variables(weights_store.get(inner_path))


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 266us/step - accuracy: 0.7542 - loss: 0.5082
Test accuracy: 0.7512999773025513
