In [4]:
import os
import gensim
import numpy as np
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from collections import Counter
from bs4 import BeautifulSoup
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import contractions



In [6]:
import os

def load_dataset(directory, label, seed=1337):
    data = []
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), 'r') as file:
            review = file.read()
            data.append((review, label))
    return data

directory_path = "/Users/mrbinit/Downloads/aclImdb" 

train_pos_dir = os.path.join(directory_path, 'train', 'pos')
train_neg_dir = os.path.join(directory_path, 'train', 'neg')
test_pos_dir = os.path.join(directory_path, 'test', 'pos')
test_neg_dir = os.path.join(directory_path, 'test', 'neg')
val_pos_dir = os.path.join(directory_path, 'val', 'pos')
val_neg_dir = os.path.join(directory_path, 'val', 'neg')

train_data = load_dataset(train_pos_dir, 1, seed=1337) + load_dataset(train_neg_dir, 0, seed=1337) # 1 represents positive and 0 represents negative sentiments
test_data = load_dataset(test_pos_dir, 1, seed=1337) + load_dataset(test_neg_dir, 0, seed=1337)

# Split the test set into a validation set (15,000 samples) and a test set (10,000 samples)
val_data = test_data[:15000]
test_data = test_data[15000:25000]

# Separate the reviews and labels from the train, test, and validation data
train_reviews, train_labels = zip(*train_data)
test_reviews, test_labels = zip(*test_data)
val_reviews, val_labels = zip(*val_data)

# Check the lengths of train, test, and validation datasets
train_length = len(train_reviews)
test_length = len(test_reviews)
val_length = len(val_reviews)

print("Train dataset length:", train_length)
print("Test dataset length:", test_length)
print("Validation dataset length:", val_length)



Train dataset length: 25000
Test dataset length: 10000
Validation dataset length: 15000


In [7]:
print("Train Data:", train_reviews[:5])  
print("Test Data:", test_reviews[:5])   
print("Validation Data:", val_reviews[:5])

Train Data: ('For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.', 'Bizarre horror movie filled with famous faces but stolen by Cristina Raines (later of TV\'s "Flamingo Road") as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the Gateway to Hell! The scenes with Raines modeling are very well captured, the mood music is perfect, Deborah Raffin is charming as Cristina\'s pal, but when Raines moves into a creepy Brooklyn Heights brownstone (inhabited by a blind priest on the top floor), things really start cooking. The neighbors, including a fantastically wicked Burgess Meredith and kinky couple Sylvia Miles & Beverly D\'Angelo, are a diabolical lot, and Eli Wallach is great fun as a wily p

In [8]:
#Regular expressions (regex) are sequences of characters that define a search pattern. They are used for string manipulation, searching, and pattern matching within text. 
import re
def has_html_tags(text):
    pattern = re.compile(r'<[^>]+>')  # Regular expression to match HTML tags
    return bool(pattern.search(text))

# Check for HTML tags in each dataset
def check_html_tags(dataset):
    for review, _ in dataset:
        if has_html_tags(review):
            return True
    return False

#check for HTML tags in each dataset
train_has_html = check_html_tags(train_data)
test_has_html = check_html_tags(test_data)
val_has_html = check_html_tags(val_data)
#print output
print("Train dataset contains HTML tags:", train_has_html)
print("Test dataset contains HTML tags:", test_has_html)
print("Validation dataset contains HTML tags:", val_has_html)


Train dataset contains HTML tags: True
Test dataset contains HTML tags: True
Validation dataset contains HTML tags: True


In [9]:
def has_url(text):
    pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return bool(pattern.search(text))

#condition to check for url
def check_for_urls(dataset):
    for review, _ in dataset:
        if has_url(review):
            return True
    return False
#check whether there is URL or not
train_has_url = check_for_urls(train_data)
test_has_url = check_for_urls(test_data)
val_has_url = check_for_urls(val_data)

print("Train dataset contains URLs:", train_has_url)
print("Test dataset contains URLs:", test_has_url)
print("Validation dataset contains URLs:", val_has_url)

Train dataset contains URLs: True
Test dataset contains URLs: True
Validation dataset contains URLs: True


In [10]:
def has_special_characters(text):
    pattern = re.compile(r'[^a-zA-Z0-9\s]')
    return bool(pattern.search(text))

def check_for_special_characters(dataset):
    for review, _ in dataset:
        if has_special_characters(review):
            return True
    return False

train_has_special_chars = check_for_special_characters(train_data)
test_has_special_chars = check_for_special_characters(test_data)
val_has_special_chars = check_for_special_characters(val_data)
print("Train dataset contains special characters:", train_has_special_chars)
print("Test dataset contains special characters:", test_has_special_chars)
print("Validation dataset contains special characters:", val_has_special_chars)


Train dataset contains special characters: True
Test dataset contains special characters: True
Validation dataset contains special characters: True


In [12]:
#preprocess function
def preprocess_text(text):
    #remove  HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    #remove urls
    text = re.sub(r'http\S+', '', text)

    #expand contractions 
    text = contractions.fix(text)

    #tokenize 
    tokens = word_tokenize(text)

    #remove non-alphabetic tokens and convert to lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]

    #remove stopwords 
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]

    #lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

In [13]:
# Preprocess all the data
train_reviews_processed = [preprocess_text(review) for review in train_reviews]
test_reviews_processed = [preprocess_text(review) for review in test_reviews]
val_reviews_processed = [preprocess_text(review) for review in val_reviews]

  text = BeautifulSoup(text, "html.parser").get_text()


In [14]:
print("Cleaned Train Data:", train_reviews_processed[:5])  
print("Cleaned Test Data:", test_reviews_processed[:5])   
print("Cleaned Validation Data:", val_reviews_processed[:5])

Cleaned Train Data: [['movie', 'get', 'respect', 'sure', 'lot', 'memorable', 'quote', 'listed', 'gem', 'imagine', 'movie', 'joe', 'piscopo', 'actually', 'funny', 'maureen', 'stapleton', 'scene', 'stealer', 'moroni', 'character', 'absolute', 'scream', 'watch', 'alan', 'skipper', 'hale', 'police', 'sgt'], ['bizarre', 'horror', 'movie', 'filled', 'famous', 'face', 'stolen', 'cristina', 'raines', 'later', 'tv', 'flamingo', 'road', 'pretty', 'somewhat', 'unstable', 'model', 'gummy', 'smile', 'slated', 'pay', 'attempted', 'suicide', 'guarding', 'gateway', 'hell', 'scene', 'raines', 'modeling', 'well', 'captured', 'mood', 'music', 'perfect', 'deborah', 'raffin', 'charming', 'cristina', 'pal', 'raines', 'move', 'creepy', 'brooklyn', 'height', 'brownstone', 'inhabited', 'blind', 'priest', 'top', 'floor', 'thing', 'really', 'start', 'cooking', 'neighbor', 'including', 'fantastically', 'wicked', 'burgess', 'meredith', 'kinky', 'couple', 'sylvia', 'mile', 'beverly', 'diabolical', 'lot', 'eli', 'wa

In [15]:
def find_max_sequence_length(train_reviews_processed):
    max_length = 0
    for tokens in train_reviews_processed:
        sequence_length = len(tokens)
        if sequence_length > max_length:
            max_length = sequence_length
    return max_length

#assuming tokenized_texts is a list of tokenized texts after preprocessing
max_sequence_length = find_max_sequence_length(train_reviews_processed)
print("Maximum sequence length:", max_sequence_length)

Maximum sequence length: 1394


In [16]:
# Train Word2Vec model
word2vec_model = gensim.models.Word2Vec(sentences=train_reviews_processed + test_reviews_processed + val_reviews_processed,
                                        vector_size=100, window=5, min_count=5, workers=4)

# Function to get vector representation of a sentence
def get_sentence_vector(tokens):
    vector = np.zeros((100,))
    count = 0
    for word in tokens:
        if word in word2vec_model.wv:
            vector += word2vec_model.wv[word]
            count += 1
    if count != 0:
        vector /= count
    return vector
    
#get sentence vector for all the dataset 
train_vectors = [get_sentence_vector(tokens) for tokens in train_reviews_processed]
test_vectors = [get_sentence_vector(tokens) for tokens in test_reviews_processed]
val_vectors = [get_sentence_vector(tokens) for tokens in val_reviews_processed]

In [17]:
#convert lists to numpy arrays 
X_train = np.array(train_vectors)
X_test = np.array(test_vectors)
X_val = np.array(val_vectors)


y_train = np.array(train_labels)
y_test = np.array(test_labels)
y_val = np.array(val_labels)

In [23]:
import numpy as np

def load_glove_embeddings(filename):
  """
  Loads pre-trained GloVe word embeddings from a text file.

  Args:
      filename (str): Path to the GloVe embeddings file.

  Returns:
      dict: A dictionary mapping words to their corresponding word vectors.
  """
  embeddings = {}
  with open(filename, 'r', encoding='utf-8') as file:
    for line in file:
      values = line.split()
      word = values[0]
      vector = np.asarray([float(x) for x in values[1:]])
      embeddings[word] = vector
  return embeddings

# Example usage
glove_embeddings = load_glove_embeddings('/Users/mrbinit/Downloads/glove.6B/glove.6B.100d.txt')  # Replace with your filename


In [29]:
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 1: Load GloVe word embeddings
glove_dir = '/Users/mrbinit/Downloads/glove.6B'
glove_file = 'glove.6B.100d.txt'  # Example file with 100-dimensional word vectors
embeddings_index = {}
with open(os.path.join(glove_dir, glove_file), encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Step 2: Tokenize the preprocessed text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_reviews_processed + test_reviews_processed + val_reviews_processed)

# Step 3: Map tokens to GloVe word vectors
word_index = tokenizer.word_index
embedding_dim = 100  # Assuming you're using GloVe vectors with 100-dimensional embeddings
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i < num_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
# Step 4: Use GloVe word embeddings in a simple text classification model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(num_words, embedding_dim, 
                              embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                              trainable=False),  # Freeze the embedding layer
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Dummy labels for example
labels = np.array([1, 0])

# Dummy input data for example
input_data = np.random.randint(0, num_words, (2, 10))

# Train the model
model.fit(input_data, labels, epochs=10, batch_size=32)



Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.5000 - loss: 0.7218
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.5000 - loss: 0.7029
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.5000 - loss: 0.6843
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.5000 - loss: 0.6660
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.5000 - loss: 0.6482
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.5000 - loss: 0.6309
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.5000 - loss: 0.6149
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.5000 - loss: 0.5992
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x30e68e490>

In [32]:

# Assuming you have already preprocessed your text data and obtained the vocabulary
vocab_size = len(word_index)  # Assuming word_index is the vocabulary dictionary
max_features = vocab_size + 1 

In [34]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

# Define the model architecture
embedding_dim = 100  # Assuming you're using GloVe vectors with 100-dimensional embeddings
max_sequence_length = 500  # Assuming your sequences have a maximum length of 500
num_classes = 2  # Binary classification (positive or negative sentiment)

model = Sequential([
    Embedding(input_dim=num_words,  # Number of unique words in your vocabulary
              output_dim=embedding_dim,
              weights=[embedding_matrix],  # Use pre-trained GloVe embeddings
              input_length=max_sequence_length,
              trainable=False),  # Freeze the embedding layer weights
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy',  # Since the labels are integers
              optimizer='adam',
              metrics=['accuracy'])

# Print model summary
model.summary()


ValueError: Unrecognized keyword arguments passed to Embedding: {'weights': [array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.38251001,  0.14821   ,  0.60601002, ...,  0.058921  ,
         0.091112  ,  0.47283   ],
       [ 0.19915999, -0.049702  ,  0.24579   , ..., -0.068109  ,
         0.017651  ,  0.06455   ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.93970001, -0.11675   ,  0.1559    , ...,  0.093543  ,
        -0.26108   , -0.46669999],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])], 'input_length': 500}