In [128]:
import tensorflow as tf
import os
import shutil

In [129]:
def load_dataset(directory, label):
    data = []
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), 'r') as file:
            review = file.read()
            data.append((review, label))
    return data

directory_path = "/Users/mrbinit/Downloads/aclImdb" 

train_pos_dir = os.path.join(directory_path, 'train/pos')
train_neg_dir = os.path.join(directory_path, 'train/neg')
test_pos_dir = os.path.join(directory_path, 'test/pos')
test_neg_dir = os.path.join(directory_path, 'test/neg')
val_pos_dir = os.path.join(directory_path, 'val/pos')
val_neg_dir = os.path.join(directory_path, 'val/neg')

train_data = load_dataset(train_pos_dir, 1) + load_dataset(train_neg_dir, 0) #1 represents positive and 0 represents neghative sentiments
test_data = load_dataset(test_pos_dir, 1) + load_dataset(test_neg_dir, 0)
val_data = load_dataset(val_pos_dir, 1) + load_dataset(val_neg_dir, 0)

# separate the reviews and labels from the train, test and validation data
train_reviews, train_labels = zip(*train_data)
test_reviews, test_labels = zip(*test_data)
val_reviews, val_labels = zip(*val_data)

# # convert lists to tensors
# train_reviews = tf.convert_to_tensor(train_reviews) # tensor are used to represent multi dimensional arrays which are important for GPU computation
# train_labels = tf.convert_to_tensor(train_labels)
# test_reviews = tf.convert_to_tensor(test_reviews)
# test_labels = tf.convert_to_tensor(test_labels)
# val_reviews = tf.convert_to_tensor(val_reviews)
# val_labels = tf.convert_to_tensor(val_labels)

# # Create datasets from tensor
# train_dataset = tf.data.Dataset.from_tensor_slices((train_reviews, train_labels)).batch(32)
# test_dataset = tf.data.Dataset.from_tensor_slices((test_reviews, test_labels)).batch(32)
# val_dataset = tf.data.Dataset.from_tensor_slices((val_reviews, val_labels)).batch(32)

# # Shuffle the training dataset
# train_dataset = train_dataset.shuffle(len(train_data))
# test_dataset = test_dataset.shuffle(len(test_data))
# val_dataset = val_dataset.shuffle(len(val_data))

#91% accuracy wiht over fitting

In [130]:
#Regular expressions (regex) are sequences of characters that define a search pattern. They are used for string manipulation, searching, and pattern matching within text. 
import re
def has_html_tags(text):
    pattern = re.compile(r'<[^>]+>')  # Regular expression to match HTML tags
    return bool(pattern.search(text))

# Check for HTML tags in each dataset
def check_html_tags(dataset):
    for review, _ in dataset:
        if has_html_tags(review):
            return True
    return False

# Check for HTML tags in each dataset
train_has_html = check_html_tags(train_data)
test_has_html = check_html_tags(test_data)
val_has_html = check_html_tags(val_data)
#ptinr output
print("Train dataset contains HTML tags:", train_has_html)
print("Test dataset contains HTML tags:", test_has_html)
print("Validation dataset contains HTML tags:", val_has_html)


Train dataset contains HTML tags: True
Test dataset contains HTML tags: True
Validation dataset contains HTML tags: True


In [131]:
def has_url(text):
    pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return bool(pattern.search(text))

#condition to check for url
def check_for_urls(dataset):
    for review, _ in dataset:
        if has_url(review):
            return True
    return False
#check whether there is URL or not
train_has_url = check_for_urls(train_data)
test_has_url = check_for_urls(test_data)
val_has_url = check_for_urls(val_data)

print("Train dataset contains URLs:", train_has_url)
print("Test dataset contains URLs:", test_has_url)
print("Validation dataset contains URLs:", val_has_url)

Train dataset contains URLs: True
Test dataset contains URLs: True
Validation dataset contains URLs: True


In [132]:
def has_special_characters(text):
    pattern = re.compile(r'[^a-zA-Z0-9\s]')
    return bool(pattern.search(text))

def check_for_special_characters(dataset):
    for review, _ in dataset:
        if has_special_characters(review):
            return True
    return False

train_has_special_chars = check_for_special_characters(train_data)
test_has_special_chars = check_for_special_characters(test_data)
val_has_special_chars = check_for_special_characters(val_data)
print("Train dataset contains special characters:", train_has_special_chars)
print("Test dataset contains special characters:", test_has_special_chars)
print("Validation dataset contains special characters:", val_has_special_chars)


Train dataset contains special characters: True
Test dataset contains special characters: True
Validation dataset contains special characters: True


In [144]:
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Download stopwords if not already downloaded
import nltk
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mrbinit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mrbinit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [147]:
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

# Load the English model
nlp = spacy.load("en_core_web_sm")

#identifies stop words and removes
def remove_Stop_words(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def data_cleaning(text):
    #convert text to lower case
    text = text.lower()
    #remove HTML tags
    clean_text = re.sub(r'<.*?>', '', text)
    #remove URLs
    clean_text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', clean_text)
    #remove special characters
    clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', clean_text)
    #handles stop words
    clean_text = remove_Stop_words(clean_text)
    return clean_text

# Function to extend words using spaCy
def extend_words_with_spacy(text):
    doc = nlp(text)
    extended_text = ' '.join([token.lemma_ for token in doc])
    return extended_text

#apply data cleaning and word extension to train data
cleaned_train_data = [(extend_words_with_spacy(data_cleaning(review)), label) for review, label in train_data]

#apply data cleaning and word extension to test data
cleaned_test_data = [(extend_words_with_spacy(data_cleaning(review)), label) for review, label in test_data]

#apply data cleaning and word extension to validation data
cleaned_val_data = [(extend_words_with_spacy(data_cleaning(review)), label) for review, label in val_data]


print("Cleaned Train Data:", cleaned_train_data[:5])  
print("Cleaned Test Data:", cleaned_test_data[:5])   
print("Cleaned Validation Data:", cleaned_val_data[:5])


Cleaned Train Data: [('movie get respect sure lot memorable quote list gem imagine movie joe piscopo actually funny maureen stapleton scene stealer moroni character absolute scream watch alan skipper hale jr police sgt', 1), ('bizarre horror movie fill famous face steal cristina rain later tvs flamingo road pretty somewhat unstable model gummy smile slate pay attempt suicide guard gateway hell scene rain model well capture mood music perfect deborah raffin charming cristinas pal raine move creepy brooklyn height brownstone inhabit blind priest top floor thing really start cook neighbor include fantastically wicked burgess meredith kinky couple sylvia miles beverly dangelo diabolical lot eli wallach great fun wily police detective movie nearly crosspollination rosemarys baby exorcistbut combination base bestseller jeffrey konvitz sentinel entertainingly spooky full shock bring well director michael winner mount thoughtfully downbeat end skill 12', 1), ('solid unremarkable film matthau e