# Data Preparation - Step 1
__________________________

Preparing the overall data for later classification.

## Loading Dataset

Import core libraries

In [1]:
import tensorflow as tf
import pandas as pd 
import os 
import shutil
tf.random.set_seed(42)

Retrieve dataset from source, establish required directory structure, delete directory related to unsupervised data

In [3]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url, untar=True, cache_subdir="./", cache_dir="./")

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
[1m84125825/84125825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m697s[0m 8us/step


In [4]:
dataset_dir = os.path.join(os.path.dirname(dataset), "aclImdb")
os.listdir(dataset_dir)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [5]:
train_dir = os.path.join("aclImdb"+"/","train" )
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [6]:
remove_dir = os.path.join(train_dir, "unsup")
shutil.rmtree(remove_dir)

## Preprocess the Text Files

Initialize spacy basics, prepare stop word list, convert it to set (improved computation)

In [None]:
import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
import re 
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words("english"))
stop_words.remove("not")
stop_words.remove("nor")
stop_words.remove("no")
stop_words.remove("again")
add_stopwords  = set(["movie", "film", "one", "the", "scene", "this", "story", "would", "really", "and", "also", ])

stop_words = stop_words.union(add_stopwords)


Remove html tags:

In [10]:
def remove_html(text):
    text = re.sub(r"<[\w]+ />", " ", text)
    text = re.sub("n't", " not", text)
    return text 

Clean text in general, stopwords removal and lemmatizing:

In [11]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\b\w{1,1}\b", " ", text)                                                     # remove single characters
    text = re.sub(r"[^a-z]", " ", text)                                                          # remove everything which aren't letters
    text = re.sub(r"[\s]+", " ", text)                                                           # remove too many whitespaces
    return text

Remove PERSON tokens and Stop words:


In [12]:
def process_and_filter_non_entities(text):
    doc = nlp(text)
    non_entity_lemmas = [token.lemma_ for token in doc if token.ent_type_ != "PERSON"]
    non_entity_lemmas = [token for token in non_entity_lemmas if token.lower() not in stop_words]
    text = " ".join(non_entity_lemmas) 
    return text

Apply preprocessing steps on text files and save the files in new directory:

In [17]:
import os 

def process_files_in_directory(input_directory, output_directory):
    for root, dirs, files in os.walk(input_directory):
        for file in files:
            input_file_path = os.path.join(root, file)
            with open(input_file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            text_removed_html = remove_html(content)
            text_removed_ents = process_and_filter_non_entities(text_removed_html)
            processed_content = clean_text(text_removed_ents)

            # Create the output directory if it doesn't exist
            output_subdirectory = os.path.join(output_directory, os.path.relpath(root, input_directory))
            os.makedirs(output_subdirectory, exist_ok=True)

            # Save the processed content to a new file in the output directory
            output_file_path = os.path.join(output_subdirectory, file)
            with open(output_file_path, 'w', encoding='utf-8') as f:
                f.write(processed_content)

input_pos_directory = './aclImdb/train/pos'
input_neg_directory = './aclImdb/train/neg'

output_pos_directory = './dataset/train/pre_final/pos'
output_neg_directory = './dataset/train/pre_final/neg'

process_files_in_directory(input_pos_directory, output_pos_directory)
process_files_in_directory(input_neg_directory, output_neg_directory)


input_pos_directory_test = './aclImdb/test/pos'
input_neg_directory_test = './aclImdb/test/neg'

output_pos_directory_test = './dataset/test/pre_final/pos'
output_neg_directory_test = './dataset/test/pre_final/neg'

process_files_in_directory(input_pos_directory_test, output_pos_directory_test)
process_files_in_directory(input_neg_directory_test, output_neg_directory_test)



repeat last step with training and testing set

# Data Preparation - Step 2
__________________________

For easier working with scikit-learn algorithms, the text files will be converted into .csv files

In [21]:
import os
import pandas as pd

# Directories containing text files
input_directory_pos = "./dataset/test/pre_final/pos"
input_directory_neg = "./dataset/test/pre_final/neg"

# Output CSV file
output_csv_file = "./dataset/testing_set_preprocessed.csv"

# Function to process each text file
def process_text_file(file_path, label):
    with open(file_path, 'r') as file:
        content = file.read()
        new_column_value = label
        return content, new_column_value

data_list = []

# Process "positive" directory
for filename in os.listdir(input_directory_pos):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_directory_pos, filename)
        content, new_column_value = process_text_file(file_path, "pos")
        data_list.append([content, new_column_value])

# Process "negative" directory
for filename in os.listdir(input_directory_neg):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_directory_neg, filename)
        content, new_column_value = process_text_file(file_path, "neg")
        data_list.append([content, new_column_value])

# CSV headers
csv_headers = ["review", "sentiment"]

df = pd.DataFrame(data_list, columns=csv_headers)
df.to_csv(output_csv_file, index=False)

# Directories containing training text files
input_directory_pos_train = "./dataset/train/pre_final/pos"
input_directory_neg_train = "./dataset/train/pre_final/neg"

# Output CSV file for training set
output_csv_file_train = "./dataset/training_set_preprocessed.csv"

data_list_train = []

# Process "positive" training reviews
for filename in os.listdir(input_directory_pos_train):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_directory_pos_train, filename)
        content, label = process_text_file(file_path, "pos")
        data_list_train.append([content, label])

# Process "negative" training reviews
for filename in os.listdir(input_directory_neg_train):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_directory_neg_train, filename)
        content, label = process_text_file(file_path, "neg")
        data_list_train.append([content, label])

# Save training CSV
df_train = pd.DataFrame(data_list_train, columns=csv_headers)
df_train.to_csv(output_csv_file_train, index=False)


In [22]:
data = pd.read_csv(output_csv_file)
data 

Unnamed: 0,review,sentiment
0,go see last night coax friend mine admit reluc...,pos
1,actor turn director follow promising debut got...,pos
2,recreational golfer knowledge sport history pl...,pos
3,see sneak preview delightful cinematography un...,pos
4,take true us golf open make much extra ordinar...,pos
...,...,...
24995,occasionally let kid watch garbage understand ...,neg
24996,anymore pretty much reality tv show people mak...,neg
24997,basic genre thriller intercut uncomfortable me...,neg
24998,four thing intrigue firstly star carly pope po...,neg


In [24]:
data_train = pd.read_csv(output_csv_file_train)
data_train

Unnamed: 0,review,sentiment
0,bromwell high cartoon comedy run time program ...,pos
1,homelessness houselessness state issue year ne...,pos
2,brilliant act well dramatic hobo lady ever see...,pos
3,easily underrated inn brooks cannon sure flawe...,pos
4,not typical much less slapstick actually plot ...,pos
...,...,...
24995,towards end feel technical feel like classroom...,neg
24996,kind enemy content watch time not bloody true ...,neg
24997,see descent last night stockholm festival huge...,neg
24998,pick pound turn rather good rd century release...,neg


# Data Preparation - Step 3
__________________________

This is one leftover from preparing the preprocessing steps: visualizing the Named Entities:

In [20]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

def process_and_filter_entities(text):
    doc = nlp(text)

    colors = {'PERSON': 'cyan'}
    options = {'colors': colors}

    displacy.render(doc, style="ent", options=options, page=False)

text_to_visualize = data["review"][0]

process_and_filter_entities(text_to_visualize)
