# Notebook 1

**NOTE:** The cell below loads the required packages and retrieves the location of the **Large Movie Review Dataset** from a configuration file. After downloading the data, the parent folder should be specified. Please open the *template_config_file.json*, insert the path on your computer with the downloaded data, and save this json file as *config_file.json*. All notebooks will fetch the path from this file, so it has to be provided only once.

In [None]:
# LOAD PACKAGES
import json
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

porter_stemmer = PorterStemmer()
import os

with open('../config/config_file.json') as f:
    config_file = json.load(f)

# Read review data folder from configuration file
movie_reviews_folder = config_file['movie_review_location']
print(f"Movie reviews will be loaded from: {movie_reviews_folder}")

### Movie review examples

In [None]:
# Specify files to display
filename_positive_review = "3247_10.txt"
filename_negative_review = "2331_1.txt"

# Read reviews and print
with open(os.path.join(movie_reviews_folder, "aclImdb", "train", "pos", filename_positive_review), 'r') as file:
    selected_pos_review = file.read()
with open(os.path.join(movie_reviews_folder, "aclImdb", "train", "neg", filename_negative_review), 'r') as file:
    selected_neg_review = file.read()
print("Selected positive review:\n", selected_pos_review, "\n")
print("Selected negative review:\n", selected_neg_review)


### Print list of stopwords

In [None]:
english_stopwords = nltk.corpus.stopwords.words('english')
print("Number of stop words:", len(english_stopwords))
for stop_w in english_stopwords:
    print(f"- {stop_w}")

In [None]:
print(f"Original review: {selected_pos_review}")

# Step 1: Remove punctuation
review_no_punctuation = re.sub(r'[".,!?;-]+', '', selected_pos_review)
print(f"Review without pucntuation: {review_no_punctuation}")

# Step 2: Tokenize string
review_tokens = nltk.word_tokenize(review_no_punctuation)
print(f"Output of nltk.word_tokenize: {review_tokens}")

# Step 3: Enforce lower case and omit non-text
lower_case_characters_only = [ ch.lower() for ch in review_tokens if ch.isalpha()]
print(f"Omit strings containing non-text and set to lower case: {lower_case_characters_only}")

# Step 4: Remove stop words
output_word_list = [word for word in lower_case_characters_only if word not in english_stopwords]
print(f"Final result having removed stop words: {output_word_list}")

def process_imdb_review_for_Bayes(imdb_review, stopwords, print_details = False):
    # Step 1: Remove all punctuation
    temp_data = re.sub(r'[".,!?;-]+', '', imdb_review)
    # Step 2: Tokenize
    temp_data = nltk.word_tokenize(temp_data)
    # Step 3: All lower case and omit non-text
    temp_data = [ ch.lower() for ch in temp_data if ch.isalpha()]
    # Step 4: Remove stop words
    output_for_Bayes = [word for word in temp_data if word not in english_stopwords]

    if print_details:
        print("\nORIGINAL: ", imdb_review)
        print("RESULT AFTER PRE-PROCESSING:", output_for_Bayes, "\n")

    return output_for_Bayes 


process_imdb_review_for_Bayes(selected_pos_review, english_stopwords, print_details=True)

### Function

In [None]:
tokenized_review = nltk.word_tokenize(selected_neg_review)

output_for_Bayes= [porter_stemmer.stem(word) for word in tokenized_review]

print(output_for_Bayes)
