In [32]:
import os
import pandas as pd

# import all the training and testing data into a dataframe where 1 is positive, 0 is negative

def read_review_files(folder_path):
    df = pd.DataFrame({'Review': [], 'Category': []})
    folders = ['neg', 'pos']
    
    for f in folders:
        path  = os.path.join(folder_path, f)
        output = 1
        if f == 'neg': 
            output = 0
        for file in os.listdir(path):
            new_row = {'Review': open(f"{path}/{file}", 'r').read(), 'Category': output}
            df.loc[len(df)] = new_row
    return df

train_data_raw = read_review_files('./aclImdb/train')
test_data_raw = read_review_files('./aclImdb/test')

In [33]:
test_data_raw['Category']

0        0
1        0
2        0
3        0
4        0
        ..
24995    1
24996    1
24997    1
24998    1
24999    1
Name: Category, Length: 25000, dtype: int64

In [34]:
def preprocess_input(df):
    new_df = df.copy()
    # remove all the special characters
    # df['Review'] = re.sub('[^A-Za-z\s]+', "", df['Review'])
    new_df['Review'] = new_df['Review'].str.replace('[^0-9A-Za-z\s]+', '', regex = True)
    # remove all the leading and tailing spaces
    new_df['Review'] = new_df['Review'].str.strip()
    # convert all the text into lower case
    new_df['Review'] = new_df['Review'].str.lower()
    
    return new_df

train_data = preprocess_input(train_data_raw)
test_data = preprocess_input(test_data_raw)

In [35]:
test_data.loc[8]['Review']

'when hollywood is trying to grasp what an intelligent person is like they fail so miserably finding it hard putting words in the mouth of the purported geniusbr br right any genius walks around trying to rub in his superiority at every instance sure they hang out in bars and pick fights  its not like they are generalizing wildly autistic nerds who never have a tanbr br plus if you are a genius you know all about math and history and politics and of course youre constantly up to date with current events and a thorough analysis of them coz these things like all go together n stuff yknowbr br plus you walk around with a smirk all the time you are just a smug son of a youknowwhat thats how it is yall br br and of course you smoke like someone who never smoked before but you smoke coz its like cool n stuff yknow and youre different that is understoodbr br and of course you can fight  youre a bully a bully who finds time to study 10000 books whenever he doesnt lift weights and whenever he d

In [36]:
import tensorflow as tf
from tensorflow import keras

tokenizer = keras.preprocessing.text.Tokenizer()
# update internal vocabulary based on a list of texts
tokenizer.fit_on_texts(train_data['Review'])
# transform training and testing input data into sequence of integers
input_test = tokenizer.texts_to_sequences(test_data['Review'])
# pad the input data so that every input is of the same size
input_test = keras.preprocessing.sequence.pad_sequences(input_test)

In [37]:
# load the generated model
network = tf.keras.models.load_model("./models/20591615_NLP_model.h5")

test_result = network.evaluate(input_test, test_data['Category'])

