In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import string
import re
from os import listdir
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.utils import plot_model
from keras.layers import Dense
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# load doc into memory
def load_doc(filename):
  file = open(filename, 'r')
  text = file.read()
  file.close()
  return text

In [None]:
text = load_doc('/content/drive/MyDrive/review_polarity/txt_sentoken/neg/cv000_29416.txt')
text

In [None]:
pos = load_doc('/content/drive/MyDrive/review_polarity/txt_sentoken/pos/cv000_29590.txt')
pos

In [None]:
# turn a doc into clean tokens
def clean_doc(doc):
  # split into tokens by white space
  tokens = doc.split()

  # prepare regex for char filtering
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))

  # remove punctuation from each word
  tokens = [re_punc.sub('', w) for w in tokens]

  # remove remaining tokens that are not alphabetic
  tokens = [word for word in tokens if word.isalpha()]

  # filter out stop words
  stop_words = set(stopwords.words('english'))
  tokens = [w for w in tokens if not w in stop_words]

  # filter out short tokens
  tokens = [word for word in tokens if len(word)>1]

  return tokens

In [None]:
text

In [None]:
clean_doc(text)

In [None]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
  # load the doc
  doc = load_doc(filename)

  # clean the doc
  tokens = clean_doc(doc)

  # filter by vocab
  tokens = [w for w in tokens if w in vocab]

  return ' '.join(tokens)

In [None]:
vocab = open('/content/drive/MyDrive/vocab.txt')
vocab = vocab.read().split()

In [None]:
vocab

In [None]:
doc_to_line('/content/drive/MyDrive/review_polarity/txt_sentoken/neg/cv000_29416.txt', vocab)

In [None]:
def process_train(directory):
  documents = []
  for filename in listdir(directory):
    if not filename.startswith('cv9'):
      path = directory + '/' + filename
      doc = load_doc(path)
      tokens = clean_doc(doc)
      documents.append(tokens)

  return documents

In [None]:
def process_test(directory):
  documents = []
  for filename in listdir(directory):
    if filename.startswith('cv9'):
      path = directory + '/' + filename
      doc = load_doc(path)
      tokens = clean_doc(doc)
      documents.append(tokens)

  return documents

In [None]:
process_test('/content/drive/MyDrive/review_polarity/txt_sentoken/neg')

In [None]:
# Load all docs in a directory
def process_docs(directory,is_train):
  documents = []
  for filename in listdir(directory):
    # skip all reviews in the test set
    if is_train and filename.startswith('cv9'):
      continue
    if not is_train and not filename.startswith('cv9'):
      continue
    path = directory + '/' + filename
    doc = load_doc(path)

    tokens = clean_doc(doc)
    documents.append(tokens)
  return documents

In [None]:
process_docs('/content/drive/MyDrive/review_polarity/txt_sentoken/neg', True)

In [None]:
def load_clean_dataset(is_train):
  # load documents
  neg  = process_docs('/content/drive/MyDrive/review_polarity/txt_sentoken/neg',is_train)
  pos  = process_docs('/content/drive/MyDrive/review_polarity/txt_sentoken/pos',is_train)
  docs = neg + pos
  # prepare labels
  labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
  return docs, labels

In [None]:
train, train_labels = load_clean_dataset(True)
test, test_labels = load_clean_dataset(False)

In [None]:
len(train),len(train_labels)

In [None]:
len(test), len(test_labels)

In [None]:

from collections import Counter
Counter(train_labels)


In [None]:
Counter(test_labels)

Preprocess the text


In [None]:
# fit a tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer


In [None]:
# Create the tokenizer
tokenizer = create_tokenizer(train)

In [None]:
# Encode the data
X_train = tokenizer.texts_to_matrix(train)
X_train.shape

In [None]:
X_train[100]

In [None]:
tokenizer.word_index

In [None]:
# Encode the data
X_test = tokenizer.texts_to_matrix(test)
X_test.shape

Build the Neural Network

In [None]:
# define the model
def define_model(n_words):
  model = Sequential()
  model.add(Dense(50, input_shape=(n_words,), activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  #Compile network
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [None]:
# summarize defined model
model = define_model(X_train.shape[1])
plot_model(model,show_layer_activations=True, show_shapes=True, show_layer_names=True)

Train the model

In [None]:
#fit network
model.fit(X_train, np.array(train_labels), epochs=10, batch_size=10)

In [None]:
model.evaluate(X_test, np.array(test_labels),batch_size=1)


Predict the unseen data

In [None]:
# Test positive text
text1 = 'Best movie ever! It was great, I will definitely recommend it.'
text2 = 'This is a bad movie. Please dont watch it.'

In [None]:
def predict_sentiment(review):
  tokens = clean_doc(review)
  line = ' '.join(tokens)
  encoded = tokenizer.texts_to_matrix([line],mode='binary')
  yhat = model.predict(encoded)
  percent_pos= yhat[0][0]
  if round(percent_pos)==0:
    return (1-percent_pos),'Negative'
  return percent_pos,'Positive'


In [None]:
predict_sentiment(text1)

In [None]:
predict_sentiment(text2)