In [1]:
# Import the pandas library to read our dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

import nltk
import random
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
file_link = 'https://raw.githubusercontent.com/cacoderquan/Sentiment-Analysis-on-the-Rotten-Tomatoes-movie-review-dataset/master/train.tsv'
data_set = pd.read_csv(file_link, sep = '\t')

In [3]:
data_set.head(10)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [0]:
def load_documents(documents, combined_phrases = False):
  # extract full sentences only from the dataset
  if combined_phrases:
    fullSentences = []
    curSentence = 0
    for i in range(data_set.shape[0]):
      if data_set['SentenceId'][i]> curSentence:
        fullSentences.append((data_set['Phrase'][i], data_set['Sentiment'][i]))
        curSentence = curSentence +1
    return fullSentences
  else:
    raw_documents = []
    for index, row in data_set.iterrows():
      raw_documents.append([row['Phrase'], row['Sentiment']])
    return raw_documents

In [0]:
documents = load_documents(data_set)
documents = np.array(documents)

In [0]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer
import re

porter = PorterStemmer()
lancaster = LancasterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

stopwords_en = stopwords.words('english')
punctuations = "?:!.,;\"-()_'" 

def preprocess_text(sentences, remove_stopwords = True, useStemming = False, useLemma = False, removePuncs = True):
  new_sentences = list()
  for sentence in sentences:
    sentence = sentence.lower()
    # Remove punctuations
    if removePuncs:
      sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    sentence = sentence.split(' ')
    tempSentence = []
    for w in sentence:
      newWord = w
      if remove_stopwords and (w in stopwords_en):
        continue
      # if removePuncs and (w in punctuations):
      #   continue
      if useStemming:
        # newWord = lancaster.stem(newWord)
        newWord = porter.stem(newWord)
      if useLemma:
        newWord = wordnet_lemmatizer.lemmatize(newWord)
      tempSentence.append(newWord)
    new_sentences.append(' '.join(tempSentence))
  return new_sentences

In [0]:
def convert_to_arrays(x_train, x_test):
  x_train = x_train.toarray()
  x_test = x_test.toarray()
  return x_train, x_test

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def get_bag_of_words(x_train, x_test, ngram_range = (1,2), max_features = 1000):
  vectorizer = CountVectorizer(stop_words = "english", ngram_range = ngram_range, max_features = max_features)
  x_train_vector = vectorizer.fit_transform(x_train)
  x_test_vector = vectorizer.transform(x_test)

  return convert_to_arrays(x_train_vector, x_test_vector)

def get_tfidf(x_train, x_test, ngram_range = (1,2), max_features = 1000):
  vectorizer = TfidfVectorizer(stop_words = "english", ngram_range = ngram_range, max_features = max_features)
  x_train_vector = vectorizer.fit_transform(x_train)
  x_test_vector = vectorizer.transform(x_test)
  
  return convert_to_arrays(x_train_vector, x_test_vector)


In [0]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

def get_data(bag_of_words = True, tfidf = False, max_features = 1000):
  if bag_of_words and tfidf:
    raise(ValueError('Select only one method'))
  elif not bag_of_words and not tfidf:
    raise(ValueError('No method is selected'))
  x_train, x_test, y_train, y_test = train_test_split(documents[:, 0], documents[:,1], random_state = 2003, test_size = 0.3)

  y_train = to_categorical(y_train)
  y_test = to_categorical(y_test)

  x_train = preprocess_text(x_train)
  x_test = preprocess_text(x_test)

  if bag_of_words:
    x_train, x_test = get_bag_of_words(x_train, x_test, max_features = max_features)
  else:
    x_train, x_test = get_tfidf(x_train, x_test, max_features = max_features)

  return x_train, x_test, y_train, y_test

In [0]:
import keras.backend as K

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def f1_score(y_true, y_pred):
  p = precision(y_true, y_pred)
  r = recall(y_true, y_pred)
  return (2 * p * r)/ (p + r + K.epsilon())


### Single Channel Model

In [0]:
# Model 1 with individual phrases
from keras.models import Sequential
from keras.layers import Conv1D,Dropout, MaxPooling1D, Flatten, Dense, Embedding, SpatialDropout1D

class CnnModel():
  def __init__(self, n_features, n_classes):
    self.model = Sequential()
    self.model.add(Embedding(input_dim=n_features,output_dim=128,input_length=n_features))
    self.model.add(Conv1D(64, kernel_size=3, padding='same', activation='relu'))
    self.model.add(MaxPooling1D(pool_size=3))
    self.model.add(Conv1D(64, kernel_size=4, padding='same', activation='relu'))
    self.model.add(MaxPooling1D(pool_size=4))
    self.model.add(Flatten())
    self.model.add(Dense(1000, activation = 'relu'))
    self.model.add(Dense(100, activation = 'relu'))
    self.model.add(Dense(n_classes, activation='softmax'))
    if n_classes == 2:
      self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', precision, recall, f1_score])
    else:
      self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', precision, recall, f1_score])

  def fit(self, x_train, y_train, e, bs, v):
    return self.model.fit(x_train, y_train, epochs = e, batch_size = bs, verbose = v)

  def evaluate(self, x_test, y_test):
    return self.model.evaluate(x_test, y_test)
    
  def save(self, file_path):
    self.model.save(file_path)
  def load_weights(self, file_path):
    self.model.load_weights(file_path)

In [0]:
# Training of single channel CNN model
x_train,x_test,y_train,y_test = get_data(bag_of_words = False, tfidf = True, max_features=3000)

n_features = x_train.shape[1]
n_classes = y_train.shape[1]
model = CnnModel(n_features, n_classes)
history = model.fit(x_train, y_train, e = 10, bs = 512,v = 1)
# model.load_weights('1093805_1dconv_reg_tfidf')

In [0]:
model.save('1093805_1dconv_reg_tfidf')

In [34]:
training_accuracy = history.history['acc']
training_precision = history.history['precision']
training_recall = history.history['recall']
training_f1_score = history.history['f1_score']

avg_training_accuracy = sum(training_accuracy)/ len(training_accuracy)
avg_training_precision = sum(training_precision) / len(training_precision)
avg_training_recall = sum(training_recall) / len(training_recall)
avg_training_f1_score = sum(training_f1_score)/ len(training_f1_score)

print('Average Training Accuracy : {0:.2f}'.format(avg_training_accuracy * 100))
print('Average Training Precision : {0:.2f}'.format(avg_training_precision ))
print('Average Training Recall : {0:.2f}'.format(avg_training_recall ))
print('Average Training F1 score : {0:.2f}'.format(avg_training_f1_score ))

Average Training Accuracy : 51.85
Average Training Precision : 0.69
Average Training Recall : 0.17
Average Training F1 score : 0.26


In [38]:
# Testing
evaluation_history = model.evaluate(x_test, y_test)

testing_accuracy = evaluation_history[1]
testing_precision = evaluation_history[2]
testing_recall = evaluation_history[3]
testing_f1_score = evaluation_history[4]
print('Testing Accuracy : {0:.2f}'.format(testing_accuracy * 100))
print('Testing Precision : {0:.2f}'.format(testing_precision ))
print('Testing Recall : {0:.2f}'.format(testing_recall ))
print('Testing F1 score : {0:.2f}'.format(testing_f1_score ))

Testing Accuracy : 51.97
Testing Precision : 0.70
Testing Recall : 0.15
Testing F1 score : 0.24


### Multi channel Model

In [0]:
from keras.models import Sequential,Model
from keras.layers import Conv1D,Dropout, MaxPooling1D, Flatten, Dense, Embedding, SpatialDropout1D, Input
from keras.layers.merge import concatenate

class MultiCnn():
  def __init__(self, n_features, vocab_size, n_classes):
    # channel 1
    inputs1 = Input(shape=(n_features,))
    embedding1 = Embedding(vocab_size, 128)(inputs1)
    conv1_1 = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding1)
    pool1_1 = MaxPooling1D(pool_size=2)(conv1_1)
    conv1_2 = Conv1D(filters=32, kernel_size=3, activation='relu')(pool1_1)
    pool1_2 = MaxPooling1D(pool_size=2)(conv1_2)
    flat1 = Flatten()(pool1_2)
  
    # channel 2
    inputs2 = Input(shape=(n_features,))
    embedding2 = Embedding(vocab_size, 128)(inputs2)
    conv2_1 = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding2)
    pool2_1 = MaxPooling1D(pool_size=2)(conv2_1)
    conv2_2 = Conv1D(filters=32, kernel_size=3, activation='relu')(pool2_1)
    pool2_2 = MaxPooling1D(pool_size=2)(conv2_2)
    flat2 = Flatten()(pool2_2)

    # merge
    merged = concatenate([flat1, flat2])

    # interpretation
    dense1 = Dense(1000, activation='relu')(merged)
    outputs = Dense(n_classes, activation='sigmoid')(dense1)
    self.model = Model(inputs=[inputs1, inputs2], outputs=outputs)

    # compile model
    if n_classes == 2:
      self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', precision, recall, f1_score])
    else:
      self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', precision, recall, f1_score])
    
  def fit(self, x_train_1,x_train_2, y_train, e, bs, v):
    return self.model.fit([x_train_1,x_train_2], y_train, epochs=e, batch_size=bs, verbose = v)

  def evaluate(self, x_test_1, x_test_2, y_test):
    return self.model.evaluate([x_test_1,x_test_2], y_test)
  def save(self, file_path):
    self.model.save(file_path)
  def load_weights(self, file_path):
    self.model.load_weights(file_path)

In [0]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
x_train, x_test, y_train, y_test = train_test_split(documents[:, 0], documents[:,1], random_state = 2003, test_size = 0.3)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

x_train = preprocess_text(x_train)
x_test = preprocess_text(x_test)

max_features = 3000
bag_of_words_x_train, bag_of_words_x_test = get_bag_of_words(x_train, x_test, max_features=max_features)
tfidf_x_train, tfidf_x_test = get_tfidf(x_train, x_test, max_features=max_features)

In [0]:
vocab_size = max_features
n_classes = y_train.shape[1]
multi_cnn_model = MultiCnn(max_features, max_features, n_classes)

In [42]:
# Training of Multi Channel CNN model
fit_history = multi_cnn_model.fit(bag_of_words_x_train, tfidf_x_train, y_train, e = 10, bs = 512, v = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [43]:
training_accuracy = fit_history.history['acc']
training_precision = fit_history.history['precision']
training_recall = fit_history.history['recall']
training_f1_score = fit_history.history['f1_score']

avg_training_accuracy = sum(training_accuracy)/ len(training_accuracy)
avg_training_precision = sum(training_precision) / len(training_precision)
avg_training_recall = sum(training_recall) / len(training_recall)
avg_training_f1_score = sum(training_f1_score)/ len(training_f1_score)

print('Average Training Accuracy : {0:.2f}'.format(avg_training_accuracy * 100))
print('Average Training Precision : {0:.2f}'.format(avg_training_precision))
print('Average Training Recall : {0:.2f}'.format(avg_training_recall))
print('Average Training F1 score : {0:.2f}'.format(avg_training_f1_score))

Average Training Accuracy : 66.99
Average Training Precision : 0.61
Average Training Recall : 0.72
Average Training F1 score : 0.66


In [0]:
model.save('1093805_1dconv_reg_multi_channel')

In [0]:
testing_history = multi_cnn_model.evaluate(bag_of_words_x_test, tfidf_x_test, y_test)

In [0]:
testing_accuracy = testing_history[1]
testing_precision = testing_history[2]
testing_recall = testing_history[3]
testing_f1_score = testing_history[4]
print('Testing Accuracy : {0:.2f}'.format(testing_accuracy * 100))
print('Testing Precision : {0:.2f}'.format(testing_precision ))
print('Testing Recall : {0:.2f}'.format(testing_recall))
print('Testing F1 score : {0:.2f}'.format(testing_f1_score))