# Comp 551 Project 4 

In [0]:
#Block 1
#Imports
import nltk
import string
import re
import io
import random
import tensorflow as tf
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow import random_uniform

from keras import backend as K
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.models import Model
from keras.layers import Dense
from keras.layers import Input
from keras.layers import Flatten
from keras.layers.core import Lambda
from keras.layers import Reshape
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers import GlobalMaxPooling1D
from keras.layers import GlobalMaxPooling2D
from keras.backend import expand_dims
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Concatenate, concatenate
from keras.layers import merge
from keras.initializers import RandomUniform
from keras.constraints import max_norm
from keras.constraints import MaxNorm
from keras.optimizers import Adadelta
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [0]:
#2
#Downloads
nltk.download('punkt') #I had to run this line as well

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
#3
#input = list of raw sentences
#output = list of formatted sentences
def processData(data):
  sentences = []
  for line in data:
    
    #remove all non Alphanumeric characters
    #also keep exclamation and question marks as they can express sentiment
    #Replace with spaces
    line = re.sub(r"[^A-Za-z0-9!?]", " ", line)
    #seperate question marks and exclamation marks from text
    line = re.sub(r"!", " ! ", line)  
    line = re.sub(r"\?", " ? ", line) 
    
    #lower case
    line = line.lower()
    #whitespace at beggining and end of sentence
    line = line.strip()
    
    #remove excess whitespace in the middle of each sentence
    line = ' '.join(line.split())
    
    #add to list
    sentences.append(line);
  return sentences

In [0]:
#4
#for MPQA dataset
#sort according to label at the start of the sentence
def sortResponses(data):
  pos = []
  neg = []
  
  for line in data:
    index = int(line[0:1])
    
    if index == 1:
      pos.append(line[1:])
    else:
      neg.append(line[1:])
      
  return (pos,neg)

In [0]:
#5
#calculate size of dataset vocabulary - needed to intialize one hot vectors
def vocabSize(data):
  P = {' '} 

  for line in data:
    #split at spaces
    words = line.split(' ')
    
    for i in words:
      P.add(i)
  
  print(len(P))
  return len(P)

## Note: Only run Block 6,7 or 8 depending on which dataset you're testing


In [0]:
#6
#MR data

#Read data
positive = io.open('rt-polarity.pos', encoding = "ISO-8859-1")
negative = io.open('rt-polarity.neg', encoding = "ISO-8859-1")


pos = processData(positive)
neg = processData(negative)

pos[0:10]

#combine to create full sentence dataset
X = pos + neg
#print(len(X))

#responses positive = 1, negative = 0
y = [1]*len(pos) + [0]*len(neg)
#len(y)

vocab_size = vocabSize(X)

In [0]:
#7
#Subj Data
objective = io.open('plot.tok.gt9.5000', encoding = "ISO-8859-1")
subjective = io.open('quote.tok.gt9.5000', encoding = "ISO-8859-1")


sub = processData(subjective)
obj = processData(objective)

#combine to create full sentence dataset
X = sub + obj
#print(len(X))

y = [1]*len(sub) + [0]*len(obj)
len(y)

vocab_size = vocabSize(X)

In [0]:
#8
#MPQA Data
mpqa = io.open('mpqa.all', encoding = "ISO-8859-1")

pos,neg = sortResponses(mpqa)

pos = processData(pos)
neg = processData(neg)


#combine to create full sentence dataset
X = pos + neg
#print(len(X))

y = [1]*len(pos) + [0]*len(neg)

print(len(pos))
print(len(neg))
print(len(X))
print(len(y))
print(pos[0:10])
print(neg[0:10])

vocab_size = vocabSize(X)

In [0]:
#9
#convert words to integers
#i.e. sentences go from sets of words to integers representing each word
encoded_docs = [one_hot(d, vocab_size) for d in X]

In [0]:
#10
#Have to get the maximum length sentence in order to pad all sentences
maxLength = 0

for i in encoded_docs:
  if len(i) > maxLength:
    maxLength = len(i)


#Without this the first and last words of the longest sentence would only be considered once! by the convolutions
maxLength += 8

maxLength 
  

In [0]:
#11

#centering sentences with zero padding
#not explicitly mentioned in paper but is in Collobert et. al. 2011

padded_docs = np.zeros((len(y), maxLength))

print(len(encoded_docs))

for i in range(len(encoded_docs)):

  length = len(encoded_docs[i])
  
  pad_end = int((maxLength - length)/2)
  
  pad_front = maxLength - length - pad_end

  #add zero padding
  padVector = [0] * pad_front + encoded_docs[i] + [0]*pad_end

  padded_docs[i] = padVector
  
  
#print(padded_docs[0:10])
print(padded_docs.shape)

In [0]:
#12
#convert labels to categorical data (this ensures it works with the final softmax layer)
y = np.asarray(y)
y = to_categorical(y)
print(len(y))
y

In [0]:
#13
X_array = np.asarray(padded_docs)
print(X_array.shape)
print(X_array[0])

In [0]:
#14

#80% of data is used in training. The remainder is split 50/50 between validation and test sets 

train_prop = 0.8
valid_prop = 0.5


#80% of data into training set, 20% for validation and testing
X_train, X_test, y_train, y_test = train_test_split(X_array, y, train_size=train_prop, random_state=7)

#Divide 20% of test data 50/50 between test and validation sets 
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=valid_prop, random_state=9)

In [0]:
#15
print(len(X_train))
print(len(X_valid))
print(len(X_test))

print(X_train.shape)
print(y_train.shape)

# CNN Architecture

In [0]:
#16
#Paramters

#from paper explicitly
filter_Sizes = [3,4,5]
feature_Maps = 100
p = 0.5
l2 = 3
mini_Batch_Size = 50

#inferred from context
embedding_Length = 300
a = 0.245

In [0]:
#17
inputs = Input(shape=(maxLength,))#fix input

#equal min and max are based on notes on page 5
#they don't specify what alpha is though, only that variance is the same as word2vec variance
#https://aclweb.org/anthology/D17-1127 says variance of word2vec is ~0.02, working backwards you get that we should sample from U[-0.245,0.245]
em = Embedding(vocab_size, embedding_Length, embeddings_initializer = RandomUniform(minval= -a, maxval=a, seed=None), input_length=maxLength)(inputs)

#add last dimension
em = Reshape(target_shape=(maxLength,embedding_Length,1))(em)

#Three submodels - each submodel for a differnt kernel size
#Submodel performs convolution (100 feature maps each) and then does MaxPool
sm1 = Conv2D(feature_Maps, kernel_size=(filter_Sizes[0],embedding_Length), padding='valid', activation='relu', strides=1)(em)
sm1 = MaxPooling2D(pool_size=(maxLength-filter_Sizes[0]+1, 1), strides=(1,1))(sm1)

sm2 = Conv2D(feature_Maps, kernel_size=(filter_Sizes[1],embedding_Length), padding='valid', activation='relu', strides=1)(em)
sm2 = MaxPooling2D(pool_size=(maxLength-filter_Sizes[1]+1, 1), strides=(1,1))(sm2)

sm3 = Conv2D(feature_Maps, kernel_size=(filter_Sizes[2],embedding_Length), padding='valid', activation='relu', strides=1)(em)
sm3 = MaxPooling2D(pool_size=(maxLength-filter_Sizes[2]+1, 1), strides=(1,1))(sm3)

#combine "outputs" from each "sub-model" to from penultimate layer
#300 x 1 layer
m = concatenate([sm1, sm2, sm3],axis=3)
m = Flatten()(m)

#regularization methods on penultimate layer
#Dropout
#l2-norm constraint is in Dense layer below
m = Dropout(p)(m)
#predict using softmax
predictions = Dense(2, activation='softmax',kernel_constraint=max_norm(l2))(m)


model = Model(inputs=inputs, outputs=predictions)

model.compile(optimizer='adaDelta',
              loss='binary_crossentropy',
              metrics=['accuracy'])



In [0]:
#18

#save each iteration of network
filepath="weights.{epoch:02d}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)
callbacks_list = [checkpoint]

In [0]:
#19
history = model.fit(X_train, y_train, epochs=30, batch_size=mini_Batch_Size, validation_data=(X_valid, y_valid),callbacks=callbacks_list)

In [0]:
#20
#Accuracy Plot

import matplotlib.pyplot as plt
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('CNN-rand Replication Accuracy on MPQA dataset')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['training set', 'validation set'], loc='lower right')
plt.show()

In [0]:
#21
#Loss Plot

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('CNN-rand Replication Loss on MPQA dataset')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['training set', 'validation set'], loc='upper left')
plt.show()

In [0]:
#22
print(history.history['acc'])
print(history.history['val_acc'])
print(history.history['loss'])
print(history.history['val_loss'])

In [0]:
#23
#Model evaluation using test set

#load in the first model (epoch #) that had the validation accuracy not increase for 5 epochs after
model.load_weights("weights.11.hdf5")

model.compile(loss='binary_crossentropy', optimizer='adaDelta', metrics=['accuracy'])

scores = model.evaluate(X_test, y_test,verbose=1)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [0]:
#24
print(model.summary())