<a href="https://colab.research.google.com/github/MarvinAmbutu/CNN-Sentiment-Classification/blob/master/CNN_Sentiment_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install -U gensim

Requirement already up-to-date: gensim in /usr/local/lib/python3.6/dist-packages (3.8.1)


In [0]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import numpy as np
import re, sys
import itertools
from collections import Counter
from gensim.models import word2vec
import os
import pickle
from os.path import join, exists, split

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPool1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate
from keras.datasets import imdb
from keras.preprocessing import sequence
"""
    Original taken from https://github.com/dennybritz/cnn-text-classification-tf
"""

Using TensorFlow backend.


'\n    Original taken from https://github.com/dennybritz/cnn-text-classification-tf\n'

In [0]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [0]:
def load_data_and_labels():
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    gdrive/My Drive/Applied ML/Fall 2019/Assignments/Miniproject4/data/rt-polarity.pos
    gdrive/My Drive/Applied ML/Fall 2019/Assignments/Miniproject4/data/rt-polarity.neg
    """
    # params - Change the file name to fit the directory file is stored on
    pos_data = 'gdrive/My Drive/Applied ML/Fall 2019/Assignments/Miniproject4/data/rt-polarity.pos'
    neg_data = 'gdrive/My Drive/Applied ML/Fall 2019/Assignments/Miniproject4/data/rt-polarity.neg'
    # Load data from files
    if sys.version_info.major == 3:
        positive_examples = list(open(pos_data, encoding ='ISO-8859-1').readlines())
        negative_examples = list(open(neg_data, encoding ='ISO-8859-1').readlines())
    else:
        positive_examples = list(open(pos_data).readlines())
        negative_examples = list(open(neg_data).readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    x_text = [s.split(" ") for s in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]
    

In [0]:
def pad_sentences(sentences, padding_word="<PAD/>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences

In [0]:
def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]

In [0]:
def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentencs and labels to vectors based on a vocabulary.
    """
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]

In [0]:
def load_data_data_helper():
    """
    Loads and preprocessed data for the MR dataset.
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """
    # Load and preprocess data
    sentences, labels = load_data_and_labels()
    sentences_padded = pad_sentences(sentences)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, labels, vocabulary)
    return [x, y, vocabulary, vocabulary_inv]

In [0]:
 
def batch_iter(data, batch_size, num_epochs):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int(len(data) / batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        shuffle_indices = np.random.permutation(np.arange(data_size))
        shuffled_data = data[shuffle_indices]
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [0]:
def train_word2vec(sentence_matrix, vocabulary_inv,
                   num_features=300, min_word_count=1, context=10):
  """
  Trains, saves, loads Word2Vec model
  Returns initial weights for embedding layer

  inputs:
  sentence_matrix # int matrix : num_sentences x max_sentence_len
  vocabulary_inv  # dict {int: str}
  num_features    # Word vector dimensionality
  min_word_count  # Minimum word count
  context         # Context window size
  """

  model_dir = 'models'
  model_name = "{:d}features_{:d}minwords_{:d}context".format(num_features, min_word_count, context)
  model_name = join(model_dir, model_name)

  if exists(model_name):
    embedding_model = word2vec.Word2Vec.load(model_name)
    print('Load existing Word2Vec model \'%s\'' % split(model_name)[-1])
  else:
    # Set values for various parameters
    num_workers = 2 # Number of threads to run in parallel
    downsampling = 1e-3 # Downsample setting for frequent words

    # Inititalize and train the model 
    print('Training Word2Vec model...')
    sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix]
    embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                        size=num_features, min_count=min_word_count,
                                        window=context, sample=downsampling)

    # If we dont plan to train the model any further, calling 
    # init_sims will make the model much more memory efficient.
    embedding_model.init_sims(replace=True)

    # Saving the model for later use. You can load it later using Word2Vec.load()
    if not exists(model_dir):
      os.mkdir(model_dir)
    print('Saving Word2Vec model \'%s\'' % split(model_name)[-1])
    embedding_model.save(model_name)
  
  # add unknown words
  embedding_weights = {key: embedding_model[word] if word in embedding_model else
                              np.random.uniform(-0.25, 0.25, embedding_model.vector_size)
                         for key, word in vocabulary_inv.items()}
  
  return embedding_weights



**Parameter type**

In [0]:
# Model type
model_type = "CNN-non-static" # CNN-rand|CNN-non-static|CNN-static

embedding_mode = "gensim_on_the_fly" # pretrained_googlenews|gensim_on_the_fly
#embedding_mode = "pretrained_googlenews"

# Data source
data_source = "local_dir" # keras_data_set|local_dir

# Model Hyperparameters

if data_source == "keras_data_set":
  embedding_dim = 50
  filter_sizes = (3, 4, 5)
  num_filters = 10
  dropout_prob = (0.5, 0.8) # 1st:(0.5, 0,8)
  hidden_dims = 50

else:
  embedding_dim = 300
  filter_sizes = (3, 4, 5)
  num_filters = 100
  dropout_prob = (0.5, 0.5) # 1st:(0.5, 0,8)
  hidden_dims = 100

# Training parameters
batch_size = 50
num_epochs = 10

# Preprocessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters
min_word_count = 1
context = 10

In [0]:
def load_data_imdb(data_source):
  assert data_source in ["keras_data_set", "local_dir"], "Uknown data source"
  if data_source == "keras_data_set":
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words, start_char=None,
                                                          oov_char=None, index_from=None)
    x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post")
    x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post")

    vocabulary = imdb.get_word_index()
    vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
    vocabulary_inv[0] = "<PAD/>"
  else:
    x, y, vocabulary, vocabulary_inv_list = load_data_data_helper()
    vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
    y = y.argmax(axis=1)

    # Shuffle data
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x = x[shuffle_indices]
    y = y[shuffle_indices]
    train_len = int(len(x) * 0.9)
    x_train = x[:train_len]
    y_train = y[:train_len]
    x_test = x[train_len:]
    y_test = y[train_len:]


  return x_train, y_train, x_test, y_test, vocabulary_inv

In [0]:
# Data Preparation
print("Load data...")
x_train, y_train, x_test, y_test, vocabulary_inv = load_data_imdb(data_source)
if sequence_length != x_test.shape[1]:
  print("Adjusting seuence length for actual size")
  sequence_length = x_test.shape[1]
print("Datasource: ", data_source)
print("x_train shape:", x_train.shape)
print("x_test:", x_test.shape)
print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

Load data...
Adjusting seuence length for actual size
Datasource:  local_dir
x_train shape: (9595, 56)
x_test: (1067, 56)
Vocabulary Size: 18765


In [0]:
# Prepare embedding layer weights and convert inputsfor static model
print("Model type is", model_type)
if model_type in ["CNN-non-static", "CNN-static"]:
  if embedding_mode == "pretrained_googlenews":
    pretrained_fpath_saved = os.path.expanduser("models/googlenews_extracted-python{}")
  
  else:embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim,
                                     min_word_count=min_word_count, context=context)
  
  if model_type == "CNN-static":
    x_train = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_train])
    x_test = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_test])
    print("x_train static shape", x_train.shape)
    print("x_test static shape", x_test.shape)

elif model_type == "CNN-rand":
  embedding_weights = None
else:
  raise ValueError("Unknown model type")

Model type is CNN-non-static
Load existing Word2Vec model '50features_1minwords_10context'




In [0]:
# Build model
if model_type == "CNN-static":
  input_shape = (sequence_length, embedding_dim)
else:
  input_shape = (sequence_length,)

model_input = Input(shape=input_shape)

# Static model does not have embedding layer
if model_type == "CNN-static":
  z = model_input
else:
  z = Embedding(len(vocabulary_inv), embedding_dim, input_length=sequence_length, name="embedding")(model_input)

z = Dropout(dropout_prob[0])(z)

In [0]:
# Convolution block
conv_blocks = []
for sz in filter_sizes:
  conv = Convolution1D(filters=num_filters,
                       kernel_size= sz,
                       padding="valid",
                       activation="relu",
                       strides=1)(z)
  conv = MaxPool1D(pool_size=2)(conv)
  conv = Flatten()(conv)
  conv_blocks.append(conv)

z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

z = Dropout(dropout_prob[1])(z)
z = Dense(hidden_dims, activation="relu")(z)
model_output = Dense(1, activation="sigmoid")(z)

model = Model(model_input, model_output)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Initialize weights with word2vec
if model_type == "CNN-non-static":
  weights = np.array([v for v in embedding_weights.values()])
  print("Initializing embedding layer with word2vec weights, shape", weights.shape)

  embedding_layer = model.get_layer("embedding")
  embedding_layer.set_weights([weights])

Initializing embedding layer with word2vec weights, shape (18765, 50)


In [0]:
# Train the model
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test), verbose=2)

Train on 9595 samples, validate on 1067 samples
Epoch 1/10
 - 4s - loss: 0.6979 - acc: 0.4981 - val_loss: 0.6928 - val_acc: 0.5370
Epoch 2/10
 - 2s - loss: 0.6920 - acc: 0.5235 - val_loss: 0.6882 - val_acc: 0.5408
Epoch 3/10
 - 2s - loss: 0.6832 - acc: 0.5631 - val_loss: 0.6749 - val_acc: 0.5698
Epoch 4/10
 - 2s - loss: 0.6715 - acc: 0.5868 - val_loss: 0.6600 - val_acc: 0.6345
Epoch 5/10
 - 2s - loss: 0.6551 - acc: 0.6164 - val_loss: 0.6460 - val_acc: 0.6420
Epoch 6/10
 - 2s - loss: 0.6299 - acc: 0.6446 - val_loss: 0.6158 - val_acc: 0.6692
Epoch 7/10
 - 2s - loss: 0.5987 - acc: 0.6757 - val_loss: 0.6536 - val_acc: 0.6261
Epoch 8/10
 - 2s - loss: 0.5598 - acc: 0.7144 - val_loss: 0.6260 - val_acc: 0.6448
Epoch 9/10
 - 2s - loss: 0.5297 - acc: 0.7330 - val_loss: 0.5530 - val_acc: 0.7216
Epoch 10/10
 - 2s - loss: 0.4997 - acc: 0.7562 - val_loss: 0.5373 - val_acc: 0.7301


<keras.callbacks.History at 0x7fc873cb7390>