# Sentiment analysis using IMDB dataset

In [25]:
import numpy as np
from glob import glob
import os
import matplotlib.pyplot as plt
from sklearn import svm
import zipfile
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from scipy import sparse
import nltk

# Download any necessary nltk files for nlp
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/raymondyuan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Get data

In [2]:
zip_file_path = './imdb_dataset.zip'
extract_dir = './'
data_dir = 'imdb_dataset'

In [5]:
# Extract all the files 
zip_ref = zipfile.ZipFile(zip_file_path, 'r')
zip_ref.extractall(extract_dir)
zip_ref.close()

Let's begin by reading in all of our text files. We'll create their label according to their sentiment, either positive or negative. In addition we'll preprocess all the nexts by removing all non-alpha numeric characters. 

In [12]:
# Regex to remove all Non-Alpha Numeric 
SPECIAL_CHARS = re.compile(r'([^a-z\d!?.\s])', re.IGNORECASE)

def read_texts(glob_to_texts):
    texts = []
    labels = []
    label = int("pos" in glob_to_texts)
    for text_name in tqdm(glob(glob_to_texts)):
        with open(text_name, 'r') as text:
            # Removing all non-alphanumeric
            filter_text = SPECIAL_CHARS.sub('',  text.read())
            texts.append(filter_text)
            labels.append(label)
    return texts, labels

# Get all training data
train_pos_data = read_texts(os.path.join(data_dir, "train/pos/*.txt"))
train_neg_data = read_texts(os.path.join(data_dir, "train/neg/*.txt"))

# Get all test data
test_pos_data = read_texts(os.path.join(data_dir, "test/pos/*.txt"))
test_neg_data = read_texts(os.path.join(data_dir, "test/neg/*.txt"))

train_texts = train_pos_data[0] + train_neg_data[0]
train_labels = train_pos_data[1] + train_neg_data[1]

test_texts = test_pos_data[0] + test_neg_data[0]
test_labels = test_pos_data[1] + test_neg_data[1]

100%|██████████| 12500/12500 [00:01<00:00, 9482.37it/s] 
100%|██████████| 12500/12500 [00:02<00:00, 4411.21it/s]
100%|██████████| 12500/12500 [00:02<00:00, 4378.72it/s]
100%|██████████| 12500/12500 [00:02<00:00, 4344.35it/s]


Split the data into training and validation sets. We'll create a validation test set with 10% of the data. 

In [20]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1,
                                                                    random_state=42)

## Vectorization
In order to extract information from text, we'll vectorize our word sequences. In other words, we'll transform our sentences into numerical features. There are many vectorization or embedding techniques such as Bag of Words, Pre-Trained word embeddings, but in our case we'll be using **TF-IDF**.

TF-IDF stands for "Term Frequency, Inverse Document Frequency". It's a technique that converts words into an importance score of each word in the document based on how they appear accros multiple documents. Intuitively, the TF-IDF score of a word is high when it is frequently found in a document. However, if the word appears in many documents, this word is not a unique identifier, and as such, will have a lower score. For example, common words such as "the" and "and" will have low score since they appear in many documents. 

In [21]:
vec = TfidfVectorizer(ngram_range=(1, 2), tokenizer=word_tokenize,
                      min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                      smooth_idf=1, sublinear_tf=1)

We fit our vectorizer to our entire corpus of words, which includes the training, validation, and test sets. Once fitted, we'll transform each subset of the data.

In [23]:
print("Created Vectorizer %s" % vec)
print("Fitting to all docs...")
vec.fit(train_texts + val_texts + test_texts)
print("Transforming train docs...")
trn_term_doc = vec.transform(train_texts)
print("Transforming val docs...")
val_term_doc = vec.transform(val_texts)
print("Transforming test docs...")
test_term_doc = vec.transform(test_texts)

Created Vectorizer TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=3,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=1,
        stop_words=None, strip_accents='unicode', sublinear_tf=1,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function word_tokenize at 0x113a678c8>, use_idf=1,
        vocabulary=None)
Fitting to all docs...
Transforming to train docs...
Transforming to val docs...
Transforming to test docs...


# Model
If you're unfamiliar or want a refresher on SVM's you should check out our [CV tutorial](https://github.com/abhmul/DataScienceTrack/blob/master/CV/Tutorial.ipynb)!

In [38]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.svm import LinearSVC

class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual='auto', verbose=0):
        self.C = C
        self.dual = dual
        self.verbose = verbose
        self._clf = None
        print("Creating model with C=%s" % C)

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))
    
    def score(self, x, y):
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.score(x.multiply(self._r), y)
        
    def fit(self, x, y):
        # Check that X and y have correct shape
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y == y_i].sum(0)
            return (p + 1) / ((y == y_i).sum() + 1)

        self._r = sparse.csr_matrix(np.log(pr(x, 1, y) / pr(x, 0, y)))
        x_nb = x.multiply(self._r)
        if self.dual == 'auto':
            self.dual = x_nb.shape[0] <= x_nb.shape[1]
        self._clf = LinearSVC(C=self.C, dual=self.dual, verbose=self.verbose)
        self._clf.fit(x_nb, y)
        return self

## Finding optimal parameters
We'll perform a grid search across the C parameter to find the optimal parameter for our dataset.

In [44]:
# Search for the appropriate C
Cs = [1e-2, 1e-1, 1e0, 1e1, 1e2]

best_model = None
best_val = -float("inf")
best_C = None
for C in Cs:
    print("Fitting with C={}".format(C))
    model = NbSvmClassifier(C=C, verbose=0).fit(trn_term_doc, train_labels)
    # Evaluate the model
    val_preds = model.predict(val_term_doc)
    score = np.mean(val_labels == val_preds)

    print("Model had val score of %s" % score)
    if score > best_val:
        print("New maximum score improved from {} to {}".format(best_val, score))
        best_model = model
        best_val = score
        best_C = C
score = best_val
print("Best score with C={} is {}".format(best_C, score))

Fitting with C=0.01
Creating model with C=0.01
Model had val score of 0.8496
New maximum score improved from -inf to 0.8496
Fitting with C=0.1
Creating model with C=0.1
Model had val score of 0.8844
New maximum score improved from 0.8496 to 0.8844
Fitting with C=1.0
Creating model with C=1.0
Model had val score of 0.9112
New maximum score improved from 0.8844 to 0.9112
Fitting with C=10.0
Creating model with C=10.0
Model had val score of 0.9132
New maximum score improved from 0.9112 to 0.9132
Fitting with C=100.0
Creating model with C=100.0
Model had val score of 0.9064
Best score with C=10.0 is 0.9132


## Test score

In [47]:
best_model.score(test_term_doc, test_labels)

0.90932

## Takeaways
From this tutorial, we learned how to work with text data and use a basic embedding. In addition, we realize that deep learning isn't always the way to go! We trained a fast and powerful linear model that achieved ~**91**%!

## Sample Texts

In [13]:
train_pos_sample_ind = np.random.randint(len(train_pos_data[0]))
train_neg_sample_ind = np.random.randint(len(train_neg_data[0]))

print("Positive Sentiment example")
print(train_pos_data[0][train_pos_sample_ind])
print("---------------------------")
print("Negative Sentiment example")
print(train_neg_data[0][train_neg_sample_ind])

Positive Sentiment example
This first two seasons of this comedy series were very strange and they werent very funny and had a drama element where Bill the mother was struggling with all the usual problems in life but that element was a bit depressing and didnt mix well with th comedy elements which is probably why it was dropped. After that it soon became one of the funniest comedy series the BBC have ever made! The chemistry between Bill and Bens characters were very funny and there was always so many brilliant and memorable sketches in each series. The Christmas specials were hilarious and a real treat for Christmas. br br The show came to a stop when the main actor Gary Olsen playing Bill passed away which was very sad because he was a brilliant actor in films such as Up n Under and a very funny man RIPbr br This underrated show has sadly disappeared from our television screens and doesnt to be repeated that often  Though it does appear on UKTV Gold once in a while but it should be

In [14]:
from collections import defaultdict

word_counts = defaultdict(int)

# Compute the frequency of each unique
for text in tqdm(train_texts + val_texts + test_texts):
    # Splits sentences 
    for word in word_tokenize(text):
        word_counts[word] += 1

100%|██████████| 50000/50000 [01:37<00:00, 514.48it/s]


In [50]:
vocab = ['<PAD>'] + sorted(word_counts, key=lambda word: word_counts[word], reverse=True)
word2id = {word: i for i, word in enumerate(vocab)}

# Examine the most common words
print("Number of unique words", len(vocab))
print("Most frequent word: ", vocab[1], "occurs", word_counts[vocab[1]], "times")
print(vocab[:100])

Number of unique words 210325
Most frequent word:  the occurs 572555 times
['<PAD>', 'the', '.', 'a', 'and', 'of', 'to', 'is', 'in', 'I', 'that', 'it', 'br', 'this', 'was', 'The', 'as', 'with', 'movie', 'for', 'film', 'but', 'on', 'are', 'not', 'have', 'his', 'you', 'be', '!', 'one', 'at', 'by', 'he', 'an', 'all', 'who', 'from', 'like', 'its', 'they', 'so', 'or', 'about', 'her', 'just', 'has', '?', 'out', 'This', 'some', 'good', 'more', 'very', '...', 'what', 'up', 'would', 'It', 'can', 'when', 'time', 'if', 'which', 'really', 'only', 'their', 'see', 'were', 'had', 'even', 'story', 'there', 'no', 'my', 'me', 'she', 'than', 'much', 'been', 'get', 'into', 'will', 'other', 'him', 'bad', 'because', 'people', 'do', 'great', 'well', 'most', 'we', 'them', 'first', 'made', 'also', 'movies', 'make', 'how']


In [77]:
np.savez('glove_embeddings.npz', embeddings=embeddings)

In [78]:
glove_embeddings = np.load('glove_embeddings.npz')['embeddings']

In [66]:
def map_texts(texts, word2id):
    return [[word2id[word] for word in word_tokenize(text)] for text in tqdm(texts)]

In [67]:
train_map_text = map_texts(train_texts, word2id)
val_map_text = map_texts(val_texts, word2id)
test_map_text = map_texts(test_texts, word2id)

100%|██████████| 22500/22500 [00:42<00:00, 524.65it/s]
100%|██████████| 2500/2500 [00:04<00:00, 530.67it/s]
100%|██████████| 25000/25000 [00:47<00:00, 531.54it/s]


In [93]:
x_train = keras.preprocessing.sequence.pad_sequences(train_map_text)
x_val = keras.preprocessing.sequence.pad_sequences(val_map_text)
x_test = keras.preprocessing.sequence.pad_sequences(test_map_text)

In [104]:
import keras
from keras import layers
from keras import models

def get_LSTM_model(embedding_matrix):
    inp = layers.Input(shape=(None,))
    x = layers.Embedding(*(embedding_matrix.shape),
                         weights=[embedding_matrix], 
                         trainable=False)(inp)
    x = layers.Bidirectional(layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = layers.GlobalMaxPool1D()(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(50, activation="relu")(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(1)(x)
    model = models.Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [105]:
model = get_LSTM_model(glove_embeddings)

In [106]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_6 (Embedding)      (None, None, 300)         63097500  
_________________________________________________________________
bidirectional_5 (Bidirection (None, None, 100)         140400    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 100)               0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
__________

In [107]:
model.fit(x_train, 
          train_labels,
          validation_data=(x_val, val_labels),
          batch_size=128,
          epochs=20,
          shuffle=True)

Train on 22500 samples, validate on 2500 samples
Epoch 1/20

KeyboardInterrupt: 