# IMDB Movies Sentiment Analysis Using DNN

### Imports :

In [12]:
import os 
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

In [28]:
import numpy as np

In [39]:
import matplotlib.pyplot as plt
%matplotlib inline

### Data Preprocessing :

In [14]:
labels = []
texts = []
label_types = ["neg","pos"]

#### Label Encoding:
- positive: 1
- negative: 0

In [15]:
# directories
data_dir = "./aclImdb/"
train_dir = os.path.join(data_dir,"train")

In [17]:
for label_type in label_types:
    dir_name = os.path.join(train_dir,label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == ".txt":
            my_file = open(os.path.join(dir_name,fname))
            texts.append(my_file.read())
            my_file.close()
            if label_type == "pos":
                labels.append(1)
            else:
                labels.append(0)
print("done..")

done..


### Tokenization:

In [20]:
maxlen = 100
training_samples = 200
validation_samples = 10000
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [27]:
word_index = tokenizer.word_index
print("Unique tokens count:",len(word_index))

Unique tokens count: 87446


In [29]:
data = pad_sequences(sequences=sequences, maxlen=maxlen)
labels = np.asarray(labels)

In [30]:
print("Shape of data tensor:", data.shape)
print("Shape of label tensor", labels.shape)

Shape of data tensor: (24275, 100)
Shape of label tensor (24275,)


In [31]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
indices

array([ 4890, 17283, 21027, ..., 22144,   216, 11299])

In [32]:
#shuffling to break the order of negative followed by positive
data = data[indices]

In [33]:
labels = labels[indices]

In [40]:
#splitting data
x_train = data[:training_samples]
x_val = data[training_samples:training_samples+validation_samples]
y_train = labels[:training_samples]
y_val = labels[training_samples:training_samples+validation_samples]

### Results plotting:

In [37]:
# results plotting function
def plot_results(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.figure()
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()