#**Task 0**

In [None]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import numpy as np
# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
print(tf.__version__)

2.4.1


In [None]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

In [None]:
os.listdir(dataset_dir)

['test', 'imdb.vocab', 'README', 'imdbEr.txt', 'train']

In [None]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['urls_unsup.txt',
 'urls_neg.txt',
 'neg',
 'unsup',
 'urls_pos.txt',
 'pos',
 'labeledBow.feat',
 'unsupBow.feat']

In [None]:
sample_file = os.path.join(train_dir, 'pos/1181_9.txt')
with open(sample_file) as f:
  print(f.read())

Rachel Griffiths writes and directs this award winning short film. A heartwarming story about coping with grief and cherishing the memory of those we've loved and lost. Although, only 15 minutes long, Griffiths manages to capture so much emotion and truth onto film in the short space of time. Bud Tingwell gives a touching performance as Will, a widower struggling to cope with his wife's death. Will is confronted by the harsh reality of loneliness and helplessness as he proceeds to take care of Ruth's pet cow, Tulip. The film displays the grief and responsibility one feels for those they have loved and lost. Good cinematography, great direction, and superbly acted. It will bring tears to all those who have lost a loved one, and survived.


In [None]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [None]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', 
    batch_size=batch_size, 
    seed=seed)



Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [None]:
#625*32=20,000 reviews
len(raw_train_ds)
#No.85

625

In [None]:
for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(3):
    print("Review", text_batch.numpy()[i])
    print("Label", label_batch.numpy()[i])

Review b"Mild Spoilers<br /><br />In the near future, Arnold stars as Ben Richards, a wrongly convicted man coerced into playing 'The Running Man', a deadly TV game show where people have to keep moving to try and escape brutal deaths at the hands of the 'Stalkers'. Of course, people are expected to die eventually and its up to Arnold to prove the system wrong.<br /><br />I haven't read the Stephen King book, but this is a great film regardless, one of Arnold's best. He does what he does best in the action man role, delivering death with unforgettable one-liners. Classics are probably the 'He was a real pain in the neck' after strangling a guy with barb wire, and 'He had to split!', referring to whereabouts he just chain sawed someone vertically. Dawson is perfectly irritating as the TV presenter, and all the 'Stalkers' are suitably camp. The action is violent, but its an action film. That's the point. The film is fast paced, and at 90 minutes it doesn't overstay its welcome. <br /><br

In [None]:
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

Label 0 corresponds to neg
Label 1 corresponds to pos


Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [None]:
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/test', 
    batch_size=batch_size)
#782 batches, 781*32+ 24=25,000

Found 25000 files belonging to 2 classes.


In [None]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [None]:
max_features = 10000
sequence_length = 250
embedding_dim=128

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [None]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [None]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [None]:
# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

Review tf.Tensor(b"The 60\xc2\xb4s is a well balanced mini series between historical facts and a good plot. In four deliveries, we follow a north American family, with 3 members. But we don't only see them. We also follow the story of several characters as a black reverend, an extremist student leader, and a soldier in Vietnam. The filmography is just extraordinary. In the first chapters, we see some shots of the Vietnam war, in between the scenes. The next chapter, doesn't start where the last one finished, it starts some time after, giving us a little mystery on what happened. In general, The 60\xc2\xb4s mini series, is a must see, not only for hippies fanatics, but for everyone with little curiosity about the topic.", shape=(), dtype=string)
Label pos
Vectorized review (<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[   2,    1,    7,    4,   73, 7844, 4199,  200,  188, 1397, 2232,
           3,    4,   49,  111,    8,  695,    1,   71,  821,    4, 2324,
         312,  215, 

In [None]:
print("1287 ---> ",vectorize_layer.get_vocabulary()[1287])
print(" 313 ---> ",vectorize_layer.get_vocabulary()[313])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

1287 --->  silent
 313 --->  night
Vocabulary size: 10000


In [None]:
train_ds = raw_train_ds.map(vectorize_text) #80% training dataset :20,000
val_ds = raw_val_ds.map(vectorize_text) #20% training dataset :5,000
test_ds = raw_test_ds.map(vectorize_text) #100% testing dataset: 25,000

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, 16),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160016    
_________________________________________________________________
dropout (Dropout)            (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 17        
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [None]:
epochs = 10
history2 = model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.31058719754219055
Accuracy:  0.8731200098991394


In [None]:
history_dict = history2.history
history_dict.keys()

dict_keys(['loss', 'binary_accuracy', 'val_loss', 'val_binary_accuracy'])

# **Task1**

In [None]:
from tensorflow.keras import layers

# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, kernel_size=(5//2)+5, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

CNN1 = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
CNN1.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
CNN1.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 128)         1280000   
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
conv1d (Conv1D)              (None, None, 128)         114816    
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0     

In [None]:
epochs = 10

# Fit the model using the train and test datasets.
CNN1.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ffa9451f8d0>

In [None]:
CNN1.evaluate(test_ds)



[0.6941274404525757, 0.8449599742889404]

# **Task 2**

In [None]:
from tensorflow.keras import layers

# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, kernel_size=(5//2)+5-1, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, kernel_size=(5//2)+5-1, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

CNN2 = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
CNN2.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
CNN2.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_2 (Embedding)      (None, None, 128)         1280000   
_________________________________________________________________
dropout_4 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         98432     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 128)         98432     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               1651

In [None]:
epochs = 10

# Fit the model using the train and test datasets.
CNN2.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: ignored

In [None]:
CNN2.evaluate(test_ds)

# **Task3**

In [None]:
# 20% of training dataset 125*32=4,000
train_ds_20=train_ds.take(125)

# 40% of training dataset 250*32=8,000
train_ds_40=train_ds.take(250)

# 60% of training dataset 375*32=12,000
train_ds_60=train_ds.take(375)

# 80% of training dataset 500*32=16,000
train_ds_80=train_ds.take(500)

# 100% of training dataset 625*32=20,000
train_ds_100=train_ds.take(625)

In [None]:
from tensorflow.keras import layers

# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, kernel_size=(5//2)+5, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

CNNbest = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
CNNbest.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
epochs = 10

# Fit the model using the train and test datasets.
history_20=CNNbest.fit(train_ds_20, validation_data=val_ds, epochs=epochs)
CNNbest.evaluate(test_ds)
history_20_dict = history_20.history
history_20_dict.keys()

#------------------------------------------------------------------#

history_40=CNNbest.fit(train_ds_40, validation_data=val_ds, epochs=epochs)
CNNbest.evaluate(test_ds)
history_40_dict = history_40.history
history_40_dict.keys()

#------------------------------------------------------------------#

history_60=CNNbest.fit(train_ds_60, validation_data=val_ds, epochs=epochs)
CNNbest.evaluate(test_ds)
history_60_dict = history_60.history
history_60_dict.keys()

#------------------------------------------------------------------#

history_80=CNNbest.fit(train_ds_80, validation_data=val_ds, epochs=epochs)
CNNbest.evaluate(test_ds)
history_80_dict = history_80.history
history_80_dict.keys()
#------------------------------------------------------------------#

history_100=CNNbest.fit(train_ds_100, validation_data=val_ds, epochs=epochs)
CNNbest.evaluate(test_ds)
history_100_dict = history_100.history
history_100_dict.keys()

In [None]:
acc_20 = history_20_dict['accuracy']
val_acc_20 = history_20_dict['val_accuracy']

acc_40 = history_40_dict['accuracy']
val_acc_40 = history_40_dict['val_accuracy']

acc_60 = history_60_dict['accuracy']
val_acc_60 = history_60_dict['val_accuracy']

acc_80 = history_80_dict['accuracy']
val_acc_80 = history_80_dict['val_accuracy']

acc_100 = history_100_dict['accuracy']
val_acc_100 = history_100_dict['val_accuracy']

epochs = range(1, 11)

In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize=(20,10))

plt.plot(epochs, acc_20, 'bo', label='Training acc_20')
plt.plot(epochs, acc_40, 'go', label='Training acc_40')
plt.plot(epochs, acc_60, 'ro', label='Training acc_60')
plt.plot(epochs, acc_80, 'co', label='Training acc_80')
plt.plot(epochs, acc_100, 'mo', label='Training acc_100')

#b,g,r,c,m
plt.plot(epochs, val_acc_20, 'b', label='Validation acc')
plt.plot(epochs, val_acc_40, 'g', label='Validation acc')
plt.plot(epochs, val_acc_60, 'r', label='Validation acc')
plt.plot(epochs, val_acc_80, 'c', label='Validation acc')
plt.plot(epochs, val_acc_100, 'm', label='Validation acc')


plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

# **Task 4**

In [None]:
y_pred_prob = CNNbest.predict(test_ds)
y_pred_prob

In [None]:
import numpy as np
import pandas as pd

df = pd.DataFrame(y_pred_prob, columns = ['Probability'])
df


In [None]:
df=df.sort_values(by='Probability', ascending=False)
df

In [None]:
df_pos=df.head(20)
df_pos

In [None]:
df_neg=df.tail(20)
df_neg

In [None]:
index_pos = df_pos.index
index_pos

In [None]:
index_neg = df_neg.index
index_neg