# **Mounting Google Drive**

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Datasets in Google drive**

In [2]:
import os

os.listdir("/content/drive/My Drive/DataSets")

['Amazon_Reviews']

In [0]:
!unzip "/content/drive/My Drive/DataSets/Amazon_Reviews/Amazon Reviews for Sentiment Analysis.zip"

In [0]:
!bunzip2 "/content/train.ft.txt.bz2"

In [0]:
!bunzip2 "/content/test.ft.txt.bz2"

In [0]:
# Moving both extracted files to Dataset folder
import shutil

shutil.move("train.ft.txt", "/content/drive/My Drive/DataSets/Amazon_Reviews")
shutil.move("test.ft.txt", "/content/drive/My Drive/DataSets/Amazon_Reviews")

'/content/drive/My Drive/DataSets/Amazon_Reviews/test.ft.txt'

In [3]:
os.listdir("/content/drive/My Drive/DataSets/Amazon_Reviews")

['test.ft.txt', 'train.ft.txt', 'Amazon Reviews for Sentiment Analysis.zip']

# **Loading Data**

In [0]:
with open("/content/drive/My Drive/DataSets/Amazon_Reviews/train.ft.txt") as f:
  reviews = f.readlines()

In [0]:
# Main Method of Seperating Labels and Text
labels = []
texts = []
for review in reviews:
  line = review.rstrip()
  if line[:10] == '__label__2':
    labels.append(0)
  elif line[:10] == "__label__1":
      labels.append(1)
  texts.append(line[11:])

In [35]:
print(labels[0] , '-- ', texts[0])

0 --  Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^


In [36]:
print("Labels:", len(labels))
print("Texts:", len(texts))

Labels: 3600000
Texts: 3600000


In [0]:
# Optional Method of Seperating Labels and Text
label = []
text = []
for review in reviews:
  line = review.rstrip()
  label.append(line[:line.find(':') + 1])
  text.append(line[line.find(':') + 1 :])

# **1) one-hot coding of the following data using book codes (naive codes) of listing 6.1**

In [0]:
samples = texts[:100] # using only first 100 reviews

In [0]:
import numpy as np


token_index = {}
for sample in samples:
  for word in sample.split():
    if word not in token_index:
      token_index[word] = len(token_index) + 1

max_length = 10
results = np.zeros(
    shape=(len(samples), max_length, max(token_index.values()) + 1))

for i, sample in enumerate(samples):
  for j, word in list(enumerate(sample.split()))[:max_length]:
    index = token_index.get(word)
    results[i, j, index] = 1

In [39]:
print(token_index)
print("Length:", len(token_index))

{'Stuning': 1, 'even': 2, 'for': 3, 'the': 4, 'non-gamer:': 5, 'This': 6, 'sound': 7, 'track': 8, 'was': 9, 'beautiful!': 10, 'It': 11, 'paints': 12, 'senery': 13, 'in': 14, 'your': 15, 'mind': 16, 'so': 17, 'well': 18, 'I': 19, 'would': 20, 'recomend': 21, 'it': 22, 'to': 23, 'people': 24, 'who': 25, 'hate': 26, 'vid.': 27, 'game': 28, 'music!': 29, 'have': 30, 'played': 31, 'Chrono': 32, 'Cross': 33, 'but': 34, 'out': 35, 'of': 36, 'all': 37, 'games': 38, 'ever': 39, 'has': 40, 'best': 41, 'backs': 42, 'away': 43, 'from': 44, 'crude': 45, 'keyboarding': 46, 'and': 47, 'takes': 48, 'a': 49, 'fresher': 50, 'step': 51, 'with': 52, 'grate': 53, 'guitars': 54, 'soulful': 55, 'orchestras.': 56, 'impress': 57, 'anyone': 58, 'cares': 59, 'listen!': 60, '^_^': 61, 'The': 62, 'soundtrack': 63, 'anything.:': 64, "I'm": 65, 'reading': 66, 'lot': 67, 'reviews': 68, 'saying': 69, 'that': 70, 'this': 71, 'is': 72, "'game": 73, "soundtrack'": 74, 'figured': 75, "I'd": 76, 'write': 77, 'review': 78, 

In [40]:
print(results.shape)

(100, 10, 2920)


# **2) Also, provide one-hot coding using Keras built-in function (listing 6.3)**

In [0]:
samples = texts[:100] # using only first 100 reviews

In [42]:
from tensorflow.keras.preprocessing.text import Tokenizer


tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)

sequences = tokenizer.texts_to_sequences(samples)

one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')

word_index = tokenizer.word_index

print(word_index)
print(f'Found {len(word_index)} unique tokens.')

{'the': 1, 'i': 2, 'and': 3, 'to': 4, 'a': 5, 'of': 6, 'it': 7, 'is': 8, 'this': 9, 'in': 10, 'you': 11, 'that': 12, 'for': 13, 'not': 14, 'on': 15, 'my': 16, 'but': 17, 'was': 18, 'have': 19, 'with': 20, 'book': 21, 'are': 22, 'so': 23, 'all': 24, 'one': 25, 'as': 26, 'very': 27, 'be': 28, 'from': 29, 'game': 30, 'at': 31, 'if': 32, 'me': 33, 'great': 34, 'they': 35, 'up': 36, 'has': 37, 'time': 38, 'just': 39, 'out': 40, 'read': 41, 'or': 42, 'get': 43, 'dvd': 44, 'would': 45, 'an': 46, 'like': 47, 'what': 48, 'music': 49, 'good': 50, 'only': 51, 'there': 52, 'she': 53, 'do': 54, 'who': 55, 'more': 56, 'by': 57, 'some': 58, 'even': 59, 'now': 60, 'when': 61, 'can': 62, "don't": 63, 'old': 64, 'your': 65, 'been': 66, 'much': 67, 'love': 68, 'best': 69, "i'm": 70, 'work': 71, 'no': 72, 'cd': 73, "can't": 74, "it's": 75, 'will': 76, 'we': 77, 'bought': 78, 'had': 79, 'while': 80, 'tv': 81, 'want': 82, 'because': 83, 'make': 84, 'movie': 85, 'any': 86, 'which': 87, 'never': 88, 'then': 8

In [43]:
print(one_hot_results.shape)

(100, 1000)


# **3) provide one-hot coding with hashing (listing 6.4)**

In [0]:
samples = texts[:100] # using only first 100 reviews

In [45]:
dimensionality = 1000
max_length = 10

results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
  for j, word in list(enumerate(sample.split()))[:max_length]:
    index = abs(hash(word)) % dimensionality
    results[i, j, index] = 1.

print(word_index)
print(f'Found {len(word_index)} unique tokens.')

{'the': 1, 'i': 2, 'and': 3, 'to': 4, 'a': 5, 'of': 6, 'it': 7, 'is': 8, 'this': 9, 'in': 10, 'you': 11, 'that': 12, 'for': 13, 'not': 14, 'on': 15, 'my': 16, 'but': 17, 'was': 18, 'have': 19, 'with': 20, 'book': 21, 'are': 22, 'so': 23, 'all': 24, 'one': 25, 'as': 26, 'very': 27, 'be': 28, 'from': 29, 'game': 30, 'at': 31, 'if': 32, 'me': 33, 'great': 34, 'they': 35, 'up': 36, 'has': 37, 'time': 38, 'just': 39, 'out': 40, 'read': 41, 'or': 42, 'get': 43, 'dvd': 44, 'would': 45, 'an': 46, 'like': 47, 'what': 48, 'music': 49, 'good': 50, 'only': 51, 'there': 52, 'she': 53, 'do': 54, 'who': 55, 'more': 56, 'by': 57, 'some': 58, 'even': 59, 'now': 60, 'when': 61, 'can': 62, "don't": 63, 'old': 64, 'your': 65, 'been': 66, 'much': 67, 'love': 68, 'best': 69, "i'm": 70, 'work': 71, 'no': 72, 'cd': 73, "can't": 74, "it's": 75, 'will': 76, 'we': 77, 'bought': 78, 'had': 79, 'while': 80, 'tv': 81, 'want': 82, 'because': 83, 'make': 84, 'movie': 85, 'any': 86, 'which': 87, 'never': 88, 'then': 8

In [46]:
print(results.shape)

(100, 10, 1000)


# 4) **Validate the how much they are similar. Try to maximize the similarity (ideally should be 100% the same). Specify the reason if both are not the same.**

# **5) Try to implement word-embedding using code given in listing (6.7) and shared with me the embedding array as well as the word dictionary**

In [47]:
import numpy as np # converting list to array

texts = np.asarray(texts[:360000])
labels = np.asarray(labels[:360000])

print('Shape of texts tensor:', texts.shape)
print('Shape of label tensor:', labels.shape)

Shape of texts tensor: (360000,)
Shape of label tensor: (360000,)


In [0]:
maxlen = 100               # Cuts off reviews after 100 words
training_samples = 200     # Trains on 200 samples
validation_samples = 10000 # Validates on 10,000 samples
max_words = 10000          # Considers only the top 10,000 words in the dataset

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=max_words)    # max_words = 10000 
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [50]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 262218 unique tokens.


In [0]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

data = pad_sequences(sequences, maxlen=maxlen)  # maxlen = 100

In [52]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data,labels)

print("x_train :",x_train.shape)
print("x_test :",x_test.shape)
print("y_train :",y_train.shape)
print("y_test :",y_test.shape)

x_train : (270000, 100)
x_test : (90000, 100)
y_train : (270000,)
y_test : (90000,)


In [53]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.layers import Embedding


embedding_dim = 100 # 8

model = Sequential()
# Specifies the maximum input length to the Embedding layer so that we can later
# flatten the embedded inputs. After the Embedding layer, the activations have
# shape (samples, maxlen, 8)
model.add(Embedding(10000, embedding_dim, input_length=maxlen))

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 10001     
Total params: 1,010,001
Trainable params: 1,010,001
Non-trainable params: 0
_________________________________________________________________


In [54]:
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [55]:
evaluation = model.evaluate(x_test,  y_test,batch_size=32, verbose=2)
print()
print("Test loss :",evaluation[0]*100,"%")
print("Test accuracy :",evaluation[1]*100,"%")

2813/2813 - 4s - loss: 0.8795 - acc: 0.8230

Test loss : 87.94628381729126 %
Test accuracy : 82.29888677597046 %


# **6) From 6.8 code pre-trained word-embeddings**

# **7) Apply RNN to the given text (listing 6.21) and provide output**

# **8) Match the results of RNN (step 7) with the step 5  and also with step 6. Share the output of the analysis.**

In [0]:
print(type(data[5]))
print(data[2])

<class 'str'>
__label__2 Amazing!: This soundtrack is my favorite music of all time, hands down. The intense sadness of "Prisoners of Fate" (which means all the more if you've played the game) and the hope in "A Distant Promise" and "Girl who Stole the Star" have been an important inspiration to me personally throughout my teen years. The higher energy tracks like "Chrono Cross ~ Time's Scar~", "Time of the Dreamwatch", and "Chronomantique" (indefinably remeniscent of Chrono Trigger) are all absolutely superb as well.This soundtrack is amazing music, probably the best of this composer's work (I haven't heard the Xenogears soundtrack, so I can't say for sure), and even if you've never played the game, it would be worth twice the price to buy it.I wish I could give it 6 stars.



In [0]:
print(data[2].split())

['__label__2', 'Amazing!:', 'This', 'soundtrack', 'is', 'my', 'favorite', 'music', 'of', 'all', 'time,', 'hands', 'down.', 'The', 'intense', 'sadness', 'of', '"Prisoners', 'of', 'Fate"', '(which', 'means', 'all', 'the', 'more', 'if', "you've", 'played', 'the', 'game)', 'and', 'the', 'hope', 'in', '"A', 'Distant', 'Promise"', 'and', '"Girl', 'who', 'Stole', 'the', 'Star"', 'have', 'been', 'an', 'important', 'inspiration', 'to', 'me', 'personally', 'throughout', 'my', 'teen', 'years.', 'The', 'higher', 'energy', 'tracks', 'like', '"Chrono', 'Cross', '~', "Time's", 'Scar~",', '"Time', 'of', 'the', 'Dreamwatch",', 'and', '"Chronomantique"', '(indefinably', 'remeniscent', 'of', 'Chrono', 'Trigger)', 'are', 'all', 'absolutely', 'superb', 'as', 'well.This', 'soundtrack', 'is', 'amazing', 'music,', 'probably', 'the', 'best', 'of', 'this', "composer's", 'work', '(I', "haven't", 'heard', 'the', 'Xenogears', 'soundtrack,', 'so', 'I', "can't", 'say', 'for', 'sure),', 'and', 'even', 'if', "you've",