In [1]:
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

In [2]:
vocab_size = 45000
embedding_dim = 512
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [3]:
description = []
labels = []

with open("classifier_data_0.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[0])
        temp = row[1]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            temp = temp.replace(token, ' ')
            temp = temp.replace(' ', ' ')
        description.append(temp)
print(len(labels))
print(len(description))


5000
5000


In [4]:
train_size = int(len(description) * training_portion)

train_description = description[0: train_size]
train_labels = labels[0: train_size]

validation_description = description[train_size:]
validation_labels = labels[train_size:]

print(train_size)
print(len(train_description))
print(len(train_labels))
print(len(validation_description))
print(len(validation_labels))

4000
4000
4000
1000
1000


In [5]:

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_description)
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 'chrome': 2,
 '0': 3,
 '1': 4,
 'cc': 5,
 'webcore': 6,
 'build': 7,
 'src': 8,
 'base': 9,
 '2': 10}

In [6]:
train_sequences = tokenizer.texts_to_sequences(train_description)
print(train_sequences[10])

[123, 11, 36, 255, 11, 242, 47, 126, 3, 10, 220, 7364, 59, 56, 117, 6438, 58, 113, 59, 56, 67, 114, 13, 117, 13, 58, 137, 61, 294, 25, 26, 23, 4, 74, 1402, 8871, 8872, 238, 1150, 10, 208, 1328, 7365, 1328, 169, 218, 14, 28, 70, 27, 54, 545, 342, 14, 97, 32, 27, 54, 434, 80, 85, 79, 66, 101, 143, 135, 175, 749, 2519, 115]


In [7]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type).astype(np.float32)
print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))
print(train_padded[10])

70
100
133
100
70
100
[1.230e+02 1.100e+01 3.600e+01 2.550e+02 1.100e+01 2.420e+02 4.700e+01
 1.260e+02 3.000e+00 1.000e+01 2.200e+02 7.364e+03 5.900e+01 5.600e+01
 1.170e+02 6.438e+03 5.800e+01 1.130e+02 5.900e+01 5.600e+01 6.700e+01
 1.140e+02 1.300e+01 1.170e+02 1.300e+01 5.800e+01 1.370e+02 6.100e+01
 2.940e+02 2.500e+01 2.600e+01 2.300e+01 4.000e+00 7.400e+01 1.402e+03
 8.871e+03 8.872e+03 2.380e+02 1.150e+03 1.000e+01 2.080e+02 1.328e+03
 7.365e+03 1.328e+03 1.690e+02 2.180e+02 1.400e+01 2.800e+01 7.000e+01
 2.700e+01 5.400e+01 5.450e+02 3.420e+02 1.400e+01 9.700e+01 3.200e+01
 2.700e+01 5.400e+01 4.340e+02 8.000e+01 8.500e+01 7.900e+01 6.600e+01
 1.010e+02 1.430e+02 1.350e+02 1.750e+02 7.490e+02 2.519e+03 1.150e+02
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.00

In [8]:
validation_sequences = tokenizer.texts_to_sequences(validation_description)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type).astype(np.float32)

print(len(validation_sequences))
print(validation_padded.shape)

1000
(1000, 100)


In [9]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

print(training_label_seq[0])
print(training_label_seq[1])
print(training_label_seq[2])
print(training_label_seq.shape)

print(validation_label_seq[0])
print(validation_label_seq[1])
print(validation_label_seq[2])
print(validation_label_seq.shape)

[44, 1, 2]
[3, 1, 2]
[62, 1, 2]
(4000,)
[]
[]
[]
(1000, 0)


  training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))


In [10]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_description(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_description(train_padded[10]))
print('---')
print(train_description[10])

product version see about version urls if applicable 0 2 149 27other browsers tested firefox ieadd ok fail browsers tested issue safari 3 firefox 3 ok ie 7 okwhat steps reproduce problem 1 open webpage compaq 6715s running vista 2 try scrolling touchpad3 scrolling work not what expected result the page scroll up what happens instead the page move please provide additional information below attach screenshot possible only minor bug ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
---
Product Version      : <see about:version>URLs (if applicable) :0.2.149.27Other browsers tested: Firefox / IEAdd OK FAIL browsers tested issue:Safari 3:    Firefox 3: OK         IE 7:OKWhat steps reproduce problem?1. Open webpage compaq 6715s running vista.2. Try scrolling touchpad3. Scrolling work , not.What expected result?The page scroll up.What happens instead?The page move.Please provide additional information below. Attach screenshot possible.Only minor bug. 


In [11]:
model = tf.keras.Sequential([
    # Add an Embedding layer expecting input vocab of size 5000, and output embedding dimension of size 64 we set at the top
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
#    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    # use ReLU in place of tanh function since they are very good alternatives of each other.
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    # Add a Dense layer with 6 units and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(6, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 512)         23040000  
_________________________________________________________________
bidirectional (Bidirectional (None, 1024)              4198400   
_________________________________________________________________
dense (Dense)                (None, 512)               524800    
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 3078      
Total params: 27,766,278
Trainable params: 27,766,278
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 10
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).