In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_datasets as tfds

# Loading Dataset

In [2]:
# Download pretrained embedding | Glove 6B embedding
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2024-09-12 05:29:21--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-09-12 05:29:21--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-09-12 05:29:22--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [3]:
!unzip glove.6B.zip -d glove.6B

Archive:  glove.6B.zip
  inflating: glove.6B/glove.6B.50d.txt  
  inflating: glove.6B/glove.6B.100d.txt  
  inflating: glove.6B/glove.6B.200d.txt  
  inflating: glove.6B/glove.6B.300d.txt  


In [4]:
dataset, info = tfds.load('ag_news_subset', with_info=True, as_supervised=True)
train_dataset = dataset['train']
test_dataset = dataset['test']

Downloading and preparing dataset 11.24 MiB (download: 11.24 MiB, generated: 35.79 MiB, total: 47.03 MiB) to /root/tensorflow_datasets/ag_news_subset/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/120000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/ag_news_subset/incomplete.LH6A51_1.0.0/ag_news_subset-train.tfrecord*...: …

Generating test examples...:   0%|          | 0/7600 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/ag_news_subset/incomplete.LH6A51_1.0.0/ag_news_subset-test.tfrecord*...:  …

Dataset ag_news_subset downloaded and prepared to /root/tensorflow_datasets/ag_news_subset/1.0.0. Subsequent calls will reuse this data.


# Preparing Data

In [8]:
train_text = [x[0].numpy().decode('utf-8') for x in train_dataset]
tokenizer = Tokenizer(num_words=20000, oov_token='<OOV>')
tokenizer.fit_on_texts(train_text)
train_seq = tokenizer.texts_to_sequences(train_text)
train_pad = pad_sequences(train_seq, padding='post')

max_length = train_pad.shape[1]

In [13]:
test_text = [x[0].numpy().decode('utf-8') for x in test_dataset]
tokenizer.fit_on_texts(test_text)
test_seq = tokenizer.texts_to_sequences(test_text)
test_pad = pad_sequences(test_seq, padding='post', maxlen = max_length)

# Apply pretrained word embedding

In [18]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50
embedding_matrix = np.zeros((vocab_size, embedding_dim))  # Initialize embedding_matrix with zeros

preparing embedding_matrix

In [20]:
with open('/content/glove.6B/glove.6B.50d.txt', 'r', encoding='utf-8') as f:
  for line in f:
    values = line.split()
    #print(values[0])
    word = values[0]
    if word in tokenizer.word_index:
      idx = tokenizer.word_index[word]
      embedding_matrix[idx] = np.array(values[1:], dtype=np.float32)

Building model

In [22]:
model_lstm = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False, input_length = max_length),
    LSTM(32, return_sequences=True),
    LSTM(16),
    Dense(64, activation='relu'),
    Dense(4, activation='softmax')
])

model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
train_labels = tf.keras.utils.to_categorical([x[1].numpy() for x in train_dataset])
test_labels = tf.keras.utils.to_categorical([x[1].numpy() for x in test_dataset])

In [24]:

history_lstm = model_lstm.fit(train_pad, train_labels, epochs=10, validation_split=0.2)

Epoch 1/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m409s[0m 135ms/step - accuracy: 0.2496 - loss: 1.3864 - val_accuracy: 0.2508 - val_loss: 1.3850
Epoch 2/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m415s[0m 126ms/step - accuracy: 0.2532 - loss: 1.3847 - val_accuracy: 0.2521 - val_loss: 1.3841
Epoch 3/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m406s[0m 134ms/step - accuracy: 0.2503 - loss: 1.3846 - val_accuracy: 0.2467 - val_loss: 1.3835
Epoch 4/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m426s[0m 129ms/step - accuracy: 0.2825 - loss: 1.3621 - val_accuracy: 0.5094 - val_loss: 1.0724
Epoch 5/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m391s[0m 130ms/step - accuracy: 0.5400 - loss: 1.0290 - val_accuracy: 0.6092 - val_loss: 0.9419
Epoch 6/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m451s[0m 133ms/step - accuracy: 0.6216 - loss: 0.9118 - val_accuracy: 0.6665 - val_loss:

In [25]:
model_lstm.evaluate(test_pad, test_labels)

[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 45ms/step - accuracy: 0.5969 - loss: 1.0608


[1.0667942762374878, 0.5902631282806396]