# Comparative Study

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
import tensorflow_hub as hub
from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, LSTM, GRU, Bidirectional, Dropout, MaxPooling1D, Flatten, Conv1D

Load Dataset

In [2]:
dataset, info = tfds.load('ag_news_subset', with_info = True, as_supervised = True)

In [3]:
train_data, test_data = dataset['train'], dataset['test']

In [4]:
# build tokenizer and pad_sequences"
tokenizer = Tokenizer(num_words=20000, oov_token='<OOV>')

In [5]:
train_text = [x[0].numpy().decode('utf-8') for x in train_data]

In [6]:
train_text[:5]

['AMD #39;s new dual-core Opteron chip is designed mainly for corporate computing applications, including databases, Web services, and financial transactions.',
 'Reuters - Major League Baseball\\Monday announced a decision on the appeal filed by Chicago Cubs\\pitcher Kerry Wood regarding a suspension stemming from an\\incident earlier this season.',
 'President Bush #39;s  quot;revenue-neutral quot; tax reform needs losers to balance its winners, and people claiming the federal deduction for state and local taxes may be in administration planners #39; sights, news reports say.',
 'Britain will run out of leading scientists unless science education is improved, says Professor Colin Pillinger.',
 'London, England (Sports Network) - England midfielder Steven Gerrard injured his groin late in Thursday #39;s training session, but is hopeful he will be ready for Saturday #39;s World Cup qualifier against Austria.']

In [7]:
tokenizer.fit_on_texts(train_text)
word_ind = tokenizer.word_index

train_seq = tokenizer.texts_to_sequences(train_text)
train_pad = pad_sequences(train_seq, padding='post')

In [8]:
train_labels = [x[1].numpy() for x in train_data]

In [9]:
train_labels = np.asarray(train_labels)

In [10]:
train_labels = to_categorical(train_labels, num_classes=4)

In [11]:
train_sequence, val_sequence, train_labels, val_labels = train_test_split(train_pad, train_labels, test_size=0.2)

Modeling

In [12]:
vocab_size = 20000
embedding_dim = 64
max_length = train_pad.shape[1]

In [13]:
# DNN model
model_dnn = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(16, activation='relu'),
    Dense(4, activation='softmax'),
])



In [14]:
# CNN model
model_cnn = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dense(4, activation='softmax')
])

In [15]:
# LSTM model
model_lstm = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    LSTM(32, return_sequences=True),
    LSTM(32),
    Dense(64, activation='relu'),
    Dense(4, activation='softmax')
])

In [16]:
# Bidirectional LSTM
model_bidirectional = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(32, return_sequences=True)),
    Bidirectional(LSTM(16)),
    Dense(64, activation='relu'),
    Dense(4, activation='softmax')
])

In [17]:
models = [model_dnn, model_cnn, model_lstm, model_bidirectional]

for model in models:
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  history = model.fit(train_sequence, train_labels, validation_data = (val_sequence,val_labels), verbose=False)

In [18]:
test_text = [x[0].numpy().decode('utf-8') for x in test_data]
test_labels = [x[1].numpy() for x in test_data]

tokenizer.fit_on_texts(test_text)
test_seq = tokenizer.texts_to_sequences(test_text)
test_pad = pad_sequences(test_seq, padding='post')

test_labels = to_categorical(test_labels, num_classes=4)

In [19]:
model_cnn.evaluate(test_pad,test_labels)

[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.3552 - loss: 2.2287


[2.200000762939453, 0.36460527777671814]

In [None]:
#model_dnn.evaluate(test_pad,test_labels)

In [None]:
model_lstm.evaluate(test_pad,test_labels)

In [None]:
model_bidirectional.evaluate(test_pad,test_labels)