# Text Classifier

## Importing libraries

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import keras
import numpy as np
from keras.datasets import reuters

## Getting data

In [4]:
(x_train, y_train) , (x_test, y_test) = reuters.load_data(num_words= None, test_split=0.2 )

In [5]:
word_index= reuters.get_word_index()

In [None]:
#word_index = reuters.get_word_index(path="reuters_word_index.json")

In [6]:
y_train.shape

(8982,)

In [7]:
x_test.shape

(2246,)

In [8]:
num_classes= max(y_train)+1
print("Number of classes {}".format(num_classes))

Number of classes 46


In [9]:
print(y_train[0])

3


In [10]:
word_index['anger']

5568

In [11]:
index_to_word={}
for key, value in word_index.items():
    index_to_word[value] = key

In [13]:
index_to_word[5568]

'anger'

In [14]:
print(' ' . join([index_to_word[x] for x in x_train[0]]))

the wattie nondiscriminatory mln loss for plc said at only ended said commonwealth could 1 traders now april 0 a after said from 1985 and from foreign 000 april 0 prices its account year a but in this mln home an states earlier and rise and revs vs 000 its 16 vs 000 a but 3 psbr oils several and shareholders and dividend vs 000 its all 4 vs 000 1 mln agreed largely april 0 are 2 states will billion total and against 000 pct dlrs


In [16]:
from keras.preprocessing.text import Tokenizer

max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')

In [17]:
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [18]:
print(x_train[0])
print(len(x_train[0]))
print(max(x_train[0]))

[0. 1. 0. ... 0. 0. 0.]
10000
1.0


In [19]:
print(y_train[0])
print(len(y_train[0]))

[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
46


In [21]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [22]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.metrics_names)

['loss', 'acc']


In [23]:
batch_size = 32
epochs = 2

history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1)
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 8083 samples, validate on 899 samples
Epoch 1/2
Epoch 2/2
Test loss: 0.8363968149849271
Test accuracy: 0.8058771148974196


**Training accuracy starts from 0.71 to 0.88
Validation accuracy is 0.81 
Test accuracy is 0.80**

## Try another set of sample with "count" instead of"binary"

In [24]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)

x_train = tokenizer.sequences_to_matrix(x_train, mode='count')
x_test = tokenizer.sequences_to_matrix(x_test, mode='count')

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

print(x_train[0])
print(len(x_train[0]))
print(max(x_train[0]))
print(np.argmax(x_train[0]))

[0. 1. 0. ... 0. 0. 0.]
10000
6.0
6


In [25]:
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1)
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 8083 samples, validate on 899 samples
Epoch 1/2
Epoch 2/2
Test loss: 0.8634419657774303
Test accuracy: 0.8161175422974176


**A little improvement in accuracy i.e 0.816**

## Try with "freq"

In [27]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)

x_train = tokenizer.sequences_to_matrix(x_train, mode='freq')
x_test = tokenizer.sequences_to_matrix(x_test, mode='freq')

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

print(x_train[0])
print(len(x_train[0]))
print(max(x_train[0]))

[0.         0.01149425 0.         ... 0.         0.         0.        ]
10000
0.06896551724137931


In [28]:
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [29]:
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1)
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 8083 samples, validate on 899 samples
Epoch 1/2
Epoch 2/2
Test loss: 1.6476916122521446
Test accuracy: 0.5854853072393609


**Decrease in accracy. Is it bad activation method then? Frequency values are too small. As we can see maximum train shows only 0.068. So, its too hard for model to learn **

## Trying "tfidf"

In [30]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)

tokenizer.fit_on_sequences(x_train) # one additional step

x_train = tokenizer.sequences_to_matrix(x_train, mode='tfidf')
x_test = tokenizer.sequences_to_matrix(x_test, mode='tfidf')

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

print(x_train[0])
print(len(x_train[0]))
print(max(x_train[0]))

[0.         0.69309152 0.         ... 0.         0.         0.        ]
10000
6.214608098422191


In [31]:
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [32]:
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1)
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 8083 samples, validate on 899 samples
Epoch 1/2
Epoch 2/2
Test loss: 1.0457381822122385
Test accuracy: 0.8023152270703473


**Check training accuracy is 0.91 and validation accuracy is 0.81. Test accuracy is only 0.80. It shows we have an overfitting between Epoch 1 and epoch 2.**<br>
Our best model seems to be with "count" activation method.