In [1]:
import keras
from keras.datasets import reuters

Using TensorFlow backend.


# for downloading the data
Num words is set to none since we are considering the all the words in the dataset

In [33]:
(x_train,y_train),(x_test,y_test)=reuters.load_data(num_words=None,test_split=0.2)
word_index=reuters.get_word_index()

## Investigating the data

In [34]:
print("No of training samples: {}".format(len(x_train)))
print("No of test samples: {}".format(len(x_test)))
num_class=max(y_train)+1
print("No of classes:{}".format(num_class))

No of training samples: 8982
No of test samples: 2246
No of classes:46


## building a reverse dictionary to see words instead of integers 

In [35]:
print(word_index['in'])

4


In [36]:
ind_word={}
for k,v in word_index.items():
    ind_word[v]=k

In [37]:
print(ind_word[89])
print(ind_word[43])

after
loss


In [38]:
print(' '.join([ind_word[x] for x in x_train[4]]))

the bleached could mln at world as holding for include its i 3 start measures gnp 525 process ccb and nations bleached it 1985 do 000 april 0 a agreed bleached mln in ended cost cts must and ccb tenneco in winter 53 1 mln net diplomats and reorganization group 38 said 49 26 and plastics in this mln ccb field foreign is said bleached 10 3 group 26 38 producers had 4 is bleached mln 1 as equivalent not 145 world york and credits in 20 3 as permits in set board 1 share turnover it than growth pct dlrs


# Data cleaning 

In [39]:
from keras.preprocessing.text import Tokenizer

max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
#onehotencoding
x_train = tokenizer.sequences_to_matrix(x_train, mode='count')
x_test = tokenizer.sequences_to_matrix(x_test, mode='count')
y_train = keras.utils.to_categorical(y_train, num_class)
y_test = keras.utils.to_categorical(y_test, num_class)

In [40]:
print(x_train.shape)
print(x_train[2])
print(y_train.shape)
print(y_train[2])

(8982, 10000)
[0. 1. 0. ... 0. 0. 0.]
(8982, 46)
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


## Building the model

In [41]:
from keras.models import Sequential
from keras.layers import Dense,Dropout,Activation

model=Sequential()
model.add(Dense(512,input_shape=(max_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.7))
model.add(Dense(num_class))
model.add(Activation('softmax'))


In [42]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

## Training the model

In [43]:
history=model.fit(x_train,y_train,batch_size=512,epochs=10,verbose=1,validation_split=0.1)
score=model.evaluate(x_test,y_test,batch_size=512,verbose=1)
print("Test_loss:{}".format(score[0]))
print("Test_accuracy:{}".format(score[1]))

Train on 8083 samples, validate on 899 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test_loss:0.8958225287397525
Test_accuracy:0.8081033229827881


## Another way of preprocessing using tf-idf

In [29]:
(x_train,y_train),(x_test,y_test)=reuters.load_data(num_words=None,test_split=0.2)
tokenizer.fit_on_sequences(x_train)
x_train = tokenizer.sequences_to_matrix(x_train, mode='tfidf')
x_test = tokenizer.sequences_to_matrix(x_test, mode='tfidf')
y_train = keras.utils.to_categorical(y_train, num_class)
y_test = keras.utils.to_categorical(y_test, num_class)
print(x_train[0])
print(max(x_train[0]))

[0.         0.69311935 0.         ... 0.         0.         0.        ]
6.2427234182900655


In [30]:
model=Sequential()
model.add(Dense(512,input_shape=(max_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.7))
model.add(Dense(num_class))
model.add(Activation('softmax'))

In [31]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [32]:
history=model.fit(x_train,y_train,batch_size=512,epochs=10,verbose=1,validation_split=0.1)
score=model.evaluate(x_test,y_test,batch_size=512,verbose=1)
print("Test_loss:{}".format(score[0]))
print("Test_accuracy:{}".format(score[1]))

Train on 8083 samples, validate on 899 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test_loss:1.096245281205895
Test_accuracy:0.8076580762863159


## Another way of preprocessing using frequency method

In [44]:
(x_train,y_train),(x_test,y_test)=reuters.load_data(num_words=None,test_split=0.2)

x_train = tokenizer.sequences_to_matrix(x_train, mode='freq')
x_test = tokenizer.sequences_to_matrix(x_test, mode='freq')
y_train = keras.utils.to_categorical(y_train, num_class)
y_test = keras.utils.to_categorical(y_test, num_class)
print(x_train[0])
print(max(x_train[0]))

[0.         0.01149425 0.         ... 0.         0.         0.        ]
0.06896551724137931


In [45]:
model=Sequential()
model.add(Dense(512,input_shape=(max_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.7))
model.add(Dense(num_class))
model.add(Activation('softmax'))

In [46]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history=model.fit(x_train,y_train,batch_size=512,epochs=10,verbose=1,validation_split=0.1)
score=model.evaluate(x_test,y_test,batch_size=512,verbose=1)
print("Test_loss:{}".format(score[0]))
print("Test_accuracy:{}".format(score[1]))

Train on 8083 samples, validate on 899 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test_loss:1.7766749838687221
Test_accuracy:0.5289403200149536


The 3 methods which were used here are:frequency, count, tf-idf. Since the model is simple, the accuracy which was achieved using tf-idf was way too less. But, in the future, if we tweak the parameters and build a even more complex model using tf-idf, we must be able to achieve better results.