In [2]:
import pandas as pd
import numpy as np
import pickle
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Activation, Dense, Dropout
from sklearn.preprocessing import LabelBinarizer
import sklearn.datasets as skds
from pathlib import Path

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
# For reproducibility
np.random.seed(1237)
#loading training files
files_train = skds.load_files("datasets\\train",load_content=False)
label_index = files_train.target
label_names = files_train.target_names
labelled_files = files_train.filenames
data_tags = ["filename","category","news"]
data_list = []
# Read and add data from file to a list
i=0
for f in labelled_files:
    data_list.append((f,label_names[label_index[i]],Path(f).read_text()))
    i += 1
 
# We have training data available as dictionary filename, category, data
data = pd.DataFrame.from_records(data_list, columns=data_tags)
data.head(10)

Unnamed: 0,filename,category,news
0,datasets\train\rec.sport.baseball\102736,rec.sport.baseball,From: cubbie@garnet.berkeley.edu ( ...
1,datasets\train\comp.sys.mac.hardware\50485,comp.sys.mac.hardware,From: gnelson@pion.rutgers.edu (Gregory Nelson...
2,datasets\train\sci.crypt\15246,sci.crypt,From: crypt-comments@math.ncsu.edu\nSubject: C...
3,datasets\train\comp.sys.mac.hardware\51904,comp.sys.mac.hardware,From: ()\nSubject: Re: Quadra SCSI Problems??...
4,datasets\train\alt.atheism\53144,alt.atheism,From: keith@cco.caltech.edu (Keith Allan Schne...
5,datasets\train\comp.sys.mac.hardware\50458,comp.sys.mac.hardware,From: taihou@chromium.iss.nus.sg (Tng Tai Hou)...
6,datasets\train\comp.windows.x\66981,comp.windows.x,From: huub@cwi.nl (Huub Bakker)\nSubject: wait...
7,datasets\train\comp.windows.x\67231,comp.windows.x,From: lanzo@tekelec.com (Mark Lanzo)\nSubject:...
8,datasets\train\sci.med\59250,sci.med,Subject: Why isolate it?\nFrom: chinsz@eis.cal...
9,datasets\train\sci.electronics\53591,sci.electronics,From: seema@madvlsi.columbia.edu (Seema Varma)...


In [6]:
# lets take 80% data as training and remaining 20% for test.
train_size = int(len(data) * .8)
 
train_posts = data['news'][:train_size]
train_tags = data['category'][:train_size]
train_files_names = data['filename'][:train_size]
 
test_posts = data['news'][train_size:]
test_tags = data['category'][train_size:]
test_files_names = data['filename'][train_size:]

In [7]:
# 20 news groups
num_labels = 20
vocab_size = 15000
batch_size = 100
 
# define Tokenizer with Vocab Size
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_posts)
 
x_train = tokenizer.texts_to_matrix(train_posts, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test_posts, mode='tfidf')
 
encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [8]:
encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [9]:
model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.summary()
 
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=30,
                    verbose=1,
                    validation_split=0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               7680512   
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_2 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 20)                10260     
__________

In [10]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)

print('Test accuracy:', score[1])

text_labels = encoder.classes_

for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    print(test_files_names.iloc[i])
    print('Actual label:' + test_tags.iloc[i])
    print("Predicted label: " + predicted_label)

Test accuracy: 0.884666374996074
datasets\train\alt.atheism\53114
Actual label:alt.atheism
Predicted label: alt.atheism
datasets\train\comp.graphics\38666
Actual label:comp.graphics
Predicted label: comp.graphics
datasets\train\sci.med\58932
Actual label:sci.med
Predicted label: sci.med
datasets\train\sci.crypt\15212
Actual label:sci.crypt
Predicted label: sci.crypt
datasets\train\comp.os.ms-windows.misc\9695
Actual label:comp.os.ms-windows.misc
Predicted label: comp.os.ms-windows.misc
datasets\train\rec.sport.baseball\104482
Actual label:rec.sport.baseball
Predicted label: rec.sport.baseball
datasets\train\soc.religion.christian\20731
Actual label:soc.religion.christian
Predicted label: misc.forsale
datasets\train\comp.graphics\38583
Actual label:comp.graphics
Predicted label: comp.graphics
datasets\train\rec.sport.hockey\52638
Actual label:rec.sport.hockey
Predicted label: rec.sport.hockey
datasets\train\rec.sport.hockey\52636
Actual label:rec.sport.hockey
Predicted label: rec.sport.

In [11]:
# creates a HDF5 file 'my_model.h5'
model.model.save('my_model.h5')
 
# Save Tokenizer i.e. Vocabulary
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
# load our saved model
model = model.model.load_model('my_model.h5')
 
# load tokenizer
tokenizer = Tokenizer()
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

AttributeError: 'Model' object has no attribute 'load_model'