In [None]:
import pandas as pd
import numpy as np
import pickle
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from sklearn.preprocessing import LabelBinarizer
import sklearn.datasets as skds
from pathlib import Path

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# For reproducibility
np.random.seed(1237)
import pathlib
# Source file directory
path_train = "/content/drive/MyDrive/text classification/bbc"

files_train = skds.load_files(path_train,load_content=False)

label_index = files_train.target
label_names = files_train.target_names
labelled_files = files_train.filenames

data_tags = ["filename","category","news"]
data_list = []

# Read and add data from file to a list
i=0
for f in labelled_files:
  data_list.append((f,label_names[label_index[i]],pathlib.Path(f).read_text()))
  i += 1

# We have training data available as dictionary filename, category, data
data = pd.DataFrame.from_records(data_list, columns=data_tags)
data

Unnamed: 0,filename,category,news
0,/content/drive/MyDrive/text classification/bbc...,tech,Net regulation 'still possible'\n\nThe blurrin...
1,/content/drive/MyDrive/text classification/bbc...,business,VW considers opening Indian plant\n\nVolkswage...
2,/content/drive/MyDrive/text classification/bbc...,business,Court rejects $280bn tobacco case\n\nA US gove...
3,/content/drive/MyDrive/text classification/bbc...,business,AstraZeneca hit by drug failure\n\nShares in A...
4,/content/drive/MyDrive/text classification/bbc...,business,J&J agrees $25bn Guidant deal\n\nPharmaceutica...
...,...,...,...
1515,/content/drive/MyDrive/text classification/bbc...,politics,Labour chooses Manchester\n\nThe Labour Party ...
1516,/content/drive/MyDrive/text classification/bbc...,politics,Report attacks defence spending\n\nThe Ministr...
1517,/content/drive/MyDrive/text classification/bbc...,tech,"Rich pickings for hi-tech thieves\n\nViruses, ..."
1518,/content/drive/MyDrive/text classification/bbc...,entertainment,McCririck out of Big Brother show\n\nRacing pu...


In [None]:
# lets take 80% data as training and remaining 20% for test.
train_size = int(len(data) * .8)
 
train_posts = data['news'][:train_size]
train_tags = data['category'][:train_size]
train_files_names = data['filename'][:train_size]
 
test_posts = data['news'][train_size:]
test_tags = data['category'][train_size:]
test_files_names = data['filename'][train_size:]

In [None]:
# 4 news groups
num_labels = 4
vocab_size = 15000
batch_size = 100
 
# define Tokenizer with Vocab Size
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_posts)
 
x_train = tokenizer.texts_to_matrix(train_posts, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test_posts, mode='tfidf')
 
encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [None]:
model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))



model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.3))

model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.3))

model.add(Dense(num_labels))
model.add(Activation('softmax'))

model.summary()
 
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=5,
                    verbose=1,
                    validation_split=0.1)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 512)               7680512   
_________________________________________________________________
activation_3 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
activation_4 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 128)              

In [None]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)

print('Test accuracy:', score[1])
print(score)
text_labels = encoder.classes_
print(text_labels)
for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    print(predicted_label)
    print(test_files_names.iloc[i])
    print(prediction)
    print('Actual label:' + test_tags.iloc[i])
    print("Predicted label: " + predicted_label)

Test accuracy: 0.9671052694320679
[0.1782637983560562, 0.9671052694320679]
['business' 'entertainment' 'politics' 'tech']
tech
/content/drive/MyDrive/text classification/bbc/tech/283.txt
[[3.5508184e-14 6.0780220e-16 4.2555460e-15 1.0000000e+00]]
Actual label:tech
Predicted label: tech
entertainment
/content/drive/MyDrive/text classification/bbc/entertainment/076.txt
[[9.0399063e-11 1.0000000e+00 1.3545637e-09 1.0047584e-08]]
Actual label:entertainment
Predicted label: entertainment
entertainment
/content/drive/MyDrive/text classification/bbc/entertainment/205.txt
[[2.4268488e-04 9.8494339e-01 3.6677278e-03 1.1146266e-02]]
Actual label:entertainment
Predicted label: entertainment
business
/content/drive/MyDrive/text classification/bbc/business/148.txt
[[9.9979109e-01 2.5054817e-07 1.9407533e-04 1.4652585e-05]]
Actual label:business
Predicted label: business
business
/content/drive/MyDrive/text classification/bbc/business/161.txt
[[9.9975365e-01 2.4006132e-08 2.4549730e-04 8.6029343e-07

In [None]:
# creates a HDF5 file 'my_model.h5'
model = model.save('bbc.h5')
 
# Save Tokenizer i.e. Vocabulary
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# load our saved model
from keras.models import load_model
model = load_model('bbc.h5')
 
# load tokenizer
tokenizer = Tokenizer()
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
    
encoder.classes_ #LabelBinarizer

array(['business', 'entertainment', 'politics', 'tech'], dtype='<U13')

In [None]:
# These are the labels we stored from our training
# The order is very important here.
 
labels = np.array(['business', 'entertainment', 'politics', 'tech'])
 
test_files = ["/content/drive/My Drive/text classification/news.txt"]
x_data = []
for t_f in test_files:
    t_f_data = Path(t_f).read_text()
    x_data.append(t_f_data)
 
x_data_series = pd.Series(x_data)
x_tokenized = tokenizer.texts_to_matrix(x_data_series, mode='tfidf')
 
i=0
for x_t in x_tokenized:
    prediction = model.predict(np.array([x_t]))
    predicted_label = labels[np.argmax(prediction[0])]
    print(prediction)
    print("File ->", test_files[i], "Predicted label: " + predicted_label)

[[0.8029984  0.03982589 0.04494743 0.1122283 ]]
File -> /content/drive/My Drive/text classification/news.txt Predicted label: business
