Code adapted from: https://www.opencodez.com/python/text-classification-using-keras.htm

Further info can be found here: https://github.com/MinorJinx/WebsiteTextClassifier

In [0]:
# Remove GoogleColab generated directories
!rm -r sample_data/

In [2]:
# Download 20Newsgroups dataset
# !wget http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz
!wget https://github.com/MinorJinx/WebsiteTextClassifier/raw/master/20news-bydate.tar.gz

--2019-11-29 23:05:07--  https://github.com/MinorJinx/WebsiteTextClassifier/raw/master/20news-bydate.tar.gz
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/MinorJinx/WebsiteTextClassifier/master/20news-bydate.tar.gz [following]
--2019-11-29 23:05:07--  https://raw.githubusercontent.com/MinorJinx/WebsiteTextClassifier/master/20news-bydate.tar.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14464277 (14M) [application/octet-stream]
Saving to: ‘20news-bydate.tar.gz’


2019-11-29 23:05:08 (116 MB/s) - ‘20news-bydate.tar.gz’ saved [14464277/14464277]



In [3]:
# Uncompress datset
!tar -xzf 20news-bydate.tar.gz
!ls

20news-bydate.tar.gz  20news-bydate-test  20news-bydate-train


In [0]:
import pandas as pd
import numpy as np
import os, pickle
from keras.preprocessing.text import Tokenizer
from keras.models import load_model, Sequential
from keras.layers import Activation, Dense, Dropout
from sklearn.preprocessing import LabelBinarizer
import sklearn.datasets as skds
from pathlib import Path

In [0]:
# Suppresses TensorFlow warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# For reproducibility
np.random.seed(1237)

# Source file directory
path_train = "20news-bydate-train"

files_train = skds.load_files(path_train,load_content=False)

label_index = files_train.target
label_names = files_train.target_names
labelled_files = files_train.filenames

data_tags = ["filename","category","news"]
data_list = []

# Read and add data from file to a list (*Added cp1252 encoding)
i=0
for f in labelled_files:
    data_list.append((f,label_names[label_index[i]],Path(f).read_text(encoding='cp1252')))
    i += 1

# We have training data available as dictionary filename, category, data
data = pd.DataFrame.from_records(data_list, columns=data_tags)

In [0]:
# Seperate 80% data as training and remaining 20% for test.
train_size = int(len(data) * .8)
 
train_posts = data['news'][:train_size]
train_tags = data['category'][:train_size]
train_files_names = data['filename'][:train_size]
 
test_posts = data['news'][train_size:]
test_tags = data['category'][train_size:]
test_files_names = data['filename'][train_size:]

In [0]:
# 20 news groups
num_labels = 20
vocab_size = 15000
batch_size = 100
 
# define Tokenizer with Vocab Size
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_posts)
 
x_train = tokenizer.texts_to_matrix(train_posts, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test_posts, mode='tfidf')
 
encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [0]:
encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [12]:
model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.summary()
 
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
# history = model.fit(x_train, y_train,
#                     batch_size=batch_size,
#                     epochs=30,
#                     verbose=1,
#                     validation_split=0.1)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 512)               7680512   
_________________________________________________________________
activation_10 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 512)               262656    
_________________________________________________________________
activation_11 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 20)               

In [13]:
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
 
print('Test accuracy:', score[1], '\n')
 
text_labels = encoder.classes_
 
for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    #print(test_files_names.iloc[i])
    print('Actual label:' + test_tags.iloc[i])
    print("Predicted label: " + predicted_label)

Test accuracy: 0.888643392855964 

Actual label:alt.atheism
Predicted label: alt.atheism
Actual label:comp.graphics
Predicted label: comp.graphics
Actual label:sci.med
Predicted label: sci.med
Actual label:sci.crypt
Predicted label: sci.crypt
Actual label:comp.os.ms-windows.misc
Predicted label: comp.os.ms-windows.misc
Actual label:rec.sport.baseball
Predicted label: rec.sport.baseball
Actual label:soc.religion.christian
Predicted label: sci.med
Actual label:comp.graphics
Predicted label: comp.graphics
Actual label:rec.sport.hockey
Predicted label: rec.sport.hockey
Actual label:rec.sport.hockey
Predicted label: rec.sport.baseball


In [0]:
# creates a HDF5 file 'my_model.h5'
# model.model.save('my_model.h5')
 
# Save Tokenizer i.e. Vocabulary
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
!wget https://github.com/MinorJinx/WebsiteTextClassifier/raw/master/my_model.h5

--2019-11-29 23:15:09--  https://github.com/MinorJinx/WebsiteTextClassifier/raw/master/my_model.h5
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/MinorJinx/WebsiteTextClassifier/master/my_model.h5 [following]
--2019-11-29 23:15:10--  https://raw.githubusercontent.com/MinorJinx/WebsiteTextClassifier/master/my_model.h5
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 95473968 (91M) [application/octet-stream]
Saving to: ‘my_model.h5’


2019-11-29 23:15:12 (252 MB/s) - ‘my_model.h5’ saved [95473968/95473968]



In [17]:
# load our saved model
model = load_model('my_model.h5')
 
# load tokenizer
tokenizer = Tokenizer()
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# List the lables created from training
encoder.classes_

array(['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
       'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
       'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
       'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
       'sci.electronics', 'sci.med', 'sci.space',
       'soc.religion.christian', 'talk.politics.guns',
       'talk.politics.mideast', 'talk.politics.misc',
       'talk.religion.misc'], dtype='<U24')

In [0]:
# These are the labels we stored from our training, in exact order
labels = np.array(['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x',
 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space',
 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast',
 'talk.politics.misc', 'talk.religion.misc'])

Text websites are found here: http://lite.cnn.io/en

Change the website variable below to classify it's content.

In [0]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

website = 'http://lite.cnn.io/en/article/h_d9b1226cbd5978f4859d9fa318ac5883'

page = urlopen(website).read()
soup = BeautifulSoup(page)
body = soup.find_all('p')
file = open('websiteText.txt', 'w')
for p in body:
  file.write(str(p.text))
file.close()

In [20]:
# To test against files from 20newsGroup
# test_files = ["20news-bydate-test/talk.politics.mideast/77250"]

# Array that contains out website text from above
test_files = ["websiteText.txt"]

x_data = []
for t_f in test_files:
    t_f_data = Path(t_f).read_text()
    x_data.append(t_f_data)
 
x_data_series = pd.Series(x_data)
x_tokenized = tokenizer.texts_to_matrix(x_data_series, mode='tfidf')
 
i=0
for x_t in x_tokenized:
    prediction = model.predict(np.array([x_t]))
    predicted_label = labels[np.argmax(prediction[0])]
    second_guess = labels[np.argpartition(prediction[0], -2)[-2:][0]] # Gets index of second best prediction
    third_guess = labels[np.argpartition(prediction[0], -3)[-3:][0]] # Gets index of second best prediction
    print("File:\t\t", test_files[i])
    print("Predicted label:", predicted_label, "  Confidence:", np.max(prediction[0]))
    print("Second Guess:   ", second_guess, "  Confidence:", np.partition(prediction[0].flatten(), -2)[-2])
    print("Third Guess:    ", third_guess, "  Confidence:", np.partition(prediction[0].flatten(), -3)[-3])
    i += 1

File:		 websiteText.txt
Predicted label: talk.politics.misc   Confidence: 0.98500293
Second Guess:    talk.politics.guns   Confidence: 0.014995642
Third Guess:     soc.religion.christian   Confidence: 9.215289e-07


In [0]:
%pycat websiteText.txt

In [0]:
%pycat 20news-bydate-test/talk.politics.mideast/77250