In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from tensorflow.keras.layers import Dense, Input, Flatten
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Concatenate
from tensorflow.keras.models import Model

In [2]:
# pip install tqdm -> progressbar 
from tqdm import tqdm

In [3]:
categories = ['comp.sys.mac.hardware', 'rec.motorcycles', 'sci.electronics']

newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True, categories=categories)

print(newsgroups_train.target_names)
print(len(newsgroups_train.data))

['comp.sys.mac.hardware', 'rec.motorcycles', 'sci.electronics']
1767


In [4]:
# print(newsgroups_train.data[0])
print(newsgroups_train.data[0].split("\n")[10:15])

[': |> In article <2514@tekgen.bv.tek.com> davet@interceptor.cds.tek.com (Dave Tharp CDS) writes:', ': |> >In article <1993Apr15.171757.10890@i88.isc.com> jeq@lachman.com (Jonathan E. Quist) writes:', ': |> >>Rolls-Royce owned by a non-British firm?', ': |> >>', ': |> >>Ye Gods, that would be the end of civilization as we know it.']


In [5]:
print(newsgroups_train.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [6]:
print(len(newsgroups_train.data))
print(len(newsgroups_train.filenames))
print(len(newsgroups_train.target_names))
print(len(newsgroups_train.target))

1767
1767
3
1767


In [7]:
print(newsgroups_train.target_names)

['comp.sys.mac.hardware', 'rec.motorcycles', 'sci.electronics']


In [8]:
newsgroups_train.data[0]

'From: npet@bnr.ca (Nick Pettefar)\nSubject: Re: Happy Easter!\nNntp-Posting-Host: bmdhh299\nOrganization: BNR Europe Ltd, Maidenhead, UK\nX-Newsreader: TIN [version 1.1 PL8]\nLines: 37\n\nkevinh, on the Tue, 20 Apr 1993 13:23:01 GMT wibbled:\n\n: In article <1993Apr19.154020.24818@i88.isc.com>, jeq@lachman.com (Jonathan E. Quist) writes:\n: |> In article <2514@tekgen.bv.tek.com> davet@interceptor.cds.tek.com (Dave Tharp CDS) writes:\n: |> >In article <1993Apr15.171757.10890@i88.isc.com> jeq@lachman.com (Jonathan E. Quist) writes:\n: |> >>Rolls-Royce owned by a non-British firm?\n: |> >>\n: |> >>Ye Gods, that would be the end of civilization as we know it.\n: |> >\n: |> >  Why not?  Ford owns Aston-Martin and Jaguar, General Motors owns Lotus\n: |> >and Vauxhall.  Rover is only owned 20% by Honda.\n: |> \n: |> Yes, it\'s a minor blasphemy that U.S. companies would ?? on the likes of A.M.,\n: |> Jaguar, or (sob) Lotus.  It\'s outright sacrilege for RR to have non-British\n: |> ownership

In [9]:
%%time
labels = newsgroups_train.target
texts = newsgroups_train.data

MAX_VOCAB_SIZE = 20000
MAX_SEQUENCE_LENGTH = 1000

tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)

CPU times: user 408 ms, sys: 9.31 ms, total: 417 ms
Wall time: 422 ms


In [10]:
print(sequences[0][:10])
print(texts[0])

[12, 1265, 270, 70, 769, 1233, 20, 33, 1207, 4237]
From: npet@bnr.ca (Nick Pettefar)
Subject: Re: Happy Easter!
Nntp-Posting-Host: bmdhh299
Organization: BNR Europe Ltd, Maidenhead, UK
X-Newsreader: TIN [version 1.1 PL8]
Lines: 37

kevinh, on the Tue, 20 Apr 1993 13:23:01 GMT wibbled:

: In article <1993Apr19.154020.24818@i88.isc.com>, jeq@lachman.com (Jonathan E. Quist) writes:
: |> In article <2514@tekgen.bv.tek.com> davet@interceptor.cds.tek.com (Dave Tharp CDS) writes:
: |> >In article <1993Apr15.171757.10890@i88.isc.com> jeq@lachman.com (Jonathan E. Quist) writes:
: |> >>Rolls-Royce owned by a non-British firm?
: |> >>
: |> >>Ye Gods, that would be the end of civilization as we know it.
: |> >
: |> >  Why not?  Ford owns Aston-Martin and Jaguar, General Motors owns Lotus
: |> >and Vauxhall.  Rover is only owned 20% by Honda.
: |> 
: |> Yes, it's a minor blasphemy that U.S. companies would ?? on the likes of A.M.,
: |> Jaguar, or (sob) Lotus.  It's outright sacrilege for RR to have

In [11]:
word_index = tokenizer.word_index
print("Unique tokens:", len(word_index))

Unique tokens: 25458


In [12]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [25]:
print(data.shape)
print(data[0][680:730])

(1767, 1000)
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0   12 1265  270   70  769 1233   20   33 1207
 4237   52   47   49 3173   25  270 2009  999 1735  146  107  402  471
  238   34   34 1052   21 1390 3928   15]


In [15]:
labels = to_categorical(np.array(labels))
labels

array([[0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.]], dtype=float32)

In [16]:
print('Data shape:', data.shape)
print('Label shape:', labels.shape)

Data shape: (1767, 1000)
Label shape: (1767, 3)


In [50]:
VALIDATION_SPLIT = 0.2

indices = np.arange(data.shape[0])
np.random.shuffle(indices) 
data = data[indices] 
labels = labels[indices] 
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples] 
y_train = labels[:-nb_validation_samples] 
x_val = data[-nb_validation_samples:] 
y_val = labels[-nb_validation_samples:] 

print (x_train.shape)
print (y_train.shape)

print('Number of catrgories in traing and validation set ') 
print (y_train.sum(axis=0))
print (y_val.sum(axis=0))


(1414, 1000)
(1414, 3)
Number of catrgories in traing and validation set 
[460. 478. 476.]
[118. 120. 115.]
