In [33]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential,Model
from keras.layers import Input,Dense,Dropout

In [17]:
newsdata = fetch_20newsgroups(subset='train')
newstest = fetch_20newsgroups(subset='test')
print(newsdata.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [5]:
print('number of train data: ', len(newsdata.data))
print('number of news topic: ', len(newsdata.target_names))

number of train data:  11314
number of news topic:  20


In [10]:
data = pd.DataFrame({'email': newsdata.data,'target': newsdata.target})

In [11]:
data

Unnamed: 0,email,target
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14
...,...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,4
11311,From: westes@netcom.com (Will Estes)\nSubject:...,3
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1


In [16]:
print('without duplicate data: ',data['email'].nunique())

without duplicate data:  11314


In [19]:
x_train = data['email']
y_train = data['target']
x_test = newstest.data
y_test = newstest.target

max_words = 10000
num_classes = 20

In [25]:
## mode = binary, count, tfidf, freq
def preprocess(train, test, mode):
    t = Tokenizer(num_words = max_words)
    t.fit_on_texts(train)
    X_train = t.texts_to_matrix(train, mode= mode)
    X_test = t.texts_to_matrix(test, mode= mode)
    return X_train, X_test, t.index_word


In [26]:
X_train, X_test, idx_to_word = preprocess(x_train,x_test, 'tfidf')
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test,num_classes)

### Sequence Model

In [30]:
def sequence(X_train,y_train):
    model = Sequential()
    model.add(Dense(256, input_shape=(max_words,), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    return model

In [35]:
model = sequence(X_train,y_train)


model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.fit(X_train,y_train,epochs = 3,batch_size=128,verbose=1,validation_split=0.1)

Train on 10182 samples, validate on 1132 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x1a4ef22470>

### Functional API Model

In [43]:
def functional(X_train,y_train):
    inputs = Input(shape=(X_train.shape[1],))
    model = Dense(256, activation='relu')(inputs)
    model = Dropout(0.5)(model)
    model = Dense(128, activation='relu')(model)
    model = Dropout(0.5)(model)
    outputs =  Dense(num_classes, activation='softmax')(model)
    model = Model(inputs=inputs, outputs = outputs)
    return model

In [44]:
model = functional(X_train,y_train)

model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.fit(X_train,y_train,epochs = 3,batch_size=128,verbose=1,validation_split=0.1)

Train on 10182 samples, validate on 1132 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x1a4f73e048>