In [14]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
%matplotlib inline
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


newsdata = fetch_20newsgroups(subset = 'train')
print(newsdata.keys())

print('train sample num: {}'.format(len(newsdata.data)))
print('total theme num: {}'.format(len(newsdata.target_names)))
print(newsdata.target_names)
print(newsdata.target)
#print(newsdata.data[0])

data = pd.DataFrame(newsdata.data,columns=['email'])
data['target'] = pd.Series(newsdata.target)
data[:5]

#data['target'].value_counts().plot(kind='bar')

# get test datas
newsdata_test = fetch_20newsgroups(subset='test',shuffle=True)

# prepare train and test datas
train_email = data['email']
train_label = data['target']
test_email = newsdata_test.data
test_label = newsdata_test.target

max_words = 10000
num_classes = 20

def prepare_data(train_data,test_data,mode):
    t = Tokenizer(num_words = max_words)
    t.fit_on_texts(train_data)
    X_train = t.texts_to_matrix(train_data,mode=mode)
    X_test = t.texts_to_matrix(test_data,mode=mode)
    return X_train,X_test,t.index_word


X_train,X_test,index_to_word = prepare_data(train_email,test_email,'binary')

y_train = to_categorical(train_label,num_classes)
y_test = to_categorical(test_label,num_classes)

def fit_and_evaluate(X_train,y_train,X_test,y_test):
    model = Sequential()
    model.add(Dense(256,input_shape=(max_words,),activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128,activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes,activation='softmax'))

    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.fit(X_train,y_train,batch_size=128,epochs=5,verbose=1,validation_split=0.1)
    score = model.evaluate(X_test,y_test,batch_size=128,verbose=0)
    return score[1]


modes = ['binary','count','tfidf','freq']

for mode in modes:
   X_train,X_test, _ = prepare_data(train_email,test_email,mode)
   score = fit_and_evaluate(X_train,y_train,X_test,y_test)
   print(mode + 'mode accuracy: ', score)


dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])
train sample num: 11314
total theme num: 20
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
[7 4 4 ... 3 1 8]
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
binarymode accuracy:  0.8285979628562927
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
countmode accuracy:  0.8207647204399109
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
tfidfmode accuracy:  0.835767388343811
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
freqmode accuracy:  0.6776420474052429
