In [1]:
import numpy as np
from scipy.sparse import csc_matrix, hstack
#Loading the data set - training data.
from sklearn.datasets import fetch_20newsgroups
train = fetch_20newsgroups(random_state=1,
                           subset='train',
                                          )
test = fetch_20newsgroups(random_state=1,
                          subset='test',
                           )

In [2]:
import pandas as pd
#read preprocessed data and remove the row index column
train_posts_embed=pd.read_csv("train_posts_embed.csv",sep="~").iloc[:,1:513]
test_posts_embed=pd.read_csv("test_posts_embed.csv",sep="~").iloc[:,1:513]
test_posts_embed=csc_matrix(test_posts_embed)
train_posts_embed=csc_matrix(train_posts_embed)


In [3]:
# Extracting features from text files
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(ngram_range= (1, 2),max_features=200000)), ('tfidf', TfidfTransformer(use_idf=True))])
X_train_tfidf = text_clf.fit_transform(train.data)
X_test_tfidf=text_clf.transform(test.data)

# Support vector machine (SVM) classifier

In [4]:
X_train_tfidf

<11314x200000 sparse matrix of type '<class 'numpy.float64'>'
	with 3489004 stored elements in Compressed Sparse Row format>

In [5]:
from sklearn.linear_model import SGDClassifier
svm=SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3,max_iter=60, random_state=42)
svm.fit(X_train_tfidf, train.target)
predicted = svm.predict(X_test_tfidf)
np.mean(predicted == test.target)


#Out[5]:
#0.8364312267657993

0.8364312267657993

In [6]:
num_labels=20

In [7]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
encoder.fit(train.target)
y_train = encoder.transform( train.target)
y_test = encoder.transform( test.target)

In [8]:
#Merge text embeded datasets and tfidf datasets
train_posts=hstack([train_posts_embed,X_train_tfidf]).todense()
test_posts=hstack([test_posts_embed,X_test_tfidf]).todense()

# DNN Classifier


In [9]:
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
model = Sequential()
model.add(Dense(256, input_shape=(train_posts.shape[1],)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.summary()
 
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               51331328  
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                16448     
_________________________________________________________________
activation_2 (Activation)    (None, 64)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 20)                1300      
__________

In [10]:
history = model.fit(train_posts, y_train,
                    steps_per_epoch=80,
                    epochs=2,
                    verbose=1)
                   # validation_steps=1,
                   # validation_split=0.2)
                   # validation_data= (test_posts, y_test))
 

#Epoch 1/2
#80/80 [==============================] - 2724s 34s/step - loss: 0.8221 - acc: 0.8835
#Epoch 2/2
#80/80 [==============================] - 2483s 31s/step - loss: 0.0224 - acc: 0.9986

Epoch 1/2
Epoch 2/2


In [12]:

score = model.evaluate(test_posts, y_test,
                        verbose=1,batch_size=500)
 
print('Test accuracy:', score[1])


#7532/7532 [==============================] - 41s 5ms/step
#Test accuracy: 0.8647105682421667

Test accuracy: 0.8647105682421667
