#### Just run the cells marked with star[****] to produce features and labels:

## Importing 20_Newsgroups Public Dataset:

In [0]:
#***************************************************************************************
import numpy as np
import pickle
from sklearn.datasets import fetch_20newsgroups
train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')
#combining training and testing dataset
train_x,y_train = train.data,train.target 
test_x,y_test = test.data,test.target
data = train_x+test_x
targets = np.array(y_train.tolist()+y_test.tolist())
#***************************************************************************************

In [0]:
#A glimpse into individual Document/email:
data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [0]:
#News Genres:
train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [0]:
#Shape of training and testing data:
y_train.shape,y_test.shape

((11314,), (7532,))

## Tokenizing and cleaning the text using NLTK

In [0]:
import nltk
import re 
import pickle
from nltk.corpus import wordnet 
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
stopwords = list(stopwords.words('english'))+['From','Subject','Organization','Lines','\n']
tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()

## This cell is refining the dataset by removing punctuations, and doing afterwards tokenization and lemmatization.
* I have already run this cell and stored refined training and testing datsets in pickle format [__See cell below this cell__].

In [0]:
def lemmatization(text_tuple_list):
    new_text = []
    for word,tag in text_tuple_list:
        if word not in stopwords: #stopwords removal
            if tag.startswith('J'):
                new_text.append(lemmatizer.lemmatize(word,wordnet.ADJ)) #giving the proper Part-of-speech
            elif tag.startswith('V'):
                new_text.append(lemmatizer.lemmatize(word,wordnet.VERB)) 
            elif tag.startswith('N'):
                new_text.append(lemmatizer.lemmatize(word,wordnet.NOUN))
            elif tag.startswith('R'):
                new_text.append(lemmatizer.lemmatize(word,wordnet.ADV))
            else:
                new_text.append(lemmatizer.lemmatize(word,wordnet.NOUN))
    return new_text

def get_words_only(doc): #doc: the entire dataset.
    ref_text = [] 
    for text in doc: #text:individual email.
        ref_text.append(lemmatization(nltk.pos_tag(tokenizer.tokenize(' '.join(re.findall(r'\w+',text)))))) 
        # Above code-line: Puntuation removal,Tokenization & Lemmatization in respective order.
    return ref_text
ref_data = get_words_only(data)

In [0]:
# with open('refined_data.pkl','wb') as fp:
#     pickle.dump(ref_data,fp)
with open('refined_data.pkl','rb') as fp:
    ref_data = pickle.load(fp)

In [0]:
# Unrefined data:
data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [0]:
# After Refining:
' '.join(ref_data[0])

'lerxst wam umd edu thing WHAT car Nntp Posting Host rac3 wam umd edu University Maryland College Park 15 I wonder anyone could enlighten car I saw day It 2 door sport car look late 60 early 70 It call Bricklin The door really small In addition front bumper separate rest body This I know If anyone tellme model name engine spec year production car make history whatever info funky look car please e mail Thanks IL bring neighborhood Lerxst'

## Word2Vec using Gensim:

In [0]:
import gensim
# import word2vec model from gensim
# In here we are making 32 length vectors for words

In [0]:
model = gensim.models.Word2Vec(ref_data, min_count=2, size = 32,workers=4) # Training documents
model.save('TrainedModel')
# model = gensim.models.Word2Vec.load('TrainedModel')
list(model.wv.vocab.keys())[:10] #Listing out some words in the vocabulary

['lerxst',
 'wam',
 'umd',
 'edu',
 'thing',
 'WHAT',
 'car',
 'Nntp',
 'Posting',
 'Host']

In [0]:
model['university'] # Vector of length 32 for word "university"

  """Entry point for launching an IPython kernel.


array([-1.310268  , -1.3606578 , -0.23493813,  0.8143682 ,  0.17923371,
       -0.11570183, -0.5684594 , -0.77626234, -0.29843172,  1.8071996 ,
       -1.3042477 ,  0.45951122,  0.18712486, -1.35923   ,  1.6233407 ,
        0.04698589,  0.8572935 ,  0.31643814, -2.1183898 , -0.7802545 ,
       -0.69086695,  0.36233494, -1.2134007 , -1.4227774 , -0.6366987 ,
        0.8359502 , -0.68532044, -0.6712709 , -0.23325507, -0.92290527,
       -0.795331  , -0.446155  ], dtype=float32)

In [0]:
# Making all sentences of same length:
from sklearn.decomposition import PCA
pca = PCA(n_components=10, whiten=True) 
## Restricting the sentence length to 10 words for avoiding computational Complexity.

## Getting Features for our model

In [0]:
# Features:
def get_train_WordVectors(doc):
    text2vec = []
    for text_list in doc:
        temp=[]
        for word in text_list:
            if word in train_model.wv.vocab.keys():
                temp.append(train_model[word]) # Using vectors for words
        # getting features of dim:10*32
        text2vec.append(np.transpose(pca.fit_transform(np.transpose(np.array(temp))))) 
    return np.array(text2vec)
features = get_train_WordVectors(ref_data) 

  


In [0]:
# Splitting into train and test:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(features, targets):
    train_features, test_features = features[train_index], features[test_index]
    y_train, y_test = targets[train_index], targets[test_index]

## Shapes for training and testing data:

In [0]:
train_features.shape,test_features.shape ## Here 32 is the length of word vectors

((13192, 10, 32), (5654, 10, 32))

In [0]:
train_features[0][:2]

array([[ 0.23186775, -0.6520217 , -0.4004753 , -1.1607968 ,  0.03996227,
         0.08741288,  0.7559615 , -0.40436098, -0.6398477 , -1.4138429 ,
         1.8241627 ,  0.746278  , -0.8856913 ,  0.25858217,  0.16521806,
        -1.9235377 ,  0.11031622, -1.2854174 ,  1.5676588 ,  0.28131175,
         1.2927779 ,  0.6763326 ,  0.19645643,  2.588946  , -0.5486543 ,
        -0.8266543 , -0.38063905, -1.2012979 ,  0.22351465, -0.63498795,
         0.18617636,  1.1252905 ],
       [ 0.721735  , -1.5837886 ,  0.17612976, -1.3189217 , -0.68496716,
         0.15538706,  1.0982243 ,  1.321961  , -0.08356301,  0.61828643,
        -0.51905197, -0.97500336,  0.22576787, -1.5622381 ,  0.46475992,
        -1.5012553 , -0.98459566,  0.6308646 , -0.35756305, -0.48749956,
         1.1084393 , -0.40805563, -0.0325205 , -1.5285044 , -1.3362103 ,
         0.8110064 ,  1.092042  ,  0.5011803 ,  1.5817442 ,  0.02947482,
         1.0058678 ,  1.8208686 ]], dtype=float32)

## I have stored all these files, so you need not to run all above code.
#### Just run the  cell below

In [0]:
#***************************************************************************************
# with open('train_features.pkl','wb') as fp:
#     pickle.dump(train_features,fp)
# with open('test_features.pkl','wb') as fp:
#     pickle.dump(test_features,fp)
# with open('y_train.pkl','wb') as fp:
#     pickle.dump(train_features,fp)
# with open('y_test.pkl','wb') as fp:
#     pickle.dump(test_features,fp)
with open('train_features.pkl','rb') as fp:
    train_features = pickle.load(fp)
with open('test_features.pkl','rb') as fp:
    test_features = pickle.load(fp)
with open('y_train.pkl','rb') as fp:
    y_train = pickle.load(fp)
with open('y_test.pkl','rb') as fp:
    y_test = pickle.load(fp)
#***************************************************************************************

In [4]:
train_features.shape

(13192, 10, 32)

## Training the model using CuDNNLSTM:

In [0]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,CuDNNLSTM

In [6]:
model = Sequential()
model.add(CuDNNLSTM(128,input_shape = (train_features.shape[1:]),return_sequences = True))
model.add(Dropout(0.2))

model.add(CuDNNLSTM(128))
model.add(Dropout(0.2))

model.add(Dense(32,activation = 'relu'))
model.add(Dropout(0.2))

model.add(Dense(20,activation = 'softmax'))

opt = tf.keras.optimizers.Adam(lr = 1e-3,decay = 1e-5)
model.compile(optimizer = opt,loss = "sparse_categorical_crossentropy" , metrics=['accuracy'])
model.fit(train_features,y_train,epochs = 10,validation_data = (test_features,y_test))

Train on 13192 samples, validate on 5654 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fbbdfeb6978>