In [0]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sys
import os
import time
import copy
import pickle
import random
from random import shuffle 
import matplotlib.pyplot as plt
from scipy import spatial

from sklearn.manifold import TSNE
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
# from keras.utils.np_utils import to_categorical
from gensim.models import Word2Vec
from multiprocessing import cpu_count

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
# from torchtext.data import Example
# import torchtext

torch.manual_seed(10)


%load_ext autoreload
%autoreload 2
this = sys.modules[__name__]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [0]:
def read_preTrained(file_name):
    with open(file_name,'r') as f:
        word_vocab = set() # not using list to avoid duplicate entry
        word2vector = {}
        for line in f:
            line_ = line.strip() #Remove white space
            words_Vec = line_.split()
            if len(words_Vec) > 301:
                continue
            try:
                word_vocab.add(words_Vec[0])
                word2vector[words_Vec[0]] = np.array(words_Vec[1:],dtype=float)
            except ValueError:
                continue
    return word_vocab,word2vector

def read_numpy_files():
    """Instead of running the entire pipeline at all times."""
    filename = os.path.join(curDir, 'data', 'train_test_data.dat') ##@
    with open(filename, 'rb') as handle:
        x_train = pickle.load(handle)
        y_train = pickle.load(handle)
        x_test = pickle.load(handle)
        y_test = pickle.load(handle)
        int_category = pickle.load(handle)
        category_int = pickle.load(handle)

    return (x_train, y_train), (x_test, y_test), int_category, category_int

def seed_everything(seed=10):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [0]:
document_max_num_words = 15
emb_dim = 300
firstTime = False
cache = True
reuters = False
# reader.generate_categories(reuters)
# class_names = reader.categories # works
print('reading data...')
print('# of dimensions is: ', emb_dim)
(x_train, y_train), (x_test, y_test), int_category, category_int = read_numpy_files()
# vocabulary = np.insert(np.load('./data/vocabulary.npy'), 0, '')
vocabulary = np.load('./data/vocabulary.npy')
categories = np.load('./data/categories.npy')
print('Training data size: ', x_train.shape, y_train.shape)
print('Testing data size: ', x_test.shape, y_test.shape)
print('Vocabulary size: ', len(vocabulary))
print('Categories: ', len(categories), categories)
# print(int_category)
# word_to_ix = {word: i for i, word in enumerate(vocabulary)}
# bad_score = ['COLLEGE', 'ENVIRONMENT', 'FIFTY', 'HEALTHY LIVING', 'LATINO VOICES', 'MONEY', 'PARENTS', 'TASTE', 'WORLD NEWS']
# x_train = reader.vectorize_idx(x_train, word_to_ix, document_max_num_words)
# y_train = np.array(list(y_train.values()))
# x_test = reader.vectorize_idx(x_test, word_to_ix, document_max_num_words)
# y_test = np.array(list(y_test.values()))
# reader.save_data(x_train, y_train, x_test, y_test, int_category, category_int)

reading data...
# of dimensions is:  300
Training data size:  (160682, 15) (160682, 40)
Testing data size:  (40171, 15) (40171, 40)
Vocabulary size:  55618
Categories:  40 ['ARTS' 'ARTS & CULTURE' 'BLACK VOICES' 'BUSINESS' 'COLLEGE' 'COMEDY'
 'CRIME' 'CULTURE & ARTS' 'DIVORCE' 'EDUCATION' 'ENTERTAINMENT'
 'ENVIRONMENT' 'FIFTY' 'FOOD & DRINK' 'GOOD NEWS' 'GREEN' 'HEALTHY LIVING'
 'HOME & LIVING' 'IMPACT' 'LATINO VOICES' 'MEDIA' 'MONEY' 'PARENTING'
 'PARENTS' 'POLITICS' 'QUEER VOICES' 'RELIGION' 'SCIENCE' 'SPORTS' 'STYLE'
 'STYLE & BEAUTY' 'TASTE' 'TECH' 'TRAVEL' 'WEDDINGS' 'WEIRD NEWS'
 'WELLNESS' 'WOMEN' 'WORLD NEWS' 'WORLDPOST']


In [0]:
# vocab, w2v = read_preTrained("./model/glove_files/glove.840B.300d.txt")
# print("Total Words in DataSet:",len(vocab))
# emb_dim = w2v[list(vocab)[0]].shape[0]
# matrix_len = len(vocabulary)
# weights_matrix = np.zeros((matrix_len, emb_dim))
# words_not_found = 0
# n_words = []

# for i, word in enumerate(vocabulary):
#     try: 
#         weights_matrix[i] = w2v[word]
#     except KeyError:
#         weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))
#         words_not_found += 1
#         n_words.append(word)

# print('words not found:', words_not_found)
# print('random embedded words percentage = %.2f%%'%(words_not_found/(matrix_len)*100))
# import gc
# w2v.clear() 
# del w2v, vocab
# gc.collect()
# len(n_words)
# n_words[:50]
# np.savez_compressed('./data/weights_matrix_840B', weights_matrix)
weights_matrix = np.load('./data/weights_matrix_840B.npz')['arr_0']
weights_matrix = torch.tensor(weights_matrix, dtype=torch.float64)
print(weights_matrix.size())

torch.Size([55618, 300])


In [0]:
class LSTMTopic(nn.Module):

    def __init__(self, weights_matrix, lstm_out, dense_out, num_categories, noEmbTrain=True, uselast=True, maxpool=False):
        super(LSTMTopic, self).__init__()
        self.hidden_dim = lstm_out[1]
        self.uselast = uselast
        self.maxpool = maxpool
        num_embeddings, embedding_dim = weights_matrix.size()
        self.embedding = nn.Embedding.from_pretrained(weights_matrix, freeze=noEmbTrain)
        #self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm1 = nn.LSTM(embedding_dim, self.hidden_dim , 3, batch_first=True, dropout=0.5)
        self.lstm2 = nn.LSTM(embedding_dim, self.hidden_dim , 1, batch_first=True)
        # The linear layer that maps from hidden state space to tag space
        # self.dense= nn.Linear(self.hidden_dim , dense_out)
        # self.relu = nn.ReLU()
        self.drop = nn.Dropout(0.5)
        self.out = nn.Linear(self.hidden_dim , num_categories)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, inputs):
        x = self.embedding(inputs)
        x, (h, c) = self.lstm2(x.float())
        #x, _ = nn.utils.rnn.pad_packed_sequence(x)
        x = self.drop(x)
        # x, _ = self.lstm2(x)
        # x, _ = self.lstm3(x)
        if self.uselast:
            x = x[:,-1].view(len(x), -1)
        elif self.maxpool:
            x = torch.transpose(x, 1, 2)#torch.Size([batch, hidden, seq])
            x = torch.tanh(x)
            x, indices = F.max_pool1d(x,x.size(2), return_indices=True)
            x = torch.tanh(x)
            x = x.squeeze(2)
        else:
            x = torch.mean(x, dim=1)
        # x = self.relu(self.dense(x))
        # x = self.drop(x)
        x = self.out(x)
        x = self.softmax(x)
        return x

In [0]:
num_categories = len(categories)
batch_size=128 
n_lr = 5e-3
decay = 1e-7
n_epochs = 10
lstm_out = [128,64,32]
dense_layer = 64
SEED = 10
n_splits = 9

In [0]:
beg = time.time()
model = LSTMTopic(weights_matrix, lstm_out, dense_layer, num_categories,True,False)
model.load_state_dict(torch.load(os.path.join(curDir,'ModelData','Pytorch_Model','model_state_dict_v3_1.pth')))
print("Loading time = %.2fs"%(time.time()-beg))

Loading time = 0.24s


In [0]:
CPU = True
if CPU:
    # device = torch.device("cpu")
    model.cpu()
    x_tt = torch.tensor(x_train, dtype=torch.long)
    y_tt = torch.tensor(y_train, dtype=torch.float32)
    y_pp = model(x_tt)
    x_t = torch.tensor(x_test, dtype=torch.long)
    y_t = torch.tensor(y_test, dtype=torch.float32)
    beg = time.time()
    y_p = model(x_t)
else:
    model.cuda()
    x_tt = torch.tensor(x_train, dtype=torch.long).cuda()
    y_tt = torch.tensor(y_train, dtype=torch.float32).cuda()
    y_pp = model(x_tt).detach()
    x_t = torch.tensor(x_test, dtype=torch.long).cuda()
    y_t = torch.tensor(y_test, dtype=torch.float32).cuda()
    beg = time.time()
    y_p = model(x_t).detach()
    
print('Predicted Train accuracy ',(y_pp.argmax(dim=1)==y_tt.argmax(dim=1)).sum().cpu().numpy()/len(y_pp))
print('Predicted Test accuracy ',(y_p.argmax(dim=1)==y_t.argmax(dim=1)).sum().cpu().numpy()/len(y_p))
print("prediction time = %.2fs"%(time.time()-beg))

Predicted Train accuracy  0.7165332769071832
Predicted Test accuracy  0.567349580543178
prediction time = 2.75s


In [0]:
beg = time.time()
y_p = model(x_t)
print('Predicted Test accuracy ',(y_p.argmax(dim=1)==y_t.argmax(dim=1)).sum().cpu().numpy()/len(y_p))
print("prediction time = %.2fs"%(time.time()-beg))

Predicted Test accuracy  0.5686938338602474
prediction time = 3.04s


In [0]:
f1_score(y_t.argmax(dim=1).cpu(), y_p.argmax(dim=1).cpu(),list(int_category.values()),average='weighted')

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0

In [0]:
print('Train statistics:')
print('accuracy %s' % accuracy_score(y_pp.argmax(dim=1).cpu(), y_tt.argmax(dim=1).cpu()))
print(classification_report(y_tt.argmax(dim=1).cpu(), y_pp.argmax(dim=1).cpu(),target_names=list(int_category.values())))
print('Test statistics:')
print('accuracy %s' % accuracy_score(y_p.argmax(dim=1).cpu(), y_t.argmax(dim=1).cpu()))
print(classification_report(y_t.argmax(dim=1).cpu(), y_p.argmax(dim=1).cpu(),target_names=list(int_category.values())))

Train statistics:
accuracy 0.7165332769071832
                precision    recall  f1-score   support

          ARTS       0.52      0.39      0.44      1245
ARTS & CULTURE       0.49      0.42      0.45      1086
  BLACK VOICES       0.68      0.63      0.66      3664
      BUSINESS       0.67      0.63      0.65      4763
       COLLEGE       0.64      0.53      0.58       915
        COMEDY       0.70      0.57      0.63      4086
         CRIME       0.73      0.74      0.74      2708
CULTURE & ARTS       0.67      0.50      0.57       867
       DIVORCE       0.79      0.76      0.78      2717
     EDUCATION       0.53      0.39      0.45       806
 ENTERTAINMENT       0.78      0.83      0.81     12818
   ENVIRONMENT       0.66      0.48      0.56      1068
         FIFTY       0.48      0.30      0.37      1125
  FOOD & DRINK       0.75      0.84      0.79      4998
     GOOD NEWS       0.52      0.45      0.48      1134
         GREEN       0.56      0.54      0.55      2082
H