In [1]:
from sklearn.datasets import fetch_20newsgroups

features_all = fetch_20newsgroups(subset='all').data
targets_all = fetch_20newsgroups(subset='all').target

In [2]:
from nltk import tokenize
import re
import os

# drop  \, ", '
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

reviews = []

for text in features_all:

    text = clean_str(text.encode('ascii','ignore').decode("utf-8"))
    
    sentences = tokenize.sent_tokenize(text) # list of string senteces

    reviews.append(sentences)

In [3]:
len(reviews)

18846

In [5]:
print(reviews[0])
len(reviews[0])

['from: mamatha devineni ratnam <mr47+@andrew.cmu.edu>\nsubject: pens fans reactions\norganization: post office, carnegie mellon, pittsburgh, pa\nlines: 12\nnntp-posting-host: po4.andrew.cmu.edu\n\n\n\ni am sure some bashers of pens fans are pretty confused about the lack\nof any kind of posts about the recent pens massacre of the devils.', 'actually,\ni am  bit puzzled too and a bit relieved.', 'however, i am going to put an end\nto non-pittsburghers relief with a bit of praise for the pens.', 'man, they\nare killing those devils worse than i thought.', 'jagr just showed you why\nhe is much better than his regular season stats.', 'he is also a lot\nfo fun to watch in the playoffs.', 'bowman should let jagr have a lot of\nfun in the next couple of games since the pens are going to beat the pulp out of jersey anyway.', 'i was very disappointed not to see the islanders lose the final\nregular season game.', 'pens rule!!', '!']


10

In [12]:
import unicodedata
import re

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    # s = re.sub(r"([,.!?])", r" \1 ", s)
    s = re.sub(r"[^a-zA-Z0-9]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

reviews_clean = [[normalize_string(sent) for sent in doc if normalize_string(sent)] for doc in reviews]

In [13]:
print(reviews_clean[0])
len(reviews_clean[0])

['from mamatha devineni ratnam mr47 andrew cmu edu subject pens fans reactions organization post office carnegie mellon pittsburgh pa lines 12 nntp posting host po4 andrew cmu edu i am sure some bashers of pens fans are pretty confused about the lack of any kind of posts about the recent pens massacre of the devils', 'actually i am bit puzzled too and a bit relieved', 'however i am going to put an end to non pittsburghers relief with a bit of praise for the pens', 'man they are killing those devils worse than i thought', 'jagr just showed you why he is much better than his regular season stats', 'he is also a lot fo fun to watch in the playoffs', 'bowman should let jagr have a lot of fun in the next couple of games since the pens are going to beat the pulp out of jersey anyway', 'i was very disappointed not to see the islanders lose the final regular season game', 'pens rule']


9

In [14]:
from nltk.corpus import stopwords
from collections import Counter

stoplist = stopwords.words('english')

class Lang():
    def __init__(self, docs, stoplist, min_count=30):
        self.docs = docs
        self.stoplist = stoplist
        self.min_count = min_count
        self.word2idx = {"<PAD>": 0}
        self.idx2word = {0: "<PAD>"}
        self.n_words = self.process_sents()

    def process_sents(self):
        words = []
        for doc in self.docs:
            for sent in doc:
                words += sent.split(' ')

        cc = 1
        counter = Counter(words)
        for word, num in counter.items():
            if num > self.min_count and word not in self.stoplist:
                self.word2idx[word] = cc
                self.idx2word[cc] = word
                cc += 1
        return cc
    
input_lang = Lang(reviews_clean, stoplist)

In [15]:
assert input_lang.n_words == len(input_lang.word2idx)
input_lang.n_words

12706

In [16]:
for i in range(5):
    print(input_lang.idx2word[i])

<PAD>
andrew
cmu
edu
subject


In [71]:
## load pre-trained word2vec
with open('/data/charley/crawl-300d-2M.pkl', 'rb') as pf:
    fb_w2v = pickle.load(pf)
    
embed_dim = 300
embed_matrix = np.zeros((input_lang.n_words, embed_dim))
flag = 0
for word, idx in input_lang.word2idx.items():
    word_vec = fb_w2v.get(word)
    if word_vec is not None:
        embed_matrix[idx] = word_vec
        flag += 1
print("There are {} words with pre-trained vector in vocab_size = {}.".format(flag, len(input_lang.word2idx)))

There are 11851 words with pre-trained vector in vocab_size = 12706.


In [72]:
np.save('EmbeddingMatrix_han', embed_matrix) # saved as EmbeddingMatrix.npy

In [75]:
embed_matrix.shape

(12706, 300)

In [19]:
input_docs = []
for doc in reviews_clean:
    doc_idx = [[input_lang.word2idx[word] for word in sent.split(' ') if word in input_lang.word2idx] 
               for sent in doc]
    input_docs.append(doc_idx)
print(len(input_docs)) # num of docs
print(len(input_docs[0])) # num of sentences in doc-1
print(len(input_docs[0][0])) # num of words in sentence-1 of doc-1

18846
9
35


In [21]:
print(input_docs[100])

[[2246, 310, 4, 1441, 302, 3917, 4338, 8, 512, 4339, 487, 1782, 587, 15, 1183, 306, 1056, 2410, 3810, 3, 469, 3810, 3, 147, 4340, 147, 163, 1176, 1067, 4341, 202, 588, 525, 2386, 409, 1381], [1062, 24, 1686, 24, 556, 809, 3994, 3311, 996, 4342], [1580, 202, 1406, 121, 4132, 4343, 30, 4344, 4345, 604, 456, 1118, 343], [4231, 4346, 4347, 4348, 578, 776, 2023, 3584, 4231], [4349, 498, 4350, 4351, 454, 4149, 4352, 4353, 4354, 4355, 4356, 4325, 2292, 3999, 4357, 2023, 342, 4132, 4343], [130, 2260, 4358, 2022, 3987, 371, 259, 4359, 787], [654, 121, 4360, 163, 4361, 4362, 4363], [2260]]


In [30]:
# drop sentencs without word 
input_docs_new = []
for doc in input_docs:
    input_docs_new.append([sent for sent in doc if sent])
print(input_docs_new[100])

[[2246, 310, 4, 1441, 302, 3917, 4338, 8, 512, 4339, 487, 1782, 587, 15, 1183, 306, 1056, 2410, 3810, 3, 469, 3810, 3, 147, 4340, 147, 163, 1176, 1067, 4341, 202, 588, 525, 2386, 409, 1381], [1062, 24, 1686, 24, 556, 809, 3994, 3311, 996, 4342], [1580, 202, 1406, 121, 4132, 4343, 30, 4344, 4345, 604, 456, 1118, 343], [4231, 4346, 4347, 4348, 578, 776, 2023, 3584, 4231], [4349, 498, 4350, 4351, 454, 4149, 4352, 4353, 4354, 4355, 4356, 4325, 2292, 3999, 4357, 2023, 342, 4132, 4343], [130, 2260, 4358, 2022, 3987, 371, 259, 4359, 787], [654, 121, 4360, 163, 4361, 4362, 4363], [2260]]


In [31]:
doc_lengths = [len(doc) for doc in input_docs_new]
sent_lengths = [len(sent) for doc in input_docs_new for sent in doc]
len(doc_lengths), len(sent_lengths)

(18846, 289497)

In [32]:
import numpy as np

def print_info(doc_lengths):
    print(np.mean(doc_lengths), np.std(doc_lengths), np.min(doc_lengths), np.max(doc_lengths))

print_info(doc_lengths)
print_info(sent_lengths)

15.36119070359758 30.04536205310365 1 1186
11.00158550865812 24.322507329258436 1 7031


In [47]:
num_docs = len(input_docs_new) # 18846
max_sents = 75 # 15 + 30x2
max_sent_length = 60

data = np.zeros((num_docs, max_sents, max_sent_length), dtype='int32')
sentences_per_document = np.zeros(num_docs, dtype='int32') # num of sentences of a doc
words_per_sentence = np.zeros((num_docs, max_sents), dtype='int32')

for i, doc in enumerate(input_docs_new):
    doc_len = min(len(doc), max_sents)
    doc = doc[:doc_len]
    sentences_per_document[i] = doc_len
    for j, sent in enumerate(doc):
        sent_len = min(len(sent), max_sent_length)
        data[i, j, :sent_len] = sent[:sent_len]
        words_per_sentence[i, j] = sent_len
        
data.shape

(18846, 75, 60)

In [48]:
data[100, :10, :10]

array([[2246,  310,    4, 1441,  302, 3917, 4338,    8,  512, 4339],
       [1062,   24, 1686,   24,  556,  809, 3994, 3311,  996, 4342],
       [1580,  202, 1406,  121, 4132, 4343,   30, 4344, 4345,  604],
       [4231, 4346, 4347, 4348,  578,  776, 2023, 3584, 4231,    0],
       [4349,  498, 4350, 4351,  454, 4149, 4352, 4353, 4354, 4355],
       [ 130, 2260, 4358, 2022, 3987,  371,  259, 4359,  787,    0],
       [ 654,  121, 4360,  163, 4361, 4362, 4363,    0,    0,    0],
       [2260,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int32)

In [50]:
sentences_per_document[100], words_per_sentence[100]

(8, array([36, 10, 13,  9, 19,  9,  7,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0], dtype=int32))

In [52]:
## split data
x_train, sents_doc_train, words_sent_train, y_train = \
            data[:11314], sentences_per_document[:11314], words_per_sentence[:11314], targets_all[:11314]
x_test, sents_doc_test, words_sent_test, y_test = \
            data[11314:], sentences_per_document[11314:], words_per_sentence[11314:], targets_all[11314:]
print(x_train.shape, sents_doc_train.shape, words_sent_train.shape, y_train.shape)

(11314, 75, 60) (11314,) (11314, 75) (11314,)


In [53]:
import pickle
newspaper_han_data = {'trian': {1:x_train, 2:sents_doc_train, 3:words_sent_train, 4:y_train}, 
                      'test': {1:x_test, 2:sents_doc_test, 3:words_sent_test, 4:y_test},
                      'lang': input_lang}

with open('newspaper_han.pkl', 'wb') as f:
    pickle.dump(newspaper_han_data, f)

In [56]:
from torch.utils.data import Dataset

class NewsPaperDatasets(Dataset):
    def __init__(self, src, src_sents, src_words, tgt):
        self.src = src
        self.src_sents = src_sents
        self.src_words = src_words
        self.tgt = tgt

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        return self.src[idx], self.src_sents[idx], self.src_words[idx], self.tgt[idx]

In [59]:
import torch
try_loader = torch.utils.data.DataLoader(
                    NewsPaperDatasets(x_train[:16], sents_doc_train[:16], words_sent_train[:16], y_train[:16]),
                    num_workers = 2,
                    batch_size = 4,
                    shuffle = True,
                    drop_last = True)

for batch in try_loader:
    src, src_sents, src_words, tgt = batch
    print(src.size())
    print(src_sents)
    print(src_words.size())
    print(tgt)
    break

torch.Size([4, 75, 60])
tensor([4, 2, 9, 8], dtype=torch.int32)
torch.Size([4, 75])
tensor([ 4, 11, 19, 12])


In [65]:
train_loader = torch.utils.data.DataLoader(
                    NewsPaperDatasets(x_train, sents_doc_train, words_sent_train, y_train),
                    num_workers = 2,
                    batch_size = 64,
                    shuffle = True,
                    drop_last = True)

test_loader = torch.utils.data.DataLoader(
                    NewsPaperDatasets(x_test, sents_doc_test, words_sent_test, y_test),
                    num_workers = 2,
                    batch_size = 64,
                    shuffle = True,
                    drop_last = True)

In [62]:
from Model_HAN import *

model = HierarchialAttentionNetwork(n_classes = 20, 
                                    vocab_size = input_lang.n_words, 
                                    emb_size = 100, 
                                    word_rnn_size = 128, 
                                    sentence_rnn_size = 128, 
                                    word_rnn_layers = 2,
                                    sentence_rnn_layers = 2, 
                                    word_att_size = 128, 
                                    sentence_att_size = 128, 
                                    dropout = 0.5, 
                                    embed_weights = None)


In [61]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)

HierarchialAttentionNetwork(
  (sentence_attention): SentenceAttention(
    (word_attention): WordAttention(
      (embeddings): Embedding(12706, 100)
      (word_rnn): GRU(100, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
      (word_attention): Linear(in_features=256, out_features=128, bias=True)
      (word_context_vector): Linear(in_features=128, out_features=1, bias=False)
      (dropout): Dropout(p=0.5)
    )
    (sentence_rnn): GRU(256, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
    (sentence_attention): Linear(in_features=256, out_features=128, bias=True)
    (sentence_context_vector): Linear(in_features=128, out_features=1, bias=False)
    (dropout): Dropout(p=0.5)
  )
  (fc): Linear(in_features=256, out_features=20, bias=True)
  (dropout): Dropout(p=0.5)
)

In [70]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

model.train()

for i, batch in enumerate(train_loader):
    src, src_sents, src_words, tgt = [subtensor.long().to(device) for subtensor in batch]
    outputs, _, _ = model(src, src_sents, src_words)
    loss = criterion(outputs, tgt)
    loss.backward()
    optimizer.step()
    print(loss.item())
    



3.0030689239501953
2.9946494102478027
3.002408504486084
3.0123629570007324
3.008464813232422
3.013972759246826
3.0314745903015137
3.0051140785217285
2.9838218688964844
2.9835832118988037
3.0121264457702637
3.006166458129883
2.9987599849700928
2.967792510986328
3.021367073059082
3.0165843963623047
2.975843906402588
3.0029866695404053
2.987004280090332
2.9833061695098877
2.9648263454437256
3.010542154312134
2.9773077964782715
3.0179946422576904
2.936753749847412
2.9322447776794434
3.010312795639038
2.998314380645752
2.9591684341430664
2.967848300933838
2.937185525894165
2.9367311000823975
2.9368481636047363
2.8597543239593506
2.9494822025299072
2.9266867637634277
2.920616865158081
2.8704466819763184
2.9774718284606934
2.9136242866516113
2.977733850479126
2.981370449066162
3.0409085750579834
2.8914735317230225
2.860506772994995
2.9757823944091797
2.948040008544922
3.08793044090271
3.0192861557006836
3.5264792442321777
3.216702699661255
3.179327964782715
2.7487082481384277
3.05507016181945