In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
import torch.nn.functional as F
from torch.autograd import Variable

from gensim.models import Word2Vec, KeyedVectors
import nltk
from nltk.corpus import brown
from nltk import ngrams, RegexpTokenizer, FreqDist


import random, numpy as np, re
from collections import Counter, OrderedDict
from copy import deepcopy
flatten = lambda l: [item for sublist in l for item in sublist]


def glove_to_word2vec():
    from gensim.scripts.glove2word2vec import glove2word2vec
    glove_input_file = 'data/glove/glove.6B.300d.txt'
    word2vec_output_file = 'data/glove.6B.300d.txt.word2vec'
    glove2word2vec(glove_input_file, word2vec_output_file)


## Prepare 

In [2]:
sents = brown.sents()
sents = [" ".join(x) for x in sents]

def tokens2ids(sent, word_to_ix, max_sent_len=50):
    ids = [word_to_ix[x] for x in sent]
    while len(ids) < max_sent_len:
        ids.append(900800700)
    return np.asarray(ids)

def getTrain(sents):
    tokenizer = RegexpTokenizer(r'[a-z][a-z\']*').tokenize
    t_sents = [tokenizer(sent.lower()) for sent in sents]
    t_sents = [t for t in t_sents if len(t) < 40]

    vocab = set([x for sent in t_sents for x in sent])

    word_to_ix['<PAD>'] = 0 
    word_to_ix['<UNK>'] = 1
    word_to_ix = {word: i+2 for i, word in enumerate(vocab)}
    
    id_sents = np.asarray([tokens2ids(x, word_to_ix) for x in t_sents])
    return id_sents, vocab, word_to_ix

In [None]:
id_sents, vocab, word2index = getTrain(sents)
train_data = id_sents
index2word = {v:k for k,v in word2index.items()}

In [None]:
def pad_to_batch(batch):
    x,y = zip(*batch)
    max_x = max([s.size(1) for s in x])
    x_p=[]
    for i in range(len(batch)):
        if x[i].size(1)<max_x:
            x_p.append(torch.cat([x[i],Variable(LongTensor([word2index['<PAD>']]*(max_x-x[i].size(1)))).view(1,-1)],1))
        else:
            x_p.append(x[i])
    return torch.cat(x_p),torch.cat(y).view(-1)

def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch
        
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if w in to_index.keys() else to_index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

In [None]:
class  CNNClassifier(nn.Module):
    
    def __init__(self, vocab_size,embedding_dim,output_size,kernel_dim=100,kernel_sizes=[3,4,5],dropout=0.5):
        super(CNNClassifier,self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, kernel_dim, (K, embedding_dim)) for K in kernel_sizes])

        # kernal_size = (K,D) 
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(kernel_sizes)*kernel_dim, output_size)
    
    
    def init_weights(self,pretrained_word_vectors,is_static=False):
        self.embedding.weight = nn.Parameter(torch.from_numpy(pretrained_word_vectors).float())
        if is_static:
            self.embedding.weight.requires_grad = False


    def forward(self, inputs,is_training=False):
        inputs = self.embedding(inputs).unsqueeze(1) # (B,1,T,D)
        inputs = [F.relu(conv(inputs)).squeeze(3) for conv in self.convs] #[(N,Co,W), ...]*len(Ks)
        inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs] #[(N,Co), ...]*len(Ks)

        concated = torch.cat(inputs, 1)

        if is_training:
            concated = self.dropout(concated) # (N,len(Ks)*Co)
        out = self.fc(concated) 
        return F.log_softmax(out)

In [None]:
word2vec_path = ""
def load_embed_model():
    from gensim.models import KeyedVectors
    # load the Stanford GloVe model
    filename = 'data/glove.6B.100d.txt.word2vec'
    embed_model = KeyedVectors.load_word2vec_format(filename, binary=False)
    return embed_model

# model = KeyedVectors.load_word2vec_format('../dataset/GoogleNews-vectors-negative300.bin', binary=True)
model = load_embed_model()

In [None]:
len(model.index2word)

In [None]:
pretrained = []



for i in range(len(word2index)):
    try:
        pretrained.append(model[word2index[i]])
    except:
        pretrained.append(np.random.randn(300))
        
pretrained_vectors = np.vstack(pretrained)


## Train


In [None]:
EPOCH=5
BATCH_SIZE=50
KERNEL_SIZES = [3,4,5]
KERNEL_DIM = 100
LR = 0.001

In [None]:
model = CNNClassifier(len(word2index),300,len(target2index),KERNEL_DIM,KERNEL_SIZES)
model.init_weights(pretrained_vectors) # initialize embedding matrix using pretrained vectors

if USE_CUDA:
    model = model.cuda()
    
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=LR)

In [None]:
for epoch in range(EPOCH):
    losses=[]
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        inputs,targets = pad_to_batch(batch)
        
        model.zero_grad()
        preds = model(inputs,True)
        
        loss = loss_function(preds,targets)
        losses.append(loss.data.tolist()[0])
        loss.backward()
        
        #for param in model.parameters():
        #    param.grad.data.clamp_(-3, 3)
        
        optimizer.step()
        
        if i % 100==0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch,EPOCH,np.mean(losses)))
            losses=[]

In [None]:
accuracy = 0
for test in test_data:
    pred = model(test[0]).max(1)[1]
    pred = pred.data.tolist()[0]
    target = test[1].data.tolist()[0][0]
    if pred == target:
        accuracy+=1

print(accuracy/len(test_data)*100)

In [None]:
tokenizer = RegexpTokenizer(r'[a-z][a-z\']*').tokenize
# tokenized = [tokenizer(rev) for rev in train]

def sent2embed(sent):
    tokens = tokenizer(sent)
    embed_matrix = np.asarray([embed_model[token] for token in tokens])
    
    return embed_matrix

In [None]:
m = sent2embed("this is my life")
m.shape