In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

from gensim.models import Word2Vec
from nltk.corpus import treebank, brown
from nltk import ngrams, RegexpTokenizer, FreqDist
import numpy as np




def glove_to_word2vec():
    from gensim.scripts.glove2word2vec import glove2word2vec
    glove_input_file = 'data/glove/glove.6B.300d.txt'
    word2vec_output_file = 'data/glove.6B.300d.txt.word2vec'
    glove2word2vec(glove_input_file, word2vec_output_file)


In [None]:
nchannels = 1 # for word embeddings
n_hid_filt = 16 ## total guess
embed_size = 100
stride = 1

class _netD(nn.Module):
    def __init__(self, ngpu):
        super(_netD, self).__init__()
        self.main = nn.Sequential(
            # X is (nchannels) x 64 x 64
            # in, out, kernal, stride, padding
            nn.Conv2d(nchannels, n_hid_filt, (3, embed_size) , stride,  bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            
            # state size. (n_hid_filt) x 32 x 32
            nn.Conv2d(n_hid_filt, n_hid_filt * 2, (3, embed_size), stride,  bias=False),
            nn.BatchNorm2d(n_hid_filt * 2),
            nn.LeakyReLU(0.2, inplace=True),
            
            # state size. (n_hid_filt*2) x 16 x 16
            nn.Conv2d(n_hid_filt * 2, n_hid_filt * 4, (3, embed_size), stride,  bias=False),
            nn.BatchNorm2d(n_hid_filt * 4),
            nn.LeakyReLU(0.2, inplace=True),
            
            # state size. (n_hid_filt*4) x 8 x 8
            nn.Conv2d(n_hid_filt * 4, n_hid_filt * 8, (3, embed_size), stride,  bias=False),
            nn.BatchNorm2d(n_hid_filt * 8),
            nn.LeakyReLU(0.2, inplace=True),
            
            # state size. (n_hid_filt*8) x 4 x 4
            nn.Conv2d(n_hid_filt * 8, 1, (3, embed_size), stride,  bias=False),
            nn.Sigmoid()
        )

    def forward(self, X):
        if isinstanchannelse(X.data, torch.cuda.FloatTensor) and self.ngpu > 1:
            output = nn.parallel.data_parallel(self.main, X, range(self.ngpu))
        else:
            output = self.main(X)

        return output.view(-1, 1).squeeze(1)

In [None]:
netD = _netD
criteria = nn.MSELoss()

X = torch.FloatTensor(batchSize, 1, embedSize, maxSentLen)

label = torch.FloatTensor(batchSize)
real_label = 1
fake_label = 0

optimizer = optim.Adam(netD.parameters(), lr=0.0002, betas=(0.5, 0.999))


In [None]:
for epoch in range(numEpoch):
    for i, data in enumerate(dataloader, 0):
        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        # train with real
        netD.zero_grad()
        real, _ = data
        batch_size = real.size(0)
        if opt.cuda:
            real = real.cuda()
            
        X.resize_as_(real).copy_(real)
        
        label.resize_(batch_size).fill_(real_label)
        inputv = Variable(X)
        labelv = Variable(label)

        output = netD(inputv)
        errD_real = criterion(output, labelv)
        errD_real.backward()
        D_x = output.data.mean()

        # train with fake
        noise.resize_(batch_size, nz, 1, 1).normal_(0, 1)
        noisev = Variable(noise)
        fake = netG(noisev)
        labelv = Variable(label.fill_(fake_label))
        output = netD(fake.detach())
        errD_fake = criterion(output, labelv)
        errD_fake.backward()
        D_G_z1 = output.data.mean()
        errD = errD_real + errD_fake
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        labelv = Variable(label.fill_(real_label))  # fake labels are real for generator cost
        output = netD(fake)
        errG = criterion(output, labelv)
        errG.backward()
        D_G_z2 = output.data.mean()
        optimizerG.step()

        print('[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f'
              % (epoch, opt.niter, i, len(dataloader),
                 errD.data[0], errG.data[0], D_x, D_G_z1, D_G_z2))
        if i % 100 == 0:
            vutils.save_image(real_cpu,
                    '%s/real_samples.png' % opt.outf,
                    normalize=True)
            fake = netG(fixed_noise)
            vutils.save_image(fake.data,
                    '%s/fake_samples_epoch_%03d.png' % (opt.outf, epoch),
                    normalize=True)

    # do checkpointing
    torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (opt.outf, epoch))
    torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (opt.outf, epoch))

In [None]:
b = Word2Vec(brown.sents())
# mr = Word2Vec(movie_reviews.sents())
t = Word2Vec(treebank.sents())

In [None]:
# sentence -> word2vec -> make 2D
print(b.most_similar('company', topn=5))
print(t.most_similar('company', topn=5))



In [None]:
x = list(treebank.sents())

In [None]:
len(x)

In [26]:
from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = 'data/glove.6B.100d.txt.word2vec'
embed_model = KeyedVectors.load_word2vec_format(filename, binary=False)


In [28]:
type(embed_model['son'])


numpy.ndarray

In [32]:
tokenizer = RegexpTokenizer(r'[a-z][a-z\']*').tokenize
# tokenized = [tokenizer(rev) for rev in train]

def sent2embed(sent):
    tokens = tokenizer(sent)
    embed_matrix = np.asarray([embed_model[token] for token in tokens])
    
    return embed_matrix

In [33]:
m = sent2embed("this is my life")
m.shape

(4, 100)