In [1]:
import argparse

import os
import sys
import json
import random
import nltk
import numpy as np
import re
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)
PAD_TOKEN = '_PAD_'
UNK_TOKEN = '_UNK_'



# Methods for loading SST data

def sentiment2label(x):
  if x >= 0 and x <= 0.2:
    return 0
  elif x > 0.2 and x <= 0.4:
    return 1
  elif x > 0.4 and x <= 0.6:
    return 2
  elif x > 0.6 and x <= 0.8:
    return 3
  elif x > 0.8 and x <= 1:
    return 4
  else:
    raise ValueError('Improper sentiment value {}'.format(x))


def read_dictionary_txt_with_phrase_ids(dictionary_path, phrase_ids_path, labels_path=None):
  print('Reading data dictionary_path={} phrase_ids_path={} labels_path={}'.format(
    dictionary_path, phrase_ids_path, labels_path))

  with open(phrase_ids_path) as f:
    phrase_ids = set(line.strip() for line in f)

  with open(dictionary_path) as f:
    examples_dict = dict()
    for line in f:
      parts = line.strip().split('|')
      phrase = parts[0]
      phrase_id = parts[1]

      if phrase_id not in phrase_ids:
        continue

      example = dict()
      example['phrase'] = phrase.replace('(', '-LRB').replace(')', '-RRB-')
      example['tokens'] = example['phrase'].split(' ')
      example['example_id'] = phrase_id
      example['label'] = None
      examples_dict[example['example_id']] = example

  if labels_path is not None:
    with open(labels_path) as f:
      for i, line in enumerate(f):
        if i == 0:
          continue
        parts = line.strip().split('|')
        phrase_id = parts[0]
        sentiment = float(parts[1])
        label = sentiment2label(sentiment)

        if phrase_id in examples_dict:
          examples_dict[phrase_id]['label'] = label

  examples = [ex for _, ex in examples_dict.items()]

  print('Found {} examples.'.format(len(examples)))

  return examples


def build_vocab(datasets):
  vocab = dict()
  vocab[PAD_TOKEN] = len(vocab)
  vocab[UNK_TOKEN] = len(vocab)
  for data in datasets:
    for example in data:
      for word in example['tokens']:
        if word not in vocab:
          vocab[word] = len(vocab)

  print('Vocab size: {}'.format(len(vocab)))

  return vocab


class TokenConverter(object):
  def __init__(self, vocab):
    self.vocab = vocab
    self.unknown = 0

  def convert(self, token):
    if token in self.vocab:
      id = self.vocab.get(token)
    else:
      id = self.vocab.get(UNK_TOKEN)
      self.unknown += 1
    return id


def convert2ids(data, vocab):
  converter = TokenConverter(vocab)
  for example in data:
    example['tokens'] = list(map(converter.convert, example['tokens']))
  print('Found {} unknown tokens.'.format(converter.unknown))
  return data


def load_data_and_embeddings(data_path, phrase_ids_path, embeddings_path):
  labels_path = os.path.join(data_path, 'sentiment_labels.txt')
  dictionary_path = os.path.join(data_path, 'dictionary.txt')
  train_data = read_dictionary_txt_with_phrase_ids(dictionary_path, os.path.join(phrase_ids_path, 'phrase_ids.train.txt'), labels_path)
  validation_data = read_dictionary_txt_with_phrase_ids(dictionary_path, os.path.join(phrase_ids_path, 'phrase_ids.dev.txt'), labels_path)
  test_data = read_dictionary_txt_with_phrase_ids(dictionary_path, os.path.join(phrase_ids_path, 'phrase_ids.test.txt'))
  vocab = build_vocab([train_data, validation_data, test_data])
  vocab, embeddings = load_embeddings('./glove.840B.300d.txt', vocab, cache=True)
  train_data = convert2ids(train_data, vocab)
  validation_data = convert2ids(validation_data, vocab)
  test_data = convert2ids(test_data, vocab)
  return train_data, validation_data, test_data, vocab, embeddings


def load_embeddings(path, vocab, cache=False, cache_path=None):
  rows = []
  new_vocab = [UNK_TOKEN]

  if cache_path is None:
    cache_path = path + '.cache'

  # Use cache file if it exists.
  if os.path.exists(cache_path):
    path = cache_path

  print("Reading embeddings from {}".format(path))

  # first pass over the embeddings to vocab and relevant rows
  with open(path) as f:
    for line in f:
      word, row = line.split(' ', 1)
      if word == UNK_TOKEN:
        raise ValueError('The unk token should not exist w.in embeddings.')
      if word in vocab:
        rows.append(line)
        new_vocab.append(word)

  # optionally save relevant rows to cache file.
  if cache and not os.path.exists(cache_path):
    with open(cache_path, 'w') as f:
      for line in rows:
        f.write(line)
      print("Cached embeddings to {}".format(cache_path))

  # turn vocab list into a dictionary
  new_vocab = {w: i for i, w in enumerate(new_vocab)}

  print('New vocab size: {}'.format(len(new_vocab)))

  assert len(rows) == len(new_vocab) - 1

  # create embeddings matrix
  embeddings = np.zeros((len(new_vocab), 300), dtype=np.float32)
  for i, line in enumerate(rows):
    embeddings[i+1] = list(map(float, line.strip().split(' ')[1:]))

  return new_vocab, embeddings


# Batch Iterator

def prepare_data(data):
  # pad data
  maxlen = max(map(len, data))
  data = [ex + [0] * (maxlen-len(ex)) for ex in data]

  # wrap in tensor
  return torch.LongTensor(data)


def prepare_labels(labels):
  try:
    return torch.LongTensor(labels)
  except:
    return labels


def batch_iterator(dataset, batch_size, forever=False):
  dataset_size = len(dataset)
  order = None
  nbatches = dataset_size // batch_size

  def init_order():
    return random.sample(range(dataset_size), dataset_size)

  def get_batch(start, end):
    batch = [dataset[ii] for ii in order[start:end]]
    data = prepare_data([ex['tokens'] for ex in batch])
    labels = prepare_labels([ex['label'] for ex in batch])
    example_ids = [ex['example_id'] for ex in batch]
    return data, labels, example_ids

  order = init_order()

  while True:
    for i in range(nbatches):
      start = i*batch_size
      end = (i+1)*batch_size
      yield get_batch(start, end)

    if nbatches*batch_size < dataset_size:
      yield get_batch(start, end)

    if not forever:
      break
    
    order = init_order()


# Models

class BagOfWordsModel(nn.Module):
  def __init__(self, embeddings):
    super(BagOfWordsModel, self).__init__()
    self.embed = nn.Embedding(embeddings.shape[0], embeddings.shape[1], sparse=True)
    self.embed.weight.data.copy_(torch.from_numpy(embeddings))
    self.classify = nn.Linear(embeddings.shape[1], 5)

  def forward(self, x):
    return self.classify(self.embed(x).sum(1))


# Utility Methods

def checkpoint_model(step, val_err, model, opt, save_path):
  save_dict = dict(
    step=step,
    val_err=val_err,
    model_state_dict=model.state_dict(),
    opt_state_dict=opt.state_dict())
  torch.save(save_dict, save_path)


def load_model(model, opt, load_path):
  load_dict = torch.load(load_path)
  step = load_dict['step']
  val_err = load_dict['val_err']
  model.load_state_dict(load_dict['model_state_dict'])
  opt.load_state_dict(load_dict['opt_state_dict'])
  return step, val_err

class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)

        self.embedding_v.weight.data.uniform_(-1, 1) # init
        self.embedding_u.weight.data.uniform_(0, 0) # init
        #self.out = nn.Linear(projection_dim,vocab_size)
    def forward(self, center_words,target_words, outer_words):
        center_embeds = self.embedding_v(center_words) # B x 1 x D
        target_embeds = self.embedding_u(target_words) # B x 1 x D
        outer_embeds = self.embedding_u(outer_words) # B x V x D
        
        scores = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # Bx1xD * BxDx1 => Bx1
        norm_scores = outer_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # BxVxD * BxDx1 => BxV
        
        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        
        return nll # negative log likelihood
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds 
    
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

USE_CUDA = torch.cuda.is_available()
gpus = [0]

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [107]:
train_data, validation_data, test_data, vocab, embeddings = \
load_data_and_embeddings("./stanfordSentimentTreebank", "./", './glove.840B.300d.txt')

Reading data dictionary_path=./stanfordSentimentTreebank/dictionary.txt phrase_ids_path=./phrase_ids.train.txt labels_path=./stanfordSentimentTreebank/sentiment_labels.txt
Found 159274 examples.
Reading data dictionary_path=./stanfordSentimentTreebank/dictionary.txt phrase_ids_path=./phrase_ids.dev.txt labels_path=./stanfordSentimentTreebank/sentiment_labels.txt
Found 24772 examples.
Reading data dictionary_path=./stanfordSentimentTreebank/dictionary.txt phrase_ids_path=./phrase_ids.test.txt labels_path=None
Found 46663 examples.
Vocab size: 21703
Reading embeddings from ./glove.840B.300d.txt.cache
New vocab size: 18640
Found 8758 unknown tokens.
Found 1255 unknown tokens.
Found 21205 unknown tokens.


In [74]:
#data = [[d.split(':')[1][:-1], d.split(':')[0]] for d in data]
data=[[d['phrase'],d['label']] for d in train_data]
validation=[[d['phrase'],d['label']] for d in validation_data]

In [75]:
X, y = list(zip(*data))
X = list(X)
X_val, y_val=list(zip(*validation))
X_val=list(X_val)

In [76]:
for i, x in enumerate(X):
    X[i] = re.sub('\d', '#', str(x)).split()
    
for i, x in enumerate(X_val):
    X_val[i] = re.sub('\d', '#', str(x)).split()

In [77]:
vocab = list(set(flatten(X)))

In [78]:
word2index={'<PAD>': 0, '<UNK>': 1}

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
        
index2word = {v:k for k, v in word2index.items()}

target2index = {}

for cl in set(y):
    if target2index.get(cl) is None:
        target2index[cl] = len(target2index)

index2target = {v:k for k, v in target2index.items()}

In [105]:
X_p, y_p = [], []
X_val_p=[]
y_val_p=[]

for pair in zip(X,y):
    X_p.append(prepare_sequence(pair[0], word2index).view(1, -1))
    y_p.append(Variable(LongTensor([target2index[pair[1]]])).view(1, -1))
for pair in zip(X_val,y_val):
    X_val_p.append(prepare_sequence(pair[0], word2index).view(1, -1))
    y_val_p.append(Variable(LongTensor([target2index[pair[1]]])).view(1, -1))
    
data_p = list(zip(X_p, y_p))
random.shuffle(data_p)

train_data = data_p[: int(len(data_p) * 0.9)]
test_data = data_p[int(len(data_p) * 0.9):]
val_data=list(zip(X_val_p, y_val_p))

In [37]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

In [111]:
pretrained = []

for key in word2index.keys():
    try:
        pretrained.append(model[word2index[key]])
        
    except:
        pretrained.append(np.random.randn(300))
        
pretrained_vectors = np.vstack(pretrained)

In [55]:
class  CNNClassifier(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, output_size, kernel_dim=100, kernel_sizes=(3, 4, 5), dropout=0.5):
        super(CNNClassifier,self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, kernel_dim, (K, embedding_dim)) for K in kernel_sizes])

        # kernal_size = (K,D) 
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(kernel_sizes) * kernel_dim, output_size)
    
    
    def init_weights(self, pretrained_word_vectors, is_static=False):
        self.embedding.weight = nn.Parameter(torch.from_numpy(pretrained_word_vectors).float())
        if is_static:
            self.embedding.weight.requires_grad = False


    def forward(self, inputs, is_training=False):
        inputs = self.embedding(inputs).unsqueeze(1) # (B,1,T,D)
        inputs = [F.relu(conv(inputs)).squeeze(3) for conv in self.convs] #[(N,Co,W), ...]*len(Ks)
        inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs] #[(N,Co), ...]*len(Ks)

        concated = torch.cat(inputs, 1)

        if is_training:
            concated = self.dropout(concated) # (N,len(Ks)*Co)
        out = self.fc(concated) 
        return F.log_softmax(out,1)

In [56]:
EPOCH = 5
BATCH_SIZE = 50
KERNEL_SIZES = [1]
KERNEL_DIM = 100
LR = 0.001

In [57]:
model = CNNClassifier(len(word2index), 300, len(target2index), KERNEL_DIM, KERNEL_SIZES)
model.init_weights(pretrained_vectors) # initialize embedding matrix using pretrained vectors


loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

KeyboardInterrupt: 

In [None]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch
        
def pad_to_batch(batch):
    x,y = zip(*batch)
    max_x = max([s.size(1) for s in x])
    x_p = []
    for i in range(len(batch)):
        if x[i].size(1) < max_x:
            x_p.append(torch.cat([x[i], Variable(LongTensor([word2index['<PAD>']] * (max_x - x[i].size(1)))).view(1, -1)], 1))
        else:
            x_p.append(x[i])
    return torch.cat(x_p), torch.cat(y).view(-1)

In [20]:
for epoch in range(EPOCH):
    losses = []
    for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        inputs,targets = pad_to_batch(batch)
        
        model.zero_grad()
        preds = model(inputs, True)
        
        loss = loss_function(preds, targets)
        losses.append(loss.data.tolist()[0])
        loss.backward()
        
        #for param in model.parameters():
        #    param.grad.data.clamp_(-3, 3)
        
        optimizer.step()
        
        if i % 100 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch, EPOCH, np.mean(losses)))
            losses = []

[0/5] mean_loss : 1.68
[0/5] mean_loss : 1.35
[0/5] mean_loss : 1.28
[0/5] mean_loss : 1.26
[0/5] mean_loss : 1.25
[0/5] mean_loss : 1.22
[0/5] mean_loss : 1.20
[0/5] mean_loss : 1.20
[0/5] mean_loss : 1.18
[0/5] mean_loss : 1.18
[0/5] mean_loss : 1.20
[0/5] mean_loss : 1.13
[0/5] mean_loss : 1.14
[0/5] mean_loss : 1.14
[0/5] mean_loss : 1.14
[0/5] mean_loss : 1.12
[0/5] mean_loss : 1.10
[0/5] mean_loss : 1.08
[0/5] mean_loss : 1.09
[0/5] mean_loss : 1.10
[0/5] mean_loss : 1.10
[0/5] mean_loss : 1.05
[0/5] mean_loss : 1.06
[0/5] mean_loss : 1.06
[0/5] mean_loss : 1.07
[0/5] mean_loss : 1.04
[0/5] mean_loss : 1.05
[0/5] mean_loss : 1.02
[0/5] mean_loss : 1.03
[0/5] mean_loss : 1.00
[0/5] mean_loss : 1.01
[0/5] mean_loss : 1.01
[1/5] mean_loss : 0.87
[1/5] mean_loss : 0.97
[1/5] mean_loss : 0.97
[1/5] mean_loss : 0.95
[1/5] mean_loss : 0.96
[1/5] mean_loss : 0.95
[1/5] mean_loss : 0.97
[1/5] mean_loss : 0.94
[1/5] mean_loss : 0.97
[1/5] mean_loss : 0.94
[1/5] mean_loss : 0.96
[1/5] mean_

KeyboardInterrupt: 

In [101]:
def test(model,test_data):   
    accuracy = 0
    for test in test_data:
        pred = model_stored(test[0]).max(1)[1]
        pred = pred.data.tolist()[0]
        target = test[1].data.tolist()[0][0]
        if pred == target:
            accuracy += 1

    print(accuracy/len(test_data) * 100)

In [102]:
test(model,dev_data)

43.6401305876444


In [173]:
torch.save(model,'CNN_word2vec')

  "type " + obj.__name__ + ". It won't be checked "


In [70]:
model_stored=torch.load('CNN_word2vec')

In [104]:
test(model_stored,val_data)

47.464879702890364
