In [7]:
import torch
from torch import nn
import torch.nn.functional as F
from sklearn.feature_extraction.text import CountVectorizer
from sentiment_data import read_sentiment_examples, WordEmbeddings, read_word_embeddings
from torch.utils.data import Dataset, DataLoader
import time
import argparse
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from BOWmodels import SentimentDatasetBOW, NN2BOW, NN3BOW
from DANmodels import SentimentDatasetDAN, DAN



In [8]:
import os
train_data = os.path.join("data", "train.txt")
dev_data = os.path.join("data", "dev.txt")

# Load the training and development data
train_exs = read_sentiment_examples(train_data)
dev_exs = read_sentiment_examples(dev_data)

train_exs

[['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'century', "'s", 'new', '``', 'conan', "''", 'and', 'that', 'he', "'s", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean-claud', 'van', 'damme', 'or', 'steven', 'segal', '.']; label=1,
 ['the', 'gorgeously', 'elaborate', 'continuation', 'of', '``', 'the', 'lord', 'of', 'the', 'rings', "''", 'trilogy', 'is', 'so', 'huge', 'that', 'a', 'column', 'of', 'words', 'can', 'not', 'adequately', 'describe', 'co-writer\\/director', 'peter', 'jackson', "'s", 'expanded', 'vision', 'of', 'j.r.r.', 'tolkien', "'s", 'middle-earth', '.']; label=1,
 ['singer\\/composer', 'bryan', 'adams', 'contributes', 'a', 'slew', 'of', 'songs', '--', 'a', 'few', 'potential', 'hits', ',', 'a', 'few', 'more', 'simply', 'intrusive', 'to', 'the', 'story', '--', 'but', 'the', 'whole', 'package', 'certainly', 'captures', 'the', 'intended', ',', 'er', ',', 'spirit', 'of', 'the', 'piece', '.']; label=1,
 ['yet

In [9]:
train_data = SentimentDatasetBOW("data/train.txt")
dev_data = SentimentDatasetBOW("data/dev.txt")
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(dev_data, batch_size=16, shuffle=False)

for x, y in train_loader:
    print(x, y)
    break

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]]) tensor([1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1])


In [10]:
examples = read_sentiment_examples("data/train.txt")
sentences = [" ".join(ex.words) for ex in examples]
sentences

["the rock is destined to be the 21st century 's new `` conan '' and that he 's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .",
 "the gorgeously elaborate continuation of `` the lord of the rings '' trilogy is so huge that a column of words can not adequately describe co-writer\\/director peter jackson 's expanded vision of j.r.r. tolkien 's middle-earth .",
 'singer\\/composer bryan adams contributes a slew of songs -- a few potential hits , a few more simply intrusive to the story -- but the whole package certainly captures the intended , er , spirit of the piece .',
 'yet the act is still charming here .',
 "whether or not you 're enlightened by any of derrida 's lectures on `` the other '' and `` the self , '' derrida is an undeniably fascinating and playful fellow .",
 'just the labour involved in creating the layered richness of the imagery in this chiaroscuro of madness and light is astonishing .',
 'part of the charm of 

In [11]:
# Load dataset
start_time = time.time()

word_embeddings = read_word_embeddings("data/glove.6B.50d-relativized.txt")
train_data = SentimentDatasetDAN("data/train.txt", word_embeddings)
dev_data = SentimentDatasetDAN("data/dev.txt", word_embeddings)
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(dev_data, batch_size=16, shuffle=False)


Read in 14923 vectors of size 50


In [14]:
for x, y in train_loader:
    print(x, y)
    break

tensor([[   -1,    -1,  7430,    25,   737,  7302,     7,   240,    58,    18,
             9,  4979,  3659,     5,  1050,     3,    33,    -1,  1442,    92,
          1004,  5998,    10,   634,     4,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [ 1126,    46,     5,     2,    93,  4311,  8044,     5,     9,   686,
          2853,   561,  1425,     6,   287,     4,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [  383,   286,  9579,     6,    29,    20,  4023,  6062,    20,    21,
           877,     6,    29,     4,     0,     0,     0,     0,     0,     0,
    

In [32]:
word_embeddings.word_indexer.get_object(-1)

In [34]:
sentence = [" ".join(ex.words) for ex in examples][0]
word_indices = [word_embeddings.word_indexer.index_of(word) for word in sentence.split()]
word_indices

[2,
 922,
 15,
 5476,
 6,
 29,
 2,
 3296,
 508,
 10,
 48,
 27,
 -1,
 26,
 7,
 13,
 19,
 10,
 203,
 6,
 149,
 9,
 7215,
 141,
 1128,
 70,
 -1,
 -1,
 3,
 -1,
 1167,
 -1,
 44,
 -1,
 -1,
 4]

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
from sklearn.feature_extraction.text import CountVectorizer
from sentiment_data import read_sentiment_examples, WordEmbeddings, read_word_embeddings
from torch.utils.data import Dataset, DataLoader
import time
import argparse
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from BOWmodels import SentimentDatasetBOW, NN2BOW, NN3BOW
from DANmodels import SentimentDatasetDAN, DAN

In [17]:
train_data = SentimentDatasetDAN("data/train.txt", None)
dev_data = SentimentDatasetDAN("data/dev.txt", None)

In [19]:
train_data.sentences_idx[0], len(train_data.sentences_idx[0])

(tensor([ 0,  1,  2,  3,  4,  5,  0,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  8,
         16,  4, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
 52)

In [16]:
word_embeddings = read_word_embeddings("data/glove.6B.300d-relativized.txt")
train_data = SentimentDatasetDAN("data/train.txt", word_embeddings)

Read in 14923 vectors of size 300


In [20]:
train_data.sentences_idx[0], len(train_data.sentences_idx[0])

(tensor([ 0,  1,  2,  3,  4,  5,  0,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  8,
         16,  4, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
 52)