In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim



In [2]:
V_data = [1,2,3]
V = torch.Tensor(V_data)

In [3]:
V

tensor([1., 2., 3.])

In [4]:
M_Data = [[1,2,3], [4,5,6]]
M = torch.Tensor(M_Data)
print(M)

tensor([[1., 2., 3.],
        [4., 5., 6.]])


In [5]:
T_data = [[[1,2], [3,4]],[[5,6], [7,8]]]

In [7]:
T = torch.Tensor(T_data)
print(T)

tensor([[[1., 2.],
         [3., 4.]],

        [[5., 6.],
         [7., 8.]]])


In [8]:
x = torch.randn((3,4,5))
print(x)

tensor([[[ 2.2430, -0.0071,  1.3282, -1.3933,  0.4622],
         [-0.2764, -1.2210,  0.8431,  1.0022,  0.1400],
         [-1.1555,  0.9144, -0.6386, -0.2809, -2.0007],
         [-0.4383, -1.9135,  1.5961, -0.0737,  0.1087]],

        [[ 0.5461, -0.4756, -0.4275,  1.6071, -0.5357],
         [-0.1147,  0.6570, -1.0857, -1.2134,  1.5178],
         [ 1.0895,  1.1816,  1.8049,  0.5686, -0.1804],
         [-0.1040, -0.1954, -0.1030, -0.7803, -0.6796]],

        [[ 1.3413,  1.9875, -0.1457,  0.2885,  0.2464],
         [ 0.1643, -0.5070, -0.5306,  0.4973, -1.0910],
         [ 0.1839, -0.6231,  0.5895,  0.6499, -0.7294],
         [-0.7590, -1.0900,  1.9198,  0.6902, -1.1122]]])


In [15]:
x = autograd.Variable(torch.Tensor([1,2,3]), requires_grad = True)
y = autograd.Variable(torch.Tensor([4,5,6]), requires_grad = True)
z = x + y
print(z.grad_fn)

<AddBackward0 object at 0x11ae4fc50>


In [19]:
var_x = torch.randn((2,2),requires_grad = True)
var_x = torch.randn((2,2),requires_grad = True)
var_z = var_x + var_y
print(var_z.grad_fn)
var_z_data = var_z.data
print(var_z.data)

<AddBackward0 object at 0x11ae4f0b8>
tensor([[ 1.7441,  2.1293],
        [-1.1714,  2.0727]])


In [21]:
lin = nn.Linear(5, 3)
data = torch.randn(2, 5, requires_grad = True)
print(lin(data))

tensor([[ 0.9869, -0.5094, -0.0087],
        [ 0.5162,  0.8296, -0.7762]], grad_fn=<AddmmBackward>)


In [22]:
data = torch.randn(2, 2, requires_grad = True)
print(data)
print(F.relu(data))


tensor([[-0.4593,  0.8147],
        [ 1.1913, -2.4868]], requires_grad=True)
tensor([[0.0000, 0.8147],
        [1.1913, 0.0000]], grad_fn=<ReluBackward0>)


In [23]:
data = torch.randn(5, requires_grad=True)
print(data)
print(F.softmax(data))
print(F.softmax(data).sum())
print(F.log_softmax(data))


tensor([ 0.5426,  0.5777, -1.2939, -0.3694,  1.4929], requires_grad=True)
tensor([0.1929, 0.1998, 0.0307, 0.0775, 0.4990], grad_fn=<SoftmaxBackward>)
tensor(1., grad_fn=<SumBackward0>)
tensor([-1.6455, -1.6104, -3.4819, -2.5574, -0.6951],
       grad_fn=<LogSoftmaxBackward>)


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


In [27]:
data = [('me gusta comer en la cafeteria'.split(), "SPANISH"), 
       ('give it to me'.split(), "ENGLISH"), 
       ('No cre0 que sea una buena idea'.split(),  "SPANISH"),
       ('No it is not a food idea to get lost at sea'.split(), 'ENGLISH')]
test_data = [('yo creo que si'.split(), "SPANISH"), 
            ('it is lost on me'.split(), "ENGLISH")]

In [28]:
WORD_TO_IX = {}
for sent, _ in data + test_data:
    for word in sent:
        if word not in WORD_TO_IX:
            WORD_TO_IX[word]= len(WORD_TO_IX)
print(WORD_TO_IX)
            


{'food': 19, 'a': 18, 'give': 6, 'si': 25, 'que': 11, 'en': 3, 'No': 9, 'la': 4, 'una': 13, 'sea': 12, 'is': 16, 'idea': 15, 'me': 0, 'it': 7, 'at': 22, 'creo': 24, 'cre0': 10, 'not': 17, 'gusta': 1, 'buena': 14, 'comer': 2, 'on': 26, 'cafeteria': 5, 'yo': 23, 'to': 8, 'lost': 21, 'get': 20}


In [29]:
VOCAB_SIZE = len(WORD_TO_IX)
NUM_BABEL = 2


In [30]:
class BoWClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(BoWClassifier, self).__init__()
        self.linear = nn.Linear(vocab_size, num_labels)
    
    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec))
    
        
    
    

In [37]:
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)


def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])


In [38]:
model = BoWClassifier(NUM_BABEL, VOCAB_SIZE)
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[ 0.1502,  0.0295, -0.0327,  0.1416, -0.0519,  0.0789, -0.1835,  0.0651,
         -0.1791, -0.1154, -0.0140, -0.0544,  0.0622,  0.0272,  0.0845, -0.1395,
          0.0005,  0.1077,  0.1912,  0.1428, -0.0058,  0.0038, -0.0926, -0.0165,
          0.0803,  0.1407,  0.0045],
        [-0.1011, -0.1380, -0.0752,  0.1185, -0.1134,  0.1280, -0.0991, -0.0676,
         -0.1642, -0.0101, -0.1823,  0.1239, -0.0853,  0.1793,  0.1206,  0.0864,
         -0.1885, -0.0652, -0.1353, -0.0986, -0.0971, -0.1765,  0.0415,  0.0471,
         -0.0818,  0.1187, -0.0441]], requires_grad=True)
Parameter containing:
tensor([-0.0365, -0.0277], requires_grad=True)


In [39]:
sample = data[0]
bow_vector = make_bow_vector(sample[0], WORD_TO_IX)
log_probs = model(bow_vector)
print(log_probs)

tensor([[-0.4786, -0.9666]], grad_fn=<LogSoftmaxBackward>)


  import sys


In [40]:
label_to_ix = {'SPANISH':0, 'ENGLISH':1}

In [41]:
for instance, lebel in test_data:
    bow_vec = make_bow_vector(instance, word_to_ix=WORD_TO_IX)
    log_probs = model(bow_vec)
    print(log_probs)
print(next(model.parameters())[:,WORD_TO_IX["creo"]])

tensor([[-0.7270, -0.6604]], grad_fn=<LogSoftmaxBackward>)
tensor([[-0.3733, -1.1662]], grad_fn=<LogSoftmaxBackward>)
tensor([ 0.0803, -0.0818], grad_fn=<SelectBackward>)


  import sys


In [42]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [43]:
for epoch in range(10):
    for instance, label in data:
        model.zero_grad()
        bow_vec = make_bow_vector(instance, WORD_TO_IX)
        target = make_target(label, label_to_ix)
        log_probs = model(bow_vec)
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

  import sys


In [45]:
print(loss)

tensor(0.0537, grad_fn=<NllLossBackward>)


In [46]:
for instance, label in test_data:
    bow_vec = make_bow_vector(instance, WORD_TO_IX)
    print(bow_vec)
    log_probs = model(bow_vec)
    print(log_probs)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1., 1., 1., 0.]])
tensor([[-0.4690, -0.9825]], grad_fn=<LogSoftmaxBackward>)
tensor([[1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
         0., 0., 0., 1., 0., 0., 0., 0., 1.]])
tensor([[-1.2923, -0.3211]], grad_fn=<LogSoftmaxBackward>)


  import sys


In [47]:
word_to_ix = {"hello":0, "world":1}


In [50]:
embeds = nn.Embedding(2, 5)
# 2 words in vocab, 5 dimensional embedding
lookup_tensor = torch.LongTensor([word_to_ix['hello']])
hello_embed = embeds(lookup_tensor)
print(hello_embed)

tensor([[ 0.1127, -0.8782,  1.7724, -0.0710,  0.4250]],
       grad_fn=<EmbeddingBackward>)


In [59]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

In [65]:
trigrams = [([test_sentence[i], test_sentence[i+1]], test_sentence[i+2]) for i in range(len(test_sentence)-2)]

In [66]:
trigrams

[(['When', 'forty'], 'winters'),
 (['forty', 'winters'], 'shall'),
 (['winters', 'shall'], 'besiege'),
 (['shall', 'besiege'], 'thy'),
 (['besiege', 'thy'], 'brow,'),
 (['thy', 'brow,'], 'And'),
 (['brow,', 'And'], 'dig'),
 (['And', 'dig'], 'deep'),
 (['dig', 'deep'], 'trenches'),
 (['deep', 'trenches'], 'in'),
 (['trenches', 'in'], 'thy'),
 (['in', 'thy'], "beauty's"),
 (['thy', "beauty's"], 'field,'),
 (["beauty's", 'field,'], 'Thy'),
 (['field,', 'Thy'], "youth's"),
 (['Thy', "youth's"], 'proud'),
 (["youth's", 'proud'], 'livery'),
 (['proud', 'livery'], 'so'),
 (['livery', 'so'], 'gazed'),
 (['so', 'gazed'], 'on'),
 (['gazed', 'on'], 'now,'),
 (['on', 'now,'], 'Will'),
 (['now,', 'Will'], 'be'),
 (['Will', 'be'], 'a'),
 (['be', 'a'], "totter'd"),
 (['a', "totter'd"], 'weed'),
 (["totter'd", 'weed'], 'of'),
 (['weed', 'of'], 'small'),
 (['of', 'small'], 'worth'),
 (['small', 'worth'], 'held:'),
 (['worth', 'held:'], 'Then'),
 (['held:', 'Then'], 'being'),
 (['Then', 'being'], 'asked

In [62]:
vocab = set(test_sentence)
word_to_ix = {word : i for i, word in enumerate(vocab)}

In [56]:
class NGramLanguageModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, congext_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(1, -1)
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out)
        return log_probs

    

In [63]:
class NGramLanguageModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(1, -1)
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out)
        return log_probs

In [64]:
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr = 0.001)



In [75]:
for epoch in range(10):
    total_loss = torch.Tensor([0])
    for context, target in triframs:
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in variables
        context_idxs = list(map(lambda w:word_to_ix[w], context))
        print(context_idxs)
        context_var = torch.LongTensor(context_idxs)
        
        # Step 2. Recall that torch *accumulates* gradients.  Before passing in a new instance,
        # you need to zero out the gradients from the old instance
        model.zero_grad()
        
        # Step 3. Run the forward pass, getting log probabilities over next words
        log_prob = model(context_var)
        loss = loss_function(log_prob, torch.LongTensor([word_to_ix[target]]))
        
        loss.backward()
        optimizer.step()
        total_loss += loss.data
    
    losses.append(total_loss)
print(losses)

[30, 33]
[33, 55]
[55, 11]
[11, 40]
[40, 2]
[2, 50]
[50, 89]
[89, 23]
[23, 12]
[12, 10]
[10, 31]
[31, 2]
[2, 21]
[21, 94]
[94, 35]
[35, 95]
[95, 4]
[4, 3]
[3, 65]
[65, 47]
[47, 91]
[91, 18]
[18, 7]
[7, 48]
[48, 76]
[76, 84]
[84, 44]
[44, 52]
[52, 96]
[96, 36]
[36, 74]
[74, 26]
[26, 29]
[29, 90]
[90, 32]
[32, 0]
[0, 2]
[2, 63]
[63, 51]
[51, 27]
[27, 0]
[0, 93]
[93, 85]
[85, 52]
[52, 2]
[2, 75]
[75, 56]
[56, 77]
[77, 19]
[19, 42]
[42, 13]
[13, 86]
[86, 12]
[12, 22]
[22, 6]
[6, 41]
[41, 59]
[59, 67]
[67, 54]
[54, 92]
[92, 34]
[34, 24]
[24, 28]
[28, 82]
[82, 8]
[8, 20]
[20, 73]
[73, 2]
[2, 21]
[21, 61]
[61, 1]
[1, 38]
[38, 80]
[80, 43]
[43, 16]
[16, 78]
[78, 71]
[71, 52]
[52, 83]
[83, 15]
[15, 39]
[39, 81]
[81, 45]
[45, 92]
[92, 66]
[66, 81]
[81, 53]
[53, 49]
[49, 5]
[5, 14]
[14, 63]
[63, 60]
[60, 64]
[64, 46]
[46, 72]
[72, 25]
[25, 87]
[87, 48]
[48, 58]
[58, 17]
[17, 69]
[69, 38]
[38, 70]
[70, 57]
[57, 89]
[89, 9]
[9, 2]
[2, 68]
[68, 37]
[37, 69]
[69, 38]
[38, 62]
[62, 88]
[30, 33]
[33, 5

  if sys.path[0] == '':


[6, 41]
[41, 59]
[59, 67]
[67, 54]
[54, 92]
[92, 34]
[34, 24]
[24, 28]
[28, 82]
[82, 8]
[8, 20]
[20, 73]
[73, 2]
[2, 21]
[21, 61]
[61, 1]
[1, 38]
[38, 80]
[80, 43]
[43, 16]
[16, 78]
[78, 71]
[71, 52]
[52, 83]
[83, 15]
[15, 39]
[39, 81]
[81, 45]
[45, 92]
[92, 66]
[66, 81]
[81, 53]
[53, 49]
[49, 5]
[5, 14]
[14, 63]
[63, 60]
[60, 64]
[64, 46]
[46, 72]
[72, 25]
[25, 87]
[87, 48]
[48, 58]
[58, 17]
[17, 69]
[69, 38]
[38, 70]
[70, 57]
[57, 89]
[89, 9]
[9, 2]
[2, 68]
[68, 37]
[37, 69]
[69, 38]
[38, 62]
[62, 88]
[30, 33]
[33, 55]
[55, 11]
[11, 40]
[40, 2]
[2, 50]
[50, 89]
[89, 23]
[23, 12]
[12, 10]
[10, 31]
[31, 2]
[2, 21]
[21, 94]
[94, 35]
[35, 95]
[95, 4]
[4, 3]
[3, 65]
[65, 47]
[47, 91]
[91, 18]
[18, 7]
[7, 48]
[48, 76]
[76, 84]
[84, 44]
[44, 52]
[52, 96]
[96, 36]
[36, 74]
[74, 26]
[26, 29]
[29, 90]
[90, 32]
[32, 0]
[0, 2]
[2, 63]
[63, 51]
[51, 27]
[27, 0]
[0, 93]
[93, 85]
[85, 52]
[52, 2]
[2, 75]
[75, 56]
[56, 77]
[77, 19]
[19, 42]
[42, 13]
[13, 86]
[86, 12]
[12, 22]
[22, 6]
[6, 41]
[41, 59

In [78]:

CONTEXT_SIZE = 2 # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process. Computational processes are abstract
beings that inhabit computers. As they evolve, processes manipulate other abstract
things called data. The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()
word_to_ix = { word: i for i, word in enumerate(set(raw_text)) }
data = []
for i in range(2, len(raw_text) - 2):
    context = [ raw_text[i-2], raw_text[i-1], raw_text[i+1], raw_text[i+2] ]
    target = raw_text[i]
    data.append( (context, target) )
print(data[:5])

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]


In [79]:
class CBOW(nn.Module):
    
    def __init__(self):
        pass
    
    def forward(self, inputs):
        pass

In [81]:

# create your model and train.  here are some functions to help you make the data ready for use by your module
def make_context_vector(context, word_to_ix):
    idxs = list(map(lambda w: word_to_ix[w], context))
    print(idxs)
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

make_context_vector(data[0][0], word_to_ix) # example

[4, 26, 30, 42]


tensor([ 4, 26, 30, 42])

In [93]:
lstm = nn.LSTM(3, 3)
inputs = [torch.randn(1, 3) for _ in range(5)]
inputs

[tensor([[1.5634, 0.9127, 0.6512]]),
 tensor([[-0.1519,  0.6428, -0.1296]]),
 tensor([[-0.2298,  0.5477,  0.3164]]),
 tensor([[0.7692, 0.6142, 2.1560]]),
 tensor([[-0.2368,  1.2858, -0.1171]])]

In [94]:
hidden = (torch.randn(1,1,3), torch.randn(1,1,3))
hidden

(tensor([[[ 2.5431,  0.0763, -1.2461]]]), tensor([[[0.4701, 1.1985, 0.6401]]]))

In [108]:
lstm = nn.LSTM(3, 3)
inputs = [torch.randn(1, 3) for _ in range(5)]

hidden = (torch.randn(1,1,3), torch.randn(1,1,3))

for i in inputs:
    out, hidden = lstm(i.view(1, 1, -1), hidden)
print(inputs)

inputs = torch.cat(inputs).view(len(inputs), 1, -1)

hidden = (torch.randn(1,1,3), torch.randn(1,1,3))


[tensor([[ 0.7198,  0.5704, -1.1909]]), tensor([[-2.3201,  1.5620,  0.2520]]), tensor([[ 0.4881, -0.1031,  0.6183]]), tensor([[ 0.6253, -1.3380, -1.4148]]), tensor([[-0.5018, -1.2962,  0.7409]])]


In [109]:
# out, hidden = lstm(inputs, hidden)
hidden

(tensor([[[-0.7615, -0.4533,  0.5973]]]),
 tensor([[[-0.2464, -0.2153,  0.0371]]]))

In [110]:
hidden[0].shape

torch.Size([1, 1, 3])

In [111]:
out, hidden = lstm(inputs, hidden)

In [112]:
out

tensor([[[-0.0457, -0.1372,  0.1554]],

        [[ 0.2805, -0.1643,  0.0381]],

        [[ 0.2576,  0.0371,  0.1937]],

        [[ 0.2866, -0.0131,  0.1137]],

        [[ 0.4438,  0.0524,  0.1285]]], grad_fn=<StackBackward>)

In [113]:
hidden

(tensor([[[0.4438, 0.0524, 0.1285]]], grad_fn=<StackBackward>),
 tensor([[[0.8040, 0.1381, 0.3358]]], grad_fn=<StackBackward>))

In [126]:
def prepare_sequence(seq, to_ix):
    idxs = list(map(lambda w:to_ix[w], seq))
    tensor = torch.LongTensor(idxs)
    return tensor

In [127]:
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
tag_to_ix = {'DET':0, 'NN':1, "V":2}


In [128]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

In [135]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()
    
    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_dim), torch.zeros(1,1,self.hidden_dim))
    
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space)
        return tag_scores
    
        
    
        
        

In [179]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.1)

In [180]:
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_score = model(inputs)
tag_score



tensor([[-0.7822, -1.1816, -1.4448],
        [-0.8182, -1.2010, -1.3552],
        [-0.8409, -1.1557, -1.3710],
        [-0.7655, -1.2714, -1.3687],
        [-0.7772, -1.2004, -1.4304]], grad_fn=<LogSoftmaxBackward>)

In [181]:
training_data[0][0]

['The', 'dog', 'ate', 'the', 'apple']

In [182]:
for epoch in range(300): # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        
        # Step 1. Remember that Pytorch accumulates gradients.  We need to clear them out
        # before each instance
        model.zero_grad()
        
        # Also, we need to clear out the hidden state of the LSTM, detaching it from its
        # history on the last instance.
        model.hidden = model.init_hidden()
    
        # Step 2. Get our inputs ready for the network, that is, turn them into Variables
        # of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
    
        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)
    
        # Step 4. Compute the loss, gradients, and update the parameters by calling
        # optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()




In [183]:
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_score = model(inputs)
tag_score



tensor([[-0.5617, -0.8879, -4.0039],
        [-3.5406, -0.0377, -4.8264],
        [-3.5701, -4.1509, -0.0449],
        [-0.0321, -3.8560, -4.5592],
        [-4.3360, -0.0166, -5.6835]], grad_fn=<LogSoftmaxBackward>)

In [176]:
training_data
    

[(['The', 'dog', 'ate', 'the', 'apple'], ['DET', 'NN', 'V', 'DET', 'NN']),
 (['Everybody', 'read', 'that', 'book'], ['NN', 'V', 'DET', 'NN'])]

In [184]:
# Bidirectional LSTM Conditional Random Field for Named-Entity Recognition

In [185]:
# Helper functions to make the code more readable.
def to_scalar(var):
    # returns a python float
    return var.view(-1).data.tolist()[0]

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return to_scalar(idx)

# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
    

In [245]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim//2, num_layers=1, bidirectional=True)
        
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        
        # Matrix of transition parameters.  Entry i,j is the score of transitioning *to* i *from* j.
        self.transitions = nn.Parameter(torch.randn(self.tagset_size, self.tagset_size))
        
        # These two statements enforce the constraint that we never transfer *to* the start tag,
        # and we never transfer *from* the stop tag (the model would probably learn this anyway,
        # so this enforcement is likely unimportant)
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
        
        self.hidden = self.init_hidden()
    
    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim), torch.randn(2, 1, self.hidden_dim))
    
    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.
        
        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas
        # Iterate through the sentence
        for feat in feats:
            alphas_t = [] # The forward variables at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of the previous tag
                emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the edge (i -> next_tag)
                # before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the scores.
                alphas_t.append(log_sum_exp(next_tag_var))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha
    
    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats
 
    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = autograd.Variable( torch.Tensor([0]) )
        tags = torch.cat( [torch.LongTensor([self.tag_to_ix[START_TAG]]), tags] )
        for i, feat in enumerate(feats):
            score = score + self.transitions[tags[i+1], tags[i]] + feat[tags[i+1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score
    
    def _viterbi_decode(self, feats):
        backpointers = []
        
        # Initialize the viterbi variables in log space
        init_vvars = torch.Tensor(1, self.tagset_size).fill_(-10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0
        
        # forward_var at step i holds the viterbi variables for step i-1 
        forward_var = autograd.Variable(init_vvars)
        for feat in feats:
            bptrs_t = [] # holds the backpointers for this step
            viterbivars_t = [] # holds the viterbi variables for this step
            
            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the previous step,
                # plus the score of transitioning from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id])
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)
        
        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]
        
        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG] # Sanity check
        best_path.reverse()
        return path_score, best_path
 
    def neg_log_likelihood(self, sentence, tags):
        self.hidden = self.init_hidden()
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score
        
    def forward(self, sentence): # dont confuse this with _forward_alg above.
        self.hidden = self.init_hidden()
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)
        
        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq


In [246]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 5
HIDDEN_DIM = 4

# Make up some training data
training_data = [ (
    "the wall street journal reported today that apple corporation made money".split(),
    "B I I I O O O B I O O".split()
), (
    "georgia tech is a university in georgia".split(),
    "B I O O O O B".split()
) ]

word_to_ix = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
            
tag_to_ix = { "B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4 }

In [247]:
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)


In [248]:
# Check predictions before training
precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
precheck_tags = torch.LongTensor([tag_to_ix[t] for t in training_data[0][1]])
model(precheck_sent)

TypeError: 'Tensor' object is not callable

In [238]:
precheck_sent

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [239]:
precheck_tags

tensor([0, 1, 1, 1, 2, 2, 2, 0, 1, 2, 2])

In [240]:
[tag_to_ix[t] for t in training_data[0][1]]

[0, 1, 1, 1, 2, 2, 2, 0, 1, 2, 2]

In [241]:
training_data[0][1]

['B', 'I', 'I', 'I', 'O', 'O', 'O', 'B', 'I', 'O', 'O']

In [242]:
model(precheck_sent)

TypeError: 'Tensor' object is not callable

In [243]:
len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM

(17, {'<START>': 3, '<STOP>': 4, 'B': 0, 'I': 1, 'O': 2}, 5, 4)

In [249]:
len(tag_to_ix)

5