In [63]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import json
import numpy as np
from tqdm import tqdm
torch.manual_seed(1)

<torch._C.Generator at 0x25b83ed4cd0>

In [242]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device="cpu"

In [7]:
NER_train_file = open("./processed/NER_train_tagged.json")
NER_train_json = json.load(NER_train_file)

NER_val_file = open("./processed/NER_val_tagged.json")
NER_val_json = json.load(NER_val_file)

NER_test_file = open("./processed/NER_test_tagged.json")
NER_test_json = json.load(NER_test_file)

In [16]:
from gensim.models import KeyedVectors
word2vec = KeyedVectors.load_word2vec_format('./vector_models/GoogleNews-vectors-negative300.bin', binary=True)

In [227]:
## Helper Functions
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def get_embeds(sentence,embed_model,embedding_dim):
    word_vectors = []
    for word in sentence:
        # Check if the word exists in the Word2Vec model's vocabulary
        if word in embed_model:
            # Get the word vector for the current word
            word_vector = embed_model[word]
            # Append the word vector to the list
            word_vectors.append(word_vector)
        else:
            # If the word is not in the vocabulary, append a zero vector
            word_vectors.append(np.zeros(embedding_dim))
    
    # Convert the list of word vectors to a tensor
    embeds_tensor = torch.tensor(word_vectors).to(device)
    
    return embeds_tensor

# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

def generate_tags_to_idx(data):
    START_TAG = "<START>"
    STOP_TAG = "<STOP>"
    unique_labels = []
    for entry in data:
        labels = entry['labels']
        for label in labels:
            unique_labels.append(label)
    unique_labels=list(set(unique_labels))
    label_dict = {}  
    for label_index in range(len(unique_labels)):
        label_dict[unique_labels[label_index]] =label_index
    label_dict[START_TAG]=len(label_dict)
    label_dict[STOP_TAG]=len(label_dict)
    return label_dict

In [243]:
#need to remove embedding layer
class BiLSTM_CRF(nn.Module):
    def __init__(self, tag_to_ix, embedding_model, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        START_TAG = "<START>"
        STOP_TAG = "<STOP>"
        self.embed_model=embedding_model
        self.embedding_dim=len(embedding_model['hello'])
        self.hidden_dim = hidden_dim
        # self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        # do not need this we use word2vec output 
        # self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(self.embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size).to(device))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        START_TAG = "<START>"
        STOP_TAG = "<STOP>"
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1).to(device)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = tuple(h.to(device) for h in self.init_hidden())
        # self.hidden.to(device)
        # embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        embeds=sentence.unsqueeze(1)
        embeds=embeds.to(torch.float32).to(device)
        # print(embeds)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats    

    def _score_sentence(self, feats, tags):
        START_TAG = "<START>"
        STOP_TAG = "<STOP>"
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags]).to(device)
        # print(feats.shape)
        # print(tags)
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        START_TAG = "<START>"
        STOP_TAG = "<STOP>"
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        feats.to(device)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [244]:
HIDDEN_DIM = 4
tag_to_ix=generate_tags_to_idx(NER_train_json)
vocab_size=len(word2vec['hello'])
model = BiLSTM_CRF(tag_to_ix, word2vec, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [245]:
def train(model,optimizer,epochs,train_data):
    tag_to_ix=generate_tags_to_idx(train_data)
    model.to(device)  # Move model to GPU if available
    embedded=[]
    for entry in tqdm(train_data,desc="PREPPING EMBEDDINGS "):
        sentence=entry['text'].split(" ")
        embedded.append(get_embeds(sentence,word2vec,300))

    for epoch in tqdm(range(epochs)):
        for i in tqdm(range(len(train_data))):
            entry=train_data[i]
            tags=entry['labels']
            # print(sentence)
            # print(tags)
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is,
            # turn them into Tensors of word indices.
            sentence_in = embedded[i].to(device)
            targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long).to(device)
            # print(targets)
            # Step 3. Run our forward pass.
            loss = model.neg_log_likelihood(sentence_in, targets)

            # Step 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            loss.backward()
            optimizer.step()

In [246]:
train(model,optimizer=optimizer,epochs=1,train_data=NER_train_json)

PREPPING EMBEDDINGS : 100%|██████████| 8019/8019 [00:14<00:00, 542.76it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

0




1




2




3




4




5




6




7





8


  0%|          | 9/8019 [00:04<1:11:49,  1.86it/s][A

9




10




11




12




13




14




15




16




17




18




19




20




21




22




23




24




25




26




27




28




29




30




31




32




33




34




35




36




37




38




39




40




41




42




43




44




45




46




47




48




49




50




51




52




53




54




55




56




57




58




59




60




61




62




63




64




65




66




67




68




69




70




71




72




73




74




75




76




77




78




79





80


  1%|          | 81/8019 [01:01<1:17:53,  1.70it/s][A

81




82




83




84




85




86




87




88




89




90




91




92




93




94




95




96




97




98




99




100




101




102




103




104




105




106




107




108




109




110




111




112




113


  1%|▏         | 113/8019 [01:21<1:35:17,  1.38it/s]
  0%|          | 0/1 [01:21<?, ?it/s]


KeyboardInterrupt: 

In [247]:
from sklearn.metrics import f1_score

def test(model, test_data,count):
    true_labels = []
    predicted_labels = []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move model to GPU if available
    with torch.no_grad():
        cnt=0
        for entry in test_data:
            cnt+=1
            if cnt>count:
                break
            # Step 1. Get the inputs ready for the network
            sentence=entry['text'].split(" ")
            tags=entry['labels']
            sentence_in = sentence
            targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

            # Step 2. Run the forward pass
            _, predicted_tags = model(sentence_in)

            # Convert predicted tags tensor to a list
            predicted_tags = predicted_tags

            # Append true and predicted labels for F1 score calculation
            true_labels.extend(targets.tolist())
            predicted_labels.extend(predicted_tags)

    # Calculate F1 score
    f1 = f1_score(true_labels, predicted_labels, average='macro')

    return f1

In [152]:
test(model,NER_test_json,100)

0.035716272600834494

In [153]:
test(model,NER_val_json,100)

0.033894926899042124