In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import json
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score
torch.manual_seed(1)

<torch._C.Generator at 0x27e8afd8c50>

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device="cpu"

In [3]:
NER_train_file = open("./processed/NER_train_tagged.json")
NER_train_json = json.load(NER_train_file)

NER_val_file = open("./processed/NER_val_tagged.json")
NER_val_json = json.load(NER_val_file)

NER_test_file = open("./processed/NER_test_tagged.json")
NER_test_json = json.load(NER_test_file)

In [4]:
from gensim.models import KeyedVectors
word2vec = KeyedVectors.load_word2vec_format('.vector_models/GoogleNews-vectors-negative300.bin', binary=True)

In [5]:
def get_embeds(sentence,embed_model,embedding_dim):
    word_vectors = []
    for word in sentence:
        # Check if the word exists in the Word2Vec model's vocabulary
        if word in embed_model:
            # Get the word vector for the current word
            word_vector = embed_model[word]
            # Append the word vector to the list
            word_vectors.append(word_vector)
        else:
            # If the word is not in the vocabulary, append a zero vector
            word_vectors.append(np.zeros(embedding_dim))
    
    # Convert the list of word vectors to a tensor
    embeds_tensor = torch.tensor(word_vectors).to(device)
    return embeds_tensor

# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

def generate_tags_to_idx(data):
    unique_labels = []
    for entry in data:
        labels = entry['labels']
        for label in labels:
            unique_labels.append(label)
    unique_labels=list(set(unique_labels))
    label_dict = {}  
    for label_index in range(len(unique_labels)):
        label_dict[unique_labels[label_index]] =label_index
    return label_dict

# trying keras

In [6]:
from keras.models import Model
from keras.layers import Input, LSTM, Embedding, Dense, Bidirectional, Reshape
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy
from sklearn.metrics import f1_score
import keras.backend as K
import keras as k

In [22]:
!pip show tensorflow

Name: tensorflow
Version: 2.10.1
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: c:\users\deeptanshu barman\appdata\local\packages\pythonsoftwarefoundation.python.3.10_qbz5n2kfra8p0\localcache\local-packages\python310\site-packages
Requires: absl-py, astunparse, flatbuffers, gast, google-pasta, grpcio, h5py, keras, keras-preprocessing, libclang, numpy, opt-einsum, packaging, protobuf, setuptools, six, tensorboard, tensorflow-estimator, tensorflow-io-gcs-filesystem, termcolor, typing-extensions, wrapt
Required-by: 




In [20]:
!pip cache purge

Files removed: 788


In [23]:
!pip install --upgrade tensorflow==2.12.0

'sudo' is not recognized as an internal or external command,
operable program or batch file.


In [15]:
from tensorflow_addons.layers import CRF

In [8]:
tag_to_idx=generate_tags_to_idx(NER_train_json)

In [9]:
def sentences_and_tags(data):
    sentences=[]
    labels=[]
    for entry in data:
        sentences.append(entry['text'])
        labels.append(entry['labels'])
    return sentences,labels

In [10]:
# Function to preprocess your sentence and tags
def preprocess(sentence, tags, word2vec, max_len):
    # Tokenize the sentence
    tokens = sentence.split()
    # Convert tokens to word embeddings using Word2Vec
    embeddings = [word2vec[token] if token in word2vec else np.zeros(300) for token in tokens]
    # Pad embeddings to max_len
    padded_embeddings = embeddings[:max_len] + [np.zeros(300)] * (max_len - len(embeddings))
    # Convert tags to indices
    tag_indices = [tag_to_idx[tag] for tag in tags]
    return np.array(padded_embeddings), np.array(tag_indices)

# Define your BiLSTM-CRF model
def create_model(max_len, num_tags):
    input_layer = Input(shape=(max_len, 300))

    # Bidirectional LSTM layer
    lstm_layer = Bidirectional(LSTM(units=50, return_sequences=True))(input_layer)

    # CRF layer
    crf_layer = CRF(num_tags)
    # Output layer
    _, output_layer, _, _ = crf_layer(lstm_layer)

    # Create and compile the model
    model = Model(input_layer, output_layer)
    model.compile(optimizer='adam', loss=crf_loss, metrics=[crf_viterbi_accuracy])
    return model

In [11]:
# Function to preprocess your data
def preprocess_data(sentences, tags, word2vec, tag_to_idx, max_len):
    padded_embeddings = []
    tag_indices = []
    for sentence, tag in zip(sentences, tags):
        embeddings = [word2vec[token] if token in word2vec else np.zeros(300) for token in sentence.split()]
        padded_embeddings.append(embeddings[:max_len] + [np.zeros(300)] * (max_len - len(embeddings)))
        tag_indices.append([tag_to_idx[t] for t in tag][:max_len] + [0] * (max_len - len(tag)))
    return np.array(padded_embeddings), np.array(tag_indices)

# Function to calculate F1 score
def calculate_f1_score(y_true, y_pred):
    y_true_flat = y_true.flatten()
    y_pred_flat = y_pred.flatten()
    return f1_score(y_true_flat, y_pred_flat, average='macro')

# Prepare your training data
# sentences_train, tags_train are your training data
# sentences_val, tags_val are your validation data
# Assume you have your word2vec embeddings and tag_to_idx dictionary
# max_len is the maximum length of your sentences

sentences_train,tags_train=sentences_and_tags(NER_train_json)
sentences_val,tags_val=sentences_and_tags(NER_val_json)
# sentences_test,tags_val=sentences_and_tags(NER_test_json)

num_tags=len(tag_to_idx)
max_len=75

padded_embeddings_train, tag_indices_train = preprocess_data(sentences_train, tags_train, word2vec, tag_to_idx, max_len)
padded_embeddings_val, tag_indices_val = preprocess_data(sentences_val, tags_val, word2vec, tag_to_idx, max_len)

In [12]:
# Define your BiLSTM-CRF model
model = create_model(max_len, num_tags)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 75, 300)]         0         
                                                                 
 bidirectional (Bidirectiona  (None, 75, 100)          140400    
 l)                                                              
                                                                 
 crf (CRF)                   [(None, 75),              3510      
                              (None, 75, 27),                    
                              (None,),                           
                              (27, 27)]                          
                                                                 
Total params: 143,910
Trainable params: 143,910
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Train the model
epochs = 10  # Define number of epochs
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    # Fit the model on training data
    model.fit(padded_embeddings_train, tag_indices_train, epochs=1, batch_size=1, verbose=1)
    
    # Evaluate the model on validation data
    y_pred_val = model.predict(padded_embeddings_val)
    f1_val = calculate_f1_score(tag_indices_val, y_pred_val)
    print(f"F1 Score (Validation): {f1_val}")

Epoch 1/10


AttributeError: in user code:

    File "C:\Users\Deeptanshu Barman\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Deeptanshu Barman\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras_contrib\losses\crf_losses.py", line 54, in crf_loss  *
        crf, idx = y_pred._keras_history[:2]

    AttributeError: 'Tensor' object has no attribute '_keras_history'
