In [None]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd

# Paths to your datasets
train_data_path = 'combined_data_train.csv'
test_data_path = 'combined_data_test.csv'
dev_data_path = 'combined_data_dev.csv'

# Load the datasets
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
dev_data = pd.read_csv(dev_data_path)

# Example of examining the first few rows of the training data
print(train_data.head())

                                               Token               Label
0                                               Step                   O
1                                                  1     B-EXAMPLE_LABEL
2                                                  .                   O
3                                         tert-Butyl  B-REACTION_PRODUCT
4  ((3S,6R)-6-(fluoromethyl)tetrahydro-2H-pyran-3...  I-REACTION_PRODUCT


In [None]:
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')

def get_synonym(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)[0] if synonyms else word

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/ananthasubb/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from transformers import AutoTokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import numpy as np
import random

# Adjust the probability as needed
SYNONYM_PROB = 0.36

# Constants
MAX_LEN = 128  # Adjust based on the length of your sentences
BERT_MODEL_NAME = "dmis-lab/biobert-base-cased-v1.1"
NUM_CLASSES = len(train_data['Label'].unique())  # Number of unique labels

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)

def concatenate_tokens_with_synonyms(data, synonym_prob=0.99):
    sentences = []
    labels = []
    current_sentence = []
    current_labels = []

    for _, row in data.iterrows():
        token = row['Token']
        label = row['Label']

        # Replace token with a synonym with a certain probability
        if random.random() < synonym_prob:
            token = get_synonym(token)

        if token == ".":
            sentences.append(" ".join(current_sentence))
            labels.append(" ".join(current_labels))
            current_sentence = []
            current_labels = []
        else:
            current_sentence.append(token)
            current_labels.append(label)

    return sentences, labels

# Function to concatenate tokens into sentences
def concatenate_tokens(data):
    sentences = []
    labels = []
    current_sentence = []
    current_labels = []

    for _, row in data.iterrows():
        if row['Token'] == ".":
            sentences.append(" ".join(current_sentence))
            labels.append(" ".join(current_labels))
            current_sentence = []
            current_labels = []
        else:
            current_sentence.append(row['Token'])
            current_labels.append(row['Label'])

    return sentences, labels

# Flag to control synonym replacement
APPLY_SYNONYM_REPLACEMENT_TO_TEST_DEV = False  # Set to True to apply synonym replacement

# For Training Data (Always apply synonym replacement)
train_sentences, train_label_sentences = concatenate_tokens_with_synonyms(train_data, SYNONYM_PROB)

# For Test Data
if APPLY_SYNONYM_REPLACEMENT_TO_TEST_DEV:
    test_sentences, test_label_sentences = concatenate_tokens_with_synonyms(test_data, SYNONYM_PROB)
else:
    test_sentences, test_label_sentences = concatenate_tokens(test_data)

# For Dev Data
if APPLY_SYNONYM_REPLACEMENT_TO_TEST_DEV:
    dev_sentences, dev_label_sentences = concatenate_tokens_with_synonyms(dev_data, SYNONYM_PROB)
else:
    dev_sentences, dev_label_sentences = concatenate_tokens(dev_data)

# Tokenization and padding
def tokenize_and_pad(sentences):
    input_ids = [tokenizer.encode(sentence, add_special_tokens=True) for sentence in sentences]
    return pad_sequences(input_ids, maxlen=MAX_LEN, truncating='post', padding='post')

train_inputs = tokenize_and_pad(train_sentences)
test_inputs = tokenize_and_pad(test_sentences)
dev_inputs = tokenize_and_pad(dev_sentences)

# Label encoding and one-hot encoding
label_encoder = LabelEncoder()
label_encoder.fit(train_data['Label'])

def encode_labels(label_sentences):
    encoded_labels = [label_encoder.transform(label.split()) for label in label_sentences]
    return pad_sequences(encoded_labels, maxlen=MAX_LEN, padding='post', value=label_encoder.transform(['O'])[0])

train_labels = to_categorical(encode_labels(train_label_sentences), num_classes=NUM_CLASSES)
test_labels = to_categorical(encode_labels(test_label_sentences), num_classes=NUM_CLASSES)
dev_labels = to_categorical(encode_labels(dev_label_sentences), num_classes=NUM_CLASSES)

# Checking shapes of the outputs
print(train_inputs.shape, train_labels.shape)
print(test_inputs.shape, test_labels.shape)
print(dev_inputs.shape, dev_labels.shape)


2023-12-11 14:03:14.237632: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-11 14:03:14.237777: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-11 14:03:14.239302: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-11 14:03:14.248560: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


(2022, 128) (2022, 128, 25)
(1102, 128) (1102, 128, 25)
(200, 128) (200, 128, 25)


In [None]:
from transformers import TFBertModel
from tensorflow.keras.layers import Input, Bidirectional, LSTM, Dense, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Constants
MAX_LEN = 128  # Should be the same as used in preprocessing
NUM_CLASSES = train_labels.shape[2]  # Based on the one-hot encoding

# Load BERT model
BERT_MODEL_NAME = "dmis-lab/biobert-base-cased-v1.1"
bert_model = TFBertModel.from_pretrained(BERT_MODEL_NAME, from_pt=True)

# Model architecture
input_ids = Input(shape=(MAX_LEN,), dtype='int32')
attention_mask = Input(shape=(MAX_LEN,), dtype='int32')
bert_output = bert_model(input_ids, attention_mask=attention_mask)[0]
bilstm = Bidirectional(LSTM(50, return_sequences=True))(bert_output)
output = TimeDistributed(Dense(NUM_CLASSES, activation='softmax'))(bilstm)

model = Model(inputs=[input_ids, attention_mask], outputs=output)
model.compile(optimizer=Adam(learning_rate=3e-5), loss='categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()


2023-12-11 14:03:29.940945: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13775 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:27:00.0, compute capability: 7.5
2023-12-11 14:03:30.485170: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification mo

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPooli   1083102   ['input_1[0][0]',             
 )                           ngAndCrossAttentions(last_   72         'input_2[0][0]']             
                             hidden_state=(None, 128, 7                                           
                             68),                                                             

In [None]:
# Training parameters
EPOCHS = 50  # Adjust as needed
BATCH_SIZE = 32  # Adjust as needed

# Training the model
history = model.fit(
    [train_inputs, np.ones_like(train_inputs)],  # Assuming full attention masks
    train_labels,
    validation_data=([test_inputs, np.ones_like(test_inputs)], test_labels),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE
)

Epoch 1/50


2023-12-11 14:03:59.961469: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
2023-12-11 14:04:01.627456: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fbae43e64c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-12-11 14:04:01.627588: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2023-12-11 14:04:01.638555: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1702321441.872657 2228762 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




In [None]:
# Evaluate on the test set
test_loss, test_accuracy = model.evaluate(
    [dev_inputs, np.ones_like(dev_inputs)],  # Assuming full attention masks
    dev_labels,
    batch_size=BATCH_SIZE
)

print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

In [None]:
import matplotlib.pyplot as plt

# Plotting training and validation loss
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Plotting training and validation accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
import numpy as np

# Predict labels on the test set
test_predictions = model.predict([test_inputs, np.ones_like(test_inputs)])
test_predictions = np.argmax(test_predictions, axis=-1)

# Convert test labels from one-hot encoded to integer format
true_test_labels = np.argmax(test_labels, axis=-1)

In [None]:
from sklearn.metrics import classification_report

# Decode the predictions and true labels
decoded_predictions = [label_encoder.inverse_transform(pred) for pred in test_predictions]
decoded_true_labels = [label_encoder.inverse_transform(true_label) for true_label in true_test_labels]

# Flatten the lists for classification report
flat_predictions = [label for sublist in decoded_predictions for label in sublist]
flat_true_labels = [label for sublist in decoded_true_labels for label in sublist]

# Extract unique labels present in predictions and true labels
unique_labels = sorted(set(flat_predictions + flat_true_labels))

# Generate classification report
report = classification_report(flat_true_labels, flat_predictions, target_names=unique_labels)

print(report)

In [None]:
report = classification_report(flat_true_labels, flat_predictions, target_names=unique_labels, zero_division=1)
print(report)