In [None]:
import pandas as pd
import tensorflow as tf
tf.config.run_functions_eagerly(True)
df = pd.read_csv('Data/ai_data.csv', index_col = [0])
df_original = df.copy()
df = df[['seq', 'SMILE', 'InChI']]
df.dropna(inplace = True)
len(df)

tmp = pd.DataFrame(df[['seq', 'SMILE']].agg(' '.join, axis=1).str.len()).rename(columns = {0:'length'})
df = df[tmp.length < 4097].copy()
####
#custom tokenization map . one can use the lib, we are using the lib

seq_map = pd.read_csv('Data/seq_map.csv', index_col = [0])
smile_map = pd.read_csv('Data/smile_map.csv', index_col = [0])
inchi_map = pd.read_csv('Data/inchi_map.csv', index_col = [0])


In [None]:

from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
# max_length = 4096
class CustomTokenizer:
    def __init__(self, vocabulary):
        # Create vocabulary mappings
        self.vocab = {word: idx for idx, word in enumerate(vocabulary)}
        self.vocab['pad'] = len(self.vocab)
        self.idx_to_word = {idx: word for word, idx in self.vocab.items()}

        # Handle special tokens
        self.start_token_id = self.vocab.get('[START]')
        self.end_token_id = self.vocab.get('[END]')
        self.sep_token_id = self.vocab.get('[SEP]')
        self.pad_token_id = self.vocab.get('[PAD]')
        self.vocab_size = len(vocabulary)

    def encode(self, input_text):
        # Tokenize input and output text, adding separator
        input_tokens = input_text.split()


        input_ids = [self.vocab.get(token, self.vocab.get('[END]')) for token in input_tokens]
        # Combine with start and end tokens
        return [self.start_token_id] + input_ids

    def encode_test(self, input_text):
        # Tokenize input and output text, adding separator
        input_tokens = input_text
        input_tokens = list(input_tokens)
        input_ids = [self.vocab.get(token, self.vocab.get('[END]')) for token in input_tokens]
        # Combine with start and end tokens
        return input_ids
    def decode(self, token_ids):
        # Convert token IDs back to text, excluding special tokens
        tokens = [self.idx_to_word.get(token_id) for token_id in token_ids if token_id not in [self.start_token_id, self.end_token_id, self.sep_token_id]]
        return ''.join(tokens)
    def vocab_output_vector(self, token):
        # Initialize vector of zeros with size equal to vocab size
        output_vector = np.zeros(self.vocab_size)
        # Get the index of the token and set the corresponding index to 1
        token_id = self.vocab.get(token, self.vocab.get('[END]'))  # Default to [END] if token not found
        output_vector[token_id] = 1
        return output_vector

#Vocabulary input, target input, right shift target input.
vi = list(set ( list(seq_map.seq))) + ['[PAD]','[END]']
vt = list(set (list(smile_map.smile)))+ ['[START]','[END]', '[PAD]']

source = CustomTokenizer(vi)
target = CustomTokenizer(vt)


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Embedding, LayerNormalization, Dropout
import numpy as np

# Scaled Dot-Product Attention
def scaled_dot_product_attention(query, key, value, mask):
    matmul_qk = tf.matmul(query, key, transpose_b=True)
    dk = tf.cast(tf.shape(key)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, value)

    return output, attention_weights

# Multi-Head Attention
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // num_heads

        self.wq = Dense(d_model)
        self.wk = Dense(d_model)
        self.wv = Dense(d_model)
        self.dense = Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, query, key, value, mask):
        batch_size = tf.shape(query)[0]

        query = self.wq(query)
        key = self.wk(key)
        value = self.wv(value)

        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        output, attention_weights = scaled_dot_product_attention(query, key, value, mask)
        output = tf.transpose(output, perm=[0, 2, 1, 3])
        output = tf.reshape(output, (batch_size, -1, self.d_model))

        return self.dense(output), attention_weights

# Point-wise Feed-Forward Network
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        Dense(dff, activation='relu'),
        Dense(d_model)
    ])

# Encoder Block
class EncoderBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderBlock, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, x, mask, training):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Decoder Block
class DecoderBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderBlock, self).__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.layernorm3 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        self.dropout3 = Dropout(rate)

    def call(self, x, enc_output, look_ahead_mask, padding_mask, training):
        attn1, _ = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(x + attn1)

        # Cross-attention with the encoder output
        attn2, _ = self.mha2(out1, enc_output, enc_output, padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(out1 + attn2)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        return self.layernorm3(out2 + ffn_output)

# Encoder
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size, rate=0.1):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = Embedding(vocab_size, d_model)
        # self.pos_encoding = self.positional_encoding(1000, d_model) # 1000 is the sequence length
        self.pos_encoding = self.positional_encoding(5000, d_model)

        self.enc_layers = [EncoderBlock(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = Dropout(rate)

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        return tf.cast(angle_rads[np.newaxis, ...], tf.float32)

    def get_angles(self, position, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return position * angle_rates

    def call(self, x, mask, training):
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)
        x += self.pos_encoding[:, :seq_len, :]
        # x = self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, mask=mask, training=training)

        return x

# Decoder
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size, rate=0.1):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = Embedding(vocab_size, d_model)
        # self.pos_encoding = self.positional_encoding(1000, d_model) # 1000 is the sequence length
        self.pos_encoding = self.positional_encoding(598, d_model)

        self.dec_layers = [DecoderBlock(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = Dropout(rate)

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        return tf.cast(angle_rads[np.newaxis, ...], tf.float32)

    def get_angles(self, position, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return position * angle_rates

    def call(self, x, enc_output, look_ahead_mask, padding_mask, training):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)
        x += self.pos_encoding[:, :seq_len, :]
        # x = self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.dec_layers[i](x, enc_output=enc_output, look_ahead_mask=look_ahead_mask, padding_mask=padding_mask, training=training)

        return x

# Transformer Model (Encoder-Decoder)
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, rate=0.1, source=None, target=None):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, rate)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, rate)
        self.lstm_layer = LSTM(units=d_model, return_sequences=True)
        # self.final_layer = Dense(target_vocab_size, activation='softmax')
        self.final_layer = Dense(target_vocab_size)
        self.source = source
        self.target = target

    def create_padding_mask(self, seq, padding_token):
        # seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
        seq = tf.cast(tf.math.equal(seq, padding_token), tf.float32)
        return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

    def create_look_ahead_mask(self, size):
        mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
        return mask  # (seq_len, seq_len)

    def create_masks(self, inp, tar):
        # Encoder padding mask
        enc_padding_mask = self.create_padding_mask(inp, self.source.pad_token_id)

        # Decoder padding mask (same as encoder padding mask, used during cross-attention)
        dec_padding_mask = self.create_padding_mask(inp, self.target.pad_token_id)

        # Look-ahead mask (for autoregressive decoding) and combining with target padding mask
        look_ahead_mask = self.create_look_ahead_mask(tf.shape(tar)[1])
        dec_target_padding_mask = self.create_padding_mask(tar, self.target.pad_token_id)
        combined_look_ahead_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

        return enc_padding_mask, combined_look_ahead_mask, dec_padding_mask

    def call(self,inputs, training):
        inp, right_shifted_tar = inputs
        """
        Args:
            inp: input tensor (source sequence)
            right_shifted_tar: decoder input (right-shifted target sequence)
            training: whether it's training (boolean)

        Returns:
            final_output: logits (before softmax) for the next token predictions
        """
        # Create masks internally
        enc_padding_mask, look_ahead_mask, dec_padding_mask = self.create_masks(inp, right_shifted_tar)

        # Encoder output
        enc_output = self.encoder(inp, mask=enc_padding_mask, training=training)

        # Decoder output (taking right-shifted target as input)
        dec_output = self.decoder(right_shifted_tar, enc_output=enc_output, look_ahead_mask=look_ahead_mask, padding_mask=dec_padding_mask, training=training)
        lstm_output = self.lstm_layer(dec_output)
        # Final linear layer for logits (prediction)
        final_output = self.final_layer(lstm_output)

        return final_output


In [None]:
# Example input and target (toy example)
input_sequence = tf.constant([[1, 2, 3, 4, 0]])  # Source sequence (input)
target_sequence = tf.constant([[1, 5, 6, 7, 0]])  # Target sequence (actual output)

# Right-shift the target sequence to create the decoder input
right_shifted_target = tf.constant([[0, 1, 5, 6, 7]])  # Right-shifted target (decoder input)

# Model configuration
# input_vocab_size = source.vocab_size
# target_vocab_size = target.vocab_size
# num_layers = 2
# d_model = 48
# num_heads = 8
# dff = 1024

input_vocab_size = source.vocab_size
target_vocab_size = target.vocab_size
num_layers = 1
d_model = 32
num_heads = 8
dff = 1024
## ClcC

# input_vocab_size = source.vocab_size
# target_vocab_size = target.vocab_size
# num_layers = 4
# d_model = 64
# # d_model = 5000
# num_heads = 8
# dff = 1024
#performance increased again

# input_vocab_size = source.vocab_size
# target_vocab_size = target.vocab_size
# num_layers = 3
# d_model = 64
# # d_model = 5000
# num_heads = 8
# dff = 1024
#increased performance

# Instantiate the transformer model
# transformer = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size)
# with tf.device('/device:GPU:0'):
transformer = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, source=source, target=target)

# Predict the output (logits) for the given input and right-shifted target
output_logits = transformer([input_sequence, right_shifted_target], training=False)

# Print the output logits (logits before applying softmax)
# print("Output logits:\n", output_logits)


In [None]:
from IPython.display import clear_output
import math

def convert_output_to_probabilities(out):
  batch_size, seq_length = out.shape
  vocab_size = target.vocab_size  # Example vocabulary size

  # Create a new array for one-hot encoding
  one_hot_encoded = np.zeros((batch_size, seq_length, vocab_size), dtype=int)

  # Set the appropriate indices to 1
  one_hot_encoded[np.arange(batch_size)[:, None], np.arange(seq_length), out] = 1
  return one_hot_encoded

def infer(inp):
  prompt_len = 100
  output_sequence = []
  rpred =  np.array([[target.start_token_id]])
  for i in range(0,prompt_len):
    input_sequence = np.expand_dims(inp[0], axis=0)  # Shape: (1, 35)
    predictions = transformer([input_sequence, rpred], training=False)
    predicted_logits = predictions[:, -1, :]
    predicted_token = tf.argmax(tf.nn.softmax(predicted_logits), axis=-1).numpy()[0]
    output_sequence.append(predicted_token)
    rpred = np.concatenate([rpred, np.array([[predicted_token]])], axis=-1)
  print(target.decode(output_sequence).split('[PAD]')[0].split('[END]')[0])

def max_pad(inpu_data):
  return max([i.shape[1] for i in inpu_data])

def pad_data(inpu_data, value=None):
  max_length = max_pad(inpu_data)
  for i in range(len(inpu_data)):
    inpu_data[i] = pad_sequences(
        inpu_data[i],
        maxlen = max_length,
        padding='post',
        value=value
        )
  inpu_data=np.concatenate(inpu_data)
  return inpu_data

cat = 0
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
if cat == 1:
  transformer.compile(
      optimizer=optimizer,
      loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
      metrics=['accuracy']
  )
else:
  transformer.compile(
      optimizer='adam',  # Choose an appropriate optimizer
      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),  # Use from_logits=True if outputs are logits
      metrics=['accuracy']  # Optional: you can add other metrics as needed
  )


batch = 221170*2
input_data,  routput_data , output_data= [],[],[]
frac = len(df) / batch
for i in range(0,int(batch)):
  print(len(df) / batch)
  print(i*frac, (i + 1)* frac)
  tmp = df[int(i*frac) : int((i + 1)* frac )].copy()
  for j, row in tmp.iterrows():
    inp,rout, out = [], [], []
    multiple = len(row.seq) / len(row.SMILE)
    remainder = len(row.seq) % len(row.SMILE)
    if len(row.seq) > len(row.SMILE):
      # print(multiple, remainder,len(row.seq),len(row.SMILE))
      for k in range(len(row.SMILE)):
        inp.append(source.encode_test(row.seq))
        out.append([target.start_token_id] + [target.encode_test(l)[0]  for l in row.SMILE[:k+1] for _ in range(math.floor(multiple)) ])
        rout.append([target.start_token_id] + [target.encode_test(l)[0]  for l in row.SMILE[:k] for _ in range(math.floor(multiple)) ])
      k += 1
      inp.append(source.encode_test(row.seq))
      rout.append([target.start_token_id] + [target.encode_test(l)[0]  for l in row.SMILE[:k] for _ in range(math.floor(multiple)) ])
      out.append([target.start_token_id] + [target.encode_test(l)[0]  for l in row.SMILE[:k] for _ in range(math.floor(multiple)) ] + [target.end_token_id])

      inp = pad_sequences(inp, padding = 'post', value=source.pad_token_id)
      out = pad_sequences(out, padding = 'post', value=target.pad_token_id)
      if cat == 1:
        out = convert_output_to_probabilities(out)
      rout = pad_sequences(rout, maxlen = len(out[-1]), padding = 'post', value=target.pad_token_id)
      input_data.append(inp)
      output_data.append(out)
      routput_data.append(rout)

  input_data = pad_data(input_data, value = source.pad_token_id)
  output_data = pad_data(output_data, value=target.pad_token_id)
  routput_data = pad_data(routput_data,value=target.pad_token_id)
  break
# remove break for production level training
# add model.fit in the loop

In [None]:
with tf.device('/device:GPU:0'):
  transformer.fit([input_data,routput_data],output_data,epochs =100, batch_size = 32)
transformer.save('/content/drive/MyDrive/Ai-Projects/Sanjevni/Models/cross_pharma.keras')


In [None]:
from IPython.display import clear_output

def remove_consecutive_duplicates(s):
    result = s[0]  # Start with the first character
    for char in s[1:]:
        if char != result[-1]:
            result += char
    return result

def infer(inp):
  prompt_len = inp.shape[1]
  output_sequence = []
  rpred =  np.array([[target.start_token_id]])
  for i in range(0,prompt_len):
    print(i)
    input_sequence = np.expand_dims(inp[-1], axis=0)  # Shape: (1, 35)
    predictions = transformer([input_sequence, rpred], training=False)
    predicted_logits = predictions[:, -1, :]
    predicted_token = tf.argmax(tf.nn.softmax(predicted_logits), axis=-1).numpy()[0]
    output_sequence.append(predicted_token)
    rpred = np.concatenate([rpred, np.array([[predicted_token]])], axis=-1)
    clear_output()
  pred = remove_consecutive_duplicates(target.decode(output_sequence))
  return pred
pred = infer(inp)
print(pred)

# Best of Luck, Have fun