<a href="https://colab.research.google.com/github/Luly7/RT/blob/main/RT_Roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install huggingface_hub[hf_xet]
!pip install transformers
!pip install pandas transformers tensorflow

import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel
import numpy as np
import pandas as pd
#Load your dataset
csv_path="/content/drive/MyDrive/CS6480/GNN-RT/data/SMRT_train_set.txt"
df=pd.read_csv(csv_path)
df.head()


# 1. Load tokenizer and model
model_name = "roberta-base"  # Or ChemBERTa if you have it
tokenizer = RobertaTokenizer.from_pretrained(model_name)
transformer_model = TFRobertaModel.from_pretrained(model_name)

# 2. Define RT prediction model
class RTModel(tf.keras.Model):
    def __init__(self, transformer_model):
        super(RTModel, self).__init__()
        self.transformer = transformer_model
        self.dense = tf.keras.layers.Dense(1)  # Output RT

    def call(self, inputs):
        outputs = self.transformer(**inputs).last_hidden_state  # <-- Proper unpack
        pooled_output = tf.reduce_mean(outputs, axis=1)  # Simple mean pooling
        rt_prediction = self.dense(pooled_output)
        return rt_prediction

# 3. Instantiate model
rt_model = RTModel(transformer_model)

# 4. Tokenize all SMILES
tokenized_inputs = tokenizer(
    list(df['smiles']),
    padding=True,
    truncation=True,
    return_tensors="tf"
)

# 5. Tokenize
#inputs = tokenizer(smiles_data, return_tensors="tf", padding=True, truncation=True)

# 6. Create a TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask']
    },
    tf.constant(df['rt'].values, dtype=tf.float32)
))
# Shuffle, batch, prefetch
batch_size = 16
train_dataset = dataset.shuffle(len(df)).batch(batch_size).prefetch(tf.data.AUTOTUNE)# Only one sample now

# 7. Compile
rt_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss='mse')

# 8. Train (this is just for demo, with one data point!)
rt_model.fit(dataset, epochs=10)


# 8. Save the model
rt_model.save('/content/rt_model_final')



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

KeyError: 'smiles'

In [10]:
# Install the necessary packages
!pip install huggingface_hub
!pip install transformers
!pip install pandas tensorflow

# Imports
import tensorflow as tf
import pandas as pd
import numpy as np
from transformers import RobertaTokenizer, TFRobertaModel

# 1. Load your dataset
csv_path = "/content/drive/MyDrive/CS6480/GNN-RT/data/SMRT_test_set.txt"  # Update your path if needed
df = pd.read_csv(csv_path)

print(df.head())

# Check if the columns are properly named
assert 'smiles' in df.columns, "Missing 'smiles' column!"
assert 'rt' in df.columns, "Missing 'rt' (retention time) column!"

# 2. Load tokenizer and pre-trained model
model_name = "roberta-base"  # or "seyonec/ChemBERTa-zinc-base-v1" if using ChemBERTa
tokenizer = RobertaTokenizer.from_pretrained(model_name)
transformer_model = TFRobertaModel.from_pretrained(model_name)

# 3. Define RT prediction model
class RTModel(tf.keras.Model):
    def __init__(self, transformer):
        super(RTModel, self).__init__()
        self.transformer = transformer
        self.dense = tf.keras.layers.Dense(1)  # Output RT

    def call(self, inputs):
        outputs = self.transformer(**inputs).last_hidden_state
        pooled_output = tf.reduce_mean(outputs, axis=1)  # Mean pooling across tokens
        return self.dense(pooled_output)

# 4. Instantiate your model
rt_model = RTModel(transformer_model)

# 5. Tokenize all SMILES
tokenized_inputs = tokenizer(
    list(df['smiles']),
    padding=True,
    truncation=True,
    return_tensors="tf"
)

# 6. Create TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask']
    },
    tf.constant(df['rt'].values, dtype=tf.float32)
))

# Shuffle and batch the dataset
batch_size = 16
train_dataset = dataset.shuffle(len(df)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

# 7. Compile model
rt_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss='mse')

# 8. Train model
rt_model.fit(train_dataset, epochs=10)

# 9. Save the trained model
rt_model.save('/content/rt_model_final')


                                          smiles\tRT
0  O=C(c1ccc2c(c1)N(CC(O)=NCc1ccco1)C(=O)[C@@H]1C...
1  CCCOc1nc2ccccc2nc1N1CCC[C@H](C(O)=Nc2ccc(OC)c(...
2  CCCOc1nc2ccccc2nc1N1CCC[C@H](C(O)=Nc2cccc(CC)c...
3      CN1C2CCC1CC(OC(=O)[C@H](CO)c1ccccc1)C2\t667.1
4                 OC1CCCCC1N1CCC(c2ccccc2)CC1\t679.1


AssertionError: Missing 'smiles' column!