In [4]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import os
import torch
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

In [5]:
# Load GROVER model
tokenizer = AutoTokenizer.from_pretrained("PoetschLab/GROVER")
model = AutoModelForMaskedLM.from_pretrained("PoetschLab/GROVER")
model.eval()

Some weights of the model checkpoint at PoetschLab/GROVER were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(609, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

In [18]:
# Set directories
datasets_folder = '/.../.../.../Unsupervised_Embedding_Evaluation/datasets/'
output_folder_path = "/.../.../.../Unsupervised_Embedding_Evaluation/Embeddings/grover"
os.makedirs(output_folder_path, exist_ok=True)


In [22]:
csv_files = [file for file in os.listdir(datasets_folder) if file.endswith('.csv')]
time_records = []

In [23]:
csv_files

['test_human_enhancers_ensembl.csv',
 'train_demo_coding_vs_intergenomic_seqs.csv',
 'test_demo_coding_vs_intergenomic_seqs.csv',
 'train_human_enhancers_ensembl.csv',
 'train_human_enhancers_cohn.csv',
 'train_human_nontata_promoters.csv',
 'test_human_nontata_promoters.csv',
 'test_human_enhancers_cohn.csv']

In [24]:
csv_files = csv_files[1:8]
csv_files

['train_demo_coding_vs_intergenomic_seqs.csv',
 'test_demo_coding_vs_intergenomic_seqs.csv',
 'train_human_enhancers_ensembl.csv',
 'train_human_enhancers_cohn.csv',
 'train_human_nontata_promoters.csv',
 'test_human_nontata_promoters.csv',
 'test_human_enhancers_cohn.csv']

In [25]:
# Embedding function
def calculate_mean_embedding(sequence):
    inputs = tokenizer(sequence, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        hidden_states = model(**inputs, output_hidden_states=True).hidden_states[-1]
    embedding_mean = hidden_states.mean(dim=1).squeeze().detach().numpy()
    return embedding_mean

In [26]:
# Process each file
for csv_file in csv_files:
    file_path = os.path.join(datasets_folder, csv_file)
    df = pd.read_csv(file_path)
    print(df.shape)
    
    start_time = time.time()
    tqdm.pandas()
    df['mean_embedding'] = df['seq'].progress_apply(calculate_mean_embedding)
    time_taken = time.time() - start_time

    embedding_file_path = os.path.join(output_folder_path, os.path.splitext(csv_file)[0] + "_GROVER_Embeddings.npy")
    np.save(embedding_file_path, df['mean_embedding'].values)
    print(f"Embeddings saved to: {embedding_file_path}")
    
    time_records.append({'file_name': csv_file, 'time_taken': time_taken})

# Save time log
time_df = pd.DataFrame(time_records)
time_df.to_csv(os.path.join(output_folder_path, "grover_time_calculation.csv"), index=False)

(75000, 2)


100%|██████████| 75000/75000 [2:37:31<00:00,  7.94it/s]  


Embeddings saved to: /.../.../.../Unsupervised_Embedding_Evaluation/Embeddings/grover/train_demo_coding_vs_intergenomic_seqs_GROVER_Embeddings.npy
(25000, 2)


100%|██████████| 25000/25000 [52:39<00:00,  7.91it/s]


Embeddings saved to: /.../.../.../Unsupervised_Embedding_Evaluation/Embeddings/grover/test_demo_coding_vs_intergenomic_seqs_GROVER_Embeddings.npy
(123872, 2)


100%|██████████| 123872/123872 [4:58:59<00:00,  6.90it/s]  


Embeddings saved to: /.../.../.../Unsupervised_Embedding_Evaluation/Embeddings/grover/train_human_enhancers_ensembl_GROVER_Embeddings.npy
(20843, 2)


100%|██████████| 20843/20843 [1:15:30<00:00,  4.60it/s]


Embeddings saved to: /.../.../.../Unsupervised_Embedding_Evaluation/Embeddings/grover/train_human_enhancers_cohn_GROVER_Embeddings.npy
(27097, 2)


100%|██████████| 27097/27097 [1:06:14<00:00,  6.82it/s]


Embeddings saved to: /.../.../.../Unsupervised_Embedding_Evaluation/Embeddings/grover/train_human_nontata_promoters_GROVER_Embeddings.npy
(9034, 2)


100%|██████████| 9034/9034 [22:02<00:00,  6.83it/s]


Embeddings saved to: /.../.../.../Unsupervised_Embedding_Evaluation/Embeddings/grover/test_human_nontata_promoters_GROVER_Embeddings.npy
(6948, 2)


100%|██████████| 6948/6948 [24:51<00:00,  4.66it/s]

Embeddings saved to: /.../.../.../Unsupervised_Embedding_Evaluation/Embeddings/grover/test_human_enhancers_cohn_GROVER_Embeddings.npy



