In [None]:
#!pip3 install transformers -U 

In [1]:
from transformers import AutoTokenizer, AutoModel
import os
import torch
from Bio import SeqIO
import math
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import time
from tqdm import tqdm

In [2]:
tokenizer = AutoTokenizer.from_pretrained("RaphaelMourad/Mistral-DNA-v0.1", trust_remote_code=True) # Same as DNABERT2
model = AutoModel.from_pretrained("RaphaelMourad/Mistral-DNA-v0.1", trust_remote_code=True)

model.safetensors:   0%|          | 0.00/420M [00:00<?, ?B/s]

In [3]:
import os
os.getcwd()

'/.../.../.../Unsupervised_Embedding_Evaluation'

In [4]:
os.chdir('/.../.../.../Unsupervised_Embedding_Evaluation/datasets/')

In [8]:
datasets_folder = '/.../.../.../Unsupervised_Embedding_Evaluation/datasets/'
csv_files = [file for file in os.listdir(datasets_folder) if file.endswith('.csv')]

In [9]:
csv_files

['test_human_enhancers_ensembl.csv',
 'train_human_ensembl_regulatory.csv',
 'test_human_ensembl_regulatory.csv',
 'train_demo_coding_vs_intergenomic_seqs.csv',
 'test_demo_coding_vs_intergenomic_seqs.csv',
 'train_human_enhancers_ensembl.csv',
 'train_human_ocr_ensembl.csv',
 'test_human_ocr_ensembl.csv',
 'train_human_enhancers_cohn.csv',
 'train_human_nontata_promoters.csv',
 'test_human_nontata_promoters.csv',
 'test_human_enhancers_cohn.csv']

In [10]:
from tqdm import tqdm

# Define a function to calculate mean embedding
def calculate_mean_embedding(sequence):
    inputs = tokenizer(sequence, return_tensors='pt')["input_ids"]
    with torch.no_grad():
        hidden_states = model(inputs)[0]  # [1, sequence_length, 768]

    # Embedding with mean pooling
    embedding_mean = torch.mean(hidden_states, dim=1).squeeze()  # Calculate mean along the sequence length
    # Convert torch tensor to numpy array
    embedding_mean_np = embedding_mean.detach().numpy()

    return embedding_mean_np


In [2]:
# Folder path to save the embeddings and the time calculation file
output_folder_path = "/.../.../.../Unsupervised_Embedding_Evaluation/Embeddings/mistral"
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

# List to store the time taken for each file
time_records = []

# Apply the calculate_mean_embedding function to each CSV file
for csv_file in csv_files:
    file_path = os.path.join(datasets_folder, csv_file)
    df = pd.read_csv(file_path)
    print(df.shape)
    # Measure the time taken to calculate the mean embeddings
    start_time = time.time()
    tqdm.pandas()
    df['mean_embedding'] = df['seq'].progress_apply(calculate_mean_embedding)
    end_time = time.time()
    time_taken = end_time - start_time

    # Save the time taken for the current file
    time_records.append({'file_name': csv_file, 'time_taken': time_taken})

    # Save the 'mean_embedding' column as a numpy array
    embedding_file_path = os.path.join(output_folder_path, os.path.splitext(csv_file)[0] + "_Mistral_Embeddings.npy")
    np.save(embedding_file_path, df['mean_embedding'].values)

    print(f"Embeddings saved successfully at: {embedding_file_path}")

# Save the time calculation records to a CSV file
time_df = pd.DataFrame(time_records)
time_csv_path = os.path.join(output_folder_path, "mistral_time_calculation.csv")
time_df.to_csv(time_csv_path, index=False)

print(f"Time calculations saved successfully at: {time_csv_path}")
