In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
import numpy as np
from transformers import EsmModel, EsmTokenizer
from sklearn.random_projection import SparseRandomProjection
from sklearn.preprocessing import StandardScaler

In [None]:
# Step 1: Load Dataset
data_path = './cleaned_dataset.csv' 
data = pd.read_csv(data_path)

In [None]:
# ----------------- Step 1: Feature Extraction -----------------
# Protein Embeddings using ESM
tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
model = EsmModel.from_pretrained("facebook/esm2_t6_8M_UR50D")

def get_protein_embedding(uniprot_id):
    """Compute embeddings for protein sequences using ESM model."""
    inputs = tokenizer(uniprot_id, return_tensors="pt", add_special_tokens=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling over sequence tokens
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embedding

# Generate embeddings for proteins
protein_embeddings = {}
for uniprot_id in data['UniProt_ID'].unique():
    try:
        protein_embeddings[uniprot_id] = get_protein_embedding(uniprot_id)
    except Exception as e:
        print(f"Error for {uniprot_id}: {e}")

In [None]:
# ----------------- Step 2: Placeholder Chemical Features -----------------
# Generate random embeddings for chemicals
def generate_random_projections(cids, embedding_dim=256):
    """Generate random embeddings using SparseRandomProjection."""
    random_projector = SparseRandomProjection(n_components=embedding_dim, random_state=42)
    cid_indices = {cid: idx for idx, cid in enumerate(cids)}
    random_matrix = np.random.rand(len(cids), embedding_dim)
    random_embeddings = random_projector.fit_transform(random_matrix)
    return {cid: random_embeddings[cid_indices[cid]] for cid in cids}

# Filter out rows where kiba_score is NaN
valid_data = data.dropna(subset=['kiba_score'])
unique_cids = valid_data['pubchem_cid'].dropna().unique()

chemical_embeddings = generate_random_projections(unique_cids)

In [None]:
# Combine protein and chemical features
features = []
targets = []

for _, row in data.iterrows():
    if row['kiba_score'] is not None and row['pubchem_cid'] in chemical_embeddings:
        protein = protein_embeddings.get(row['UniProt_ID'], None)
        chemical = chemical_embeddings.get(row['pubchem_cid'], None)
        if protein is not None and chemical is not None:
            combined = np.concatenate((protein, chemical))
            features.append(combined)
            targets.append(row['kiba_score'])

In [None]:
np.savez('processed_dataset_w_negatives.npz', features=features, targets=targets)