In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from transformers import EsmModel, EsmTokenizer
import torch

from sklearn.random_projection import SparseRandomProjection
from datetime import datetime
import time
from sklearn.decomposition import PCA

In [None]:
# !pip install scikit-learn
# !pip install transformers
# !pip install torch

In [3]:
# Load dataset
data = pd.read_csv('./cleaned_dataset.csv')

In [4]:
# ----------------- Step 1: Feature Extraction -----------------
# Protein Embeddings using ESM
tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
model = EsmModel.from_pretrained("facebook/esm2_t6_8M_UR50D")

def get_protein_embedding(uniprot_id):
    """Compute embeddings for protein sequences using ESM model."""
    inputs = tokenizer(uniprot_id, return_tensors="pt", add_special_tokens=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling over sequence tokens
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embedding

# Generate embeddings for proteins
protein_embeddings = {}
for uniprot_id in data['UniProt_ID'].unique():
    try:
        protein_embeddings[uniprot_id] = get_protein_embedding(uniprot_id)
    except Exception as e:
        print(f"Error for {uniprot_id}: {e}")

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# ----------------- Step 2: Placeholder Chemical Features -----------------
# Generate random embeddings for chemicals
def generate_random_projections(cids, embedding_dim=256):
    """Generate random embeddings using SparseRandomProjection."""
    random_projector = SparseRandomProjection(n_components=embedding_dim, random_state=42)
    cid_indices = {cid: idx for idx, cid in enumerate(cids)}
    random_matrix = np.random.rand(len(cids), embedding_dim)
    random_embeddings = random_projector.fit_transform(random_matrix)
    return {cid: random_embeddings[cid_indices[cid]] for cid in cids}

# Filter out rows where kiba_score is NaN
valid_data = data.dropna(subset=['kiba_score'])
unique_cids = valid_data['pubchem_cid'].dropna().unique()

chemical_embeddings = generate_random_projections(unique_cids)

# Combine protein and chemical features
features = []
targets = []

for _, row in valid_data.iterrows():
    if row['kiba_score'] is not None and row['pubchem_cid'] in chemical_embeddings:
        protein = protein_embeddings.get(row['UniProt_ID'], None)
        chemical = chemical_embeddings.get(row['pubchem_cid'], None)
        if protein is not None and chemical is not None:
            combined = np.concatenate((protein, chemical))
            features.append(combined)
            targets.append(row['kiba_score'])

In [None]:
X = np.array(features)

In [None]:
from sklearn.feature_selection import VarianceThreshold

# Apply variance thresholding
selector = VarianceThreshold(threshold=0.01)  # Remove features with variance below 0.01
features_reduced = selector.fit_transform(X)

X = features_reduced


In [8]:
y = np.array(targets)

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Get the current date and time
start_time = time.time()

# Train Random Forest Regressor
model = RandomForestRegressor(n_estimators=20, max_depth=20, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_val)

# Evaluate performance
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

end_time = time.time()

print(f"Validation MSE: {mse}")
print(f"Validation R^2: {r2}")

Validation MSE: 706066523546.5475
Validation R^2: -0.48390028153030284


In [None]:
execution_time = (end_time - start_time) / 60
print(f"Execution time: {execution_time:.2f} minutes")

In [None]:
from joblib import dump

# Save model to a file
dump(model, "random_forest_model.joblib")
print("Model saved as 'random_forest_model.joblib'")

In [None]:
import lightgbm as lgb

lgb_model = lgb.LGBMRegressor(n_estimators=100, random_state=42)
lgb_model.fit(X_train, y_train)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm
from joblib import Parallel, delayed

# Define a progress bar wrapper
def fit_with_progress(estimator, X, y):
    n_estimators = estimator.n_estimators
    with tqdm(total=n_estimators) as pbar:
        def update(*args):
            pbar.update(1)

        Parallel(n_jobs=-1, backend='threading')(
            delayed(estimator._fit_single_estimator)(
                estimator, X, y, random_state, update
            )
            for random_state in estimator.random_state
        )
    return estimator

# Example
model = RandomForestRegressor(n_estimators=100, random_state=42)
fit_with_progress(model, X_train, y_train)


In [None]:
# Predict Missing Scores
# Prepare rows with missing kiba_score
missing_features = []

for _, row in data[data['kiba_score'].isnull()].iterrows():
    protein = protein_embeddings.get(row['UniProt_ID'], None)
    chemical = chemical_embeddings.get(row['pubchem_cid'], None)
    if protein is not None and chemical is not None:
        combined = np.concatenate((protein, chemical))
        missing_features.append(combined)

# Predict missing scores
missing_scores = model.predict(np.array(missing_features))
print("Missing kiba_score values predicted!")

# Save predicted results
data.loc[data['kiba_score'].isnull(), 'kiba_score'] = missing_scores
data.to_csv("dataset_with_estimated_kiba_scores.csv", index=False)

print("Dataset with estimated kiba_score values saved!")