### Check GPU hardware

In [None]:
!nvidia-smi

### Save hardware configuration

In [None]:
# Create the summary file
!echo "Hardware Summary" > hardware_summary.txt

# Fetch and write CPU Information
!echo "\nCPU Information:" >> hardware_summary.txt
!echo "-----------------" >> hardware_summary.txt
!lscpu | egrep 'Model name|Socket|Thread|CPU\(s\)' >> hardware_summary.txt

# Fetch and write Total RAM Information
!echo "\nTotal RAM Information:" >> hardware_summary.txt
!echo "-----------------" >> hardware_summary.txt
!free -h | grep Mem | awk '{print $2}' >> hardware_summary.txt


# Fetch and write GPU Information
!echo "\nGPU Information:" >> hardware_summary.txt
!echo "-----------------" >> hardware_summary.txt

# If you have a Nvidia GPU
!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv >> hardware_summary.txt

# Alternatively for other GPUs
# !lspci | grep VGA >> hardware_summary.txt


### Install D-SCRIPT

In [4]:
!pip install dscript

Collecting dscript
  Downloading dscript-0.2.8-py3-none-any.whl.metadata (474 bytes)
Collecting seaborn (from dscript)
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading dscript-0.2.8-py3-none-any.whl (38 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn, dscript
Successfully installed dscript-0.2.8 seaborn-0.13.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### Download trained models

In [5]:
# Download Human D-SCRIPT model 
!wget http://cb.csail.mit.edu/cb/dscript/data/models/human_v1.sav

--2025-02-12 16:38:50--  http://cb.csail.mit.edu/cb/dscript/data/models/human_v1.sav
Resolving cb.csail.mit.edu (cb.csail.mit.edu)... 128.52.131.233
Connecting to cb.csail.mit.edu (cb.csail.mit.edu)|128.52.131.233|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cb.csail.mit.edu/cb/dscript/data/models/human_v1.sav [following]
--2025-02-12 16:38:54--  https://cb.csail.mit.edu/cb/dscript/data/models/human_v1.sav
Connecting to cb.csail.mit.edu (cb.csail.mit.edu)|128.52.131.233|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://cb.csail.mit.edu/dscript/data/models/human_v1.sav [following]
--2025-02-12 16:38:55--  http://cb.csail.mit.edu/dscript/data/models/human_v1.sav
Connecting to cb.csail.mit.edu (cb.csail.mit.edu)|128.52.131.233|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cb.csail.mit.edu/dscript/data/models/human_v1.sav [following]
--202

### Download sequence and interaction files for test datasets

In [6]:
!wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/pairs/ecoli_test.tsv
!wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/seqs/ecoli.fasta


--2025-02-12 16:39:10--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/pairs/ecoli_test.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8000::154, 2606:50c0:8003::154, 2606:50c0:8001::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8000::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 812468 (793K) [text/plain]
Saving to: ‘ecoli_test.tsv’


2025-02-12 16:39:12 (3.47 MB/s) - ‘ecoli_test.tsv’ saved [812468/812468]

--2025-02-12 16:39:12--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/seqs/ecoli.fasta
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8001::154, 2606:50c0:8002::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5413511 (5.2M) [text/plain]
Saving to: ‘eco

### Generate embeddings

In [16]:
!dscript embed --seqs TEST_seqs.fasta -o TEST_emb.h5 -d 0

[2025-02-12-21:34:40] # Using CPU
[2025-02-12-21:34:40] # Loading Model...
  state_dict = torch.load(state_dict_path)
[2025-02-12-21:34:40] # Loading Sequences...
100%|███████████████████████████████████| 6879/6879 [00:00<00:00, 181564.63it/s]
[2025-02-12-21:34:41] # 6879 Sequences Loaded
[2025-02-12-21:34:41] # Approximate Storage Required (varies by average sequence length): ~55.032000000000004GB
[2025-02-12-21:34:41] # Storing to TEST_emb.h5...
100%|█████████████████████████████████████| 6879/6879 [1:15:01<00:00,  1.53it/s]


### Evaluate on test dataset

In [1]:
# Evaluate with Human D-SCRIPT model 
!dscript evaluate --model human_v1.sav --test ecoli_test.tsv --embedding ecoli.h5 --outfile ecoli_dscript


[2025-02-13-00:39:54] Using CPU
  model = torch.load(model_path, map_location=torch.device("cpu")).cpu()
  0%|                                                  | 0/7138 [00:00<?, ?it/s]
multiprocessing.pool.RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/home/insybio/.pyenv/versions/3.10.12/lib/python3.10/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/home/insybio/.pyenv/versions/3.10.12/lib/python3.10/site-packages/dscript/utils.py", line 53, in _hdf5_load_partial_func
    with h5py.File(file_path, "r") as fi:
  File "/home/insybio/.local/lib/python3.10/site-packages/h5py/_hl/files.py", line 561, in __init__
    fid = make_fid(name, mode, userblock_size, fapl, fcpl, swmr=swmr)
  File "/home/insybio/.local/lib/python3.10/site-packages/h5py/_hl/files.py", line 235, in make_fid
    fid = h5f.open(name, flags, fapl=fapl)
  File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
  File "h5py/_objects.pyx", lin

In [1]:
import os
import pandas as pd
import subprocess
import time
import gc

# Define file paths
input_file = 'TEST_ppis_r.tsv'
model = 'human_v1.sav'
embedding = 'TEST_emb.h5'
output_file = 'TEST_dscript.predictions.tsv'

# Create a subfolder for chunk files and predictions
output_dir = 'TEST_chunks_and_predictions_2'
os.makedirs(output_dir, exist_ok=True)

# Load the dataset
df = pd.read_csv(input_file, sep='\t')

# Define chunk size (adjust based on memory)
chunk_size = 100  # Modify as needed
num_chunks = len(df) // chunk_size + (1 if len(df) % chunk_size != 0 else 0)

# Prepare the final output file
final_output_path = os.path.join(output_dir, output_file)

# Iterate through each chunk, process it, and merge results
for i in range(num_chunks):
    # Create chunk of data
    chunk = df.iloc[i * chunk_size: (i + 1) * chunk_size]
    chunk_file = os.path.join(output_dir, f"TEST_test_chunk_{i+1}.tsv")

    # Save the chunk to a file
    chunk.to_csv(chunk_file, sep='\t', index=False, header=False)

    # Define prediction file (it will be created by D-SCRIPT)
    predictions_file = os.path.join(output_dir, f"TEST_test_chunk_{i+1}.predictions.tsv")

    # Run the dscript evaluate command on the chunk
    command = [
        'dscript', 'evaluate',
        '--model', model,
        '--test', chunk_file,
        '--embedding', embedding,
        '--outfile', predictions_file  # Let dscript handle the extension
    ]

    # Execute the command
    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Check if the predictions file was created
    if os.path.exists(predictions_file):
        # Append predictions to the final output file
        with open(predictions_file, 'r') as pred_file, open(final_output_path, 'a') as final_file:
            final_file.writelines(pred_file.readlines())

        # Remove processed chunk file (not predictions)
        os.remove(chunk_file)
        print(f"Processed and merged chunk {i+1}.")
    else:
        print(f"Error: Predictions file for chunk {i+1} was not created. stderr: {result.stderr.decode()}")

    # Manually clear memory
    del chunk
    gc.collect()  # Force garbage collection

    # Pause to prevent memory overload
    time.sleep(5)

print(f"Final predictions saved to {final_output_path}")







  model = torch.load(model_path
100%|██████████| 184/184 [00:13<00:00, 13.47it/s]
Predicting pairs:  79%|███████▉  | 79/100 [00:16<00:06,  3.24it/s]9606.ENSP00000416097 x 9606.ENSP00000355533 - The size of tensor a (4967) must match the size of tensor b (2000) at non-singleton dimension 3
  model = torch.load(model_path
100%|██████████| 182/182 [00:11<00:00, 16.00it/s]
Predicting pairs: 100%|██████████| 100/100 [00:27<00:00,  3.68it/s]606.ENSP00000296755 x 9606.ENSP00000435466 - The size of tensor a (2468) must match the size of tensor b (2000) at non-singleton dimension 2

  model = torch.load(model_path
100%|██████████| 180/180 [00:09<00:00, 19.02it/s]
  return F.conv2d(
Predicting pairs: 100%|██████████| 100/100 [00:28<00:00,  3.45it/s]606.ENSP00000353362 x 9606.ENSP00000324463 - The size of tensor a (2146) must match the size of tensor b (2000) at non-singleton dimension 3

  model = torch.load(model_path
100%|██████████| 182/182 [00:07<00:00, 25.20it/s]
  return F.conv2d(
Predicti

In [3]:
import os
import glob

# Define paths
input_dir = 'TEST_chunks_and_predictions_2'  # Folder where chunk predictions are stored
output_file = 'TEST_dscript_merged.predictions.tsv'  # Final merged file

# Get all prediction files (sorted to maintain order)
prediction_files = sorted(glob.glob(os.path.join(input_dir, '*.predictions.tsv.predictions.tsv')))

# Merge all prediction files
with open(output_file, 'w') as outfile:
    for file in prediction_files:
        with open(file, 'r') as infile:
            outfile.writelines(infile.readlines())

print(f"Final merged predictions saved to {output_file}")


Final merged predictions saved to TEST_dscript_merged.predictions.tsv


### Read prediction file

In [4]:
import pandas as pd
df = pd.read_csv('TEST_dscript_merged.predictions.tsv', sep='\t', header=None)
print(df)
df = df.drop_duplicates(subset=[0, 1])
print(df)


                          0                     1  2         3
0      9606.ENSP00000361032  9606.ENSP00000301761  0  0.004176
1      9606.ENSP00000305810  9606.ENSP00000394699  0  0.004176
2      9606.ENSP00000364475  9606.ENSP00000362584  0  0.004185
3      9606.ENSP00000365176  9606.ENSP00000353854  0  0.004175
4      9606.ENSP00000267884  9606.ENSP00000478677  0  0.004239
...                     ...                   ... ..       ...
34973  9606.ENSP00000261489  9606.ENSP00000256151  1  0.004177
34974  9606.ENSP00000376024  9606.ENSP00000345680  1  0.004248
34975  9606.ENSP00000341844  9606.ENSP00000318914  1  0.004176
34976  9606.ENSP00000252542  9606.ENSP00000252542  1  0.809190
34977  9606.ENSP00000193322  9606.ENSP00000372193  1  0.004187

[34978 rows x 4 columns]
                          0                     1  2         3
0      9606.ENSP00000361032  9606.ENSP00000301761  0  0.004176
1      9606.ENSP00000305810  9606.ENSP00000394699  0  0.004176
2      9606.ENSP00000364475  

### Compute performance metrics

In [7]:
# Import necessary libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, roc_auc_score, confusion_matrix, average_precision_score, matthews_corrcoef

import numpy as np

# Extract prediction and true labels
y_true = df.iloc[:, 2]
y_prob = df.iloc[:, 3]
y_pred = np.round(y_prob)

# Accuracy
acc = accuracy_score(y_true, y_pred) * 100  # Multiply by 100 to convert to percentage

# Precision
prec = precision_score(y_true, y_pred) * 100

# Recall
rec = recall_score(y_true, y_pred) * 100

# Specificity
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
spec = (tn / (tn + fp)) * 100

# MCC score
mcc = matthews_corrcoef(y_true, y_pred)

# F1 score
f1 = f1_score(y_true, y_pred) * 100

# F2 score (using beta=2 to prioritize recall)
f2 = fbeta_score(y_true, y_pred, beta=2) * 100

# AUC-ROC
auroc = roc_auc_score(y_true, y_prob) * 100

# AUPRC
auprc = average_precision_score(y_true, y_prob) * 100

# Print metrics with 2 decimal places
print(f'Accuracy: {acc:.2f}%, Precision: {prec:.2f}%, Recall: {rec:.2f}%, Specificity: {spec:.2f}%, MCC: {mcc:.2f}, F1-Score: {f1:.2f}%, F2-Score: {f2:.2f}%, AUC-ROC: {auroc:.2f}%, AUPRC: {auprc:.2f}%')




Accuracy: 65.88%, Precision: 66.31%, Recall: 6.61%, Specificity: 98.17%, MCC: 0.12, F1-Score: 12.02%, F2-Score: 8.06%, AUC-ROC: 53.70%, AUPRC: 43.23%
