### Check GPU hardware

In [None]:
!nvidia-smi

### Save hardware configuration

In [2]:
# Create the summary file
!echo "Hardware Summary" > hardware_summary.txt

# Fetch and write CPU Information
!echo "\nCPU Information:" >> hardware_summary.txt
!echo "-----------------" >> hardware_summary.txt
!lscpu | egrep 'Model name|Socket|Thread|CPU\(s\)' >> hardware_summary.txt

# Fetch and write Total RAM Information
!echo "\nTotal RAM Information:" >> hardware_summary.txt
!echo "-----------------" >> hardware_summary.txt
!free -h | grep Mem | awk '{print $2}' >> hardware_summary.txt


# Fetch and write GPU Information
!echo "\nGPU Information:" >> hardware_summary.txt
!echo "-----------------" >> hardware_summary.txt

# If you have a Nvidia GPU
!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv >> hardware_summary.txt

# Alternatively for other GPUs
# !lspci | grep VGA >> hardware_summary.txt


### Install D-SCRIPT

In [1]:
!pip install dscript


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
# Set OMP_NUM_THREADS to "1", limiting OpenMP to single-threaded operation
os.environ["OMP_NUM_THREADS"] = "1"

### Download trained models

In [3]:
# Download Human Topsy-Turvy model
!wget http://cb.csail.mit.edu/cb/dscript/data/models/topsy_turvy_v1.sav

--2025-02-13 11:14:41--  http://cb.csail.mit.edu/cb/dscript/data/models/topsy_turvy_v1.sav
Resolving cb.csail.mit.edu (cb.csail.mit.edu)... 128.52.131.233
Connecting to cb.csail.mit.edu (cb.csail.mit.edu)|128.52.131.233|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cb.csail.mit.edu/cb/dscript/data/models/topsy_turvy_v1.sav [following]
--2025-02-13 11:14:45--  https://cb.csail.mit.edu/cb/dscript/data/models/topsy_turvy_v1.sav
Connecting to cb.csail.mit.edu (cb.csail.mit.edu)|128.52.131.233|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://cb.csail.mit.edu/dscript/data/models/topsy_turvy_v1.sav [following]
--2025-02-13 11:14:46--  http://cb.csail.mit.edu/dscript/data/models/topsy_turvy_v1.sav
Connecting to cb.csail.mit.edu (cb.csail.mit.edu)|128.52.131.233|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cb.csail.mit.edu/dscript/data/models/

### Download sequence and interaction files for test datasets

In [4]:
!wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/pairs/ecoli_test.tsv
!wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/seqs/ecoli.fasta


--2025-02-13 11:14:54--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/pairs/ecoli_test.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8000::154, 2606:50c0:8003::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 812468 (793K) [text/plain]
Saving to: ‘ecoli_test.tsv’


2025-02-13 11:14:55 (4.15 MB/s) - ‘ecoli_test.tsv’ saved [812468/812468]

--2025-02-13 11:14:55--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/seqs/ecoli.fasta
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8003::154, 2606:50c0:8001::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5413511 (5.2M) [text/plain]
Saving to: ‘eco

### Generate embeddings

In [None]:
!dscript embed --seqs ecoli.fasta -o ecoli.h5 -d 0

[2025-02-13-11:15:24] # Using CPU
[2025-02-13-11:15:24] # Loading Model...
  state_dict = torch.load(state_dict_path)
[2025-02-13-11:15:25] # Loading Sequences...
100%|█████████████████████████████████| 17722/17722 [00:00<00:00, 297802.31it/s]
[2025-02-13-11:15:26] # 17722 Sequences Loaded
[2025-02-13-11:15:26] # Approximate Storage Required (varies by average sequence length): ~141.776GB
[2025-02-13-11:15:26] # Storing to ecoli.h5...
 15%|█████▌                              | 2721/17722 [16:21<2:10:27,  1.92it/s]

### Evaluate on test dataset

In [None]:
# Evaluate with Human Topsy_turvy model 
!dscript evaluate --model topsy_turvy_v1.sav --test ecoli_test.tsv --embedding ecoli.h5 --outfile ecoli_topsy_turvy -d 0

In [2]:
import os
import pandas as pd
import subprocess
import time
import gc

# Define file paths
input_file = 'TEST_ppis.tsv'
model = 'topsy_turvy_v1.sav'
embedding = 'TEST_emb.h5'
output_file = 'TEST_dscript.predictions.tsv'

# Create a subfolder for chunk files and predictions
output_dir = 'TEST_chunks_and_predictions_TT'
os.makedirs(output_dir, exist_ok=True)

# Load the dataset
df = pd.read_csv(input_file, sep='\t')

# Define chunk size (adjust based on memory)
chunk_size = 100  # Modify as needed
num_chunks = len(df) // chunk_size + (1 if len(df) % chunk_size != 0 else 0)

# Prepare the final output file
final_output_path = os.path.join(output_dir, output_file)

# Iterate through each chunk, process it, and merge results
for i in range(num_chunks):
    # Create chunk of data
    chunk = df.iloc[i * chunk_size: (i + 1) * chunk_size]
    chunk_file = os.path.join(output_dir, f"TEST_test_chunk_{i+1}.tsv")

    # Save the chunk to a file
    chunk.to_csv(chunk_file, sep='\t', index=False, header=False)

    # Define prediction file (it will be created by D-SCRIPT)
    predictions_file = os.path.join(output_dir, f"TEST_test_chunk_{i+1}.predictions.tsv")

    # Run the dscript evaluate command on the chunk
    command = [
        'dscript', 'evaluate',
        '--model', model,
        '--test', chunk_file,
        '--embedding', embedding,
        '--outfile', predictions_file  # Let dscript handle the extension
    ]

    # Execute the command
    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Check if the predictions file was created
    if os.path.exists(predictions_file):
        # Append predictions to the final output file
        with open(predictions_file, 'r') as pred_file, open(final_output_path, 'a') as final_file:
            final_file.writelines(pred_file.readlines())

        # Remove processed chunk file (not predictions)
        os.remove(chunk_file)
        print(f"Processed and merged chunk {i+1}.")
    else:
        print(f"Error: Predictions file for chunk {i+1} was not created. stderr: {result.stderr.decode()}")

    # Manually clear memory
    del chunk
    gc.collect()  # Force garbage collection

    # Pause to prevent memory overload
    time.sleep(5)

print(f"Final predictions saved to {final_output_path}")

  model = torch.load(model_path
100%|██████████| 182/182 [00:05<00:00, 31.29it/s]
Predicting pairs: 100%|██████████| 100/100 [00:22<00:00,  4.41it/s]606.ENSP00000347184 x 9606.ENSP00000321684 - The size of tensor a (3142) must match the size of tensor b (2000) at non-singleton dimension 2

  model = torch.load(model_path
100%|██████████| 190/190 [00:07<00:00, 24.05it/s]
Predicting pairs: 100%|██████████| 100/100 [00:19<00:00,  5.17it/s]606.ENSP00000369129 x 9606.ENSP00000374407 - The size of tensor a (2871) must match the size of tensor b (2000) at non-singleton dimension 2

  model = torch.load(model_path
100%|██████████| 188/188 [00:10<00:00, 17.92it/s]
Predicting pairs: 100%|██████████| 100/100 [00:23<00:00,  4.17it/s]606.ENSP00000459615 x 9606.ENSP00000348498 - The size of tensor a (2839) must match the size of tensor b (2000) at non-singleton dimension 33

  model = torch.load(model_path
100%|██████████| 186/186 [00:09<00:00, 19.10it/s]
Predicting pairs: 100%|██████████| 100/100 [

In [3]:
import os
import glob

# Define paths
input_dir = 'TEST_chunks_and_predictions_TT'  # Folder where chunk predictions are stored
output_file = 'TEST_TT_merged.predictions.tsv'  # Final merged file

# Get all prediction files (sorted to maintain order)
prediction_files = sorted(glob.glob(os.path.join(input_dir, '*.predictions.tsv.predictions.tsv')))

# Merge all prediction files
with open(output_file, 'w') as outfile:
    for file in prediction_files:
        with open(file, 'r') as infile:
            outfile.writelines(infile.readlines())

print(f"Final merged predictions saved to {output_file}")

Final merged predictions saved to TEST_TT_merged.predictions.tsv


### Read prediction file

In [5]:
import pandas as pd
df = pd.read_csv('TEST_TT_merged.predictions.tsv', sep='\t', header=None)
df.columns = ['id_1', 'id_2', 'true_label', 'predicted_label']
print(df)


                       id_1                  id_2  true_label  predicted_label
0      9606.ENSP00000317955  9606.ENSP00000263923           1         0.129160
1      9606.ENSP00000297265  9606.ENSP00000343742           1         0.111380
2      9606.ENSP00000353944  9606.ENSP00000304161           1         0.321380
3      9606.ENSP00000300283  9606.ENSP00000464030           1         0.078945
4      9606.ENSP00000357292  9606.ENSP00000310935           1         0.379990
...                     ...                   ...         ...              ...
34357  9606.ENSP00000404848  9606.ENSP00000342675           1         0.023157
34358  9606.ENSP00000332592  9606.ENSP00000385852           1         0.489470
34359  9606.ENSP00000294623  9606.ENSP00000322016           1         0.698080
34360  9606.ENSP00000220592  9606.ENSP00000355279           1         0.628780
34361  9606.ENSP00000320503  9606.ENSP00000362720           1         0.159400

[34362 rows x 4 columns]


### Compute performance metrics

In [6]:
# Import necessary libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, roc_auc_score, confusion_matrix, average_precision_score, matthews_corrcoef

import numpy as np

# Extract prediction and true labels
y_true = df.iloc[:, 2]
y_prob = df.iloc[:, 3]
y_pred = np.round(y_prob)

# Accuracy
acc = accuracy_score(y_true, y_pred) * 100  # Multiply by 100 to convert to percentage

# Precision
prec = precision_score(y_true, y_pred) * 100

# Recall
rec = recall_score(y_true, y_pred) * 100

# Specificity
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
spec = (tn / (tn + fp)) * 100

# MCC score
mcc = matthews_corrcoef(y_true, y_pred)

# F1 score
f1 = f1_score(y_true, y_pred) * 100

# F2 score (using beta=2 to prioritize recall)
f2 = fbeta_score(y_true, y_pred, beta=2) * 100

# AUC-ROC
auroc = roc_auc_score(y_true, y_prob) * 100

# AUPRC
auprc = average_precision_score(y_true, y_prob) * 100

# Print metrics with 2 decimal places
print(f'Accuracy: {acc:.2f}%, Precision: {prec:.2f}%, Recall: {rec:.2f}%, Specificity: {spec:.2f}%, MCC: {mcc:.2f}, F1-Score: {f1:.2f}%, F2-Score: {f2:.2f}%, AUC-ROC: {auroc:.2f}%, AUPRC: {auprc:.2f}%')




Accuracy: 67.97%, Precision: 55.74%, Recall: 29.48%, Specificity: 87.89%, MCC: 0.21, F1-Score: 38.57%, F2-Score: 32.55%, AUC-ROC: 60.94%, AUPRC: 45.32%
