In [13]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [14]:
import csv
import pandas as pd
import json
from datetime import datetime
from translator_deep_purpose import predict_dti

# Get the current date and time
current_date_time = datetime.now()

# Format the current date and time as a string
formatted_date_time = current_date_time.strftime("%Y%m%d_%H%M%S")

results_path = f"data/prediction_results_{formatted_date_time}.json"
sample_count = -1 # Use -1 to use all drugs and targets

In [15]:
# Load the processed datasets
drugs_df = pd.read_csv('./data/Processed_Drugs_SMILES.csv')
targets_df = pd.read_csv('./data/Processed_Targets_Seq.csv')

# Get the drug pairs (drugID and smiles)
drugs_pairs = drugs_df[['drugID', 'smiles']].values

# Get the target pairs (targetID and sequence)
targets_pairs = targets_df[['targetID', 'sequence']].values

In [16]:
import csv

models = [
    # BindingDB
    "MPNN_CNN_BindingDB",
    # "CNN_CNN_BindingDB",
    # "Morgan_CNN_BindingDB",
    # "Transformer_CNN_BindingDB",
    # "Daylight_AAC_BindingDB",
    # "Morgan_AAC_BindingDB",
    # # BindingDB IC50
    # "CNN_CNN_BindingDB_IC50",
    # "Morgan_CNN_BindingDB_IC50",
    # "Morgan_AAC_BindingDB_IC50",
    # "MPNN_CNN_BindingDB_IC50",
    # "Daylight_AAC_BindingDB_IC50",
    # # Model pre-trained on DAVIS
    # "MPNN_CNN_DAVIS",
    # "CNN_CNN_DAVIS",
    # "Morgan_CNN_DAVIS",
    # "Daylight_AAC_DAVIS",
    # "Morgan_AAC_DAVIS",
    # # Model pre-trained on KIBA
    # "MPNN_CNN_KIBA",
    # "Daylight_AAC_KIBA",
    # "Morgan_AAC_KIBA",
    # "Morgan_CNN_KIBA", # Error 'list' object has no attribute 'float'
]

# Initialize a list to store results
results = []
time_start = datetime.now()

# Get predictions for all models / all drugs / all targets in a results object
for model in models:
    print("")
    print(f"                MODEL {model}")
    
    for drug_id, drug_smile in drugs_pairs:
        print(f"       MODEL {model} - DRUG {drug_id}")
        
        for target_id, target_sequence in targets_pairs:
            try:
                res = predict_dti(drug_smile, target_sequence, model)
                
                # Append result to the results list
                results.append({
                    "model": model,
                    "drug": drug_id,
                    "target": target_id,
                    "score": res['score'],
                })
            except Exception as e:
                print(f"⚠️ Error predicting for [{model}] {drug_id} - {target_id} = {e}")

# Save results to a single CSV file
csv_filename = f"data/combined_results_{formatted_date_time}.csv"
with open(csv_filename, "w", newline="") as csv_file:
    csv_writer = csv.writer(csv_file)
    
    # Write CSV header
    csv_writer.writerow(["Model", "DrugID", "TargetID", "PredictedBindingScore"])
    
    # Write data rows
    for result in results:
        csv_writer.writerow([result["model"], result["drug"], result["target"], result["score"]])

print("Results saved in combined_results.csv.")
print(f"⏱️ Total runtime: {datetime.now() - time_start}")


                MODEL MPNN_CNN_BindingDB
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00091
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00093
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00104
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00114
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00115
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00116
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00117
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00118
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00119
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00120
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00121
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00122
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00123
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00125
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00126
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00127
       MODEL MPNN_CNN_BindingDB - DRUG drugbank:DB00128
      

In [None]:
from translator_deep_purpose import predict_dti, utils
import csv
from sklearn.metrics import concordance_index_score, roc_auc_score, average_precision_score

# ... Your previous code ...

# Initialize lists to store evaluation metrics
cindex_scores = []
auroc_scores = []
auprc_scores = []

# Get predictions for all models / all drugs / all targets in a results object
for model in models:
    print("")
    print(f"                MODEL {model}")

    for drug_id, drug_smile in drugs_pairs:
        print(f"       MODEL {model} - DRUG {drug_id}")

        for target_id, target_sequence in targets_pairs:
            try:
                res = predict_dti(drug_smile, target_sequence, model)
                
                # Get true label (y_true)
                # You need to determine the true label for each drug-target pair
                # Assuming you have a variable y_true containing the true labels

                # Calculate evaluation metrics
                cindex = concordance_index_score(y_true, res['score'])
                auroc = roc_auc_score(y_true, res['score'])
                auprc = average_precision_score(y_true, res['score'])

                # Append scores to lists
                cindex_scores.append(cindex)
                auroc_scores.append(auroc)
                auprc_scores.append(auprc)

                # Append result to the results list
                results.append({
                    "model": model,
                    "drug": drug_id,
                    "target": target_id,
                    "score": res['score'],
                    "cindex": cindex,
                    "auroc": auroc,
                    "auprc": auprc,
                })
            except Exception as e:
                print(f"⚠️ Error predicting for [{model}] {drug_id} - {target_id} = {e}")

# ... Your code to save results ...

# Print evaluation metrics
print("Concordance Index (CI) scores:", cindex_scores)
print("AUROC scores:", auroc_scores)
print("AUPRC scores:", auprc_scores)

# Print average scores
print("Average CI:", sum(cindex_scores) / len(cindex_scores))
print("Average AUROC:", sum(auroc_scores) / len(auroc_scores))
print("Average AUPRC:", sum(auprc_scores) / len(auprc_scores))

In [None]:
import csv
from datetime import datetime
import pandas as pd
from sklearn.metrics import roc_auc_score, f1_score

# Define your list of models
models = [
    # BindingDB
    "MPNN_CNN_BindingDB",
    # "CNN_CNN_BindingDB",
    # "Morgan_CNN_BindingDB",
    # "Transformer_CNN_BindingDB",
    # "Daylight_AAC_BindingDB",
    # "Morgan_AAC_BindingDB",
    # # BindingDB IC50
    # "CNN_CNN_BindingDB_IC50",
    # "Morgan_CNN_BindingDB_IC50",
    # "Morgan_AAC_BindingDB_IC50",
    # "MPNN_CNN_BindingDB_IC50",
    # "Daylight_AAC_BindingDB_IC50",
    # # Model pre-trained on DAVIS
    # "MPNN_CNN_DAVIS",
    # "CNN_CNN_DAVIS",
    # "Morgan_CNN_DAVIS",
    # "Daylight_AAC_DAVIS",
    # "Morgan_AAC_DAVIS",
    # # Model pre-trained on KIBA
    # "MPNN_CNN_KIBA",
    # "Daylight_AAC_KIBA",
    # "Morgan_AAC_KIBA",
    # "Morgan_CNN_KIBA", # Error 'list' object has no attribute 'float'
]

# Initialize a list to store results
results = []
time_start = datetime.now()

# Get predictions for all models / all drugs / all targets in a results object
for model in models:
    print("")
    print(f"                MODEL {model}")

    for drug_id, drug_smile in drugs_pairs:
        print(f"       MODEL {model} - DRUG {drug_id}")

        for target_id, target_sequence in targets_pairs:
            try:
                res = predict_dti(drug_smile, target_sequence, model)

                # Append result to the results list
                results.append({
                    "model": model,
                    "drug": drug_id,
                    "target": target_id,
                    "score": res['score'],
                })
            except Exception as e:
                print(f"⚠️ Error predicting for [{model}] {drug_id} - {target_id} = {e}")

# Save results to a single CSV file
formatted_date_time = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"data/combined_results_{formatted_date_time}.csv"
with open(csv_filename, "w", newline="") as csv_file:
    csv_writer = csv.writer(csv_file)

    # Write CSV header
    csv_writer.writerow(["Model", "DrugID", "TargetID", "PredictedBindingScore"])

    # Write data rows
    for result in results:
        csv_writer.writerow([result["model"], result["drug"], result["target"], result["score"]])

print("Results saved in combined_results.csv.")
print(f"⏱️ Total runtime: {datetime.now() - time_start}")

# Load the CSV file with results
df = pd.read_csv(csv_filename)

# Evaluate model performance
evaluation_results = []

for model in models:
    model_results = df[df['model'] == model]
    y_true = model_results['true_label']  # Replace with your true labels
    y_pred = model_results['score']

    # Calculate metrics
    auroc = roc_auc_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred.round())

    evaluation_results.append({
        'Model': model,
        'AUROC': auroc,
        'F1 Score': f1
    })

# Create a DataFrame from the evaluation results and save to CSV
evaluation_df = pd.DataFrame(evaluation_results)
evaluation_csv_filename = f"data/model_evaluation_{formatted_date_time}.csv"
evaluation_df.to_csv(evaluation_csv_filename, index=False)

print("Model evaluation results saved in model_evaluation.csv.")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the combined_results CSV file
csv_filename = "./data/combined_resultsT2D_20230815_143317.csv"
df = pd.read_csv(csv_filename)

# Group by DrugID and calculate distribution statistics
grouped = df.groupby('DrugID')['PredictedBindingScore'].agg(['mean', 'median', 'min', 'max'])

# Write the calculated statistics to a CSV file
statistics_filename = "./data/statistics_resultsT2D.csv"
grouped.to_csv(statistics_filename)

# Print the calculated statistics
print(grouped)

In [None]:
import pandas as pd

# Load the combined_results CSV file
csv_filename = "./data/combined_resultsT2D_20230815_143317.csv"
df = pd.read_csv(csv_filename)

# Group by DrugID
grouped = df.groupby('DrugID')

# Initialize dictionaries to store min and max pairs
min_pairs = {}
max_pairs = {}

# Iterate through each group
for drug_id, group_data in grouped:
    # Find the row with the minimum PredictedBindingScore
    min_row = group_data.loc[group_data['PredictedBindingScore'].idxmin()]
    min_pairs[drug_id] = min_row

    # Find the row with the maximum PredictedBindingScore
    max_row = group_data.loc[group_data['PredictedBindingScore'].idxmax()]
    max_pairs[drug_id] = max_row

# Sort the DrugIDs
sorted_drug_ids = sorted(min_pairs.keys())

# Print min and max pairs
print("Min Pairs:")
for drug_id in sorted_drug_ids:
    pair = min_pairs[drug_id]
    print(f"DrugID: {drug_id}, TargetID: {pair['TargetID']}, Min Score: {pair['PredictedBindingScore']}")

print("\nMax Pairs:")
for drug_id in sorted_drug_ids:
    pair = max_pairs[drug_id]
    print(f"DrugID: {drug_id}, TargetID: {pair['TargetID']}, Max Score: {pair['PredictedBindingScore']}")

# Group by DrugID and create histograms
grouped = df.groupby('DrugID')

for drug_id, group_data in grouped:
    plt.figure(figsize=(8, 6))
    plt.hist(group_data['PredictedBindingScore'], bins=20, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of PredictedBindingScore for DrugID: {drug_id}')
    plt.xlabel('PredictedBindingScore')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

In [None]:
import pandas as pd

# Load the CSV file
csv_filename = "./data/combined_resultsT2D_20230815_143317.csv"
df = pd.read_csv(csv_filename)

# Drop the 'Model' column
df = df.drop(columns=['Model'])

# Sort the DataFrame by DrugID and PredictedBindingScore
df_sorted = df.sort_values(by=['DrugID', 'PredictedBindingScore'], ascending=[True, False])

# Group by DrugID and create a list of TargetID-PredictedBindingScore pairs
grouped = df_sorted.groupby('DrugID').apply(lambda x: list(zip(x['TargetID'], x['PredictedBindingScore'])))

# Save the results to a new CSV file
output_filename = "./data/sorted_resultsT2D.csv"
with open(output_filename, "w") as f:
    f.write("DrugID,TargetID,PredictedBindingScore\n")
    for drug_id, pairs in grouped.items():
        for target_id, score in pairs:
            f.write(f"{drug_id},{target_id},{score}\n")

print("Sorted results saved in sorted_resultsT2D.csv.")

In [None]:
import pandas as pd

# Load the combined_results CSV file
csv_filename = "./data/combined_resultsT2D_20230815_143317.csv"
df = pd.read_csv(csv_filename)

# Group by DrugID and calculate distribution statistics
grouped = df.groupby('DrugID')['PredictedBindingScore'].agg(['mean', 'median', 'min', 'max'])
grouped.reset_index(inplace=True)

# Find the corresponding target for min and max scores
min_pairs = df[df.groupby('DrugID')['PredictedBindingScore'].transform('min') == df['PredictedBindingScore']]
max_pairs = df[df.groupby('DrugID')['PredictedBindingScore'].transform('max') == df['PredictedBindingScore']]

min_pairs = min_pairs[['DrugID', 'TargetID', 'PredictedBindingScore']].rename(columns={'TargetID': 'minTarget', 'PredictedBindingScore': 'min'})
max_pairs = max_pairs[['DrugID', 'TargetID', 'PredictedBindingScore']].rename(columns={'TargetID': 'maxTarget', 'PredictedBindingScore': 'max'})

# Merge min and max pairs
merged_pairs = pd.merge(min_pairs, max_pairs, on='DrugID')

# Merge with the grouped statistics
result_df = pd.merge(grouped, merged_pairs, on='DrugID')

# Format numerical values to keep four decimal places
result_df = result_df.round(4)

# Print all drugs with statistics
print(result_df)

# Write the result to a CSV file
result_filename = "./data/statistics_results_formatted.csv"
result_df.to_csv(result_filename, index=False)

print("Formatted results saved to:", result_filename)

In [None]:
# calc total # of DTI pairs in the test set, total # of known DTI binding pairs, and the percentage
import pandas as pd

# Load the CSV file
csv_filename = "./data/sorted_resultsT2D.csv"
df = pd.read_csv(csv_filename)

# Count the total # of DTI pairs where TargetID is not null
total_DTI_pairs = df['TargetID'].count()

print("Total # of DTI pairs:", total_DTI_pairs)

# Count total # of known DTI binding pairs
total_known_binding_pairs = df['knownTarget'].count()

print("Total # of known binding pairs:", total_known_binding_pairs)

# Calculate the percentage of known binding pairs / total DTI pairs
percentage_known_binding_pairs = (total_known_binding_pairs / total_DTI_pairs) * 100

print("Percentage of known binding pairs / total DTI pairs (%):", percentage_known_binding_pairs)

In [None]:
# Freq of ranking (known DTI pairs)
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file
csv_filename = "./data/sorted_resultsT2D.csv"
df = pd.read_csv(csv_filename)

# Remove rows with empty cells in 'knownTarget' column
df = df.dropna(subset=['knownTarget'])

# Pivot the table to get the frequency of each ranking
ranking_frequency = df.groupby('Ranking').size()

# Plot the frequency of ranking
ranking_frequency.plot(kind='bar')
plt.xlabel('Ranking')
plt.ylabel('Frequency')
plt.title('Frequency of Ranking')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file
csv_filename = "./data/sorted_resultsT2D.csv"
df = pd.read_csv(csv_filename)

# Remove rows with empty cells in 'knownTarget' column
df = df.dropna(subset=['knownTarget'])

# Define the bins for aggregation
bins = [0, 11, 22, 34]
labels = ['[1,11]', '[12,22]', '[23,34]']

# Add a new column 'RankingBucket' based on the bins
df['RankingBucket'] = pd.cut(df['Ranking'], bins=bins, labels=labels, right=False)

# Pivot the table to get the frequency of each ranking bucket
ranking_bucket_frequency = df.groupby('RankingBucket').size()

# Calculate the total frequency for percentage calculation
total_freq = ranking_bucket_frequency.sum()

# Plot the frequency of ranking buckets
ax = ranking_bucket_frequency.plot(kind='bar', figsize=(8, 6))
plt.xlabel('Ranking')
plt.ylabel('Frequency')
plt.title('Frequency of Ranking Buckets')

# Make the y-axis integer
ax.yaxis.set_major_locator(plt.MaxNLocator(integer=True))

# Add the count and percentage on top of each bar
for bar in ax.patches:
    count = bar.get_height()
    percentage = (count / total_freq) * 100
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{count}\n({percentage:.1f}%)',
             ha='center', va='bottom')

plt.show()

In [None]:
# freq of ranking by drug
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file
csv_filename = "./data/sorted_resultsT2D.csv"
df = pd.read_csv(csv_filename)

# Remove rows with empty cells in 'knownTarget' column
df = df.dropna(subset=['knownTarget'])

# Pivot the table to get the frequency of each ranking for each drug
drug_ranking_frequency = df.pivot_table(index='DrugID', columns='Ranking', values='PredictedBindingScore', aggfunc='count', fill_value=0)

# Plot the frequency of ranking by drug
ax = drug_ranking_frequency.plot(kind='bar', stacked=True)
plt.xlabel('Drug')
plt.ylabel('Frequency')
plt.title('Frequency of Ranking by Drug')

# Move the legend to the right side
plt.legend(title='Ranking', loc='upper left', bbox_to_anchor=(1, 1))

plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file
csv_filename = "./data/sorted_resultsT2D.csv"
df = pd.read_csv(csv_filename)

# Remove rows with empty cells in the 'knownTarget' column
df = df.dropna(subset=['knownTarget'])

# Define the ranking buckets
buckets = {
    '1-One: [1,11]': range(1, 12),
    '2-Two: [12,23]': range(12, 23),
    '3-Three: [24,34]': range(23, 35)
}

# Map each ranking to a bucket
df['RankingBucket'] = df['Ranking'].apply(
    lambda ranking: next(bucket for bucket, range_ in buckets.items() if ranking in range_)
)

# Group by DrugID and RankingBucket, and count the frequencies
drug_bucket_frequency = df.groupby(['DrugID', 'RankingBucket']).size().unstack(fill_value=0)

# Plot the frequency of ranking buckets by drug
ax = drug_bucket_frequency.plot(kind='bar', stacked=True)
plt.xlabel('Drug')
plt.ylabel('Frequency')
plt.title('Frequency of Ranking Buckets for Known DTI Bindings')

# Move the legend to the right side
plt.legend(title='Ranking Buckets', loc='upper left', bbox_to_anchor=(1, 1))

plt.show()

In [None]:
# Wilcoxon rank-sum (Mann-Whitney U) test
# Purpose: for comparing two independent groups (known DTI binding pairs vs. unknown binding pairs) when the sample sizes are imbalanced and we're interested in assessing whether the distributions of rankings differ significantly
# comparing the rankings of known binding pairs (top x rankings) against the rankings of the unknown binding pairs (the test will provide insight into whether the known binding pairs tend to have different rankings than the unknown binding pairs within each drug)
# H0: the distribution of rankings for known binding pairs is NOT significantly different from the distribution of rankings for unknown binding pairs within each drug
# H1: the distribution of rankings for known binding pairs is significantly different from the distribution of rankings for unknown binding pairs within each drug
# Interpretation: if the Wilcoxon rank-sum test yields a significant p-value for a particular drug, it indicates that the distribution of rankings for known binding pairs is significantly different from the distribution of rankings for unknown binding pairs within that drug. This suggests that the known binding pairs tend to have different rankings.

# Results: acorss all drugs, the distributions of rankings are NOT significantly different...

import pandas as pd
from scipy.stats import ranksums

# Load the dataset
data = pd.read_csv("./data/sorted_resultsT2D.csv")

# Get unique DrugIDs
unique_drugs = data['DrugID'].unique()

# Significance level
alpha = 0.05

# Loop through unique drugs and perform Wilcoxon rank-sum test
for drug in unique_drugs:
    drug_data = data[data['DrugID'] == drug]
    
    known_ranking = drug_data[drug_data['knownTarget'].notnull()]['Ranking']
    unknown_ranking = drug_data[drug_data['knownTarget'].isnull()]['Ranking']
    
    statistic, p_value = ranksums(known_ranking, unknown_ranking)
    
    print(f"Drug: {drug}")
    print(f"Statistic: {statistic}")
    print(f"P-value: {p_value}")
    
    if p_value < alpha:
        print("The distributions of rankings are significantly different.")
    else:
        print("The distributions of rankings are not significantly different.")
    
    print("-" * 40)

In [None]:
# Wilcoxon signed-rank test
# H0: there is NO significant difference between the predicted scores for known binding pairs and the predicted scores for unknown binding pairs within each drug
# H1: the predicted scores for known binding pairs are significantly higher than the predicted scores for unknown binding pairs within each drug
# Interpretation: if the Wilcoxon signed-rank test yields a significant p-value for a particular drug, it indicates that the predicted scores for known binding pairs are significantly higher than the predicted scores for unknown binding pairs within that drug. This suggests that the model's performance is better in terms of ranking known binding pairs higher.

# results: except for Drug: Repaglinide (p-value: 0.02810804014715179, the distributions of predicted scores for known and unknown binding pairs are significantly different)
# all the other drugs have no significant difference in the distributions of predicted scores between known and unknown binding pairs

import pandas as pd
from scipy.stats import ranksums

# Load the dataset
data = pd.read_csv("./data/sorted_resultsT2D.csv")

# Get unique DrugIDs
unique_drugs = data['DrugID'].unique()

# Significance level
alpha = 0.05

# Loop through unique drugs and perform Wilcoxon rank-sum test
for drug in unique_drugs:
    drug_data = data[data['DrugID'] == drug]
    
    known_scores = drug_data[drug_data['knownTarget'].notnull()]['PredictedBindingScore']
    unknown_scores = drug_data[drug_data['knownTarget'].isnull()]['PredictedBindingScore']
    
    statistic, p_value = ranksums(known_scores, unknown_scores)
    
    print(f"Drug: {drug}")
    print(f"Statistic: {statistic}")
    print(f"P-value: {p_value}")
    
    if p_value < alpha:
        print("The distributions of predicted scores for known and unknown binding pairs are significantly different.")
    else:
        print("There is no significant difference in the distributions of predicted scores between known and unknown binding pairs.")
    
    print("-" * 40)