In [2]:
import pandas as pd
import random
from itertools import product
import numpy as np

In [3]:
# Step 1: Load Dataset
data_path = './cleaned_dataset.csv' 
data = pd.read_csv(data_path)

In [4]:
all_proteins = data['UniProt_ID'].unique()
all_chemicals = data['pubchem_cid'].unique()

In [5]:
# Set the number of synthetic pairs to generate (adjust as needed)
num_negatives = len(data)  # Same as the number of positive pairs for balance

# Generate synthetic negative pairs
np.random.seed(42)  # For reproducibility
synthetic_proteins = np.random.choice(all_proteins, num_negatives, replace=True)
synthetic_chemicals = np.random.choice(all_chemicals, num_negatives, replace=True)

# Combine into a DataFrame
negative_pairs = pd.DataFrame({
    'UniProt_ID': synthetic_proteins,
    'pubchem_cid': synthetic_chemicals,
    'kiba_score': 0,  # Assign 0 as the binding score for negatives
    'kiba_score_estimated': True,  # Indicate these are synthetic
    'binding': 0  # Set binding to 0 for negatives
})

In [6]:
# Ensure No Overlap 
positive_pairs = data[['UniProt_ID', 'pubchem_cid']].drop_duplicates()
negative_pairs = negative_pairs.merge(positive_pairs, on=['UniProt_ID', 'pubchem_cid'], how='left', indicator=True)
negative_pairs = negative_pairs[negative_pairs['_merge'] == 'left_only'].drop(columns=['_merge'])

In [7]:
# Add Binding Column to Positive Samples
positive_samples = data.copy()
positive_samples['binding'] = 1

In [8]:
# Combine Positives and Negatives
augmented_data = pd.concat([positive_samples, negative_pairs], ignore_index=True)

In [9]:
# Shuffle the dataset
augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [10]:
print(len(data))
print(len(data)*2)
print(len(augmented_data))

1095027
2190054
2189620


In [11]:
# Save to CSV
output_path = 'augmented_dataset_with_estimates.csv'
augmented_data.to_csv(output_path, index=False)

print(f"Augmented dataset saved with {len(augmented_data)} samples at '{output_path}'.")

Augmented dataset saved with 2189620 samples at 'augmented_dataset_with_estimates.csv'.


In [12]:
print(augmented_data.head(10))

  UniProt_ID  pubchem_cid  kiba_score  kiba_score_estimated  binding
0     O95835     53357987       0.000                  True        0
1     Q9Y5Z0    122197438    4360.000                  True        1
2     O14649     89734200     625.000                  True        1
3     Q14831      9838340       0.000                  True        0
4     P03956      9956692   10100.000                  True        1
5     O75907     57397281      23.800                  True        1
6     Q8IYD1     90058690       0.000                  True        0
7     P58295     44286547       0.000                  True        0
8     C7C422    135311268       0.407                  True        1
9     P09874       690181      89.300                  True        1
