In [1]:
import pandas as pd
import pickle
from numpy import random
import os

In [2]:
f="/nas/longleaf/home/oem/Kuhlman-Rotation/ThermoMPNN/ssm_preproc.csv"

df=pd.read_csv(f)

In [3]:

# Put binder sequences into fasta for clustering
from Bio import SeqIO
from Bio.Seq import Seq

records = {}
for r in df.to_records():
    name = r.ssm_parent
    if name not in records:
        records[name] = SeqIO.SeqRecord(Seq(r.binder_seq.replace('X', '-')), id=name, name=name, description='')

In [4]:
with open("../dataset_splits/ssm_fasta.fasta", 'w') as outFile:
    SeqIO.write(list(records.values()), outFile, 'fasta')

# !mmseqs2 easy-cluster ../data/fireprot/fireprot_proteins.fasta ../data/fireprot/fireprot_proteins ../data/fireprot/tmp --min-seq-id 0.25

In [5]:
clust = pd.read_csv("/nas/longleaf/home/oem/Kuhlman-Rotation/ssm_dataset/fasta-clusters/_cluster.tsv", sep="\t", header=None, names=['cluster', 'member'])


In [6]:
vcs = clust.cluster.value_counts()
cs = clust.cluster.unique()

In [7]:
# Merge the cluster_id onto the main dataframe

df.to_csv("/nas/longleaf/home/oem/Kuhlman-Rotation/ssm_dataset/splits/ssm_all.csv")
df = df.merge(clust, left_on='ssm_parent', right_on='member', how='left')
df.shape

(326131, 26)

In [8]:
# Generate splits (80/10/10)

random.seed(0)

# Get a column on unique clusters
clusters = pd.DataFrame({'cluster': df.cluster.unique()})

# Make the training dataset
train_clust = clusters.sample(frac=0.80)

# Get the clusters that did not end up in the training set
not_in_train = train_clust.merge(clusters, how='right', indicator=True, on='cluster')
not_in_train = not_in_train[not_in_train['_merge'] == 'right_only']
not_in_train = not_in_train.drop(columns='_merge')

# Sample from the remaining clusters 
test_clust = not_in_train.sample(frac=0.5)

# Get the remaining clusters, they are the validation set
val_clust = test_clust.merge(not_in_train, how='right', indicator=True, on='cluster')
val_clust = val_clust[val_clust['_merge'] == 'right_only']
val_clust = val_clust.drop(columns='_merge')

In [9]:
# Now we can merge the actual data onto the cluster groups
train_df = train_clust.merge(df, on='cluster', how='inner')
test_df = test_clust.merge(df, on='cluster', how='inner')
val_df = val_clust.merge(df, on='cluster', how='inner')

# Dataset Sizes
print(f"Training Dataset Size:\t{train_df.shape}")
print(f"Test Dataset Size:\t{test_df.shape}")
print(f"Validation Dataset Size:\t{val_df.shape}")
print()

# If everything worked properly this should evaluate to True
print(train_df.shape[0] + test_df.shape[0] + val_df.shape[0] == df.shape[0])

Training Dataset Size:	(272757, 26)
Test Dataset Size:	(21266, 26)
Validation Dataset Size:	(32108, 26)

True


In [10]:
train_names = train_df.ssm_parent.unique()
test_names = test_df.ssm_parent.unique()
val_names = val_df.ssm_parent.unique()

In [11]:
splits = {
    'train': train_names,
    'val': val_names,
    'test': test_names
}

with open("/nas/longleaf/home/oem/Kuhlman-Rotation/ssm_dataset/splits/ssm_splits.pkl", "wb") as fh:
    pickle.dump(splits, fh)

In [12]:
train_df.to_csv("/nas/longleaf/home/oem/Kuhlman-Rotation/ssm_dataset/splits/ssm_train.csv")
test_df.to_csv("/nas/longleaf/home/oem/Kuhlman-Rotation/ssm_dataset/splits/ssm_test.csv")
val_df.to_csv("/nas/longleaf/home/oem/Kuhlman-Rotation/ssm_dataset/splits/ssm_val.csv")

In [13]:
split_binder_names = {
    "train": [],
    "val": [],
    "test": [],
    "all":[]
}

wt_binder_seqs = {}
mut_rows = {}

split = 'test'

if split == 'all':
    all_names = list(splits.values())
    all_names = [j for sub in all_names for j in sub]
    split_binder_names[split] = all_names
else:
    split_binder_names[split] = splits[split]

# print(all_names)


In [14]:
wt_binder_names = split_binder_names[split]

In [15]:
test = df[df['sketch_kd'] == False]

In [16]:
test.shape


(319528, 26)

In [17]:
df.shape

(326131, 26)

In [18]:
df.shape[0] - test.shape[0]

6603