Prepares PDB files of analyzed sequences for SPACE2 structural analysis. Generates a csv file with the PDB file names & location for each clustered reduction CSV inputted. 

Contains separate section for single reduction, full folder and multi-folder preparation

## Setup

In [None]:
import sys
sys.path.append('/Users/isaacdaviet/Desktop/thesis/python versions')
# replace with directory containing the .py calculation files below
import SPACE2_analysis as sp2

## Assign PDB file names to final cluster files

### Extract pdb files from individual cluster csv file

Organize PDB files based on the provided cluster information. For a given .csv file, will create a new folder for each cluster indicated, organized by label type and priority status, and save a csv file containing the output folder as column name and the associated .pdb files for all sequences found in the cluster.


In [None]:
### Input Files & Folders
cluster_csv = r'/Users/isaacdaviet/Desktop/results/clustering/dbscan_clusters/correlation_clusters/csv_files/UMAP_Mason-correlation-3-25-0.0-1-2_dbscanClusters-0.15-20.csv'
# CSV file containing final clusters for a specific reduction

pdb_folder = r'/Users/isaacdaviet/Desktop/mason_igfold_models/mason_igfold_models/igfold_outfile'
# folder containing pdb structure files for all sequences. File names must contain iSeq identifier

output_folder = r'/Users/isaacdaviet/Desktop/results/SPACE2_analysis'


### Organizing Parameters
cluster_column = 'Cluster' 
# decide if want to use original ('Cluster') or adjusted clusters ('adjusted_clusters')

pdb_naming_format = 'mHER_H3_LABELID_unique_fv_ISEQ#_igfold.pdb' 
# insert '#' where unique iseq identifier and 'LABELID' where label ('AgPos'/'AgNeg') will be replaced in template

labels_to_organize = 'all' 
#Replace with 'binders' or 'non binders' if only want to analyze one label type

binder_priorities_to_ignore = ['unclustered']
nonbinder_priorities_to_ignore = ['out_of_bounds', 'unclustered']



sp2.organize_pdb(cluster_csv = cluster_csv, pdb_folder = pdb_folder, output_folder = output_folder, cluster_column = cluster_column, pdb_naming_format = pdb_naming_format, labels_to_organize = labels_to_organize, binder_priorities_to_ignore = binder_priorities_to_ignore, nonbinder_priorities_to_ignore = nonbinder_priorities_to_ignore)

### Extract pdb files from entire cluster csv folder

Uses organize_pdb function to iterate through all cluster csv files found in a given folder and outputs the organized folders into the given output folder


In [None]:
### Input Folders
csv_folder = r'/Users/isaacdaviet/Desktop/results/clustering/dbscan_clusters/correlation_clusters/csv_files'

pdb_folder = r'/Users/isaacdaviet/Desktop/mason_igfold_models/mason_igfold_models/igfold_outfile'

output_folder = r'/Users/isaacdaviet/Desktop/results/SPACE2_analysis/correlation_clusters'


### Organizing Parameters
cluster_column = 'Cluster', 
# Use  of original 'Cluster' or 'adjusted_cluster' column

pdb_naming_format = 'mHER_H3_LABELID_unique_fv_ISEQ#_igfold.pdb'
# insert '#' where unique iseq identifier and 'LABELID' where label ('AgPos'/'AgNeg') will be replaced in template

labels_to_organize = 'binders' 
#Replace with 'binders' or 'non binders' if only want to analyze one label type

binder_priorities_to_ignore = ['unclustered'] # as list

nonbinder_priorities_to_ignore = ['out_of_bounds', 'unclustered'] # as list

sp2.space2_prep_folders(csv_folder = csv_folder, pdb_folder = pdb_folder, output_folder = output_folder, cluster_column = cluster_column, pdb_naming_format = pdb_naming_format, labels_to_organize = labels_to_organize, binder_priorities_to_ignore = binder_priorities_to_ignore, nonbinder_priorities_to_ignore = nonbinder_priorities_to_ignore)


### Extract pdb files from multiple folders of similar naming format

Prepares all folders for SPACE2 using above space2_prep_folders and organize_pdb functions. Since previous steps in workflow can create separate distance metric subfolders, this cell allows for the automatic iteration of all subfolders.


In [None]:
pdb_folder = r'/Users/isaacdaviet/Desktop/mason_igfold_models/mason_igfold_models/igfold_outfile'
# folder containing all IGfold models in .pdb file format


input_folder_format = r'/Users/isaacdaviet/Desktop/results/clustering/dbscan_clusters/IDMETRIC_clusters/csv_files'
output_folder_format = r'/Users/isaacdaviet/Desktop/results/SPACE2_analysis/pdb_lists/IDMETRIC_DBsc_PDBs'
what_to_replace_in_folder_format_strings = 'IDMETRIC'
# string sequence in 'input_folder_format' & 'output_folder_format' strings that will be replaced with items in 'what_to_replace_it_with' list

what_to_replace_it_with = ['correlation', 'cosine', 'euclidean', 'hamming', 'manhattan']
# strings that the 'what_to_replace_in_folder_format_strings' string will be replaced with


cluster_column = 'Cluster'
pdb_naming_format = 'mHER_H3_LABELID_unique_fv_ISEQ#_igfold.pdb'
labels_to_organize = 'all'
binder_priorities_to_ignore = ['unclustered']
nonbinder_priorities_to_ignore = ['out_of_bounds', 'unclustered']


sp2.prep_all_folders_for_space2(what_to_replace_it_with,
                                pdb_folder,
                                input_folder_format,
                                output_folder_format,
                                what_to_replace_in_folder_format_strings,
                                cluster_column = cluster_column,
                                pdb_naming_format = pdb_naming_format,
                                labels_to_organize = labels_to_organize,
                                binder_priorities_to_ignore = binder_priorities_to_ignore,
                                nonbinder_priorities_to_ignore = nonbinder_priorities_to_ignore)