Perform SPACE2 analysis of either manual PCA or dbscan UMAP clusters using cluster files generated in previous sections. Produces unique summary files for each reduction/cluster file containing SPACE2 results and basic summary and an 'all_summaries' file containing overviews and analysis of all previously mentioned summaries file within a given folder  

## Imports and setups

In [None]:
import sys
sys.path.append('/Users/isaacdaviet/Desktop/thesis/python_versions')
# replace with directory containing the .py calculation files below
import SPACE2_analysis as sp2
import pandas as pd


## Extract lists from csv file of unique cluster and cluster using SPACE2

Calculates SPACE2 structural configuration clusters and saves separate results csv and summary xl file 

### Single pdb_list file analysis

In [None]:
cluster_pdbs_file = r'/Users/isaacdaviet/Desktop/results/SPACE2_analysis/pdb_lists/correlation_DBsc_PDB/UMAP_Mason-correlation-3-25-0.0-1-2_DBsc-0.15-20_ClstrPDBs.csv'
# Single reduction clusters with assigned pdb file

clusters_to_analyze = ['high_Binder0']
# Use 'all' to analyze all clusters in file, or specify specific cluster name to analyze

output_file = r'/Users/isaacdaviet/Desktop/results/SPACE2_analysis/pdb_lists/test.xlsx'
# Full output file path + name with .xlsx extension

priorities_to_analyze = ['all']

### SPACE2 parameters
cdr_selection = ['CDRH3']
chain_selection = ['fwH']
rmsd_threshold = 1.25 # recommended default of 1.25 Angstrom
algorithm = 'agglomerative' 



structural_df = sp2.SPACE2_clustering(cluster_pdbs_file, clusters_to_analyze, priorities_to_analyze, cdr_selection = cdr_selection, chain_selection=chain_selection, rmsd_threshold = 1.25, algorithm= algorithm)
structural_df = sp2.SPACE2_summary(structural_df)

sp2.save_SPACE2_results(structural_df, output_file = output_file,shorten_antibody_names = 'y')

### Full folder analysis

In [None]:
cluster_pdbs_folder = r'/Users/isaacdaviet/Desktop/results/SPACE2_analysis/pdb_lists/correlation_DBsc_PDB'
# Folder containing multiple reduction clusters with assigned pdb files 

output_folder = r'/Users/isaacdaviet/Desktop/results/SPACE2_analysis/structural_clusters/correlation'
# Output folder to save results in

### Cluster filters
clusters_to_analyze = ['all'] # Do not change
priorities_to_analyze = ['all']

### SPACE2 parameters
cdrs = ['CDRH3']
chain = ['fwH']
rmsd_threshold = 1.25 # recommended default of 1.25 Angstrom
algorithm = 'agglomerative'
n_jobs =1 # recommended default of 1

shorten_structural_cluster_names = 'y' 
# parameter to simplify names of resulting files


sp2.SPACE2_folder_analysis(cluster_pdbs_folder, output_folder, clusters_to_analyze, priorities_to_analyze, cdrs, chain, rmsd_threshold, algorithm, n_jobs, shorten_structural_cluster_names)

### Multiple folder analysis

To iterate through multiple folders that follow a similar path and naming format, ie if distance metrics are kept in separate folders

In [None]:
### Input folders to analyze
input_folder_format = r'/Users/isaacdaviet/Desktop/results/SPACE2_analysis/pdb_lists/IDMETRIC_DBsc_PDB'
output_folder_format = r'/Users/isaacdaviet/Desktop/results/SPACE2_analysis/structural_clusters/IDMETRIC'
what_to_replace_in_folder_format_strings = 'IDMETRIC'
# string sequence in folder format strings that will be replaced as the below list is iterated through
what_to_replace_it_with = ['correlation', 'cosine', 'euclidean', 'hamming', 'manhattan']
# strings that the 'what_to_replace_in_folder_format_strings' string will be replaced with

### Cluster Filters
clusters_to_analyze = ['all'] # Please note that the function remove the priority level of a cluster frorm the pdb.csv column name, so do no include
priorities_to_analyze = ['all']

### SPACE2 parameters
cdr_selection = ['CDRH3']
chain_selection = ['fwH']
rmsd_threshold = 1.25 # recommended default of 1.25 Angstrom
algorithm = 'agglomerative'
n_jobs =1 # recommended default of 1


shorten_structural_cluster_names = 'y' 
# parameter to simplify names of resulting files


sp2.space2_analyze_multiple_folders(what_to_replace_it_with,input_folder_format,output_folder_format,what_to_replace_in_folder_format_strings,clusters_to_analyze = clusters_to_analyze,priorities_to_analyze = priorities_to_analyze,cdr_selection = cdr_selection,chain_selection = chain_selection,rmsd_threshold = rmsd_threshold,algorithm = algorithm,n_jobs =n_jobs,shorten_structural_cluster_names = shorten_structural_cluster_names)

## Editing summary sheets of resulting excel files

Adds additional details to summary sheet generated in previous function. Only do if editing was not already included in save_SPACE2_results function

#### Single Summay File

In [None]:
xl_file = r'/Users/isaacdaviet/Desktop/results/test.xlsx'

sp2.edit_xl_summary(xl_file)

#### Single Folder of Summary Files

In [None]:
folder = r'/Users/isaacdaviet/Desktop/results/SPACE2_analysis/structural_clusters/correlation'

sp2.edit_xl_summary_folder(folder)

#### Multiple Folders to Iterate

Similar formatting rules as above

In [None]:
folders_to_edit = ['correlation', 'cosine', 'euclidean', 'hamming', 'manhattan', ]

format = r'/Users/isaacdaviet/Desktop/results/SPACE2_analysis/structural_clusters/IDMETRIC'

replacement = 'IDMETRIC'

for folder in folders_to_edit:
    file = format.replace(replacement, folder)
    sp2.edit_xl_summary_folder(file)
    print(f'Finished edits to {folder} folder')

## Create All Summaries excel file

Generates an 'all_summaries' file providing further cross reduction analysis through combination of all 'summary' files contained in given folder  

### UMAP Clusters

In [None]:
structural_clusters_folder = r'/Users/isaacdaviet/Desktop/results/SPACE2_analysis/PCA_manual_clusters'
pc_cutoffs_to_explore =[100, 90, 80, 60, 50, 40]
# Percentage cutoffs of interest, recommend using [100, 90, 80, 60, 50, 40]

sp2.create_all_summaries_xl(structural_clusters_folder)

all_summaries_xl = f'{structural_clusters_folder}/all_summaries.xlsx'
for cutoff in pc_cutoffs_to_explore:
    all_overviews = sp2.all_summaries_overview(all_summaries_xl, pc_cutoff = cutoff)

### PCA  manual clusters

In [None]:
python_files_folder = '/Users/isaacdaviet/Desktop/thesis/python_versions' # replace with directory containing the .py calculation files below
import sys
sys.path.append(python_files_folder)
import importlib
import SPACE2_analysis as sp2

# Reload the module
importlib.reload(sp2)

sp2.create_pca_all_summaries('/Users/isaacdaviet/Desktop/results/SPACE2_analysis/PCA_manual_clusters/Redo/mason_PCA_manual_clusters_SPACE2_agglomerative_1-25_REDO_min.xlsx')

## Violin Plots

Generates violin plots using 'all_summaies.xlsx' file generated in previous section. Setting 'iterate_through_recommended_plots' to 'y' will generate the following plot combinations:

For UMAP:

    - x, y, labels_filter = 'n_abs%', 'label', 'all'
    - x, y, labels_filter = 'n_abs%', 'metric', 'all'
    - x, y, labels_filter = 'n_abs%', 'label', 'Binder'
    - x, y, labels_filter = 'n_abs%', 'label', 'Non Binder'

    - x, y, labels_filter = 'avg_rmsd', 'label', 'all'
    - x, y, labels_filter = 'avg_rmsd', 'metric', 'all'
    - x, y, labels_filter = 'avg_rmsd', 'label', 'Binder'
    - x, y, labels_filter = 'avg_rmsd', 'label', 'Non Binder'

    - x, y, labels_filter = '1_vs_2', 'label', 'all'
    - x, y, labels_filter = '1_vs_2', 'metric', 'all'
    - x, y, labels_filter = '1_vs_2', 'label', 'Binder'
    - x, y, labels_filter = '1_vs_2', 'label', 'Non Binder'

For PCA:

    - x, y, labels_filter = 'n_abs%', 'label', 'all'
    - x, y, labels_filter = '1_vs_2', 'label', 'all'
    - x, y, labels_filter = 'avg_rmsd', 'label', 'all'
    - x, y, labels_filter = 'avg_rmsd', 'n_SPACE2_clusters', 'all'
    - x, y, labels_filter = 'avg_rmsd', 'n_SPACE2_clusters', 'Binder'
    - x, y, labels_filter = 'avg_rmsd', 'n_SPACE2_clusters', 'Non Binder'    

In [None]:
### Input File
all_summaries_file ='/Users/isaacdaviet/Desktop/results/SPACE2_analysis/PCA_manual_clusters/Redo/pca_reduced_all_summaries.xlsx'
save_path = '/Users/isaacdaviet/Desktop/results/SPACE2_analysis/PCA_manual_clusters/Redo/all_summaries_graphs'
# Folder to save graphs to

reduction_type = 'PCA'
# Set to 'UMAP' or 'PCA'

iterate_through_recommended_plots = 'y'


### if not iterating through recommendations, input desired parameters to plot:
labels_filter = 'all'
x = 'label'
y = 'n_abs%'


### Plot Formatting
plt_title = f'Percentage of {reduction_type} Cluster Contained in SPACE2 Clusters - Sorted by {x} & {labels_filter}'
inner_plot_format = 'box'
title_size = 9





if iterate_through_recommended_plots != 'n':

    plot_list = [['n_abs%', 'label', 'all'], ['n_abs%', 'metric', 'all'], ['n_abs%', 'label', 'Binder'], ['n_abs%', 'label', 'Non Binder'], ['avg_rmsd', 'label', 'all'], ['avg_rmsd', 'metric', 'all'], ['avg_rmsd', 'label', 'Binder'], ['avg_rmsd', 'label', 'Non Binder'], ['1_vs_2', 'label', 'all'], ['1_vs_2', 'metric', 'all'], ['1_vs_2', 'label', 'Binder'], ['1_vs_2', 'label', 'Non Binder']] if reduction_type == 'UMAP' else ['n_abs%', 'label', 'all'], ['1_vs_2', 'label', 'all'], ['avg_rmsd', 'label', 'all'],['avg_rmsd', 'n_SPACE2_clusters', 'all'], ['avg_rmsd', 'n_SPACE2_clusters', 'Binder'], ['avg_rmsd', 'n_SPACE2_clusters', 'Non Binder']

elif iterate_through_recommended_plots == 'n':
    plot_list = [[x, y, labels_filter]]

input_df = pd.read_excel(all_summaries_file, sheet_name='all_summaries')

selected_columns = ['label', 'dimred_metric', 'n_abs%', 'total_antibodies'] if reduction_type == 'UMAP' else ['label', 'n_abs%', 'total_antibodies']

all_df = input_df.loc[:, selected_columns]

for param_list in plot_list:
    x, y, filter = param_list[0], param_list[1], param_list[2]

    data = all_df[all_df['label'] == filter] if filter != 'all' else all_df

    sp2.summaries_violin_plot(x, y, data, plt_title, title_size, save_path, inner_plot=inner_plot_format)
