## File to Import

In [None]:
project_name = 'mason'
# for file name/title purposes 

ig_data =r"/Users/isaacdaviet/Desktop/mason_igfold_models/masonIG.npy"
# .npy file containing integrated gradients data 

labeled_seq_data_file = r"/Users/isaacdaviet/Desktop/mason_igfold_models/mason_sequences_label.csv"
# .csv containing sequences with binder labels

## Setup

In [None]:
import sys
sys.path.append(r'/Users/isaacdaviet/Desktop/thesis/python_versions')
# replace with directory containing the .py calculation files below
import pca_calc as pca
from umap_calc import flatten_from_files
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



labeled_seq_data = pd.read_csv(labeled_seq_data_file)

pt = np.load(ig_data) #load Ig data

labels = labeled_seq_data.iloc[:,1].to_numpy()
labels = np.array(labels)

sequ = labeled_seq_data.iloc[:,0].to_numpy()
sequ = np.array(sequ)

labeled_df = flatten_from_files(ig_data, labeled_seq_data_file)

binders_df = labeled_df[labeled_df['Labels'] == 'Binder']

nonbinders_df = labeled_df[labeled_df['Labels'] == 'Non Binder']
all_dfs = [labeled_df, binders_df, nonbinders_df]

# print(labeled_df.head())

## Explained Variance Analysis

Generate bar plot containing explained variance of both binder and non binders. Previous work explored up to an explained variance of 99%, which was acheived with 95 principal components. First cell chunk will generate bar plots based on given range and steps. If would like to generate a single ba plot at a given number, see next chunk

In [None]:
### Range step exploration
range_to_explore = [5,100]
step = 10

show_graphs ='y'
save_graph = 'n'
save_path = '/Users/isaacdaviet/Desktop/results/PCA_analysis' # save folder only, file name will be automatically generated

### run code below
n = range_to_explore[0]
while n <= range_to_explore[1]:
    if show_graphs != 'n':
        print(f'# of components = {n}')
    pca.pca_explained_variance_bar_plot(pt, n_components = n, show_graph = show_graphs, save_graph = save_graph, save_path = save_path, project_name = project_name)
    if show_graphs != 'n':
        print('\n\n')
    n += step

In [None]:
### Direct PC Bar Plot generator
n_components = 95

show_graphs ='y'
save_graph = 'n'
save_path = '/Users/isaacdaviet/Desktop/results/PCA_analysis' # save folder only, file name will be automatically generated

pca.pca_explained_variance_bar_plot(pt, n_components = n_components, show_graph = show_graphs, save_graph = save_graph, save_path = save_path, project_name = project_name)


## Calculating PCA dataframe for Selected n_PCs

Calculates and saves PCA dataframe as csv file for given number of components + final explained variance barplot + pairplot of all components against each other for comparison and final component selection

CAUTION: Depending on the range being explored, it is recommended to run this section, especially the pairplot function, in a server as the generation of large pairplots can be computationally expensive and take quite a while.

In [None]:
#### change variables below
pcs_to_explore = 95

save_csv_filepath = r'/Users/isaacdaviet/Desktop/results/PCA_analysis'

save_pair_plot = 'y'




### run code below
pca_binders = pca.compute_pca(pcs_to_explore)
pca_tdf_binders = pca_binders.fit_transform(binders_df.iloc[:,:-2].values)

pca_nonbinders = pca.compute_pca(pcs_to_explore)
pca_tdf_nonbinders = pca_nonbinders.fit_transform(nonbinders_df.iloc[:,:-2].values)


# Plot explained variance for binders
plt.figure(figsize=(10, 6))
plt.bar(range(1, pcs_to_explore + 1), pca_binders.explained_variance_ratio_, color='red', alpha=0.7, label='Binders')

# Plot explained variance for non-binders
plt.bar(range(1, pcs_to_explore + 1), pca_nonbinders.explained_variance_ratio_, color='blue', alpha=0.7, label='Non Binders')

plt.xlabel('Principal Components')
plt.ylabel('Explained Variance (%)')
plt.title('Explained Variance Comparison between Binders and Non Binders')
plt.legend()
plt.show()
plt.savefig(f'/Users/isaacdaviet/Desktop/results/PCA_analysis/{project_name}_ExpVar_{pcs_to_explore}PCs.png', dpi=300)


pca_df = pca.pca_df(pt, labels, sequ, pcs_to_explore, save_csv_filepath = save_csv_filepath, project = project_name)

pca.pca_pair_plot(pt, labels, pcs_to_explore, show_pts='all', show_graph='y', save_graph=None, save_path=None, project_name=None, iterate_all_pt_types='y')

### saved image, shows that the variance of the binders is substantially higher at the first 7 PCs, while non_binder variance is substantially higher from PC 8-60

## Generate Plotly's for PC's of interest

Past work has only performed manual cluster extractions (compared to an automated option with UMAP), therefore this section generates the resulting PCA graphs as interactive 2D plotly graphs, which make identification of regions of interest far simpler.

In [None]:
ranges_of_interest = [[1, 11], [23, 29]] # longer stretches components that have interesting point distributions. Generates plotly's for every combination possible within the ranges.
individual_pcas_of_interest = [[35, 38], [52, 53]] # individual component pairs of interest not covered by 'range_of_interests'

save_folder = r'/Users/isaacdaviet/Desktop/results/PCA_analysis/PCAs_of_interest'

pca.selected_2d_plotlys(pca_df, ranges_of_interest, individual_pcas_of_interest, save_path = save_folder, project = project_name)

## Manual cluster extraction

Extract all sequences contained within a set of vertices contained in a csv file
MISSING: STRUCTURE OF CSV FILE + ORDER OF VERTICES (BOTTOM LEFT, TOP LEFT, TOP RIGHT, BOTTOM RIGHT???)

In [None]:
manual_clusters_csv = r'/Users/isaacdaviet/Desktop/results/PCA_analysis/PCAs_of_interest/mason_PCA_manual_cluster_vectors_NB_NEW.csv'

pca_df_csv = r'/Users/isaacdaviet/Desktop/results/PCA_analysis/mason-PCAdf95.csv'

# import importlib
# importlib.reload(pca)


manual_clusters_df = pd.read_csv(manual_clusters_csv)
pca_df = pd.read_csv(pca_df_csv)

binders_igfold_filename_format = r'mHER_H3_AgPos_unique_fv_ISEQ_igfold.pdb'
nonbinders_igfold_filename_format = r'mHER_H3_AgNeg_unique_fv_ISEQ_igfold.pdb'
replace = 'ISEQ'
filepath = r'/Users/isaacdaviet/Desktop/results/PCA_analysis/PCAs_of_interest'


clusters_by_filename_df, clusters_by_sequence = pca.extract_manual_pca_clusters_for_space2(manual_clusters_df, pca_df, filepath, binders_igfold_filename_format, nonbinders_igfold_filename_format, replace = replace, check_clusters = 'y')

# clusters_by_sequence.values()