# Broad Parameter Search

Research code to broadely search UMAP minimum distances and neighbor parameters to narrow ideal parameters to a specific range. Replace variables with files containing labeled seqs and associated integrated gradient values

## Imports and setups

In [None]:
# Imports & Setup
# Example of adding a directory to the Python path
import sys
sys.path.append('/Users/isaacdaviet/Desktop/thesis/python versions')

import pdb_extraction as extract
import onehot_encoded as onehot
import umap_calc as umap
from umap_calc import flatten_labeled_data
from umap_calc import flatten_from_files
import pca_calc as pca
import tSNE_calc as tsne

# import combined_analysis as combi

## Input Integrated Gradient and Labeled Sequence Files to Analyze

In [None]:
# numpy file containing one-hot-encoded intergraded gradients data
ig_data = r'/Users/isaacdaviet/Desktop/mason_igfold_models/masonIG.npy'

# csv file containing sequences in column 1 and their status as binder (1)/non binders (0) in the second column
labeled_seqs= r'/Users/isaacdaviet/Desktop/mason_igfold_models/mason_sequences_label.csv' 

metric_to_test = ['euclidean', 'manhattan','cosine', 'correlation', 'mahalanobis', 'hamming', 'hellinger', 'bhattacharyya', 'kl_div', 'wasserstein']


# Combine sequence, label and IG values into single data frame for analysis
df = flatten_from_files(ig_data, labeled_seqs)

# Optimization by Distance Metric

Iterate through number of neighbors and minimum distance separately for all distance metrics of interest. Set start and end points as well as step size between each graph. Graphs will not be automatically saved unless save_graph, save_path, & project_name variables are set. Each step will generate graphs for the all-sequences, binders-only, and non-binders datasets for distribution comparisons (this can be deactivated by switching iterate_all_pt_types variable to 'n'. Setting show_pts variable to 'binders' or 'non binders' will then only show only the selected sequence types)

First subsection calls function to set distance metric of interest manually, with all following subsections set to a selection of distance metrics for simultaneous iteration and comparison.

Based on previous work, min_dist default set at 0.1 for neighbors iteration & n_neighbors set to 15 for min_dist iteration, though these can be adjusted depending on the results used. Similarly, recommended default start/stop/step values are preset to 2/200/20 for neighbors and 0/1/0.05 for min_dist, though these can als be changed depending on dataset

### Manual Distance Metric Selection

#### Iterate Neighbors

In [None]:
metric = 'euclidean'

starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_neighbors(df, starting_neighbors=starting_neighbors, final_neighbors=final_neighbors, step =step, min_dist=min_dist, metric = metric, show_pts='all', show_graph='y', save_graph=None, save_path=None, project_name=None, iterate_all_pt_types='y')

#### Iterate Minimum Distance

In [None]:
metric = 'euclidean'

starting_dist=0
final_dist=1
step=0.05

n_neighbors = 15 

umap.umap_iterate_min_dist(df, starting_dist=starting_dist, final_dist=final_dist, step=step, n_neighbors = n_neighbors, metric = metric, show_pts='all', show_graph='y', save_graph='n', save_path=None, project_name=None, iterate_all_pt_types='y')

### Euclidian

Simplest reduction technique. Can often be a good starting point.
Well-suited for continuous numerical data. It assumes that straight-line distances accurately represent relationships between points.

#### Iterate Neighbors

In [None]:
starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_neighbors(df, starting_neighbors=starting_neighbors, final_neighbors=final_neighbors, step =step, min_dist=min_dist, metric = 'euclidean', show_pts='all', show_graph='y', save_graph=None, save_path=None, project_name=None, iterate_all_pt_types='y')

#### Iterate Minimum Distance

In [None]:
starting_dist=0
final_dist=1
step=0.05

n_neighbors = 15 

umap.umap_iterate_min_dist(df, starting_dist=starting_dist, final_dist=final_dist, step=step, n_neighbors = n_neighbors, metric = 'euclidean', show_pts='all', show_graph='y', save_graph='n', save_path=None, project_name=None, iterate_all_pt_types='y')

### Manhattan

#### Iterate Neighbors

In [None]:
starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_neighbors(df, starting_neighbors=starting_neighbors, final_neighbors=final_neighbors, step =step, min_dist=min_dist, metric = 'manhattan', show_pts='all', show_graph='y', save_graph=None, save_path=None, project_name=None, iterate_all_pt_types='y')

#### Iterate Minimum Distance

In [None]:
starting_dist=0
final_dist=1
step=0.05

n_neighbors = 15 

umap.umap_iterate_min_dist(df, starting_dist=starting_dist, final_dist=final_dist, step=step, n_neighbors = n_neighbors, metric = 'manhattan', show_pts='all', show_graph='y', save_graph='n', save_path=None, project_name=None, iterate_all_pt_types='y')

### Cosine

#### Iterate Neighbors

In [None]:
starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_neighbors(df, starting_neighbors=starting_neighbors, final_neighbors=final_neighbors, step =step, min_dist=min_dist, metric = 'cosine', show_pts='all', show_graph='y', save_graph=None, save_path=None, project_name=None, iterate_all_pt_types='y')

#### Iterate Minimum Distance

In [None]:
starting_dist=0
final_dist=1
step=0.05

n_neighbors = 15 

umap.umap_iterate_min_dist(df, starting_dist=starting_dist, final_dist=final_dist, step=step, n_neighbors = n_neighbors, metric = 'cosine', show_pts='all', show_graph='y', save_graph='n', save_path=None, project_name=None, iterate_all_pt_types='y')

### Correlation

#### Iterate Neighbors

In [None]:
starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_neighbors(df, starting_neighbors=starting_neighbors, final_neighbors=final_neighbors, step =step, min_dist=min_dist1, metric = 'correlation', show_pts='all', show_graph='y', save_graph=None, save_path=None, project_name=None, iterate_all_pt_types='y')

#### Iterate Minimum Distance

In [None]:
starting_dist=0
final_dist=1
step=0.05

n_neighbors = 15 

umap.umap_iterate_min_dist(df, starting_dist=starting_dist, final_dist=final_dist, step=step, n_neighbors = n_neighbors, metric = 'correlation', show_pts='all', show_graph='y', save_graph='n', save_path=None, project_name=None, iterate_all_pt_types='y')

### Hamming

#### Iterate Neighbors

In [None]:
starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_neighbors(df, starting_neighbors=starting_neighbors, final_neighbors=final_neighbors, step =step, min_dist=min_dist, metric = 'hamming', show_pts='all', show_graph='y', save_graph=None, save_path=None, project_name=None, iterate_all_pt_types='y')

#### Iterate Minimum Distance

In [None]:
starting_dist=0
final_dist=1
step=0.05

n_neighbors = 15 

umap.umap_iterate_min_dist(df, starting_dist=starting_dist, final_dist=final_dist, step=step, n_neighbors = n_neighbors, metric = 'hamming', show_pts='all', show_graph='y', save_graph='n', save_path=None, project_name=None, iterate_all_pt_types='y')

### Hamming

#### Iterate Neighbors

In [None]:
starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_neighbors(df, starting_neighbors=starting_neighbors, final_neighbors=final_neighbors, step =step, min_dist=min_dist, metric = 'hamming', show_pts='all', show_graph='y', save_graph=None, save_path=None, project_name=None, iterate_all_pt_types='y')

#### Iterate Minimum Distance

In [None]:
starting_dist=0
final_dist=1
step=0.05

n_neighbors = 15 

umap.umap_iterate_min_dist(df, starting_dist=starting_dist, final_dist=final_dist, step=step, n_neighbors = n_neighbors, metric = 'hamming', show_pts='all', show_graph='y', save_graph='n', save_path=None, project_name=None, iterate_all_pt_types='y')

### Jaccard

#### Iterate Neighbors

In [None]:
starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_neighbors(df, starting_neighbors=starting_neighbors, final_neighbors=final_neighbors, step =step, min_dist=min_dist, metric = 'jaccard', show_pts='all', show_graph='y', save_graph=None, save_path=None, project_name=None, iterate_all_pt_types='y')

#### Iterate Minimum Distance

In [None]:
starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_min_dist(df, starting_dist=starting_dist, final_dist=final_dist, step=step, n_neighbors = n_neighbors, metric = 'correlation', show_pts='all', show_graph='y', save_graph='n', save_path=None, project_name=None, iterate_all_pt_types='y')

### Hellinger

Note: Cannot take any inputs with negative IG values

#### Iterate Neighbors

In [None]:
starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_neighbors(df, starting_neighbors=starting_neighbors, final_neighbors=final_neighbors, step =step, min_dist=min_dist, metric = 'hellinger', show_pts='all', show_graph='y', save_graph=None, save_path=None, project_name=None, iterate_all_pt_types='y')

#### Iterate Minimum Distance

In [None]:
starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_min_dist(df, starting_dist=starting_dist, final_dist=final_dist, step=step, n_neighbors = n_neighbors, metric = 'hellinger', show_pts='all', show_graph='y', save_graph='n', save_path=None, project_name=None, iterate_all_pt_types='y')

### Haversine

Note: distance metric incompatiible with previous datasets used

#### Iterate Neighbors

In [None]:
starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_neighbors(df, starting_neighbors=starting_neighbors, final_neighbors=final_neighbors, step =step, min_dist=min_dist, metric = 'haversine', show_pts='all', show_graph='y', save_graph=None, save_path=None, project_name=None, iterate_all_pt_types='y')

#### Iterate Minimum Distance

In [None]:
starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_min_dist(df, starting_dist=starting_dist, final_dist=final_dist, step=step, n_neighbors = n_neighbors, metric = 'haversine', show_pts='all', show_graph='y', save_graph='n', save_path=None, project_name=None, iterate_all_pt_types='y')

### Wasserstein

Note: metric incompatible with previousl used datasets

#### Iterate Neighbors

In [None]:
starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_neighbors(df, starting_neighbors=starting_neighbors, final_neighbors=final_neighbors, step =step, min_dist=min_dist, metric = 'wasserstein', show_pts='all', show_graph='y', save_graph=None, save_path=None, project_name=None, iterate_all_pt_types='y')

#### Iterate Minimum Distance

In [None]:
starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_min_dist(df, starting_dist=starting_dist, final_dist=final_dist, step=step, n_neighbors = n_neighbors, metric = 'wasserstein', show_pts='all', show_graph='y', save_graph='n', save_path=None, project_name=None, iterate_all_pt_types='y')

### Mahalanobis

Note: metric incompatible with previously used data

#### Iterate Neighbors

In [None]:
starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_neighbors(df, starting_neighbors=starting_neighbors, final_neighbors=final_neighbors, step =step, min_dist=min_dist, metric = 'mahalanobis', show_pts='all', show_graph='y', save_graph=None, save_path=None, project_name=None, iterate_all_pt_types='y')

#### Iterate Minimum Distance

In [None]:
starting_neighbors=2
final_neighbors=200
step =20

min_dist=0.1

umap.umap_iterate_min_dist(df, starting_dist=starting_dist, final_dist=final_dist, step=step, n_neighbors = n_neighbors, metric = 'mahalanobis', show_pts='all', show_graph='y', save_graph='n', save_path=None, project_name=None, iterate_all_pt_types='y')