## Imports, setups & Functions

Substitute variables with appropriate file paths/selections

In [2]:
python_files_folder = '/Users/isaacdaviet/Desktop/thesis/python_versions'

import sys
sys.path.append(python_files_folder)
import numpy as np
import os
import pandas as pd
from SPACE2_replotting import get_df_dict_from_excel as get_dfs
import logomaker
import matplotlib as plt
import os
import ast
from tqdm import tqdm
import time
from IPython.display import clear_output


def convert_sequence_list_to_logo_df(sequence_list):
    """
    Generates logoplot dataframe from sequence list input.
    Returns df where n_rows = max sequence length & n_columns = 20 (1 for each AA)
    """ 

    seq_length = [len(l) for l in sequence_list]
    seq_length = np.max(seq_length)

    n_seqs = len(sequence_list)

    df = pd.DataFrame(np.zeros((seq_length, 20)), columns = [j for j in 'ACDEFGHIKLMNPQRSTVWY'])

    aa_columns = df.columns.to_list()

    for seq in sequence_list:
        for pos, aa in enumerate(seq):
            df.loc[pos, aa] += (1/n_seqs)*100

    return df


# logo_df = convert_sequence_list_to_logo_df(['WSAAAFYTYT', 'FVTSGFYAYA', 'FGGSGFYAYP'])
# print(logo_df)

def MAEditor_protein_color_dict():
    """
    Defines MAEditor color scheme for logoplot generation
    """
    colors = {
        'A': 'lightgreen',
        'G': 'lightgreen',
        'C': 'green',
        'D': 'darkgreen',
        'E': 'darkgreen',
        'N': 'darkgreen',
        'Q': 'darkgreen',
        'I': 'blue',
        'L': 'blue',
        'M': 'blue',
        'V': 'blue',
        'F': 'purple',
        'W': 'purple' ,
        'Y': 'purple',
        'H': 'darkblue',
        'K': 'orange',
        'R': 'orange',
        'P': 'pink',
        'S': 'red',
        'T': 'red'
    }
    return colors

def lesk_protein_color_dict():
    """
    Defines Lesk color scheme for logoplot generation
    """
    colors = {
        'A': 'orange', # Small nonpolar
        'G': 'orange', # Small nonpolar
        'C': 'green', # Hydrophobic
        'D': 'red', # Negatively charged
        'E': 'red', # Negatively charged
        'N': 'magenta', # Polar
        'Q': 'magenta', # Polar
        'I': 'green', # Hydrophobic 
        'L': 'green', # Hydrophobic
        'M': 'green', # Hydrophobic
        'V': 'green', # Hydrophobic
        'F': 'green', # Hydrophobic
        'W': 'green', # Hydrophobic
        'Y': 'green', # Hydrophobic
        'H': 'magenta', # Polar
        'K': 'blue', # positively charged
        'R': 'blue', # positively charged
        'P': 'green', # Hydrophobic
        'S': 'orange', # Small nonpolar
        'T': 'orange' # Small nonpolar
    }
    return colors


def create_logoplot(logo_df, save_file = None, color_scheme = 'maeditor_protein', title="Sequence Logoplot", show_fig = 'n'):
    """
    Creates and saves logoplots png files from logoplot_df. 
    Figure will not be displayed unless show_fig != 'n' or no save_file given 
    """

    color_scheme = MAEditor_protein_color_dict() if color_scheme == 'maeditor_protein' else color_scheme

    color_scheme = lesk_protein_color_dict() if color_scheme == 'lesk_protein' else color_scheme

    crp_logo = logomaker.Logo(logo_df,
                          shade_below=.5,
                          fade_below=.5,
                          font_name='Arial Rounded MT Bold',
                          color_scheme=color_scheme)

    # Customize the appearance of the logoplot
    crp_logo.style_spines(visible=False)
    crp_logo.style_spines(spines=['left', 'bottom'], visible=True)
    crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0)

    # # style using Axes methods
    crp_logo.ax.set_ylabel("Frequency (%)", labelpad=-1)
    crp_logo.ax.set_xlabel("Position", labelpad=-1)
    crp_logo.ax.set_xticklabels('%d'%(x+1) for x in range(10))
    crp_logo.ax.xaxis.set_ticks_position('none')
    crp_logo.ax.xaxis.set_tick_params(pad=-1)
        
    crp_logo.ax.set_title(title)

    if save_file is not None:
        crp_logo.fig.savefig(save_file)

    elif show_fig != 'n' or save_file is None:
        crp_logo.fig.show()

    # crp_logo.fig.close()


# create_logoplot(logo_df)

def generate_logoplots_from_clusters_df(clusters_df,save_folder, color_scheme = 'maeditor_protein', project_name = 'ProjectNA', overwrite_previous_versions = 'n'):
    """
    Generates logoplots for all clusters/configurations in clusters_df by converting each entry into logoplot_df and generating png files in save folder
    Note: iterative function not designed to generate individual images in editor
    """
    seq_df_dict = {}
    
    n_dfs = len(clusters_df)
    count = 0
    progress_bar = tqdm(total=n_dfs, desc='Processing')

    for index, row in clusters_df.iterrows():

        clear_output(wait=True)
        time.sleep(0.1)

        progress_bar.update(5) if count%5 == 0 else None
        # progress_bar.set_postfix({"Progress": f"{count+1}/{n_dfs} Sequences"})


        seqs = row['sequences']
        seq_list = ast.literal_eval(seqs) if type(seqs) == str else seqs

        cluster_name = row['cluster_name'] if type(row['cluster_name']) == str else 'unknown'
        cluster_type = row['type']
        reduction = cluster_name.split('_')[0]

        type_folder = os.path.join(save_folder, cluster_type)
        os.makedirs(type_folder) if not os.path.exists(type_folder) else None

        if cluster_type == 'UMAP':
            umap_metric = cluster_name.split('-')[0]
            metric_subfolder = os.path.join(type_folder, umap_metric)
            os.makedirs(metric_subfolder) if not os.path.exists(metric_subfolder) else None

        
        final_save_folder = os.path.join(metric_subfolder, reduction) if cluster_type == 'UMAP' else os.path.join(type_folder, reduction)

        title = f"{project_name} Dataset - {cluster_name} Sequence Comparison"

        file_name = f"{project_name}_{cluster_type.replace('/SPACE2', '-SP2scl')}_{cluster_name}_logoplot.png"


        save_file = os.path.join(final_save_folder, file_name)

        if not os.path.exists(save_file) and overwrite_previous_versions == 'n':
            
            os.makedirs(final_save_folder) if not os.path.exists(final_save_folder) else None

            seq_df = convert_sequence_list_to_logo_df(seq_list)
            create_logoplot(seq_df, save_file = save_file, color_scheme=color_scheme, title=title, show_fig='n')

            seq_df_dict[f"{cluster_type}_{cluster_name}"] = seq_df


        count += 1
        pc_done = round(count/n_dfs *100)
        # if pc_done%5 == 0 and pc_done >4:
            # print(f'\t{pc_done}% Completed')

    progress_bar.close()

    return seq_df

def full_dataset_logoplots(labels_df_or_csv, color_scheme = 'maeditor_protein', project_name = 'ProjectNA', save_folder = None):
    """
    Generates logoplots for full, binders-only and non-binders only datasets.
    Input can be either pre-processed df or csv file path
    """

    full_df = pd.read_csv(labels_df_or_csv) if type(labels_df_or_csv) == str else labels_df_or_csv
    columns = full_df.columns.to_list()

    full_df = full_df.rename(columns={'sequence': 'Sequences'}) if 'sequence' in columns else full_df
    full_df = full_df.rename(columns={'label': 'Labels'}) if 'label' in columns else full_df


    b_df = full_df[(full_df['Labels'] == 1) | (full_df['Labels'] == 'Binder')]
    nb_df = full_df[(full_df['Labels'] == 0) | (full_df['Labels'] == 'Non Binder')]

    all_dfs = {'All sequences': full_df, 'Binders only': b_df, 'Non Binders only':nb_df}

    all_logo_dfs = {}

    for points, df in all_dfs.items():
        title = f"{project_name} Sequence Distribution - {points}"

        save_file = os.path.join(save_folder, f"{project_name}_logoplots_{points}.png") if save_folder is not None else None

        seq_list = df['Sequences'].tolist()
        logo_df = convert_sequence_list_to_logo_df(seq_list)
        create_logoplot(logo_df, save_file = save_file, color_scheme = color_scheme, title = title, show_fig = 'n')

        all_logo_dfs[points] = logo_df

    return all_logo_dfs    

## Generate All Clusters + Sequences Dataframe

#### Load dfs containing sequences + reduction clusters or SPACE2 superclusters
This section allows you to load the UMAP and PCA dictionaries together or separately, depending on what the goal is or how much data has been processed

Begin with defining files to analyze:

In [None]:
# UMAP Structural configuration csv file
umap_superclusters_csv = '/Users/isaacdaviet/Desktop/results/SPACE2_analysis/UMAP_reductions/SPACE2_cluster_replotting/mason_umap-space2_superclusters.csv'

# UMAP reduction clusters xl file
umap_clusters_by_reduction_xl = '/Users/isaacdaviet/Desktop/results/SPACE2_analysis/UMAP_reductions/SPACE2_cluster_replotting/UMAP_clusters_by_reduction.xlsx'


# PCA Structural configuration csv file
pca_superclusters_csv = '/Users/isaacdaviet/Desktop/results/SPACE2_analysis/PCA_manual_clusters/SPACE2_cluster_replotting/mason_pca-space2_superclusters.csv'

# PCA reduction clusters xl file
pca_clusters_by_reduction_xl = '/Users/isaacdaviet/Desktop/results/SPACE2_analysis/PCA_manual_clusters/SPACE2_cluster_replotting/PCA_clusters_by_reduction.xlsx'



### Process csv & xl files into dict objects for further analysis

In [None]:

if pca_superclusters_csv is not None and pca_clusters_by_reduction_xl is not None:
    pca = True
    pca_df_dict = get_dfs(pca_clusters_by_reduction_xl) 
    del pca_df_dict['all_clusters'] ### remove 'all clusters sheet from df_dict
    pca_spcl_df = pd.read_csv(pca_superclusters_csv)
    pca_df_dict['PCA superclusters'] = pca_spcl_df
    df_dict_to_use = pca_df_dict


if umap_superclusters_csv is not None and umap_clusters_by_reduction_xl is not None:
    umap = True
    umap_df_dict = get_dfs(umap_clusters_by_reduction_xl)
    del umap_df_dict['all_clusters']### remove 'all clusters sheet from df_dict
    umap_spcl_df = pd.read_csv(umap_superclusters_csv)
    umap_df_dict['UMAP superclusters'] = umap_spcl_df
    df_dict_to_use = umap_df_dict



df_dict = {**pca_df_dict, **umap_df_dict} if umap and pca else df_dict_to_use

## Generate new DF containing all unique reduction and structural clusters

In [None]:
### Defining processing function 
def list_sequences_by_cluster(df_dict):
    """
    Uses df_dict input to generate pandas dataframe containing cluster/structural configurations + associated AA sequences and iSeq IDs
    columns=['cluster_name', 'type', 'priority' 'associated_seqs', 'associated_iseqs']
    """
    clusters_df = pd.DataFrame(columns=['cluster_name', 'type', 'priority' 'associated_seqs', 'associated_iseqs'])

    clusters_dict ={}

    for key, df in df_dict.items():
            for index, row in df.iterrows():
                
                seq = row['Sequences']
                iseq = row['iseq']
                label = row['Labels']

                if 'superclusters' in key:

                    cluster_type = 'UMAP/SPACE2' if 'UMAP' in key else 'PCA/SPACE2'
            
                    cluster_name = row['all_SPACE2_clusters']
                    priority = None

                else:
                     cluster_name = f"{key}_{row['adjusted_clusters']}" if 'PCA' not in key else row['PCA_cluster']
                     cluster_type = 'PCA' if 'PCA' in key else 'UMAP'
                     priority = row['priority'] if 'PCA' not in key else None

                
                if cluster_name not in clusters_dict:
                    clusters_dict[cluster_name] = {'label': None, 'type': None, 'priority': None,'sequences': [], 'iseqs': []}

                clusters_dict[cluster_name]['sequences'].append(seq)
                clusters_dict[cluster_name]['iseqs'].append(iseq)

                clusters_dict[cluster_name]['label'] = label 
                clusters_dict[cluster_name]['type'] =  cluster_type
                clusters_dict[cluster_name]['priority'] =  priority

    clusters_df = pd.DataFrame.from_dict(clusters_dict, orient='index')
    clusters_df.reset_index(inplace=True)
    clusters_df.rename(columns={'index': 'cluster_name'}, inplace=True)
    clusters_df['n_sequences'] = clusters_df['sequences'].apply(lambda x: len(list(set(x))))

    return clusters_df


# clusters_df = list_sequences_by_cluster(df_dict)

# print(clusters_df.head())


#### Save clusters_df to csv

Processing df_dict can be time-intensive, recommend saving to csv to avoid repeating previous step, though not required  

In [None]:
clusters_df = list_sequences_by_cluster(df_dict)
clusters_df.to_csv('/Users/isaacdaviet/Desktop/results/logoplots/all_clusters_with_seqs.csv', index = False)
clusters_df.to_csv('/Users/isaacdaviet/Desktop/results/logoplots/DO_NOT_OPEN_all_clusters_with_seqs.csv', index = False)

print(clusters_df.head())

In [18]:
clusters_df = pd.read_csv('/Users/isaacdaviet/Desktop/results/logoplots/DO_NOT_OPEN_all_clusters_with_seqs.csv')
# print(clusters_df[clusters_df['type'] == 'UMAP'])

### Generate logoplots from cluster/structure labels

##### Filter clusters_df as appropriate

In [None]:
### Replace 'exclude' dictionary varibles to filter df
exclude = {}
exclude['cluster_type']= ['PCA', 'PCA/SPACE2']
exclude['priorities'] = ['Low']

df = clusters_df
for key, list in exclude.items():
    if key == 'cluster_type':
        column = 'type'
    elif key == 'priorities':
          column = 'priority'

    for item in list: 
        df = df[df[column] != item]



### Above for loop untested. If it fails, this was the original filter code chunk that should work 
# df = clusters_df[clusters_df['type'] != 'PCA']
# df = df[df['type'] != 'PCA/SPACE2']
# df  = df[df['priority'] != 'Low']

save_folder = '/Users/isaacdaviet/Desktop/results/logoplots/lesk_colors'

seq_df_dict = generate_logoplots_from_clusters_df(df, save_folder, color_scheme = 'lesk_protein', overwrite_previous_versions = 'n')


### Generate logoplots of full, non-binders & binders only dataset as control/comparison

In [None]:
labels_csv = '/Users/isaacdaviet/Desktop/mason_igfold_models/mason_sequences_label.csv'
save_folder = '/Users/isaacdaviet/Desktop/'
project_name = 'Mason'


all_logo_dfs = full_dataset_logoplots(labels_csv, color_scheme = 'lesk_protein', project_name = project_name, save_folder = save_folder)

In [None]:

# def generate_logoplots_from_clusters_df(clusters_df, project_name = 'Mason'):

#     for index, row in clusters_df.iterrows():
#         seqs = row['sequences']
#         cluster_name = row['cluster_name']

#         title = f"{project_name} {cluster_name} Sequence Comparison"

#         create_logoplot(seqs, title=title)
        

# generate_logoplots_from_clusters_df(crp_df)


# labels_to_include_filter = []
# cluster_type_filter = []
# priority_filter = []
# max_n_sequences = []
# min_n_sequences = []
# metric_filter = []
# PCA_components_filter = []
# exclude_clusters = ['unclustered']
# specific_clusters_to_generate= []

# save_folder = '/Users/isaacdaviet/Desktop/results/logoplots'

# filters = {'label':exclude_label_filter, 
#            'type': cluster_type_filter, 
#            'priority':priority_filter, 
#            'metrics':metric_filter, 
#            'PC_components':PCA_components_filter, 
#            'exclude_clusters': exclude_clusters,
#            'specific_cluster': specific_clusters_to_generate,
#            'n_sequences_range':(max_n_sequences, min_n_sequences)}

