In [1]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [7]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from umap import UMAP
from scipy.stats import zscore
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from maayanlab_bioinformatics.dge import characteristic_direction
from maayanlab_bioinformatics.normalization import log2_normalize, filter_by_var, zscore_normalize
from maayanlab_bioinformatics.utils import merge
import plotly.express as px
import math
import config
import boto3
import logging
from botocore.exceptions import ClientError


In [8]:
def aws_download_file(object_name, location = None):
    """Download a file from the S3 bucket for this project
    :param object_name: Name under which the file is stored in the bucket
    :param location: Local path to which the file is downloaded
    :return: True if file is downloaded, else False
    """
    if (object_name == None):
        object_name = local_file
    try:
        s3 = boto3.client('s3', aws_access_key_id=config.ACCESS_KEY,
                          aws_secret_access_key=config.SECRET_KEY)
        if (location == None):
            location = "./" + object_name
        s3.download_file(
            config.BUCKET, object_name, location
        )
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [9]:
# Initial imports

download_dir = "AWS_downloads"
os.makedirs(download_dir, exist_ok=True)

data = aws_download_file("data.csv", f"./{download_dir}/data.csv")
df_data_all = pd.read_csv(f"./{download_dir}/data.csv")
df_data_all = df_data_all.set_index("symbol")

clinical_data = aws_download_file("clinical_data.csv", f"./{download_dir}/clinical_data.csv")
df_clinical_all = pd.read_csv(f"./{download_dir}/clinical_data.csv")
df_clinical_all = df_clinical_all.set_index("case_id")

cancer_types = list(set(df_clinical_all.primary_diagnosis))
columns = df_clinical_all.columns.values

print(cancer_types)

In [49]:
def get_data_by_cancer(cancer_type):
    '''
    Given a cancer type, return the RNA-seq and clinical DataFrames with only those entries 
    corresponding to that cancer type.
    '''
        
    df_clinical = df_clinical_all[df_clinical_all.primary_diagnosis == cancer_type]
    ids = df_clinical.index.values
    df_data = df_data_all[ids]
    return df_data, df_clinical

In [30]:
def get_library_size_df(df_data):
    return pd.DataFrame(
    {
        'n_reads': df_data[df_data > 0].count(),
        'log_n_reads': np.log2(df_data[df_data > 0].count() + 1),
        'n_expressed_genes': df_data.sum(),
    }).sort_values('n_reads', ascending=False)

In [31]:
def norm_and_zscore(df_data):
    # take 2500 top most variable rows
    df_data_norm = filter_by_var(df_data)

    # compute log normalization of matrix
    df_data_norm = log2_normalize(df_data_norm)

    # convert to zscores
    df_data_norm = zscore_normalize(df_data_norm)
    
    return df_data_norm

In [32]:
def get_pca_df(df_data_norm):
    '''
    Perform PCA to reduce dimensionality of the dataset before clustering
    while still maintaining most of the variability
    '''
    data_norm_pca = PCA(
      random_state=42,
    )
    data_norm_pca.fit(df_data_norm.values.T)
    df_data_norm_pca = pd.DataFrame(
        data_norm_pca.transform(df_data_norm.values.T),
        index=df_data_norm.T.index
    )
    df_data_norm_pca.columns = [
        f'PCA-{c}' # ({r:.3f})'
        for c, r in zip(df_data_norm_pca.columns, data_norm_pca.explained_variance_ratio_)
    ]
    df_data_norm_pca.index.name = "case_id"
    return df_data_norm_pca

In [33]:
def get_umap_df(df_data_norm_pca):
    '''
    UMAP: Uniform Manifold Approximation and Projection (UMAP) dimensional reduction technique
    arguments here are defaults in Seurat, considered the standard
    '''
    n_neighbors = 30
    if (len(df_data_norm_pca.columns) < 30): # in case the dataset is small
        n_neighbors = math.floor(len(df_data_norm_pca.columns) / 10.) + 2 # must be > 1
    data_norm_umap = UMAP(
      random_state=42,
      n_components=2,
      n_neighbors=n_neighbors,
      metric='cosine',
      min_dist=0.3,
    )

    # use top 10 components of PCA
    data_norm_umap.fit(df_data_norm_pca.iloc[:, :10].values)

    df_data_norm_umap = pd.DataFrame(
      data_norm_umap.transform(df_data_norm_pca.iloc[:, :10].values),
      columns=['UMAP-1', 'UMAP-2'],
      index=df_data_norm_pca.index,
    )
    return df_data_norm_umap
    

In [34]:
def compute_silhouette_scores(df_data_norm_umap):
    '''
    Compute silhouette scores, which are a measure of how similar an entry is to its 
    cluster; from this, we can decide the ideal number of clusters
    '''
    silhouette_scores = {}
    
    max_clusters = 25
    if (df_data_norm_umap.shape[0] < 25):
        max_clusters = df_data_norm_umap.shape[0]
        max_clusters = max(3,max_clusters)
        
    for n in range(2, max_clusters):
        y_pred = KMeans(n_clusters=n, random_state=42).fit_predict(df_data_norm_umap.values)
        silhouette_scores[n] = silhouette_score(df_data_norm_umap.values, y_pred, metric='cosine')
    silhouette_scores = pd.DataFrame([
        {'N Clusters': k, 'Silhouette Score': v}
        for k, v in silhouette_scores.items()
    ])
    best = silhouette_scores.sort_values('Silhouette Score').iloc[-1]
    return silhouette_scores, best

In [35]:
def get_data_norm_km_df(df_data_norm_umap, best_silhouette_score):
    '''
    Compute the Kmeans dataframe
    '''
    km = KMeans(n_clusters=int(best_silhouette_score['N Clusters']), random_state=42)
    df_data_norm_km = pd.DataFrame({
    'Cluster': [
        str(c)
        for c in km.fit_predict(df_data_norm_umap.values)
    ]}, index=df_data_norm_umap.index)

    return df_data_norm_km

In [36]:
def get_diff_expr_df(df_data_norm, df_data_norm_km):
    '''
    Perform differential expression for each cluter
    '''
    diff_expr = {}
    for cluster, samples in df_data_norm_km.groupby('Cluster'):
        diff_expr[f"Cluster {cluster} CD"] = characteristic_direction(
            # expression outside of this cluster
            df_data_norm.loc[:, df_data_norm.columns.difference(samples.index)],
            # expression in this cluster
            df_data_norm.loc[:, samples.index],
          )['CD-coefficient']

    df_diff_expr = pd.DataFrame(diff_expr)

    df_diff_expr.index.name = 'Symbol'
    df_diff_expr['Cluster 0 CD'].sort_values(ascending=True)
    return df_diff_expr

In [37]:
def compute_lr_aucs(df_data_norm_km, df_clinical):
    '''
    Fit a logistic regression on each feature to see which most accurately predict
    cluster
    '''
    aucs = {}
    #df_clinical = df_clinical.drop(columns=["primary_diagnosis"])
    for cluster, samples in df_data_norm_km.groupby('Cluster'): 
        aucs[cluster] = {}

        for feature in df_clinical.columns: # TODO: drop NAs
            lr = LogisticRegression()
            X = df_clinical.copy()
            X = X[feature]
            X = pd.merge(X, df_data_norm_km, left_index = True, right_index = True)
            
            # drop NAs, and move on if dataset is empty
            X.replace("not reported", None)
            X = X.dropna()
            if (X.shape[0] == 0): continue

            cluster_data = X["Cluster"]
            X = X.drop(columns= ["Cluster"])
            
            # one-hot encode non numerical data
            if (not isinstance(X[feature][0], (int, float, complex))):
                X = pd.get_dummies(X[feature], prefix=feature)

            y_true = (cluster_data == cluster)

            if (len(set(y_true)) < 2): continue # if we only have one class in the dataset
            lr.fit(X, y_true)
            y_score = lr.predict_proba(X)[:, 1]
            aucs[cluster][feature] = roc_auc_score(y_true, y_score)
    print(pd.DataFrame(aucs))
    return pd.DataFrame(aucs)

In [38]:
def save_data(cancer_type,
              df_data_norm_km,
              df_data_norm_pca,
              df_data_norm_umap,
              df_diff_expr,
              df_aucs):
    
    # create the root dir
    root_dir = f'./processed_data/{cancer_type}'
    os.makedirs(root_dir, exist_ok=True)

    # /clustering/graphclust/clusters.csv
    #   Barcode,Cluster
    os.makedirs(f'{root_dir}/clustering/graphclust', exist_ok=True)
    df_data_norm_km.to_csv(f'{root_dir}/clustering/graphclust/clusters.csv')
    
    # /pca/10_components/projection.csv
    #   Barcode,PC-1,PC-2,PC-3,PC-4,PC-5,PC-6,PC-7,PC-8,PC-9,PC-10
    os.makedirs(f'{root_dir}/pca/10_components', exist_ok=True)
    df_data_norm_pca.to_csv(f'{root_dir}/pca/10_components/projection.csv')
    
    # /umap/2_components/projection.csv
    #   Barcode,UMAP-1,UMAP-2
    os.makedirs(f'{root_dir}/umap/2_components', exist_ok=True)
    df_data_norm_umap.to_csv(f'{root_dir}/umap/2_components/projection.csv')
    
     # /diffexp/graphclust/differential_expression.csv
    #   Feature Name,Cluster 2 Log2 fold change,Cluster 0 Log2 fold change,Cluster 1 Log2 fold change,Cluster 3 Log2 fold change,Cluster 2 Adjusted p value,Cluster 0 Adjusted p value,Cluster 1 Adjusted p value,Cluster 3 Adjusted p value,Cluster 2 Mean Counts,Cluster 0 Mean Counts,Cluster 1 Mean Counts,Cluster 3 Mean Counts,Feature Name
    os.makedirs(f'{root_dir}/diffexp/graphclust', exist_ok=True)
    df_diff_expr.to_csv(f'{root_dir}/diffexp/graphclust/differential_expression.csv')
    
    # /cluster_aucs.csv
    #   ,2,0,1,3
    df_aucs.to_csv(f'{root_dir}/cluster_aucs.csv')

In [93]:
%%appyter hide_code

{% do SectionField(
    name="DATASET",
    title="Dataset selection"
) %}

{% do SectionField(
    name="ENRICHR",
    title="Enrichr search parameters"
) %}


{% do SectionField(
    name='CONFIG',
    label='Configuration',
    description='Configure various parameters for the analysis',
) %}

In [92]:
%%appyter code_eval

cancer = "{{ChoiceField(
    name = "cancer",
    label = "Cancer type",
    description="The value provided as the primary diagnosis on cases in TCGA.",
    choices=["Serous cystadenocarcinoma, NOS", ""],
    section="DATASET",
    default="" ) }}"


# The number of 'top' genes to use for differential expression
top_n_genes = {{IntField(
    name='top_n_genes',
    label='Number of Genes',
    description='The number of \'top\' genes to use for differential expression',
    default=250,
    min=100,
    max=1000,
    section='CONFIG',
)}}

# The number of 'top' results to keep from enrichment analysis
top_n_results = {{ IntField(
    name='top_n_results',
    label='Number of Top Enrichment Results',
    description='The number of \'top\' results to keep from enrichment analysis',
    default=5,
    min=1,
    max=100,
    section='CONFIG',
)}}

transcription_libraries = {{ MultiChoiceField(name='transcription_libraries', 
                                              description='Select the Enrichr libraries you would like in your figure.', 
                                              label='Transcription', 
                                              default=[], 
                                              section = 'ENRICHR',
                                              choices=[
                                                    'ARCHS4_TFs_Coexp',
                                                    'ChEA_2016',
                                                    'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X',
                                                    'ENCODE_Histone_Modifications_2015',
                                                    'ENCODE_TF_ChIP-seq_2015',
                                                    'Epigenomics_Roadmap_HM_ChIP-seq',
                                                    'Enrichr_Submissions_TF-Gene_Coocurrence',
                                                    'Genome_Browser_PWMs',
                                                    'lncHUB_lncRNA_Co-Expression',
                                                    'miRTarBase_2017',
                                                    'TargetScan_microRNA_2017',
                                                    'TF-LOF_Expression_from_GEO',
                                                    'TF_Perturbations_Followed_by_Expression',
                                                    'Transcription_Factor_PPIs',
                                                    'TRANSFAC_and_JASPAR_PWMs',
                                                    'TRRUST_Transcription_Factors_2019']) }}

pathways_libraries = {{ MultiChoiceField(name='pathways_libraries', 
                                         description='Select the Enrichr libraries you would like in your figure.', 
                                         label='Pathways', 
                                         default=[], 
                                         section = 'ENRICHR',
                                         choices=[
                                                'ARCHS4_Kinases_Coexp',
                                                'BioCarta_2016',
                                                'BioPlanet_2019',
                                                'BioPlex_2017',
                                                'CORUM',
                                                'Elsevier_Pathway_Collection',
                                                'HMS_LINCS_KinomeScan',
                                                'HumanCyc_2016',
                                                'huMAP',
                                                'KEA_2015',
                                                'KEGG_2019_Human',
                                                'KEGG_2019_Mouse',
                                                'Kinase_Perturbations_from_GEO_down',
                                                'Kinase_Perturbations_from_GEO_up',
                                                'L1000_Kinase_and_GPCR_Perturbations_down',
                                                'L1000_Kinase_and_GPCR_Perturbations_up',
                                                'NCI-Nature_2016',
                                                'NURSA_Human_Endogenous_Complexome',
                                                'Panther_2016',
                                                'Phosphatase_Substrates_from_DEPOD',
                                                'PPI_Hub_Proteins',
                                                'Reactome_2016',
                                                'SILAC_Phosphoproteomics',
                                                'SubCell_BarCode',
                                                'Virus-Host_PPI_P-HIPSTer_2020',
                                                'WikiPathways_2019_Human',
                                                'WikiPathways_2019_Mouse']) }}    

  
ontologies_libraries = {{ MultiChoiceField(name='ontologies_libraries', 
                                           description='Select the Enrichr libraries you would like in your figure.', 
                                           label='Ontologies', 
                                           default=[], 
                                           section = 'ENRICHR',
                                           choices=[
                                                'GO_Biological_Process_2018',
                                                'GO_Cellular_Component_2018',
                                                'GO_Molecular_Function_2018',
                                                'Human_Phenotype_Ontology',
                                                'Jensen_COMPARTMENTS',
                                                'Jensen_DISEASES',
                                                'Jensen_TISSUES',
                                                'MGI_Mammalian_Phenotype_Level_4_2019']) }} 

    
diseases_drugs_libraries = {{ MultiChoiceField(name='diseases_drugs_libraries', 
                                               description='Select the Enrichr libraries you would like in your figure.', 
                                               label='Diseases/Drugs', 
                                               default=[], 
                                               section = 'ENRICHR',
                                               choices=[    
                                                    'Achilles_fitness_decrease',
                                                    'Achilles_fitness_increase',
                                                    'ARCHS4_IDG_Coexp',
                                                    'ClinVar_2019',
                                                    'dbGaP',
                                                    'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019',
                                                    'DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019',
                                                    'DisGeNET',
                                                    'DrugMatrix',
                                                    'DSigDB',
                                                    'GeneSigDB',
                                                    'GWAS_Catalog_2019',
                                                    'LINCS_L1000_Chem_Pert_down',
                                                    'LINCS_L1000_Chem_Pert_up',
                                                    'LINCS_L1000_Ligand_Perturbations_down',
                                                    'LINCS_L1000_Ligand_Perturbations_up',
                                                    'MSigDB_Computational',
                                                    'MSigDB_Oncogenic_Signatures',
                                                    'Old_CMAP_down',
                                                    'Old_CMAP_up',
                                                    'OMIM_Disease',
                                                    'OMIM_Expanded',
                                                    'PheWeb_2019',
                                                    'Rare_Diseases_AutoRIF_ARCHS4_Predictions',
                                                    'Rare_Diseases_AutoRIF_Gene_Lists',
                                                    'Rare_Diseases_GeneRIF_ARCHS4_Predictions',
                                                    'Rare_Diseases_GeneRIF_Gene_Lists',
                                                    'UK_Biobank_GWAS_v1',
                                                    'Virus_Perturbations_from_GEO_down',
                                                    'Virus_Perturbations_from_GEO_up',
                                                    'VirusMINT']) }}
    
cell_types_libraries = {{ MultiChoiceField(name='cell_types_libraries', 
                                           description='Select the Enrichr libraries you would like in your figure.', 
                                           label='Cell Types', 
                                           default=[], 
                                           section = 'ENRICHR',
                                           choices=[        
                                                'Allen_Brain_Atlas_down',
                                                'Allen_Brain_Atlas_up',
                                                'ARCHS4_Cell-lines',
                                                'ARCHS4_Tissues',
                                                'Cancer_Cell_Line_Encyclopedia',
                                                'CCLE_Proteomics_2020',
                                                'ESCAPE',
                                                'GTEx_Tissue_Sample_Gene_Expression_Profiles_down',
                                                'GTEx_Tissue_Sample_Gene_Expression_Profiles_up',
                                                'Human_Gene_Atlas',
                                                'Mouse_Gene_Atlas',
                                                'NCI-60_Cancer_Cell_Lines',
                                                'ProteomicsDB_2020',
                                                'Tissue_Protein_Expression_from_Human_Proteome_Map']) }}    
    
    
    
miscellaneous_libraries = {{ MultiChoiceField(name='miscellaneous_libraries', 
                                              description='Select the Enrichr libraries you would like in your figure.', 
                                              label='Miscellaneous', 
                                              default=[], 
                                              section = 'ENRICHR',
                                              choices=[            
                                                    'Chromosome_Location_hg19',
                                                    'Data_Acquisition_Method_Most_Popular_Genes',
                                                    'Enrichr_Libraries_Most_Popular_Genes',
                                                    'Genes_Associated_with_NIH_Grants',
                                                    'HMDB_Metabolites',
                                                    'HomoloGene',
                                                    'InterPro_Domains_2019',
                                                    'NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions',
                                                    'NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions',
                                                    'NIH_Funded_PIs_2017_Human_AutoRIF',
                                                    'NIH_Funded_PIs_2017_Human_GeneRIF',
                                                    'Pfam_Domains_2019',
                                                    'Pfam_InterPro_Domains',
                                                    'Table_Mining_of_CRISPR_Studies']) }}    
    
    
legacy_libraries = {{ MultiChoiceField(name='legacy_libraries', 
                                       description='Select the Enrichr libraries you would like in your figure.', 
                                       label='Legacy', 
                                       default=[], 
                                       section = 'ENRICHR',
                                       choices=[                
                                            'BioCarta_2013',
                                            'BioCarta_2015',
                                            'ChEA_2013',
                                            'ChEA_2015',
                                            'Chromosome_Location',
                                            'Disease_Signatures_from_GEO_down_2014',
                                            'Disease_Signatures_from_GEO_up_2014',
                                            'Drug_Perturbations_from_GEO_2014',
                                            'ENCODE_Histone_Modifications_2013',
                                            'ENCODE_TF_ChIP-seq_2014',
                                            'GO_Biological_Process_2013',
                                            'GO_Biological_Process_2015',
                                            'GO_Biological_Process_2017',
                                            'GO_Biological_Process_2017b',
                                            'GO_Cellular_Component_2013',
                                            'GO_Cellular_Component_2015',
                                            'GO_Cellular_Component_2017',
                                            'GO_Cellular_Component_2017b',
                                            'GO_Molecular_Function_2013',
                                            'GO_Molecular_Function_2015',
                                            'GO_Molecular_Function_2017',
                                            'GO_Molecular_Function_2017b',
                                            'HumanCyc_2015',
                                            'KEA_2013',
                                            'KEGG_2013',
                                            'KEGG_2015',
                                            'KEGG_2016',
                                            'MGI_Mammalian_Phenotype_2013',
                                            'MGI_Mammalian_Phenotype_2017',
                                            'MGI_Mammalian_Phenotype_Level_3',
                                            'MGI_Mammalian_Phenotype_Level_4',
                                            'NCI-Nature_2015',
                                            'Panther_2015',
                                            'Reactome_2013',
                                            'Reactome_2015',
                                            'TargetScan_microRNA',
                                            'Tissue_Protein_Expression_from_ProteomicsDB',
                                            'WikiPathways_2013',
                                            'WikiPathways_2015',
                                            'WikiPathways_2016']) }} 

crowd_libraries = {{ MultiChoiceField(name='crowd_libraries', 
                                      description='Select the Enrichr libraries you would like in your figure.', 
                                      label='Crowd', 
                                      default=[],
                                      section = 'ENRICHR',
                                      choices=[                
                                            'Aging_Perturbations_from_GEO_down',
                                            'Aging_Perturbations_from_GEO_up',
                                            'Disease_Perturbations_from_GEO_down',
                                            'Disease_Perturbations_from_GEO_up',
                                            'Drug_Perturbations_from_GEO_down',
                                            'Drug_Perturbations_from_GEO_up',
                                            'Gene_Perturbations_from_GEO_down',
                                            'Gene_Perturbations_from_GEO_up',
                                            'Ligand_Perturbations_from_GEO_down',
                                            'Ligand_Perturbations_from_GEO_up',
                                            'MCF7_Perturbations_from_GEO_down',
                                            'MCF7_Perturbations_from_GEO_up',
                                            'Microbe_Perturbations_from_GEO_down',
                                            'Microbe_Perturbations_from_GEO_up',
                                            'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO',
                                            'SysMyo_Muscle_Gene_Sets']) }}

enrichr_libraries = np.sort(transcription_libraries +
                            pathways_libraries +
                            ontologies_libraries +
                            diseases_drugs_libraries +
                            cell_types_libraries +
                            miscellaneous_libraries +
                            legacy_libraries + crowd_libraries)

enrichr_libraries

```python
cancer = ""
# The number of 'top' genes to use for differential expression
top_n_genes = 250
# The number of 'top' results to keep from enrichment analysis
top_n_results = 5
transcription_libraries = []
pathways_libraries = []
ontologies_libraries = []
diseases_drugs_libraries = []
cell_types_libraries = []
miscellaneous_libraries = []
legacy_libraries = []
crowd_libraries = []
enrichr_libraries = np.sort(transcription_libraries +
                            pathways_libraries +
                            ontologies_libraries +
                            diseases_drugs_libraries +
                            cell_types_libraries +
                            miscellaneous_libraries +
                            legacy_libraries + crowd_libraries)
enrichr_libraries
```

array([], dtype=float64)

In [None]:
# get data
df_data, df_clinical = get_data_by_cancer(cancer)

# initial stastistics
df_library_size = get_library_size_df(df_data)
display(df_library_size.head())   

In [None]:
# normalize and convert data to zscoes
df_data_norm = norm_and_zscore(df_data)

# plot 2 gene distributions
sns.distplot(df_data_norm.iloc[0, :]); plt.show()
sns.distplot(df_data_norm.iloc[200, :]); plt.show()

# plot a single RNA seq profile distribution
sns.distplot(df_data_norm.iloc[:, 0]); plt.show()

In [None]:
# compute PCA
df_data_norm_pca = get_pca_df(df_data_norm)

# project data onto its first 2 principal components
feature = "ethnicity"
fig = px.scatter(
  merge(
    df_data_norm_pca,
    df_library_size,
    df_clinical
  ),
  x=df_data_norm_pca.columns[0],
  y=df_data_norm_pca.columns[1],
  size='n_reads',
  size_max=8,
  color=feature,
  hover_name=feature,
  hover_data=["ethnicity"],
)
fig.show()

In [None]:
# compute UMAP for firstt 10 principal components
df_data_norm_umap = get_umap_df(df_data_norm_pca)

# project data onto its first 2 UMAP components
feature = "ethnicity"
fig = px.scatter(
  merge(
    df_data_norm_umap,
    df_library_size,
    df_clinical
  ),
  x=df_data_norm_umap.columns[0],
  y=df_data_norm_umap.columns[1],
  size='n_reads',
  size_max=8,
  color=feature,
  hover_name=feature,
  hover_data=["ethnicity"],
)
fig.show()

In [None]:
# compute silhouette scores
silhouette_scores, best = compute_silhouette_scores(df_data_norm_umap)

# plot the scores as a function of # of clusters
plt.plot(silhouette_scores['N Clusters'], silhouette_scores['Silhouette Score'])
plt.scatter([best['N Clusters']], [best['Silhouette Score']], label='Best: {} clusters'.format(int(best["N Clusters"])))
plt.legend()
plt.title('Cluster size selection')
plt.ylabel('Silhouette Score')
plt.xlabel('Number of Clusters')
plt.show()

In [None]:
# compute clusters with Kmeans, using the ideal number of clusters
df_data_norm_km = get_data_norm_km_df(df_data_norm_umap, best)
print(f'Computed {len(df_data_norm_km["Cluster"].unique())} clusters')

In [None]:
# compute differential expression
df_diff_expr = get_diff_expr_df(df_data_norm, df_data_norm_km)

In [None]:
# LR performance
df_aucs = compute_lr_aucs(df_data_norm_km, df_clinical)

In [95]:
# save data
save_data(cancer,
          df_data_norm_km,
          df_data_norm_pca,
          df_data_norm_umap,
          df_diff_expr,
          df_aucs)

NameError: name 'df_data_norm_km' is not defined

In [94]:
enrichr_libraries = np.sort(transcription_libraries +
                            pathways_libraries +
                            ontologies_libraries +
                            diseases_drugs_libraries +
                            cell_types_libraries +
                            miscellaneous_libraries +
                            legacy_libraries +
                            crowd_libraries)
print(enrichr_libraries)

[]
