# Benchmarking Test File

Use this notebook to compare a run of SPECTRA with the benchmark data. 

The data you're comparing is in the folder marked "benchmark_factors" on the google drive. There is one csv for each factor, and each file is named the same as the factor you should be comparing it against.

There are two main things we're looking at - pairwise correlation between each factor and the intersection of the genes that contribute to each factor.

In [2]:
# imports
import re
import scanpy as sc
import seaborn
import numpy as np
import csv
import pandas as pd

This is almost complete - we need to finalize where this data comes from to compare with in a new run. But once that is complete, we're good to go.

In [None]:
# Importing files
# file path for benchmark data - remember to CHANGE THE FILE PATH TO YOUR LOCAL PLACE YOU WANT THESE TO BE
benchmark_path = "C:\\Users\\phill\\Documents\\HMCFall22\\Clinic\\"
# file path for new data - remember to CHANGE THE FILE PATH TO YOUR LOCAL PLACE YOU WANT THESE TO BE
new_factor_path = 'C:/Users/phill/Documents/HMCFall22/Clinic/SPECTRA-GPU/new_factors/'

# new run - remember to CHANGE THE FILE PATH TO YOUR LOCAL PLACE YOU WANT THESE TO BE
adata = sc.read_h5ad(benchmark_path + 'data_for_clinic_2023.h5ad')

newMarkers = pd.DataFrame(adata.uns["SPECTRA_markers"]).to_csv(new_factor_path + f"markers.csv", header=False, index=False)
# this second one may throw an error, it's just a test
newGeneScalings = pd.DataFrame(adata.obs["SPECTRA_gene_scalings"]).to_csv(new_factor_path + f"gene_scalings.csv", header=False, index=False)

# get full list (in order of gene names)
gene_names = []
with open(benchmark_path + 'gene_names.csv', mode='r') as csv_file:
        csv_reader = csv.reader(csv_file)

        # get list of factors from csv
        gene_names = list(csv_reader)

        csv_file.close()
    

# get full list of factor names
factor_list = []
with open(benchmark_path + 'factor_names.csv', mode='r') as csv_file:
        csv_reader = csv.reader(csv_file)

        # get list of factors from csv
        factor_list = list(csv_reader)

        csv_file.close()
    


Needs to be done below
- format the new data so that it aligns with what's below
- AKA need to take the top N factors, map that to genes, and then perform the intersection
- the code in the block below should be a rough sketch on how to do that.
- then, either the gene_scalings or the factors ~should~ be enough to do the pairwise correlation

In [None]:
gene_loadings = [][] # this should be the markers
for factor in range(len(gene_loadings)):
    factor_name = gene_loadings[factor][0]
    factor_dict = {}
    for x in range(1, len(gene_loadings[factor])-1):
        factor_dict[gene_names[x]] = gene_loadings[factor][x]

    # sort dictionaries by gene score
    sorted_genes = sorted(factor_dict.items(), key=lambda gene: gene[1], reverse=True)

    # put sorted top N genes into csv for posterity - for now, all of them
    with open(f'{factor_list[factor]}.csv', 'w') as test_file:
        file_writer = csv.writer(test_file)
        for i in range(len(sorted_genes[0])):
            file_writer.writerow([x[i] for x in sorted_genes])

In [1]:
# Intersection between factors
# for now assuming the anndata looks identical to the benchmark set we recieved.

for l in factor_list:
    benchmark_factor = []
    new_factor = []
    
    # grab the benchmark data
    with open(benchmark_path + '{l}_genes.csv', mode='r') as csv_file:
        csv_reader = csv.reader(csv_file)

        # get list of factors from csv
        benchmark_factor = list(csv_reader)

        csv_file.close()

    # get the new run's data
    with open(new_factor_path + '{l}_genes.csv', mode='r') as csv_file:
        csv_reader = csv.reader(csv_file)

        # get list of factors from csv
        new_factor = list(csv_reader)

        csv_file.close()



    benchmark_factor_set = set(benchmark_factor)
    new_factor_set = set(new_factor)

    intersection_1 = benchmark_factor_set.intersection(new_factor_set) 
    average_unique_genes = (len(benchmark_factor_set) + len(new_factor_set))//2  

    print("Intersection between genes for factor {l[0]}: " + str(intersection_1))
    print("Percent error of intersection: " + str(round(abs(intersection_1 - average_unique_genes)/average_unique_genes, 4) * 100) + "%\n\n")


IndentationError: expected an indented block (3081531705.py, line 5)

In [None]:
# Pairwise correlation 

for l in factor_list:
    benchmark_factor = []
    new_factor = []
    
    # grab the benchmark data
    with open(benchmark_path + '{l}.csv', mode='r') as csv_file:
        csv_reader = csv.reader(csv_file)

        # get list of factors from csv
        benchmark_factor = list(csv_reader)

        csv_file.close()

    # convert benchmark_factor elements to floats
    num_benchmark_factor = []

    for r in range(len(benchmark_factor)):
        num_benchmark_factor.append(float(benchmark_factor[r][0]))

    # get the new run's data
    with open(new_factor_path + '{l}.csv', mode='r') as csv_file:
        csv_reader = csv.reader(csv_file)

        # get list of factors from csv
        new_factor = list(csv_reader)

        csv_file.close()
    
    # convert new_factor elements to floats
    num_new_factor = []

    for r in range(len(new_factor)):
        num_new_factor.append(float(new_factor[r][0]))

    #for r in range(len(new_factor)):
    #    row = []
    #    for c in range(len(new_factor[r])):
    #        row.append(float(new_factor[r][c]))
    #    num_new_factor.append(row)

    corr_mat = np.corrcoef(num_benchmark_factor, num_new_factor)

    seaborn.clustermap(corr_mat)