In [None]:
!python -m pip install pandas
!python -m pip install deseqpyodide
import pandas as pd
from time import time
from deseqpyodide.dds import DeseqDataSet   #from pydeseq2.dds import DeseqDataSet
from deseqpyodide.ds import DeseqStats  #from pydeseq2.ds import DeseqStats

In [164]:
def preprocess_for_deseq2(counts_fp: str, cohort_A_fp: str, cohort_B_fp: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Preprocesses the counts matrix for use with DESeq2. Returns a tuple of the preprocessed counts matrix and the cohort data.
    """

    start_time = time()

    # Load dataset
    print("Loading dataset")
    counts = pd.read_csv(counts_fp, index_col=0)
    print(f"Dataset loaded in {round(time() - start_time, 2)} seconds")
    print("Dataset size:", counts.shape)

    # Filter out all-0 genes
    counts = counts.loc[:, counts.sum() != 0]

    # Read cohort data
    cohort_A = pd.read_csv(cohort_A_fp)
    cohort_B = pd.read_csv(cohort_B_fp)

    def determine_cohort(sample_id):
        if sample_id.lower() in cohort_A['COHORT A'].str.lower().values:
            return 'A'
        elif sample_id.lower() in cohort_B['COHORT B'].str.lower().values:
            return 'B'
        else:
            return 'Unknown'

    # Apply determine_cohort function to create 'cohort' column
    sids = counts.columns
    cohorts = [determine_cohort(sample_id) for sample_id in sids]

    # Print cohort sizes
    print('Cohort A size:', cohorts.count('A'))
    print('Cohort B size:', cohorts.count('B'))
    print('No Cohort size:', cohorts.count('Unknown'))

    # Drop Unknown cohort
    cohort_data = pd.DataFrame({'Sample': sids, 'Condition': cohorts})
    cohort_data = cohort_data[cohort_data['Condition'] != 'Unknown']

    # Set index to sample ID
    cohort_data = cohort_data.set_index('Sample')

    # Subset the counts to only include the columns that are in the cohorts
    counts_matrix = counts[cohort_data.index]
    print('Transposing data for deseq consumption')

    # NOTE: This transpose is not needed in the R version, because the R deseq expects the counts matrix to be in the format of genes as rows and samples as columns. 
    counts_matrix = counts_matrix.transpose()
    

    # Make sure rownames and colnames match
    print('Validating preprocessed data. Valid = ', all(counts_matrix.index == cohort_data.index))

    return counts_matrix, cohort_data


def deseq(counts_matrix: pd.DataFrame, metadata: pd.DataFrame) -> pd.DataFrame:

    print('Running Deseq')
    # run dispersion and log fold-change (LFC) estimation.
    dds = DeseqDataSet(counts=counts_matrix, metadata=metadata, design_factors="Condition")
    dds.deseq2()

    print('Running stat summary')
    # summary of statistical tests
    stat_res = DeseqStats(dds, n_cpus=8, contrast = ('Condition','A','B'))
    stat_res.summary()
    res = stat_res.results_df

    return res
    


# counts_matrix, cohort_data = preprocess_for_deseq2(
#     counts_fp="C:/Users/gglatzer/Downloads/1298_combatseq_log2tpm_sampleIDnew.csv",
#     cohort_A_fp="C:/Users/gglatzer/OneDrive - Fred Hutchinson Cancer Center/Documents/Oncoscape/Cohort_A.csv",
#     cohort_B_fp="C:/Users/gglatzer/OneDrive - Fred Hutchinson Cancer Center/Documents/Oncoscape/Cohort_B.csv"
# )

start_time = time()

counts_matrix, cohort_data = preprocess_for_deseq2(
    counts_fp=r"C:\Users\gglatzer\GitHub\DifferentialExpression\count_table.csv",
    cohort_A_fp=r"C:\Users\gglatzer\GitHub\DifferentialExpression\cha.csv",
    cohort_B_fp=r"C:\Users\gglatzer\GitHub\DifferentialExpression\chb.csv"
)

print('[PREPROCESSING FINISHED]. Runtime (s):', time() - start_time)

start_time_deseq = time()

deseq_summary = deseq(counts_matrix, cohort_data)

print('[DESEQ FINISHED]. Runtime (s):', time() - start_time_deseq)
print(deseq_summary)

print('[TOTAL RUNTIME (s)]:', time() - start_time)


Loading dataset
Dataset loaded in 0.06 seconds
Dataset size: (60663, 8)
Cohort A size: 4
Cohort B size: 4
No Cohort size: 0
Transposing data for deseq consumption
Validating preprocessed data. Valid =  True
[PREPROCESSING FINISHED]. Runtime (s): 0.07342791557312012
Running Deseq


Fitting size factors...
... done in 0.02 seconds.

Fitting dispersions...
... done in 23.83 seconds.

Fitting dispersion trend curve...
... done in 4.23 seconds.

  results = super().__array_ufunc__(
Fitting MAP dispersions...
  log_alpha_hat = np.log(alpha_hat)
  x0=np.log(alpha_hat),
  sign, logdet = _umath_linalg.slogdet(a, signature=signature)
... done in 47.29 seconds.

  self.varm["_outlier_genes"] = np.log(self.varm["genewise_dispersions"]) > np.log(
Fitting LFCs...
... done in 19.14 seconds.

Refitting 0 outliers.

Running Wald tests...


Running stat summary
Log2 fold change & Wald test p-value: Condition A vs B
[DESEQ FINISHED]. Runtime (s): 113.58661532402039
                   baseMean  log2FoldChange     lfcSE      stat    pvalue  \
ENSG00000284662    0.000000             NaN       NaN       NaN       NaN   
ENSG00000186827    1.107782       -2.119166  1.486697 -1.425419  0.154036   
ENSG00000186891    1.071147       -2.761974  1.566738 -1.762882  0.077920   
ENSG00000160072  320.481168        0.492175  0.147030  3.347457  0.000816   
ENSG00000041988  103.354977        0.282138  0.130485  2.162226  0.030601   
...                     ...             ...       ...       ...       ...   
ENSG00000271254   58.543851       -0.316375  0.333484 -0.948695  0.342776   
ENSG00000275987    0.108747        0.300576  3.869272  0.077683  0.938080   
ENSG00000268674    0.000000             NaN       NaN       NaN       NaN   
ENSG00000277475    0.000000             NaN       NaN       NaN       NaN   
ENSG00000275405    0.000000

... done in 6.20 seconds.



### How will this scale?

In [200]:
ivan = pd.read_csv(r'C:\Users\gglatzer\GitHub\DifferentialExpression\count_table.csv', index_col=0)
ivan_size = ivan.shape[0] * ivan.shape[1]
print('Ivan\'s dataset sample size x gene count:', ivan_size)
ivan.shape

Ivan's dataset sample size x gene count: 485304


(60663, 8)

In [201]:
real = pd.read_csv(r"C:/Users/gglatzer/Downloads/1298_combatseq_log2tpm_sampleIDnew.csv")
a = pd.read_csv(r"C:/Users/gglatzer/OneDrive - Fred Hutchinson Cancer Center/Documents/Oncoscape/Cohort_A.csv")
b = pd.read_csv(r"C:/Users/gglatzer/OneDrive - Fred Hutchinson Cancer Center/Documents/Oncoscape/Cohort_B.csv")

a_cols = real.columns.str.lower().isin(a['COHORT A'].str.lower())
b_cols = real.columns.str.lower().isin(b['COHORT B'].str.lower())

real_shape_in_cohorts = real[real.columns[a_cols | b_cols]].shape
real_size = real_shape_in_cohorts[0] * real_shape_in_cohorts[1]

print('Real dataset sample size x gene count:', real_size)
real_shape_in_cohorts

Real dataset sample size x gene count: 5634078


(19979, 282)

In [195]:
times_larger = round(real_size / ivan_size, 2)
print(f'The real dataset is {times_larger} times larger than Ivan\'s dataset')

The real dataset is 11.61 times larger than Ivan's dataset


In [199]:
python_runtime = 113.66207528114319
R_runtime = 8.31981992721558

estimated_real_python_runtime = python_runtime * times_larger
estimated_real_R_runtime = R_runtime * times_larger

print(f'Estimated real python runtime {round(estimated_real_python_runtime // 60)}m {round(estimated_real_python_runtime % 60, 2)}s')
print(f'Estimated real R runtime (s): {round(estimated_real_R_runtime // 60)}m {round(estimated_real_R_runtime % 60, 2)}s')

Estimated real python runtime 21m 59.62s
Estimated real R runtime (s): 1m 36.59s
