#### Generate CN corrected RNA counts matrix

In [138]:
import random

def create_matrix(rows, cols, min_val, max_val):
    return [[random.randint(min_val, max_val) for _ in range(cols)] for _ in range(rows)]

In [139]:
def rna_cnv_count_matrix(rna_counts, cnv_tumor):
    cnv_normal_mat = create_matrix(19979, 45, 2, 2)
    cnv_normal_mat= np.array(cnv_normal)
    cnv_tumor_mat = np.array(cnv_tumor)
    cnv = np.concatenate((cnv_tumor_mat, cnv_normal_mat), axis=1) 
    cnv = cnv/2

    counts_mat = np.array(rna_counts)
    rna_counts_cnv = np.multiply(counts_mat, cnv)
    rna_counts_cnv = pd.DataFrame(rna_counts_cnv)
    
    # Reassign rownames and column names
    gene_id = rna_counts.index
    gene_id = pd.DataFrame(gene_id)
    gene_id.rename(columns = {0:'geneID'}, inplace = True) 
    rna_counts_cnv = pd.concat([gene_id, rna_counts_cnv], axis=1)
    rna_counts_cnv.set_index('geneID', inplace = True)
    sample_id = rna_counts.columns
    rna_counts_cnv.columns = sample_id
    rna_counts_cnv = rna_counts_cnv.T

    return rna_counts_cnv

In [140]:
rna_counts_cnv = rna_cnv_count_matrix(rna_counts, cnv_tumor)


In [2]:
import os
import pickle as pkl
import pandas as pd
import numpy as np

import pydeseq2
from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats

Load data

In [3]:
DATA_PATH = "/Users/katsiarynadavydzenka/Documents/PhD_AI/CN-aware-DGE/Python/datasets/"
counts_df = pd.read_csv(os.path.join(DATA_PATH, "rna_counts_cnv.csv"), index_col=0)
counts_df = counts_df.T
metadata = pd.read_csv(os.path.join(DATA_PATH, "metadata.csv"), index_col=0)
genes_to_keep = counts_df.columns[counts_df.sum(axis=0) >= 10]
counts_df = counts_df[genes_to_keep]
counts_df.shape

(20, 1000)

#### Model fit

In [4]:
def test_pydeseq2CN(counts_df, metadata):

    inference = DefaultInference(n_cpus=8)
    # Create dds object
    dds = DeseqDataSet(
        counts=counts_df,
        metadata=metadata,
        design_factors="condition",
        refit_cooks=False,
        n_cpus=8
    )
    dds.deseq2()
    # Statistical test
    stat_res = DeseqStats(dds, 
                      contrast=['condition', 'B', 'A'], 
                      alpha=0.05, 
                      cooks_filter=False, 
                      independent_filter=True, 
                      prior_LFC_var=None, 
                      lfc_null=0, 
                      alt_hypothesis=None, 
                      inference=None, quiet=False
                         )
    stat_res.summary()
    # LFC shrinkage (apeGLM) 
    stat_res.lfc_shrink(coeff="condition_B_vs_A")
    res_df = stat_res.results_df
    return res_df

In [5]:
res = test_pydeseq2CN(counts_df, metadata)

Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 0.11 seconds.

Fitting dispersion trend curve...
... done in 0.02 seconds.

Fitting MAP dispersions...
... done in 0.12 seconds.

Fitting LFCs...
... done in 0.08 seconds.

Running Wald tests...
... done in 0.06 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: condition B vs A
          baseMean  log2FoldChange     lfcSE      stat        pvalue      padj
G1     5483.345668       -0.125966  0.407328 -0.309249  7.571323e-01  0.815875
G2       53.273471       -0.355742  0.492492 -0.722331  4.700911e-01  0.571192
G3       38.940902        0.150874  0.526234  0.286705  7.743380e-01  0.825460
G4       43.165640       -1.013502  0.425428 -2.382314  1.720424e-02  0.042639
G5       79.037687       -0.239740  0.395948 -0.605484  5.448575e-01  0.636729
...            ...             ...       ...       ...           ...       ...
G996    448.120920        1.160320  0.324928  3.571007  3.556116e-04  0.002075
G997   1593.890327        0.648919  0.292454  2.218874  2.649529e-02  0.059010
G998    641.953647        1.538796  0.311258  4.943799  7.661450e-07  0.000018
G999    523.254006        0.307257  0.285283  1.077025  2.814691e-01  0.384521
G1000   882.841110        0.654288  0.317034  2.063779  3.903864e-02  0.0811

... done in 0.12 seconds.



#### Save results

In [84]:
# Replace this with the path to directory where you would like results to be saved
OUTPUT_PATH = "/Users/katsiarynadavydzenka/Documents/PhD_AI/CN-aware-DGE/Python/results"
os.makedirs(OUTPUT_PATH, exist_ok=True)  # Create path if it doesn't exist
res.to_csv(os.path.join(OUTPUT_PATH, "res_sim_cnv.csv"))