#### Generate CN corrected RNA counts matrix

In [138]:
import random

def create_matrix(rows, cols, min_val, max_val):
    return [[random.randint(min_val, max_val) for _ in range(cols)] for _ in range(rows)]

In [139]:
def rna_cnv_count_matrix(rna_counts, cnv_tumor):
    cnv_normal_mat = create_matrix(19979, 45, 2, 2)
    cnv_normal_mat= np.array(cnv_normal)
    cnv_tumor_mat = np.array(cnv_tumor)
    cnv = np.concatenate((cnv_tumor_mat, cnv_normal_mat), axis=1) 
    cnv = cnv/2

    counts_mat = np.array(rna_counts)
    rna_counts_cnv = np.multiply(counts_mat, cnv)
    rna_counts_cnv = pd.DataFrame(rna_counts_cnv)
    
    # Reassign rownames and column names
    gene_id = rna_counts.index
    gene_id = pd.DataFrame(gene_id)
    gene_id.rename(columns = {0:'geneID'}, inplace = True) 
    rna_counts_cnv = pd.concat([gene_id, rna_counts_cnv], axis=1)
    rna_counts_cnv.set_index('geneID', inplace = True)
    sample_id = rna_counts.columns
    rna_counts_cnv.columns = sample_id
    rna_counts_cnv = rna_counts_cnv.T

    return rna_counts_cnv

In [140]:
rna_counts_cnv = rna_cnv_count_matrix(rna_counts, cnv_tumor)


#### Model fit

In [85]:
def test_pydeseq2CN(rna_counts, metadata):
    
    # Create dds object
    dds = DeseqDataSet(
        counts=rna_counts,
        metadata=metadata,
        design_factors="condition",
        refit_cooks=False,
        n_cpus=8
    )
    dds.deseq2()
    # Statistical test
    stat_res = DeseqStats(dds, 
                      contrast=['condition', 'B', 'A'], 
                      alpha=0.05, 
                      cooks_filter=False, 
                      independent_filter=True, 
                      prior_LFC_var=None, 
                      lfc_null=0, 
                      alt_hypothesis=None, 
                      inference=None, quiet=False
                         )
    stat_res.summary()
    # LFC shrinkage (apeGLM) 
    stat_res.lfc_shrink(coeff="condition_B_vs_A")
    res_df = stat_res.results_df
    return res_df

In [86]:
res = test_pydeseq2CN(rna_counts, metadata)

Fitting size factors...
... done in 0.04 seconds.

Fitting dispersions...
... done in 1.10 seconds.

Fitting dispersion trend curve...
... done in 0.74 seconds.

Fitting MAP dispersions...
... done in 1.31 seconds.

Fitting LFCs...
... done in 0.56 seconds.

Running Wald tests...
... done in 0.42 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: condition B vs A
            baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
G 1        47.724375       -0.146760  0.293935 -0.499296  0.617571  0.999868
G 2       363.551597       -0.067415  0.211498 -0.318749  0.749917  0.999868
G 3       879.641536       -0.087083  0.171480 -0.507833  0.611570  0.999868
G 4        24.362647        0.233736  0.268687  0.869921  0.384343  0.999868
G 5      3707.907244       -0.062938  0.192512 -0.326931  0.743720  0.999868
...              ...             ...       ...       ...       ...       ...
G 14996   594.658205       -0.197746  0.184121 -1.074002  0.282822  0.999868
G 14997  1132.731293        0.076383  0.139005  0.549498  0.582664  0.999868
G 14998    14.009224        0.401684  0.229216  1.752428  0.079700  0.999868
G 14999   265.627439        0.098208  0.148001  0.663565  0.506969  0.999868
G 15000   438.623532       -0.003224  0.089394 -0.036066  0.971230  0.999868

[15000 rows x 6 colu

... done in 2.19 seconds.



#### Save results

In [84]:
# Replace this with the path to directory where you would like results to be saved
OUTPUT_PATH = "data_simulation/results/sim_4/"
os.makedirs(OUTPUT_PATH, exist_ok=True)  # Create path if it doesn't exist
res.to_csv(os.path.join(OUTPUT_PATH, "res_sim_cnv.csv"))