In [1]:
import pandas as pd
import random
import data_load, constants 

from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats

In [2]:
target_gene = random.choice(constants.genes)

In [56]:
expression_counts = data_load.load_txt_file(file_path=constants.expression_file_path)
mutation = data_load.load_maf_data(file_path=constants.maf_file_path)
# remove genes with too few counts
expression_counts = expression_counts[expression_counts.sum(axis = 1) > 10]
# only including expression values from samples that have DNA sequencing
# around 177 samples
# format is sample X gene
expression_counts = expression_counts.T[expression_counts.T.index.isin(mutation['sample'].unique())].round(0)

# for every target_gene, get mutation status of samples
metadata = expression_counts.reset_index().rename(columns={'index': 'sample'})[['sample']].merge(
    mutation[mutation['gene'] == target_gene], how ='left')[['sample', 'mutation']].fillna(0).drop_duplicates().set_index('sample')

# getting stats (log fold change, pvalue with Wald test)
inference = DefaultInference(n_cpus=8)
dds = DeseqDataSet(
    counts=expression_counts,
    metadata=metadata,
    design_factors="mutation",
    refit_cooks=True,
    inference=inference,
)

dds.deseq2()

stat_res = DeseqStats(dds, inference=inference)

stat_res.summary()
# Note on p-values set to NA: some values in the results table can be set to NA for one of the following reasons:

# If within a row, all samples have zero counts, the baseMean column will be zero, and the log2 fold change estimates, p value and adjusted p value will all be set to NA.
# If a row contains a sample with an extreme count outlier then the p value and adjusted p value will be set to NA. These outlier counts are detected by Cook’s distance. Customization of this outlier filtering and description of functionality for replacement of outlier counts and refitting is described below
# If a row is filtered by automatic independent filtering, for having a low mean normalized count, then only the adjusted p value will be set to NA. Description and customization of independent filtering is described below

stats = stat_res.results_df
stats.nsmallest(100, 'pvalue')


## Do clustering analysis

### remember, need to normalize and take a log

## Prepare data for GSEA

In [109]:
import pandas as pd 
import numpy as np
gsea_expression_df = normalized_counts.T

position = 0
new_column_name = 'description'
gsea_expression_df.insert(position, new_column_name, 'NA')

# Reset the index and give the index column a name
gsea_expression_df = gsea_expression_df.reset_index().rename(columns={'index': 'NAME'})

gsea_expression_df.to_csv('/Users/meltemtutar/Documents/Huang/Respond/data/Expression_GSEA_input.gct', index=False, sep='\t')



In [None]:
gsea_expression_df

Unnamed: 0,NAME,description,RESPOND_10100291,RESPOND_10100412,RESPOND_10100478,RESPOND_10100801,RESPOND_10100884,RESPOND_10100899,RESPOND_10100952,RESPOND_10101096,...,RESPOND_70101433,RESPOND_70102184,RESPOND_80100170,RESPOND_80100242,RESPOND_80100345,RESPOND_80100411,RESPOND_80100556,RESPOND_80100590,RESPOND_40100842,RESPOND_80100259
0,A1BG,,0.922267,0.744255,2.904442,3.849974,2.630918,2.209470,0.222168,1.487147,...,0.000000,0.000000,3.062958,0.963742,0.171112,0.481771,0.000000,4.641921,2.688290,4.002183
1,A1CF,,1.461759,0.610870,0.560596,0.000000,3.359655,3.622954,0.657201,0.938285,...,0.000000,2.028393,0.472500,0.334940,0.059247,0.000000,0.000000,0.955525,0.352507,0.266014
2,A2M,,6.217436,8.080700,7.899957,7.542932,6.990621,6.832165,7.729413,7.178869,...,7.597364,6.678447,9.351151,5.645583,8.887803,6.188094,8.009205,7.254780,7.920655,5.669277
3,A2ML1,,1.844535,0.000000,1.534477,1.011194,0.563954,2.854701,0.834710,0.938285,...,0.490286,0.519028,1.531479,1.527495,0.901662,0.976063,0.415067,1.911049,1.517741,0.959773
4,A3GALT2,,0.000000,0.000000,0.000000,0.000000,0.000000,0.908231,0.000000,0.298643,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19994,ZXDC,,5.847037,5.224781,5.718740,6.020848,5.800806,5.980982,5.073435,5.914745,...,4.963555,4.308234,5.819147,6.057506,5.757914,6.288720,5.294890,6.057883,5.946099,6.167758
19995,ZYG11A,,2.384027,0.414401,0.968147,1.602705,0.463030,0.672965,0.973890,0.938285,...,0.775231,1.607464,2.243576,1.527495,0.789501,0.976063,2.048343,0.759243,1.915176,3.440752
19996,ZYG11B,,3.769734,3.665983,3.782445,3.033582,3.568070,3.717668,4.209083,2.634100,...,3.527511,5.468870,4.176088,3.201482,4.416694,3.242411,5.025498,3.822099,3.181039,3.551584
19997,ZYX,,6.166742,7.339035,7.880284,7.855141,6.700399,7.287302,7.571788,7.396168,...,8.077892,6.216298,6.696864,7.625984,6.775296,6.320725,7.291017,7.578015,6.468400,8.548597


In [102]:
# this is the only part that relies on target gene 
gsea_mutation_df = gsea_expression_df.T.drop('NAME').drop('description')[[]].reset_index(names=['sample']).merge(
    mutation[mutation['gene'] == target_gene], how ='left')[['sample', 'mutation']].fillna(0).drop_duplicates()


gsea_mutation_df.T

In [107]:
gsea_mutation_df[['mutation']].T.to_csv('/Users/meltemtutar/Documents/Huang/Respond/data/mutation_GSEA_input.cls',
                                        index=False, header=False, sep='\t')

In [110]:
target_gene


'CSMD2'