# Import, options

In [14]:
import numpy, seaborn, pandas

import pybiomart
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

import matplotlib, matplotlib.pyplot as plt

In [2]:
matplotlib.rcParams.update({'svg.fonttype' : 'none'})
#'font.size':20, 'xtick.labelsize':20, 'ytick.labelsize':20,'axes.grid' : True, 'grid.alpha': 0.5, 'grid.linestyle' : ':',                            'figure.figsize':(8, 5), 

In [3]:
file_directory = "/Users/kja11/OneDrive - Menntaský/PostDoc_Hypothermia/in_silico/Python/"

# Data Download

In [4]:
#data DEseq HEK293 significant ones
df = pandas.read_csv(file_directory+'1) input/RNAseq/DESeq_HEK293_temp_v2.tsv',
                               sep = '\t')
print(df.shape, '\n', df.columns)
df

(22752, 8) 
 Index(['gene_id', 'gene_name', 's37rep1_sum', 's37rep2_sum', 's37rep3_sum',
       's32rep1_sum', 's32rep2_sum', 's32rep3_sum'],
      dtype='object')


Unnamed: 0,gene_id,gene_name,s37rep1_sum,s37rep2_sum,s37rep3_sum,s32rep1_sum,s32rep2_sum,s32rep3_sum
0,ENSG00000000003,TSPAN6,1669,1746,1154,1319,1492,1265
1,ENSG00000000005,TNMD,3,0,0,1,0,2
2,ENSG00000000419,DPM1,5097,5407,3552,4640,5216,4412
3,ENSG00000000457,SCYL3,158,173,136,113,97,145
4,ENSG00000000460,C1orf112,875,850,650,481,635,618
...,...,...,...,...,...,...,...,...
22747,ENSG00000292361,CD99P1,30,23,36,37,22,36
22748,ENSG00000292366,VAMP7,568,459,279,459,412,303
22749,ENSG00000292371,DDX11L16,1,0,0,0,0,0
22750,ENSG00000292372,WASH6P,486,555,515,528,464,305


In [5]:
# Import data from hsapiens_gene_ensembl
dataset = pybiomart.Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')
annotation = dataset.query(attributes=['external_gene_name', 'ensembl_gene_id', 'gene_biotype'
                                       ,'description'])

annotation = annotation.rename(columns = {'Gene stable ID' : 'gene_id',
                                          'Gene name': 'gene_name'})

annotation.set_index('gene_id', inplace=True)
annotation.head(2)

Unnamed: 0_level_0,gene_name,Gene type,Gene description
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000210049,MT-TF,Mt_tRNA,mitochondrially encoded tRNA-Phe (UUU/C) [Sour...
ENSG00000211459,MT-RNR1,Mt_rRNA,mitochondrially encoded 12S rRNA [Source:HGNC ...


## Data transformation

In [6]:
# Organize the df
df.set_index('gene_id', drop=True, inplace=True)
df = df.drop(['gene_name'], axis=1)
df.head()

Unnamed: 0_level_0,s37rep1_sum,s37rep2_sum,s37rep3_sum,s32rep1_sum,s32rep2_sum,s32rep3_sum
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000000003,1669,1746,1154,1319,1492,1265
ENSG00000000005,3,0,0,1,0,2
ENSG00000000419,5097,5407,3552,4640,5216,4412
ENSG00000000457,158,173,136,113,97,145
ENSG00000000460,875,850,650,481,635,618


In [7]:
# Remove the non expressed transcripts (less than 10 counts)
full_leng = len(df)
df = df[df.max(axis=1) >= 10]

filtr_leng = len(df)
dropped_transcripts = full_leng - filtr_leng

print(f'On {full_leng}, {dropped_transcripts} transcripts do not exceed 10 counts')

df = df.T
df

On 22752, 7429 transcripts do not exceed 10 counts


gene_id,ENSG00000000003,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,ENSG00000001461,...,ENSG00000292343,ENSG00000292344,ENSG00000292345,ENSG00000292348,ENSG00000292357,ENSG00000292358,ENSG00000292361,ENSG00000292366,ENSG00000292372,ENSG00000292373
s37rep1_sum,1669,5097,158,875,103,255,223,190,469,547,...,124,560,292,9385,1,178,30,568,486,6
s37rep2_sum,1746,5407,173,850,74,269,312,167,318,586,...,124,549,315,9491,11,205,23,459,555,0
s37rep3_sum,1154,3552,136,650,27,277,166,136,475,468,...,119,683,193,7060,6,132,36,279,515,0
s32rep1_sum,1319,4640,113,481,100,238,193,91,268,333,...,55,297,273,8885,0,165,37,459,528,13
s32rep2_sum,1492,5216,97,635,91,245,315,71,297,436,...,118,395,244,8652,8,290,22,412,464,10
s32rep3_sum,1265,4412,145,618,78,157,140,54,297,398,...,59,368,141,5871,3,74,36,303,305,0


### Deseq2

In [8]:
# Preapre the deseq2 metadata
metadata = pandas.DataFrame(zip(df.index, ['37°','37°','37°','32°', '32°', '32°']),
                            columns = ['Sample', 'Condition'])

metadata = metadata.set_index('Sample')
metadata

Unnamed: 0_level_0,Condition
Sample,Unnamed: 1_level_1
s37rep1_sum,37°
s37rep2_sum,37°
s37rep3_sum,37°
s32rep1_sum,32°
s32rep2_sum,32°
s32rep3_sum,32°


In [9]:
%%time
# define the data
dds = DeseqDataSet(counts = df,
                   metadata = metadata,
                   design_factors = "Condition")
print(dds)

#run deseq2
dds.deseq2()

#show results, The name provided in the second element is the level that is used as baseline. 
stat_res = DeseqStats(dds, contrast = ('Condition', '32°', '37°'))

AnnData object with n_obs × n_vars = 6 × 15323
    obs: 'Condition'
    obsm: 'design_matrix'


Fitting size factors...
... done in 0.01 seconds.

Fitting dispersions...
... done in 4.51 seconds.

Fitting dispersion trend curve...
... done in 0.71 seconds.

Fitting MAP dispersions...
... done in 4.63 seconds.

Fitting LFCs...


CPU times: total: 6.86 s
Wall time: 21 s


... done in 2.52 seconds.

Refitting 0 outliers.



In [10]:
# save results in a df
stat_res.summary()

res = stat_res.results_df
res

Running Wald tests...


Log2 fold change & Wald test p-value: Condition 32° vs 37°
                    baseMean  log2FoldChange     lfcSE      stat    pvalue  \
gene_id                                                                      
ENSG00000000003  1417.578539        0.236871  0.190655  1.242407  0.214086   
ENSG00000000419  4685.997108        0.422995  0.183466  2.305574  0.021134   
ENSG00000000457   135.788628        0.025567  0.324710  0.078738  0.937241   
ENSG00000000460   669.515399       -0.044322  0.230353 -0.192407  0.847423   
ENSG00000000971    78.489575        0.838610  0.413537  2.027894  0.042571   
...                      ...             ...       ...       ...       ...   
ENSG00000292358   168.216188        0.370319  0.374895  0.987792  0.323255   
ENSG00000292361    31.402552        0.477819  0.528571  0.903982  0.366005   
ENSG00000292366   402.038279        0.248861  0.244840  1.016422  0.309428   
ENSG00000292372   466.011302        0.087336  0.236827  0.368775  0.712295   
ENSG0

... done in 1.34 seconds.



Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000000003,1417.578539,0.236871,0.190655,1.242407,0.214086,0.729350
ENSG00000000419,4685.997108,0.422995,0.183466,2.305574,0.021134,0.357952
ENSG00000000457,135.788628,0.025567,0.324710,0.078738,0.937241,0.987078
ENSG00000000460,669.515399,-0.044322,0.230353,-0.192407,0.847423,0.966125
ENSG00000000971,78.489575,0.838610,0.413537,2.027894,0.042571,0.447272
...,...,...,...,...,...,...
ENSG00000292358,168.216188,0.370319,0.374895,0.987792,0.323255,0.806471
ENSG00000292361,31.402552,0.477819,0.528571,0.903982,0.366005,0.827400
ENSG00000292366,402.038279,0.248861,0.244840,1.016422,0.309428,0.797346
ENSG00000292372,466.011302,0.087336,0.236827,0.368775,0.712295,0.935807


In [11]:
# create a df of the result + annotation
df_anno  = pandas.merge(annotation, res, left_index=True, right_index=True, how = 'right')
print(df_anno .shape)
df_anno.head(3)

(15323, 9)


Unnamed: 0_level_0,gene_name,Gene type,Gene description,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ENSG00000000003,TSPAN6,protein_coding,tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858],1417.578539,0.236871,0.190655,1.242407,0.214086,0.72935
ENSG00000000419,DPM1,protein_coding,dolichyl-phosphate mannosyltransferase subunit...,4685.997108,0.422995,0.183466,2.305574,0.021134,0.357952
ENSG00000000457,SCYL3,protein_coding,SCY1 like pseudokinase 3 [Source:HGNC Symbol;A...,135.788628,0.025567,0.32471,0.078738,0.937241,0.987078


In [13]:
# save to csv
df_anno.to_csv(file_directory+'1) input/RNAseq/from_output/DEGs_HEK293_temp_annotated.csv', sep=',')

### Significant ones

In [None]:
# keep padj < 0.05
df_signif = df_anno[df_anno['padj'] < 0.05]
print(df_signif.shape)
df_signif[['log2FoldChange','padj','gene_name']].head()

In [None]:
# save to .txt the genes
# all 
numpy.savetxt(file_directory+"3) output/allsignif_genes_HEK293.txt", 
              df_signif['gene_name'].unique(), delimiter="\t", fmt="% s")
# positive
numpy.savetxt(file_directory+"3) output/top15_genes_HEK293.txt", 
              top_15['Symbol'].unique(), delimiter="\t", fmt="% s")

## negative
numpy.savetxt(file_directory+"3) output/top5_negative_genes_HEK293.txt", 
              topneg_5['Symbol'].unique(), delimiter="\t", fmt="% s")

print(top_15[['log2FoldChange','padj','gene_name']].head(), '\n','\n',  topneg_5[['log2FoldChange','padj','gene_name']])