-
Notifications
You must be signed in to change notification settings - Fork 1
/
biojupies.py
86 lines (74 loc) · 2.42 KB
/
biojupies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#%%[markdown]
# Here we walk through building a biojupies notebook using methods in this library for the purpose of integration testing.
#%%
import sys; sys.path.insert(0, '..')
import os
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import plotly.express as px
from maayanlab_bioinformatics.normalization import filter_by_var, cpm_normalize, zscore_normalize, log10_normalize
from maayanlab_bioinformatics.dge import limma_voom_differential_expression
from maayanlab_bioinformatics.utils import merge
os.environ['R_LIBS_USER'] = '/home/u8sand/.r_libs'
#%%
# biojupies settings
pca_top_genes = 2500
normalization = 'logCPM'
zscore = True
pval_thresh = 0.05
logFC_thresh = 1.5
enrichr_geneset_size = 500
sort_genes_by = 't'
#%%
# ## Load Dataset
df_data = pd.read_csv('../tests/test_example_matrix.txt', sep='\t', index_col=0)
display(df_data.head())
df_metadata = pd.read_csv('../tests/test_example_metadata.txt', sep='\t', index_col=0)
display(df_metadata.head())
#%%
# Biojupies selection
normal = df_data.loc[:, df_metadata['cell type'] == 'normal melanocytes']
perturbation = df_data.loc[:, df_metadata['cell type'] == 'melanoma cell line']
#%%
# ## PCA
# Principal Component Analysis was performed using the PCA function from in the sklearn Python module.
# Prior to performing PCA, the raw gene counts were normalized using the logCPM method
df_data_norm_for_pca = log10_normalize(cpm_normalize(df_data))
# filtered by selecting the 2500 genes with most variable expression
df_data_norm_for_pca = filter_by_var(df_data_norm_for_pca, top_n=2500)
# and finally transformed using the Z-score method.
df_data_norm_for_pca = zscore_normalize(df_data_norm_for_pca)
pca = PCA()
pca.fit(df_data_norm_for_pca.T.values)
df_pca = pd.DataFrame(
pca.transform(df_data_norm_for_pca.T.values),
index=df_data_norm_for_pca.columns,
columns=[
f"PC{n}({var*100:2.1f}% var. explained)"
for n, var in enumerate(pca.explained_variance_ratio_)
]
)
px.scatter_3d(
merge(df_pca, df_metadata),
x=df_pca.columns[0],
y=df_pca.columns[1],
z=df_pca.columns[2],
color='cell type',
hover_data=[df_pca.index],
)
#%%
# ## clustergrammer
#%%
# ## library size analysis
#%%
# ## Differential Expression Table
dge_table = limma_voom_differential_expression(
normal,
perturbation,
filter_genes=True,
voom_design=False,
)
dge_table.sort_values('P.Value')