In [None]:
## importing hail
import hail as hl

## make a toy matrix
def generate_gene_matrix(ngenes, nsamples):
    mt = hl.utils.range_matrix_table(ngenes, nsamples)
    mt = mt.annotate_rows(gene=hl.literal("gene_") + hl.str(mt['row_idx']))
    mt = mt.annotate_cols(sample=hl.literal("sample_") + hl.str(mt['col_idx']))
    mt = mt.annotate_entries(expression=hl.rand_unif(0, 1))
    return mt

ngenes=1000
nsamples=50
t = generate_gene_matrix(ngenes, nsamples).entries()
t.show()

In [None]:
## read in the gene list
SCZ_genelist = hl.import_table("/Users/andrea/Desktop/SCZ_genelist.txt", no_header=True, impute=True)
SCZ_genelist.show()

In [None]:
## read in the meta data
sample_meta = hl.import_table("/Users/andrea/Desktop/GTEx_v7_tissues.txt", impute=True)
sample_meta.show()

In [None]:
## read in the big data set
df = hl.import_matrix_table("/Users/andrea/Desktop/GTEx_v7_rpkm.txt",
                           row_fields={'Name':hl.tstr, 'Description':hl.tstr},
                           entry_type=hl.tfloat,
                           row_key="Name")
df.count()

In [None]:
## save the big data table to speed things up
df.write('/Users/andrea/Desktop/HAIL TSPEC/gtex_v7.mt')
df = hl.read_matrix_table('/Users/andrea/Desktop/HAIL TSPEC/gtex_v7.mt')

In [None]:
## add a row annotation of mean expressions
df_result = df.annotate_rows(mean_exp = hl.agg.mean(df.x))
df_result.row.show()

In [None]:
## make a gene table
gene_table=df_result.rows()
gene_table._force_count()

In [None]:
## 
gene_table=gene_table.order_by(gene_table.mean_exp)
gene_table.show()

In [None]:
gene_table = gene_table.add_index()
gene_table.count()

In [None]:
genelist = SCZ_genelist.f0.collect()
gene_table_perm=gene_table.filter(~hl.set(genelist).contains(gene_table.Description))
gene_table_perm.count()

In [None]:
gene_table_list=gene_table.filter(hl.set(genelist).contains(gene_table.Description))
gene_table_list.count()

In [None]:
gene_table_list.show()