In [1]:
import hail as hl
import hail.expr.aggregators as agg
hl.init()

Running on Apache Spark version 2.2.0
SparkUI available at http://10.1.6.190:4043
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version devel-a1d6ecc
NOTE: This is a beta version. Interfaces may change
  during the beta period. We also recommend pulling
  the latest changes weekly.


In [2]:
import numpy as np
import pandas as pd
from collections import Counter
from math import log, isnan
from pprint import pprint
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.layouts import gridplot
from bokeh.models import Span
output_notebook()

# Read Data

## SCZ Genelist

In [3]:
genelist = hl.import_table("HailData/SCZ_genelist.txt", no_header=True, impute=True)

2018-04-30 16:12:11 Hail: INFO: Reading table to impute column types
2018-04-30 16:12:11 Hail: INFO: Finished type imputation
  Loading column 'f0' as type 'str' (imputed)


In [4]:
genelist.show(5)
#genelist.describe()
print(genelist.count())

+-------+
| f0    |
+-------+
| str   |
+-------+
| CNTN3 |
| DLG2  |
| MARK4 |
| MYH7B |
| SCN2A |
+-------+
showing top 5 rows

205


In [5]:
genelist = genelist.f0.collect()

## Metadata on the Samples

In [6]:
sample_meta = hl.import_table("HailData/GTEx_v7_tissues.txt", impute=True)

2018-04-30 16:12:12 Hail: INFO: Reading table to impute column types
2018-04-30 16:12:13 Hail: INFO: Finished type imputation
  Loading column 'SAMPID' as type 'str' (imputed)
  Loading column 'SMATSSCR' as type 'str' (imputed)
  Loading column 'SMTS' as type 'str' (imputed)
  Loading column 'SMTSD' as type 'str' (imputed)


In [7]:
sample_meta.show()

+------------------------------+----------+----------------+
| SAMPID                       | SMATSSCR | SMTS           |
+------------------------------+----------+----------------+
| str                          | str      | str            |
+------------------------------+----------+----------------+
| GTEX-S4Q7-0003-SM-3NM8M      |          | Blood          |
| GTEX-1B98T-1026-SM-79OL8     | 1        | Salivary Gland |
| GTEX-17F98-1726-SM-7938O     | 1        | Esophagus      |
| GTEX-QV31-1626-SM-2S1QC      | 0        | Blood Vessel   |
| GTEX-13QIC-0011-R1a-SM-5O9CJ |          | Brain          |
| GTEX-15EU6-1726-SM-6M48H     | 1        | Adipose Tissue |
| GTEX-X5EB-0008-SM-46MU3      |          | Skin           |
| GTEX-Y3IK-0004-SM-6UH1D      |          | Blood          |
| GTEX-11ZU8-0001-SM-58Q8J     |          | Blood          |
| GTEX-P4PQ-0126-SM-2S1NM      | 1        | Skin           |
+------------------------------+----------+----------------+

+----------------------

In [8]:
sample_meta.count()

15750

## GTEx Data

In [9]:
df = hl.import_matrix_table("HailData/GTEx_Analysis_v7_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct",
                           row_fields={'Name':hl.tstr, 'Description':hl.tstr},
                           entry_type=hl.tfloat,
                           row_key='Description')

2018-04-30 16:12:57 Hail: INFO: Ordering unsorted dataset with network shuffle


In [10]:
df.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    'col_id': str 
----------------------------------------
Row fields:
    'Name': str 
    'Description': str 
----------------------------------------
Entry fields:
    'x': float64 
----------------------------------------
Column key:
    'col_id': str 
Row key:
    'Description': str 
Partition key:
    'Description': str 
----------------------------------------


In [11]:
#df = df.key_rows_by('Description')

In [12]:
df.count()

(56202, 11726)

In [13]:
#df.Name.show(5)

In [14]:
df.Description.show(5)

+-------------+
| Description |
+-------------+
| str         |
+-------------+
| 5S_rRNA     |
| 5S_rRNA     |
| 5S_rRNA     |
| 5S_rRNA     |
| 5S_rRNA     |
+-------------+
showing top 5 rows



In [15]:
df.x.show(5)

+-------------+--------------------------+-------------+
| Description | col_id                   |           x |
+-------------+--------------------------+-------------+
| str         | str                      |     float64 |
+-------------+--------------------------+-------------+
| 5S_rRNA     | GTEX-1117F-0226-SM-5GZZ7 | 0.00000e+00 |
| 5S_rRNA     | GTEX-111CU-1826-SM-5GZYN | 0.00000e+00 |
| 5S_rRNA     | GTEX-111FC-0226-SM-5N9B8 | 1.16945e-01 |
| 5S_rRNA     | GTEX-111VG-2326-SM-5N9BK | 0.00000e+00 |
| 5S_rRNA     | GTEX-111YS-2426-SM-5GZZQ | 3.32410e-01 |
+-------------+--------------------------+-------------+
showing top 5 rows



In [16]:
# genelist = genelist.f0.collect()
SCZdf = df.filter_rows(hl.set(genelist).contains(df.Description))

In [17]:
SCZdf.count()

(203, 11726)

In [18]:
new_genelist = SCZdf.Description.collect()

In [19]:
set(genelist)-set(new_genelist)

{'AKD1', 'FAM5C', 'MLL3'}

In [20]:
print(len(new_genelist))
print(len(set(new_genelist)))
# there is a duplicate gene in the GTEx dataset

203
202


In [21]:
import collections
print([item for item, count in collections.Counter(new_genelist).items() if count>1])

['CRYBG3']


In [22]:
SCZdf.filter_rows(SCZdf.Description=='CRYBG3').rows().show()

+-------------------+-------------+
| Name              | Description |
+-------------------+-------------+
| str               | str         |
+-------------------+-------------+
| ENSG00000233280.2 | CRYBG3      |
| ENSG00000080200.5 | CRYBG3      |
+-------------------+-------------+



In [23]:
# throw an error for the duplicate

In [24]:
SCZdf.write('SCZdf.mt', overwrite=True)
SCZdf = hl.read_matrix_table('SCZdf.mt')

2018-04-30 16:14:30 Hail: INFO: wrote 203 items in 198 partitions


## Toy Data

In [25]:
#def generate_gene_matrix(ngenes, nsamples):
#    mt = hl.range_matrix_table(ngenes, nsamples)
#    mt = mt.annotate_rows(gene=hl.literal("gene_") + hl.str(mt['row_idx']))
#    mt = mt.annotate_cols(sample=hl.literal("sample_") + hl.str(mt['row_idx']))
#    mt = mt.annotate_entries(expression=hl.rand_unif(0, 1))
#    return mt

In [26]:
#generate_gene_matrix(ngenes, nsamples).entries()

# Compute Statistic

# Permutation

## Find "neighborhoods"

In [27]:
SCZdf_nghbr = SCZdf.annotate_rows(neighborhood = ['TTN', 'DLG2', 'MARK4'])

## Draw Permutation Genes

In [28]:
def gene_perm(df=SCZdf_nghbr):
    df = df.annotate_rows(perm_gene = df.neighborhood[hl.int(hl.floor(hl.rand_unif(0,hl.len(df.neighborhood))))])
    return df

In [29]:
one_perm = gene_perm(SCZdf_nghbr)
#one_perm.show()

In [30]:
one_perm.count()

(203, 11726)

## Reformat the Data Matrix

In [31]:
P = 100

In [32]:
SCZdf_nghbr_perm = SCZdf_nghbr.annotate_rows(permutation = hl.range(0,P))

In [33]:
SCZdf_nghbr_perm_expl = SCZdf_nghbr_perm.explode_rows(SCZdf_nghbr_perm.permutation)

In [34]:
SCZdf_nghbr_perm_expl.row.show()

+--------------------+-------------+------------------------+-------------+
| Name               | Description | neighborhood           | permutation |
+--------------------+-------------+------------------------+-------------+
| str                | str         | array<str>             |       int32 |
+--------------------+-------------+------------------------+-------------+
| ENSG00000005471.11 | ABCB4       | ["TTN","DLG2","MARK4"] |           0 |
| ENSG00000005471.11 | ABCB4       | ["TTN","DLG2","MARK4"] |           1 |
| ENSG00000005471.11 | ABCB4       | ["TTN","DLG2","MARK4"] |           2 |
| ENSG00000005471.11 | ABCB4       | ["TTN","DLG2","MARK4"] |           3 |
| ENSG00000005471.11 | ABCB4       | ["TTN","DLG2","MARK4"] |           4 |
| ENSG00000005471.11 | ABCB4       | ["TTN","DLG2","MARK4"] |           5 |
| ENSG00000005471.11 | ABCB4       | ["TTN","DLG2","MARK4"] |           6 |
| ENSG00000005471.11 | ABCB4       | ["TTN","DLG2","MARK4"] |           7 |
| ENSG000000

In [35]:
perm_expl = gene_perm(SCZdf_nghbr_perm_expl)

In [36]:
perm_expl.count()

(20300, 11726)

In [37]:
perm_expl.row.show()

+--------------------+-------------+------------------------+-------------+-----------+
| Name               | Description | neighborhood           | permutation | perm_gene |
+--------------------+-------------+------------------------+-------------+-----------+
| str                | str         | array<str>             |       int32 | str       |
+--------------------+-------------+------------------------+-------------+-----------+
| ENSG00000005471.11 | ABCB4       | ["TTN","DLG2","MARK4"] |           0 | DLG2      |
| ENSG00000005471.11 | ABCB4       | ["TTN","DLG2","MARK4"] |           1 | MARK4     |
| ENSG00000005471.11 | ABCB4       | ["TTN","DLG2","MARK4"] |           2 | TTN       |
| ENSG00000005471.11 | ABCB4       | ["TTN","DLG2","MARK4"] |           3 | DLG2      |
| ENSG00000005471.11 | ABCB4       | ["TTN","DLG2","MARK4"] |           4 | TTN       |
| ENSG00000005471.11 | ABCB4       | ["TTN","DLG2","MARK4"] |           5 | DLG2      |
| ENSG00000005471.11 | ABCB4    

# Compute Statistic on Permuted Matrix

In [38]:
perm_expl_sort = perm_expl.key_rows_by(perm_expl.permutation)

2018-04-30 16:14:40 Hail: INFO: Ordering unsorted dataset with network shuffle


In [39]:
perm_expl_sort.row.show()

+--------------------+-------------+------------------------+-------------+-----------+
| Name               | Description | neighborhood           | permutation | perm_gene |
+--------------------+-------------+------------------------+-------------+-----------+
| str                | str         | array<str>             |       int32 | str       |
+--------------------+-------------+------------------------+-------------+-----------+
| ENSG00000005471.11 | ABCB4       | ["TTN","DLG2","MARK4"] |           0 | TTN       |
| ENSG00000138107.7  | ACTR1A      | ["TTN","DLG2","MARK4"] |           0 | TTN       |
| ENSG00000166971.12 | AKTIP       | ["TTN","DLG2","MARK4"] |           0 | DLG2      |
| ENSG00000116127.13 | ALMS1       | ["TTN","DLG2","MARK4"] |           0 | TTN       |
| ENSG00000178038.12 | ALS2CL      | ["TTN","DLG2","MARK4"] |           0 | MARK4     |
| ENSG00000181754.6  | AMIGO1      | ["TTN","DLG2","MARK4"] |           0 | DLG2      |
| ENSG00000126016.9  | AMOT     

In [42]:
df.rows().show()

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/agoeva/Downloads/spark-2.2.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1035, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/agoeva/Downloads/spark-2.2.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 883, in send_command
    response = connection.send_command(command)
  File "/Users/agoeva/Downloads/spark-2.2.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1040, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving


Py4JError: An error occurred while calling o82.rowsTable

In [40]:
perm_for_test_stat = perm_expl_sort.annotate_entries(x=df[perm_expl_sort.perm_gene, perm_expl_sort.col_id].x)

2018-04-30 16:15:03 Hail: INFO: Ordering unsorted dataset with network shuffle


FatalError: OutOfMemoryError: Java heap space

Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 44.0 failed 1 times, most recent failure: Lost task 1.0 in stage 44.0 (TID 2594, localhost, executor driver): java.lang.OutOfMemoryError: Java heap space
	at is.hail.annotations.Region.read(Region.scala:292)
	at com.esotericsoftware.kryo.serializers.DefaultSerializers$KryoSerializableSerializer.read(DefaultSerializers.java:511)
	at com.esotericsoftware.kryo.serializers.DefaultSerializers$KryoSerializableSerializer.read(DefaultSerializers.java:503)
	at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
	at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
	at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
	at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
	at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:246)
	at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:158)
	at org.apache.spark.serializer.DeserializationStream$$anon$2.getNext(Serializer.scala:188)
	at org.apache.spark.serializer.DeserializationStream$$anon$2.getNext(Serializer.scala:185)
	at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:32)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:199)
	at org.apache.spark.shuffle.BlockStoreShuffleReader.read(BlockStoreShuffleReader.scala:102)
	at org.apache.spark.rdd.ShuffledRDD.compute(ShuffledRDD.scala:105)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
	at is.hail.sparkextras.ContextRDD.collect(ContextRDD.scala:132)
	at is.hail.rvd.OrderedRVD$.getPartitionKeyInfo(OrderedRVD.scala:478)
	at is.hail.rvd.OrderedRVD$.getPartitionKeyInfo(OrderedRVD.scala:488)
	at is.hail.rvd.OrderedRVD$.coerce(OrderedRVD.scala:556)
	at is.hail.rvd.OrderedRVD$.coerce(OrderedRVD.scala:514)
	at is.hail.table.Table.toOrderedRVD(Table.scala:1152)
	at is.hail.variant.MatrixTable.annotateRowsTable(MatrixTable.scala:920)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)java.lang.OutOfMemoryError: Java heap space
	at is.hail.annotations.Region.read(Region.scala:292)
	at com.esotericsoftware.kryo.serializers.DefaultSerializers$KryoSerializableSerializer.read(DefaultSerializers.java:511)
	at com.esotericsoftware.kryo.serializers.DefaultSerializers$KryoSerializableSerializer.read(DefaultSerializers.java:503)
	at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
	at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
	at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
	at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
	at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:246)
	at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:158)
	at org.apache.spark.serializer.DeserializationStream$$anon$2.getNext(Serializer.scala:188)
	at org.apache.spark.serializer.DeserializationStream$$anon$2.getNext(Serializer.scala:185)
	at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:32)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:199)
	at org.apache.spark.shuffle.BlockStoreShuffleReader.read(BlockStoreShuffleReader.scala:102)
	at org.apache.spark.rdd.ShuffledRDD.compute(ShuffledRDD.scala:105)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)


Hail version: devel-a1d6ecc
Error summary: OutOfMemoryError: Java heap space

# P-values