In [1]:
import sys
import argparse
import operator
import itertools
import traceback
import os.path
import multiprocessing
import pysam
import HTSeq
import time

In [2]:
features_file = "gencode.v33.primary_assembly.annotation.gff3"
bam_file = "Aligned.sortedByCoord.out.bam"

In [3]:
stranded = "yes"
minaqual = 10

# CIGAR match characters (including alignment match, sequence match, and sequence mismatch
com = ('M', '=', 'X')

n_workers = 32

In [4]:
%%time
alignment = pysam.AlignmentFile(bam_file)


CPU times: user 7.98 ms, sys: 4.99 ms, total: 13 ms
Wall time: 12.6 ms


In [5]:
%%time
#pysam.index(bam_file)

CPU times: user 1 µs, sys: 1e+03 ns, total: 2 µs
Wall time: 5.25 µs


In [6]:
# Pass if encountering an unknown chromosome
class UnknownChrom(Exception):
    pass

# Check read and mapping quality
def check_read(r, minaqual):
    start = time.time()
    return r.aligned and (not r.not_primary_alignment) and (not r.supplementary) and (r.aQual >= minaqual) and r.optional_field("NH") == 1


# Find genomic features that overlap with read
def get_overlapping_features(r, features):
    start = time.time()
    iv_seq = (co.ref_iv for co in r.cigar if co.type in com and co.size > 0)
    fs = set()
    for iv in iv_seq:
        if iv.chrom in features.chrom_vectors:
            for iv2, fs2 in features[iv].steps():
                fs = fs.union(fs2)
    return fs


# Write output in tsv form
def write_to_out(r, assignment, outfile):
    name = r.read.name
    outfile.write(name + "\t" + assignment + "\n")

In [7]:
# Get features from GFF
def GFFToFeatures(gff_filename, stranded):
    start = time.time()
    gff = HTSeq.GFF_Reader(gff_filename)
    feature_scan = HTSeq.make_feature_genomicarrayofsets(
        gff,
        id_attribute='gene_name',
        feature_type='exon',
        feature_query=None,
        stranded=stranded != 'no',
        verbose=True)
    features = feature_scan['features']
    ret = features
    print("Loaded features in: " + str(time.time() - start))
    return ret

In [8]:
refs = pysam.AlignmentFile(bam_file).references

In [9]:
def _process_file(features, filename, ref, minaqual):

    # Open input file
    print("Loading input file")
    read_seq_iter = iter(HTSeq.BAM_Reader(bam_file).fetch(ref))

    # Initialize counts
    i = 0

    # Open output file
    print("Opening output file handle")
    outfile = open("outs/" + ref + ".txt", 'w')

    
    print("Iterating over reads")
    for read in read_seq_iter:

        # Track number of reads
        if i > 0 and i % 1000000 == 0:
            sys.stderr.write("%d alignment records processed.\n" %i)
            sys.stderr.flush()
        i += 1

        # If read is good, get aligned genomic intervals
        if check_read(read, minaqual):            

            # Overlap the read-aligned genomic intervals with features.
            fs = get_overlapping_features(read, features)

#             # Write output if read overlaps with one feature only.
            if fs is not None and len(fs) == 1:
                write_to_out(read, list(fs)[0], outfile)

    outfile.close()


In [10]:
%%time
# Prepare features
print("Loading features")
features = GFFToFeatures(features_file, stranded)

Loading features


100000 GFF lines processed.
200000 GFF lines processed.
300000 GFF lines processed.
400000 GFF lines processed.
500000 GFF lines processed.
600000 GFF lines processed.
700000 GFF lines processed.
800000 GFF lines processed.
900000 GFF lines processed.
1000000 GFF lines processed.
1100000 GFF lines processed.
1200000 GFF lines processed.
1300000 GFF lines processed.
1400000 GFF lines processed.
1500000 GFF lines processed.
1600000 GFF lines processed.
1700000 GFF lines processed.
1800000 GFF lines processed.
1900000 GFF lines processed.
2000000 GFF lines processed.
2100000 GFF lines processed.
2200000 GFF lines processed.
2300000 GFF lines processed.
2400000 GFF lines processed.
2500000 GFF lines processed.
2600000 GFF lines processed.
2700000 GFF lines processed.
2800000 GFF lines processed.
2900000 GFF lines processed.
2907627 GFF lines processed.


Loaded features in: 181.62418413162231
CPU times: user 3min, sys: 1.16 s, total: 3min 1s
Wall time: 3min 1s


In [11]:
from dask.distributed import LocalCluster, Client

lc = LocalCluster(processes=True, n_workers=n_workers, threads_per_worker=1)
client = Client(lc)
client



0,1
Client  Scheduler: tcp://127.0.0.1:38928  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 32  Cores: 32  Memory: 50.39 GB


In [12]:
%%time
features_data = client.scatter(features, broadcast=True)

CPU times: user 11.8 s, sys: 3.07 s, total: 14.8 s
Wall time: 52.3 s


In [13]:
%%time

results = [client.submit(_process_file, features_data, bam_file, ref, minaqual, pure=False) for ref in refs]

from dask.distributed import wait
wait(results)



CPU times: user 5min 20s, sys: 1min 43s, total: 7min 4s
Wall time: 30min 16s


DoneAndNotDoneFutures(done={<Future: finished, type: builtins.NoneType, key: _process_file-2eb07490-42fd-4c20-85c5-51d6d0bbd7f3>, <Future: finished, type: builtins.NoneType, key: _process_file-47741bab-3d74-4c09-9e10-dbc932c3ef53>, <Future: finished, type: builtins.NoneType, key: _process_file-e3d89269-794d-42a7-959f-89bae56dc293>, <Future: finished, type: builtins.NoneType, key: _process_file-2e6fc37e-9363-4f73-88e6-6a2d047e7be8>, <Future: finished, type: builtins.NoneType, key: _process_file-8e7c9108-ac3b-457d-9756-2fa1faa71b0d>, <Future: finished, type: builtins.NoneType, key: _process_file-8b2026e3-067d-4114-a765-ee9f3946e275>, <Future: finished, type: builtins.NoneType, key: _process_file-e5cec328-8064-4122-94e1-3c23b768bf87>, <Future: finished, type: builtins.NoneType, key: _process_file-f1c247e5-1575-4742-a021-c7fa41b37ec8>, <Future: finished, type: builtins.NoneType, key: _process_file-7006196c-1cdd-4124-9c7e-0d4b52df8753>, <Future: finished, type: builtins.NoneType, key: _proc

In [19]:
%%time
import glob
import os
filelist=glob.glob("outs/*.txt")
for file in filelist:
    if os.stat(file).st_size == 0:
        os.remove(file)
