In [1]:
# Setup logging 
import logging
logger = logging.getLogger()
logger.handlers[0].setFormatter(
    logging.Formatter(
        fmt=(
            "%(asctime)s.%(msecs)03d %(levelname).1s "
            "[%(name)s] %(message)s"
        ),
        datefmt="%y-%m-%d %H:%M:%S",
    )
)
logger.setLevel(logging.INFO)

In [2]:
from pathlib import Path
import pandas as pd

In [3]:
import operator as op
import numpy as np

In [4]:
cd "../../src/"

/Users/liang/code/dj_carkinos/src


In [5]:
import django
import os

In [6]:
os.environ['DJANGO_SETTINGS_MODULE'] = 'carkinos.settings.local'

In [7]:
django.setup()

In [8]:
from django.conf import settings
from django.db.models import Q, F, Count

In [9]:
from datasets.models import (
    CellLine, DataSet, MicroarrayPlatform, 
    Sample, ProbeInfo, Gene
)

## Get basic Dataset info

In [10]:
dset = DataSet.objects.first()

16-01-14 10:26:29.396 D [django.db.backends] (0.001) SELECT "datasets_dataset"."name", "datasets_dataset"."data_path" FROM "datasets_dataset" ORDER BY "datasets_dataset"."name" ASC LIMIT 1; args=()


In [11]:
dset

<DataSet: Sanger Cell Line Project>

In [12]:
# We only have one platform for now
platform = MicroarrayPlatform.objects.first()

16-01-14 10:26:29.409 D [django.db.backends] (0.000) SELECT "datasets_generalplatform"."name", "datasets_generalplatform"."description", "datasets_microarrayplatform"."generalplatform_ptr_id", "datasets_microarrayplatform"."manufacturer" FROM "datasets_microarrayplatform" INNER JOIN "datasets_generalplatform" ON ("datasets_microarrayplatform"."generalplatform_ptr_id" = "datasets_generalplatform"."name") ORDER BY "datasets_microarrayplatform"."generalplatform_ptr_id" ASC LIMIT 1; args=()


In [13]:
platform

<MicroarrayPlatform: GPL3921>

In [14]:
platform.probes.all()[:5]

16-01-14 10:26:29.416 D [django.db.backends] (0.000) SELECT "datasets_probeinfo"."id", "datasets_probeinfo"."identifier", "datasets_probeinfo"."platform_id", "datasets_probeinfo"."platform_order" FROM "datasets_probeinfo" WHERE "datasets_probeinfo"."platform_id" = 'GPL3921' LIMIT 5; args=('GPL3921',)


[<ProbeInfo: 1007_s_at of GPL3921>, <ProbeInfo: 1053_at of GPL3921>, <ProbeInfo: 117_at of GPL3921>, <ProbeInfo: 121_at of GPL3921>, <ProbeInfo: 1255_g_at of GPL3921>]

In [15]:
dset.samples.all()[:5]

16-01-14 10:26:29.421 D [django.db.backends] (0.000) SELECT "datasets_sample"."id", "datasets_sample"."name", "datasets_sample"."cell_line_id", "datasets_sample"."dataset_id", "datasets_sample"."dataset_order", "datasets_sample"."platform_id" FROM "datasets_sample" WHERE "datasets_sample"."dataset_id" = 'Sanger Cell Line Project' LIMIT 5; args=('Sanger Cell Line Project',)
16-01-14 10:26:29.422 D [django.db.backends] (0.000) SELECT "datasets_cellline"."id", "datasets_cellline"."name", "datasets_cellline"."primary_site", "datasets_cellline"."primary_histology" FROM "datasets_cellline" WHERE "datasets_cellline"."id" = 1; args=(1,)
16-01-14 10:26:29.423 D [django.db.backends] (0.000) SELECT "datasets_cellline"."id", "datasets_cellline"."name", "datasets_cellline"."primary_site", "datasets_cellline"."primary_histology" FROM "datasets_cellline" WHERE "datasets_cellline"."id" = 2; args=(2,)
16-01-14 10:26:29.424 D [django.db.backends] (0.000) SELECT "datasets_cellline"."id", "datasets_cellli

[<Sample: 5500024035100021608461.G01 380 from Sanger Cell Line Project>, <Sample: 5500024034290101707049.A01 697 from Sanger Cell Line Project>, <Sample: 5500024052603032009483.A09 5637 from Sanger Cell Line Project>, <Sample: 5500024035100021608461.H01 22RV1 from Sanger Cell Line Project>, <Sample: 5500024032848101507998.D02 23132-87 from Sanger Cell Line Project>]

## Example

In [16]:
lung_cell_lines = CellLine.objects.filter(primary_site__contains='lung')

In [17]:
BRCA_probes = ProbeInfo.objects.filter(genes__gene_symbol__contains='BRCA')

In [18]:
p = BRCA_probes[0]

16-01-14 10:26:29.438 D [django.db.backends] (0.003) SELECT "datasets_probeinfo"."id", "datasets_probeinfo"."identifier", "datasets_probeinfo"."platform_id", "datasets_probeinfo"."platform_order" FROM "datasets_probeinfo" INNER JOIN "datasets_probeinfo_genes" ON ("datasets_probeinfo"."id" = "datasets_probeinfo_genes"."probeinfo_id") INNER JOIN "datasets_gene" ON ("datasets_probeinfo_genes"."gene_id" = "datasets_gene"."id") WHERE "datasets_gene"."gene_symbol" LIKE '%BRCA%' ESCAPE '\' LIMIT 1; args=('%BRCA%',)


In [19]:
for p in BRCA_probes:
    print(
        "Probe %s has %d genes: %s"
        %(p.identifier, p.genes.count(), p.genes.first().gene_symbol)
    )

16-01-14 10:26:29.445 D [django.db.backends] (0.003) SELECT "datasets_probeinfo"."id", "datasets_probeinfo"."identifier", "datasets_probeinfo"."platform_id", "datasets_probeinfo"."platform_order" FROM "datasets_probeinfo" INNER JOIN "datasets_probeinfo_genes" ON ("datasets_probeinfo"."id" = "datasets_probeinfo_genes"."probeinfo_id") INNER JOIN "datasets_gene" ON ("datasets_probeinfo_genes"."gene_id" = "datasets_gene"."id") WHERE "datasets_gene"."gene_symbol" LIKE '%BRCA%' ESCAPE '\'; args=('%BRCA%',)
16-01-14 10:26:29.462 D [django.db.backends] (0.000) SELECT COUNT(*) AS "__count" FROM "datasets_gene" INNER JOIN "datasets_probeinfo_genes" ON ("datasets_gene"."id" = "datasets_probeinfo_genes"."gene_id") WHERE "datasets_probeinfo_genes"."probeinfo_id" = 4058; args=(4058,)
16-01-14 10:26:29.464 D [django.db.backends] (0.000) SELECT "datasets_gene"."id", "datasets_gene"."entrez_id", "datasets_gene"."gene_symbol", "datasets_gene"."gene_name" FROM "datasets_gene" INNER JOIN "datasets_probein

Probe 204531_s_at has 1 genes: BRCA1
Probe 208368_s_at has 1 genes: BRCA2
Probe 211851_x_at has 1 genes: BRCA1
Probe 214727_at has 1 genes: BRCA2


In [20]:
probes_order = list(
    map(op.attrgetter('platform_order'), BRCA_probes)
)

In [21]:
probes_order

[4057, 7868, 11244, 14102]

In [22]:
lung_samples = dset.samples.filter(cell_line__primary_site__contains='lung')

In [23]:
samples_order = list(
        map(op.attrgetter('dataset_order'), lung_samples)
)

16-01-14 10:26:29.483 D [django.db.backends] (0.000) SELECT "datasets_sample"."id", "datasets_sample"."name", "datasets_sample"."cell_line_id", "datasets_sample"."dataset_id", "datasets_sample"."dataset_order", "datasets_sample"."platform_id" FROM "datasets_sample" INNER JOIN "datasets_cellline" ON ("datasets_sample"."cell_line_id" = "datasets_cellline"."id") WHERE ("datasets_sample"."dataset_id" = 'Sanger Cell Line Project' AND "datasets_cellline"."primary_site" LIKE '%lung%' ESCAPE '\'); args=('Sanger Cell Line Project', '%lung%')


In [24]:
samples_order

[20, 24, 28, 51, 79, 91, 92, 93]

In [25]:
dset_val_pth = Path(settings.DATA_ROOT, dset.data_path)

In [26]:
dset_val = np.load(dset_val_pth.as_posix(), mmap_mode='r')

From http://stackoverflow.com/a/30918530

In [27]:
dset_val[np.meshgrid(probes_order, samples_order, indexing='ij')]

memmap([[ 456.,  209.,  239.,  229.,  146.,  209.,  261.,  177.],
       [ 106.,   29.,   43.,   80.,   67.,   46.,   95.,   60.],
       [  79.,   41.,   44.,   31.,   31.,   34.,   43.,   55.],
       [  26.,   26.,   43.,   23.,   25.,   24.,   25.,   20.]])

In [28]:
dset_val[probes_order][:, samples_order]

memmap([[ 456.,  209.,  239.,  229.,  146.,  209.,  261.,  177.],
       [ 106.,   29.,   43.,   80.,   67.,   46.,   95.,   60.],
       [  79.,   41.,   44.,   31.,   31.,   34.,   43.,   55.],
       [  26.,   26.,   43.,   23.,   25.,   24.,   25.,   20.]])

In [29]:
dset_val[np.ix_(probes_order, samples_order)]

memmap([[ 456.,  209.,  239.,  229.,  146.,  209.,  261.,  177.],
       [ 106.,   29.,   43.,   80.,   67.,   46.,   95.,   60.],
       [  79.,   41.,   44.,   31.,   31.,   34.,   43.,   55.],
       [  26.,   26.,   43.,   23.,   25.,   24.,   25.,   20.]])

In [30]:
# require to be np.array first
dset_val[np.array(probes_order)[:, None], np.array(samples_order)[None, :]]

memmap([[ 456.,  209.,  239.,  229.,  146.,  209.,  261.,  177.],
       [ 106.,   29.,   43.,   80.,   67.,   46.,   95.,   60.],
       [  79.,   41.,   44.,   31.,   31.,   34.,   43.,   55.],
       [  26.,   26.,   43.,   23.,   25.,   24.,   25.,   20.]])

In [31]:
import pandas as pd

pd.DataFrame(dset_val).ix[probes_order, samples_order]

Unnamed: 0,20,24,28,51,79,91,92,93
4057,456,209,239,229,146,209,261,177
7868,106,29,43,80,67,46,95,60
11244,79,41,44,31,31,34,43,55
14102,26,26,43,23,25,24,25,20
