# Sanger cell line Affymetrix gene expression project

### Source

- From GEO: http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE68950

In [1]:
from pathlib import Path
import pandas as pd

In [2]:
# Setup logging 
import logging
logger = logging.getLogger()
logger.handlers[0].setFormatter(
    logging.Formatter(
        fmt=(
            "%(asctime)s.%(msecs)03d %(levelname).1s "
            "[%(name)s] %(message)s"
        ),
        datefmt="%y-%m-%d %H:%M:%S",
    )
)
logger.setLevel(logging.INFO)

In [3]:
logger.info('Logger works!')

16-01-14 17:42:52.788 I [root] Logger works!


In [4]:
data_root = Path('../').resolve()
raw_data_root = data_root.joinpath(
    'raw', 
    'Sanger_Cell_Line_Project_Affymetrix_QCed_Data_n798/'
)
logger.info(
    'Required paths exist: %s' % 
    all(map((lambda p: p.exists()), [data_root, raw_data_root]))
)

16-01-14 17:42:52.793 I [root] Required paths exist: True


In [5]:
sample_info_p = next(raw_data_root.glob('*.xls'))
sample_raw_data_p = next(raw_data_root.glob('*.tsv.gz'))
plat_info = next(data_root.glob('*.csv'))

print('Sample info:', sample_info_p)
print('Sample raw data:', sample_raw_data_p)
print('Platform info:', plat_info)

Sample info: /Users/liang/code/dj_carkinos/data/raw/Sanger_Cell_Line_Project_Affymetrix_QCed_Data_n798/Sanger_affy_n798_sample_info_published.xls
Sample raw data: /Users/liang/code/dj_carkinos/data/raw/Sanger_Cell_Line_Project_Affymetrix_QCed_Data_n798/Sanger_Cell_Line_Project_Affymetrix_QCed_Data_n798.gct.tsv.gz
Platform info: /Users/liang/code/dj_carkinos/data/Affy_U133A_probe_info.csv


## Read in Platform Annotation

The platform annotation is created by `Get_Affy_probes.R`

In [6]:
plat_info_df = pd.read_csv(plat_info.as_posix())

In [7]:
plat_info_df.head()

Unnamed: 0,PROBEID,SYMBOL,ENTREZID,GENENAME
0,1007_s_at,DDR1,780,discoidin domain receptor tyrosine kinase 1
1,1007_s_at,MIR4640,100616237,microRNA 4640
2,1053_at,RFC2,5982,"replication factor C (activator 1) 2, 40kDa"
3,117_at,HSPA6,3310,heat shock 70kDa protein 6 (HSP70B')
4,121_at,PAX8,7849,paired box 8


## Read in Sample Info

In [8]:
sample_info_df = pd.read_excel(sample_info_p.as_posix())

In [9]:
sample_info_df = sample_info_df.loc[
    :, ['Scan', 'SampleName', 'PrimarySite', 'PrimaryHist']
]
sample_info_df.columns = [
    'name', 'cell_line', 'primary_site', 'primary_histology'
]

In [10]:
# sample_info_df['filename'] = sample_info_df.name + '.CEL'

In [11]:
# Using 800 samples take up 1.1GB SQLite database, which is too large
sample_info_df = sample_info_df.head(n=100)

In [12]:
sample_info_df.head()

Unnamed: 0,name,cell_line,primary_site,primary_histology
0,5500024035100021608461.G01,380,haematopoietic and lymphoid tissue,lymphoid neoplasm
1,5500024034290101707049.A01,697,haematopoietic and lymphoid tissue,haematopoietic neoplasm
2,5500024052603032009483.A09,5637,urinary tract,carcinoma
3,5500024035100021608461.H01,22RV1,prostate,carcinoma
4,5500024032848101507998.D02,23132-87,stomach,carcinoma


In [13]:
# To make sure a cell_line name only has one primary_site and primary_histology 
for grp, df in sample_info_df.iloc[:, 1:-1].groupby('cell_line'):
    assert len(pd.unique(df.to_records(index=False))) == 1

## Read Sample Raw Data

In [14]:
sample_raw_data_df = pd.read_table(sample_raw_data_p.as_posix(), skiprows=2)

In [15]:
sample_raw_data_df.index = sample_raw_data_df['NAME']
sample_raw_data_df.index.name = None

In [16]:
sample_raw_data_df = sample_raw_data_df.iloc[:, 2:]

In [17]:
sample_raw_data_df.columns = [col[:-len('.CEL')] for col in sample_raw_data_df.columns]

In [18]:
# Select only samples in sapmle_info_df
sample_raw_data_df = sample_raw_data_df.loc[:, sample_info_df['name']]

In [19]:
sample_raw_data_df = sample_raw_data_df.reindex(pd.unique(plat_info_df.PROBEID))

In [20]:
sample_raw_data_df.head()

Unnamed: 0,5500024035100021608461.G01,5500024034290101707049.A01,5500024052603032009483.A09,5500024035100021608461.H01,5500024032848101507998.D02,5500024030401071707289.D04,5500024030401071707289.C10,5500024052861011409506.D05,5500024032848101507998.E02,5500024052861011409506.E01,...,5500024052861011409506.F08,5500024032848101507000.H02,5500024032848101507000.A03,5500024032848101507000.B03,5500024034290101707049.B03,5500024032848101507998.D05,5500024034290101707049.C03,5500024034290101707049.D03,5500024034290101707049.E03,5500024035100021608461.E02
1007_s_at,520,109,777,696,2130,249,1725,1919,127,129,...,1982,368,3064,976,2386,1777,2104,1120,1042,1094
1053_at,269,159,340,151,129,85,183,235,107,311,...,222,145,50,134,151,124,115,65,170,372
117_at,20,49,20,23,24,18,18,24,24,35,...,24,39,23,28,26,29,22,24,23,20
121_at,71,76,110,95,466,66,53,66,2011,927,...,75,67,189,52,65,807,2464,137,76,77
1255_g_at,15,15,16,17,16,16,15,17,15,16,...,17,17,15,15,17,15,16,15,16,16


## Setup Django

Since Django needs to load the modules by relative import, change the working directory to `/src`.

In [21]:
cd "../../src/"

/Users/liang/code/dj_carkinos/src


In [22]:
import django
import os

In [23]:
os.environ['DJANGO_SETTINGS_MODULE'] = 'carkinos.settings.local'

In [24]:
django.setup()

In [25]:
from datasets.models import (
    CellLine, DataSet, MicroarrayPlatform, 
    Sample, ProbeInfo, Gene
)

In [26]:
logging.getLogger('django.db.backends').setLevel(logging.INFO)

### Setup Microarray Plaform

In [27]:
platform = MicroarrayPlatform(
    name='GPL3921',
    manufacturer='Affymetrix',
    description='Affymetrix HT Human Genome U133A Array',
)

In [28]:
platform.save()

In [29]:
platform

<MicroarrayPlatform: GPL3921>

### Platform Probe Info

In [30]:
plat_info_df.SYMBOL.fillna('', inplace=True)
plat_info_df.GENENAME.fillna('', inplace=True)

In [31]:
plat_info_df.ENTREZID.fillna(0, inplace=True)

In [32]:
plat_info_df.tail()

Unnamed: 0,PROBEID,SYMBOL,ENTREZID,GENENAME
24544,AFFX-r2-Hs28SrRNA-3_at,,0,
24545,AFFX-r2-Hs28SrRNA-5_at,,0,
24546,AFFX-r2-Hs28SrRNA-M_at,,0,
24547,AFFX-r2-P1-cre-3_at,,0,
24548,AFFX-r2-P1-cre-5_at,,0,


In [33]:
plat_info_df.iloc[927]

PROBEID     201265_at
SYMBOL               
ENTREZID            0
GENENAME             
Name: 927, dtype: object

In [34]:
logger.info('Start populating probe info ...')
probes = []
for probe_order, probe_identifier in enumerate(sample_raw_data_df.index):
    probe = ProbeInfo(
        identifier=probe_identifier,
        platform=platform,
        platform_order=probe_order,
    )
    probes.append(probe)

# Bulk create (all in one SQL)
ProbeInfo.objects.bulk_create(probes)
logger.info('Finished')

16-01-14 09:42:56.222 I [root] Start populating probe info ...
16-01-14 09:42:57.421 I [root] Finished


In [35]:
probes = platform.probes.order_by('platform_order')

In [36]:
logger.info('Add gene info to probes ...')
for probe in probes:
    gene_info = plat_info_df[plat_info_df['PROBEID'] == probe.identifier]
    for _, row in gene_info.iterrows():
        gene = Gene.objects.create(
            entrez_id=(row.ENTREZID if row.ENTREZID > 0 else None),
            gene_symbol=row.SYMBOL,
            gene_name=row.GENENAME,
        )
        probe.genes.add(gene)
logger.info('Finished')

16-01-14 09:42:57.439 I [root] Add gene info to probes ...
16-01-14 09:44:37.883 I [root] Finished


In [37]:
Gene.objects.count()

24549

### Setup Dataset

In [38]:
dset = DataSet(
    name = 'Sanger Cell Line Project',
    data_path = 'sanger_cell_line_proj.npy',
)

In [39]:
dset.save()

In [40]:
dset

<DataSet: Sanger Cell Line Project>

### Setup Cell Lines, Samples and Their Values

In [41]:
sample_info_df.head()

Unnamed: 0,name,cell_line,primary_site,primary_histology
0,5500024035100021608461.G01,380,haematopoietic and lymphoid tissue,lymphoid neoplasm
1,5500024034290101707049.A01,697,haematopoietic and lymphoid tissue,haematopoietic neoplasm
2,5500024052603032009483.A09,5637,urinary tract,carcinoma
3,5500024035100021608461.H01,22RV1,prostate,carcinoma
4,5500024032848101507998.D02,23132-87,stomach,carcinoma


In [42]:
# Create all cell lines
known_cell_lines = {}
for rowid, row in sample_info_df.iterrows():
    cl = row.cell_line
    try:
        cell_line = known_cell_lines[cl]
    except KeyError:
        cell_line = CellLine.objects.create(
            name=str(row.cell_line), 
            primary_site=row.primary_site, 
            primary_histology=row.primary_histology
        )
        known_cell_lines[cl] = cell_line

In [43]:
# Create all samples
for rowid, row in sample_info_df.iterrows():
    name = row['name']
    # Create sample
    sample = Sample.objects.create(
        name=name,
        cell_line=known_cell_lines[row.cell_line],
        dataset=dset,
        dataset_order=rowid,
        platform=platform,
    )

In [47]:
import numpy as np

In [48]:
with (data_root / dset.data_path).open('wb') as f:
    # sample_raw_data_df.values.tofile(f)
    np.save(f, sample_raw_data_df.values)