# Pre-Process the data
Function: Throw data of each gene to corresponding folder based on their platform (e.g. GPL96, GPL97, GPL8300, etc.)

In [1]:
import GEOparse
import re
import os
import pandas as pd

## Extract Gene Info from GEO DataSets

In [79]:
gsename = "GSE2770"
softfile = "./{}_family.soft.gz".format(gsename)
if os.path.isfile(softfile):
    gse = GEOparse.get_GEO(filepath=softfile)
else:
    gse = GEOparse.get_GEO(geo=gsename, destdir="./")

06-Aug-2019 14:34:06 INFO GEOparse - Parsing ./GSE2770_family.soft.gz: 
06-Aug-2019 14:34:06 DEBUG GEOparse - DATABASE: GeoMiame
06-Aug-2019 14:34:06 DEBUG GEOparse - SERIES: GSE2770
06-Aug-2019 14:34:06 DEBUG GEOparse - PLATFORM: GPL96
06-Aug-2019 14:34:07 DEBUG GEOparse - PLATFORM: GPL97
06-Aug-2019 14:34:08 DEBUG GEOparse - PLATFORM: GPL8300
06-Aug-2019 14:34:08 DEBUG GEOparse - SAMPLE: GSM60348
06-Aug-2019 14:34:08 DEBUG GEOparse - SAMPLE: GSM60349
06-Aug-2019 14:34:09 DEBUG GEOparse - SAMPLE: GSM60350
06-Aug-2019 14:34:09 DEBUG GEOparse - SAMPLE: GSM60351
06-Aug-2019 14:34:09 DEBUG GEOparse - SAMPLE: GSM60352
06-Aug-2019 14:34:09 DEBUG GEOparse - SAMPLE: GSM60353
06-Aug-2019 14:34:09 DEBUG GEOparse - SAMPLE: GSM60354
06-Aug-2019 14:34:09 DEBUG GEOparse - SAMPLE: GSM60355
06-Aug-2019 14:34:09 DEBUG GEOparse - SAMPLE: GSM60356
06-Aug-2019 14:34:09 DEBUG GEOparse - SAMPLE: GSM60357
06-Aug-2019 14:34:09 DEBUG GEOparse - SAMPLE: GSM60358
06-Aug-2019 14:34:09 DEBUG GEOparse - SAMPLE: GS

## Extract Platform Information

In [3]:
keys = []
values = []
for gpl in gse.gpls:
    keys.append(gse.gpls[gpl].name)
    r1 = re.findall(r"\[.*?\]",gse.gpls[gpl].metadata['title'][0])
    values.append(r1[0][4:-1])
    #values.append(gse.gpls[gpl].metadata['title'][0])
    print(gse.gpls[gpl].name)
    print(gse.gpls[gpl].metadata['title'][0])
    
celformat = dict(zip(keys, values))

GPL96
[HG-U133A] Affymetrix Human Genome U133A Array
GPL97
[HG-U133B] Affymetrix Human Genome U133B Array
GPL8300
[HG_U95Av2] Affymetrix Human Genome U95 Version 2 Array


## Classify the Samples by Platform

In [4]:
keys = []
values = []
for gsm in gse.gsms:
    #print(gse.gsms[gsm].name)
    keys.append(gse.gsms[gsm].name)
    for key,val in celformat.items():
        r1 = re.findall(r"{}".format(val),gse.gsms[gsm].columns.loc['ID_REF','description'])
        if not r1:
            pass
        else:
            values.append(key)
    #r1 = re.findall(r"\(.*?\)",gse.gsms[gsm].columns.loc['ID_REF','description'])
    #values.append(r1[0][1:-1])
    #values.append(gse.gsms[gsm].columns.loc['ID_REF','description'])

gsm_platform = dict(zip(keys, values))
gsm_platform

{'GSM60348': 'GPL8300',
 'GSM60349': 'GPL8300',
 'GSM60350': 'GPL8300',
 'GSM60351': 'GPL8300',
 'GSM60352': 'GPL8300',
 'GSM60353': 'GPL8300',
 'GSM60354': 'GPL8300',
 'GSM60355': 'GPL8300',
 'GSM60356': 'GPL8300',
 'GSM60357': 'GPL8300',
 'GSM60358': 'GPL8300',
 'GSM60359': 'GPL8300',
 'GSM60360': 'GPL8300',
 'GSM60361': 'GPL8300',
 'GSM60362': 'GPL8300',
 'GSM60363': 'GPL8300',
 'GSM60364': 'GPL8300',
 'GSM60365': 'GPL8300',
 'GSM60366': 'GPL8300',
 'GSM60367': 'GPL8300',
 'GSM60368': 'GPL8300',
 'GSM60369': 'GPL8300',
 'GSM60370': 'GPL8300',
 'GSM60371': 'GPL8300',
 'GSM60372': 'GPL8300',
 'GSM60373': 'GPL8300',
 'GSM60374': 'GPL8300',
 'GSM60375': 'GPL8300',
 'GSM60376': 'GPL8300',
 'GSM60377': 'GPL8300',
 'GSM60378': 'GPL8300',
 'GSM60379': 'GPL8300',
 'GSM60380': 'GPL8300',
 'GSM60381': 'GPL8300',
 'GSM60699': 'GPL96',
 'GSM60700': 'GPL96',
 'GSM60701': 'GPL96',
 'GSM60702': 'GPL96',
 'GSM60703': 'GPL96',
 'GSM60704': 'GPL96',
 'GSM60705': 'GPL96',
 'GSM60706': 'GPL96',
 'GSM607

## Create Folders if not exist, Move files to corresponding folders

In [5]:
# Setup paths
currentdir = os.getcwd()
dirlist = currentdir.split('/')
projectdir = '/'.join(dirlist[0:-1])
datadir = os.path.join(projectdir,'data')
outputdir = os.path.join(projectdir,'output')
gene = 'GSE2770'
genedir = os.path.join(datadir,gene + '_RAW')

In [6]:
# create a folder for each platform
for key in celformat.keys():
    platformdir = os.path.join(genedir,key)
    if not os.path.exists(platformdir):
        os.makedirs(platformdir)
        print('Path created: {}'.format(platformdir))
    else:
        print('Path already exist: {}'.format(platformdir))


Path already exist: /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL96
Path already exist: /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL97
Path already exist: /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL8300


In [91]:
# Move Corresponding Cel files to Folders
#onlyfiles = [f for f in os.listdir(genedir) if os.path.isfile(os.path.join(genedir, f))]
onlyfiles = [f for f in os.listdir(genedir) if f.endswith('.gz')]

for file in onlyfiles:
    filelist = file.split('.')
    prefix = filelist[0]
    if prefix in gsm_platform:
        platform = gsm_platform[prefix]
        platformdir = os.path.join(genedir,platform)
        src_path = os.path.join(genedir, file)
        dst_path = os.path.join(platformdir, file)
        os.rename(src_path,dst_path)
        print('Move {} to {}'.format(src_path,dst_path))
        


Move /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GSM60739.CEL.gz to /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL97/GSM60739.CEL.gz
Move /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GSM60744.CEL.gz to /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL97/GSM60744.CEL.gz
Move /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GSM60727.CEL.gz to /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL96/GSM60727.CEL.gz
Move /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GSM60756.CEL.gz to /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL97/GSM60756.CEL.gz
Move /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GSM60735.CEL.gz to /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL97/GSM60735.CEL.gz
Move /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GSM60748.CEL.gz to /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL97/GSM60748.CEL.gz
Move /Users/zhzhao/Dropbox/Helikar/pipel

## Download GPL (Gene ID map for Platforms)

In [11]:
platforms = ['GPL96','GPL97','GPL8300']

maps_list = []
gene_maps = pd.DataFrame([],columns=['GPL96','GPL97','GPL8300','ENTREZ_GENE_ID'])
gene_maps.set_index('ENTREZ_GENE_ID',inplace=True)
for platform in platforms:
    temp =gse.gpls[platform].table[['ID','ENTREZ_GENE_ID']]
    # Save to file
    filefullpath = os.path.join(datadir,'{}entrez.csv'.format(platform))
    print(filefullpath)
    temp.to_csv(filefullpath, index=False)
    # Single Table
    temp.dropna(axis=0,inplace=True)
    temp.set_index('ENTREZ_GENE_ID',inplace=True)
    maps_list.append(temp)
    #gene_maps[platform]=temp['ID']
    #gene_maps.merge(temp,right_index=True)

#gene_maps = pd.concat(maps_list, axis=1, sort=False)
#gene_maps

/Users/zhzhao/Dropbox/Helikar/pipelines/data/GPL96entrez.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


/Users/zhzhao/Dropbox/Helikar/pipelines/data/GPL97entrez.csv
/Users/zhzhao/Dropbox/Helikar/pipelines/data/GPL8300entrez.csv


In [7]:
gse.gpls['GPL96'].table

Unnamed: 0,ID,GB_ACC,SPOT_ID,Species Scientific Name,Annotation Date,Sequence Type,Sequence Source,Target Description,Representative Public ID,Gene Title,Gene Symbol,ENTREZ_GENE_ID,RefSeq Transcript ID,Gene Ontology Biological Process,Gene Ontology Cellular Component,Gene Ontology Molecular Function
0,1007_s_at,U48705,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,Affymetrix Proprietary Database,U48705 /FEATURE=mRNA /DEFINITION=HSU48705 Huma...,U48705,discoidin domain receptor tyrosine kinase 1 //...,DDR1 /// MIR4640,780 /// 100616237,NM_001202521 /// NM_001202522 /// NM_001202523...,0001558 // regulation of cell growth // inferr...,0005576 // extracellular region // inferred fr...,0000166 // nucleotide binding // inferred from...
1,1053_at,M87338,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,GenBank,M87338 /FEATURE= /DEFINITION=HUMA1SBU Human re...,M87338,"replication factor C (activator 1) 2, 40kDa",RFC2,5982,NM_001278791 /// NM_001278792 /// NM_001278793...,0000278 // mitotic cell cycle // traceable aut...,0005634 // nucleus // inferred from electronic...,0000166 // nucleotide binding // inferred from...
2,117_at,X51757,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,Affymetrix Proprietary Database,X51757 /FEATURE=cds /DEFINITION=HSP70B Human h...,X51757,heat shock 70kDa protein 6 (HSP70B'),HSPA6,3310,NM_002155,0000902 // cell morphogenesis // inferred from...,0005737 // cytoplasm // inferred from direct a...,0000166 // nucleotide binding // inferred from...
3,121_at,X69699,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,GenBank,X69699 /FEATURE= /DEFINITION=HSPAX8A H.sapiens...,X69699,paired box 8,PAX8,7849,NM_003466 /// NM_013951 /// NM_013952 /// NM_0...,0001655 // urogenital system development // in...,0005634 // nucleus // inferred from direct ass...,0000979 // RNA polymerase II core promoter seq...
4,1255_g_at,L36861,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,Affymetrix Proprietary Database,L36861 /FEATURE=expanded_cds /DEFINITION=HUMGC...,L36861,guanylate cyclase activator 1A (retina),GUCA1A,2978,NM_000409 /// XM_006715073,0007165 // signal transduction // non-traceabl...,0001750 // photoreceptor outer segment // infe...,0005509 // calcium ion binding // inferred fro...
5,1294_at,L13852,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,GenBank,L13852 /FEATURE= /DEFINITION=HUME1URP Homo sap...,L13852,microRNA 5193 /// ubiquitin-like modifier acti...,MIR5193 /// UBA7,7318 /// 100847079,NM_003335 /// NR_049825 /// XM_005265430 /// X...,0006464 // cellular protein modification proce...,0005634 // nucleus // not recorded /// 0005829...,0000166 // nucleotide binding // inferred from...
6,1316_at,X55005,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,Affymetrix Proprietary Database,X55005 /FEATURE=mRNA /DEFINITION=HSCERBAR Homo...,X55005,"thyroid hormone receptor, alpha",THRA,7067,NM_001190918 /// NM_001190919 /// NM_003250 //...,0000122 // negative regulation of transcriptio...,0000790 // nuclear chromatin // inferred from ...,0000978 // RNA polymerase II core promoter pro...
7,1320_at,X79510,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,Affymetrix Proprietary Database,X79510 /FEATURE=cds /DEFINITION=HSPTPD1 H.sapi...,X79510,"protein tyrosine phosphatase, non-receptor typ...",PTPN21,11099,NM_007039 /// XM_005267287 /// XM_006720011,0006470 // protein dephosphorylation // tracea...,0005737 // cytoplasm // inferred from electron...,0004721 // phosphoprotein phosphatase activity...
8,1405_i_at,M21121,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,GenBank,M21121 /FEATURE= /DEFINITION=HUMTCSM Human T c...,M21121,chemokine (C-C motif) ligand 5,CCL5,6352,NM_001278736 /// NM_002985,0000165 // MAPK cascade // inferred from mutan...,0005576 // extracellular region // traceable a...,0004435 // phosphatidylinositol phospholipase ...
9,1431_at,J02843,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,Affymetrix Proprietary Database,J02843 /FEATURE=cds /DEFINITION=HUMCYPIIE Huma...,J02843,"cytochrome P450, family 2, subfamily E, polype...",CYP2E1,1571,NM_000773,0006641 // triglyceride metabolic process // i...,0000139 // Golgi membrane // inferred from ele...,0004497 // monooxygenase activity // inferred ...


In [68]:
#dir(gse.gpls['GPL96'])
# dir(gse)
#dir(gse.gsms['GSM60749'])
#temp = gse.gsms['GSM60349'].table
#print(temp.VALUE[0])
#dir(gse.gsms['GSM60349'])
temp = gse.gpls['GPL96'].table
temp[['ID','ENTREZ_GENE_ID']]

Unnamed: 0,ID,ENTREZ_GENE_ID
0,1007_s_at,780 /// 100616237
1,1053_at,5982
2,117_at,3310
3,121_at,7849
4,1255_g_at,2978
5,1294_at,7318 /// 100847079
6,1316_at,7067
7,1320_at,11099
8,1405_i_at,6352
9,1431_at,1571


## Create the big table by combining gsms

In [66]:
# gsm_tables['GPL8300']
# temp.dropna(subset=['ID'],how='any',inplace=True)
# temp
# print(key)
# gse.gsms[key].table
maps_list[0]

Unnamed: 0_level_0,ID
ENTREZ_GENE_ID,Unnamed: 1_level_1
780 /// 100616237,1007_s_at
5982,1053_at
3310,117_at
7849,121_at
2978,1255_g_at
7318 /// 100847079,1294_at
7067,1316_at
11099,1320_at
6352,1405_i_at
1571,1431_at


In [82]:
# create 3 tables by platform

# initialize with platform
gsm_tables = {}
for key,val in celformat.items():
    #df = pd.DataFrame([],columns=['ID','ENTREZ_GENE_ID'])
    temp = gse.gpls[key].table
    df = temp[['ID','ENTREZ_GENE_ID']]
    df.set_index('ID',inplace=True)
    gsm_tables[key] = df

# fill each table
for key,val in gsm_platform.items():
    temp = gse.gsms[key].table.copy()
    temp.rename(columns={"ID_REF": "ID"},inplace=True)
    temp.dropna(subset=['ID'],how='any',inplace=True)
    temp.set_index('ID',inplace=True)
    #col1,col2,col3 = '{}.VALUE'.format(key), '{}.ABS_CALL'.format(key), '{}.DETECTION P-VALUE'.format(key)
    col1,col2,col3 = '{}.CEL.gz'.format(key), '{}.CEL.gz.1'.format(key), '{}.CEL.gz.2'.format(key)
    #gsm_tables[val].loc[:,[col1,col2,col3]] = temp[['VALUE','ABS_CALL','DETECTION P-VALUE']]
    gsm_tables[val][col1] = temp['VALUE']
    gsm_tables[val][col2] = temp['ABS_CALL']
    gsm_tables[val][col3] = temp['DETECTION P-VALUE']

# dropna
for key,val in celformat.items():
    #df = pd.DataFrame([],columns=['ID','ENTREZ_GENE_ID'])
    gsm_tables[key].dropna(subset=['ENTREZ_GENE_ID'],inplace=True)
    gsm_tables[key].set_index('ENTREZ_GENE_ID',inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [106]:
df_outer = None
for key,val in celformat.items():
    #df = pd.DataFrame([],columns=['ID','ENTREZ_GENE_ID'])
    print('{}: {}'.format(key,gsm_tables[key].shape))
    if df_outer is None:
        df_outer = gsm_tables[key]
    else:
        df_outer = pd.merge(df_outer,gsm_tables[key],on='ENTREZ_GENE_ID', how='outer')

df_outer.dropna(how='all',inplace=True)
print('{}: {}'.format(key,df_outer.shape))
df_outer.to_csv('{}_full_table.csv'.format(gsename))
df_outer

GPL96: (20973, 102)
GPL97: (16222, 102)
GPL8300: (12119, 102)
GPL8300: (45311, 306)


Unnamed: 0_level_0,GSM60699.CEL.gz,GSM60699.CEL.gz.1,GSM60699.CEL.gz.2,GSM60700.CEL.gz,GSM60700.CEL.gz.1,GSM60700.CEL.gz.2,GSM60701.CEL.gz,GSM60701.CEL.gz.1,GSM60701.CEL.gz.2,GSM60702.CEL.gz,...,GSM60378.CEL.gz.2,GSM60379.CEL.gz,GSM60379.CEL.gz.1,GSM60379.CEL.gz.2,GSM60380.CEL.gz,GSM60380.CEL.gz.1,GSM60380.CEL.gz.2,GSM60381.CEL.gz,GSM60381.CEL.gz.1,GSM60381.CEL.gz.2
ENTREZ_GENE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
780 /// 100616237,33.5,P,0.009985,55.9,P,0.014937,82.0,P,0.001892,50.1,...,0.003067,88.6,P,0.002617,77.0,P,0.001892,80.5,P,0.000959
780 /// 100616237,33.5,P,0.009985,55.9,P,0.014937,82.0,P,0.001892,50.1,...,0.004863,36.9,P,0.000959,48.3,P,0.002617,38.9,P,0.001141
780 /// 100616237,34.1,A,0.129639,32.9,A,0.111572,44.6,P,0.046143,29.1,...,0.003067,88.6,P,0.002617,77.0,P,0.001892,80.5,P,0.000959
780 /// 100616237,34.1,A,0.129639,32.9,A,0.111572,44.6,P,0.046143,29.1,...,0.004863,36.9,P,0.000959,48.3,P,0.002617,38.9,P,0.001141
780 /// 100616237,45.4,P,0.005859,37.6,P,0.010742,44.0,P,0.001953,32.0,...,0.003067,88.6,P,0.002617,77.0,P,0.001892,80.5,P,0.000959
780 /// 100616237,45.4,P,0.005859,37.6,P,0.010742,44.0,P,0.001953,32.0,...,0.004863,36.9,P,0.000959,48.3,P,0.002617,38.9,P,0.001141
780 /// 100616237,52.6,P,0.010742,55.1,P,0.023926,66.8,P,0.002930,63.8,...,0.003067,88.6,P,0.002617,77.0,P,0.001892,80.5,P,0.000959
780 /// 100616237,52.6,P,0.010742,55.1,P,0.023926,66.8,P,0.002930,63.8,...,0.004863,36.9,P,0.000959,48.3,P,0.002617,38.9,P,0.001141
5982,17.7,P,0.039365,14.2,A,0.073830,6.7,A,0.284747,14.6,...,0.013092,20.9,P,0.039365,17.9,M,0.043968,22.9,P,0.021866
5982,17.7,A,0.601074,11.8,A,0.633789,13.2,A,0.432373,10.7,...,0.013092,20.9,P,0.039365,17.9,M,0.043968,22.9,P,0.021866


In [113]:
df_outer.to_csv(os.path.join(genedir,'{}_full_table.csv'.format(gsename)))

In [112]:
datadir

'/Users/zhzhao/Dropbox/Helikar/pipelines/data'

In [103]:
emptydf = pd.DataFrame([],index=['ENTREZ_GENE_ID'])
result = pd.merge(emptydf,gsm_tables['GPL97'], on='ENTREZ_GENE_ID',how='outer')
result

KeyError: 'ENTREZ_GENE_ID'

In [90]:
# Combine 3 tables

# gsm_tables['GPL8300']
# gsm_tables['GPL96']
# gsm_tables['GPL97']
df_outer = pd.merge(gsm_tables['GPL8300'], gsm_tables['GPL96'], on='ENTREZ_GENE_ID', how='outer')
df_outer = pd.merge(df_outer,gsm_tables['GPL97'],on='ENTREZ_GENE_ID', how='outer')
df_outer.dropna(how='all',inplace=True)
df_outer

Unnamed: 0_level_0,GSM60348.CEL.gz,GSM60348.CEL.gz.1,GSM60348.CEL.gz.2,GSM60349.CEL.gz,GSM60349.CEL.gz.1,GSM60349.CEL.gz.2,GSM60350.CEL.gz,GSM60350.CEL.gz.1,GSM60350.CEL.gz.2,GSM60351.CEL.gz,...,GSM60763.CEL.gz.2,GSM60764.CEL.gz,GSM60764.CEL.gz.1,GSM60764.CEL.gz.2,GSM60765.CEL.gz,GSM60765.CEL.gz.1,GSM60765.CEL.gz.2,GSM60766.CEL.gz,GSM60766.CEL.gz.1,GSM60766.CEL.gz.2
ENTREZ_GENE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5595,118.5,P,0.003585,66.7,P,0.011447,37.7,P,0.035163,63.4,...,,,,,,,,,,
7075,8.9,A,0.189687,7.3,A,0.320830,2.8,A,0.284747,6.5,...,,,,,,,,,,
1557,1.6,A,0.697453,1.2,A,0.926170,5.3,A,0.358690,5.8,...,,,,,,,,,,
1557,5.8,A,0.581931,7.5,A,0.621816,3.1,A,0.621816,2.7,...,,,,,,,,,,
643,4.1,A,0.810313,4.7,A,0.765443,4.9,A,0.621816,10.8,...,,,,,,,,,,
643,4.1,A,0.810313,4.7,A,0.765443,4.9,A,0.621816,10.8,...,,,,,,,,,,
643,20.6,A,0.150527,27.7,A,0.218983,35.1,P,0.011447,25.3,...,,,,,,,,,,
643,20.6,A,0.150527,27.7,A,0.218983,35.1,P,0.011447,25.3,...,,,,,,,,,,
643,1.5,A,0.765443,1.7,A,0.926170,1.0,A,0.901946,2.6,...,,,,,,,,,,
643,1.5,A,0.765443,1.7,A,0.926170,1.0,A,0.901946,2.6,...,,,,,,,,,,


## End of Code

In [90]:
# onlyfiles = [f for f in os.listdir(genedir) if os.path.isfile(os.path.join(genedir, f))]
# onlyfiles
# os.listdir(genedir)
onlyfiles = [f for f in os.listdir(genedir) if f.endswith('.gz')]
onlyfiles

['GSM60739.CEL.gz',
 'GSM60744.CEL.gz',
 'GSM60727.CEL.gz',
 'GSM60756.CEL.gz',
 'GSM60735.CEL.gz',
 'GSM60748.CEL.gz',
 'GSM60760.CEL.gz',
 'GSM60703.CEL.gz',
 'GSM60711.CEL.gz',
 'GSM60351.cel.gz',
 'GSM60379.CEL.gz',
 'GSM60367.CEL.gz',
 'GSM60375.CEL.gz',
 'GSM60353.cel.gz',
 'GSM60377.CEL.gz',
 'GSM60369.CEL.gz',
 'GSM60365.CEL.gz',
 'GSM60737.CEL.gz',
 'GSM60729.CEL.gz',
 'GSM60754.CEL.gz',
 'GSM60725.CEL.gz',
 'GSM60758.CEL.gz',
 'GSM60746.CEL.gz',
 'GSM60713.CEL.gz',
 'GSM60701.CEL.gz',
 'GSM60762.CEL.gz',
 'GSM60361.CEL.gz',
 'GSM60699.CEL.gz',
 'GSM60373.CEL.gz',
 'GSM60357.CEL.gz',
 'GSM60349.CEL.gz',
 'GSM60705.CEL.gz',
 'GSM60766.CEL.gz',
 'GSM60380.CEL.gz',
 'GSM60717.CEL.gz',
 'GSM60709.CEL.gz',
 'GSM60721.CEL.gz',
 'GSM60742.CEL.gz',
 'GSM60733.CEL.gz',
 'GSM60750.CEL.gz',
 'GSM60715.CEL.gz',
 'GSM60764.CEL.gz',
 'GSM60719.CEL.gz',
 'GSM60707.CEL.gz',
 'GSM60752.CEL.gz',
 'GSM60731.CEL.gz',
 'GSM60740.CEL.gz',
 'GSM60723.CEL.gz',
 'GSM60371.CEL.gz',
 'GSM60363.CEL.gz',


In [55]:
# dir(gse.gsms[gsm])
# print(gse.gsms[gsm].metadata['title'])
print(gse.gsms[gsm].columns.loc['ID_REF','description'])
r1 = re.findall(r"U133B",gse.gsms[gsm].columns.loc['ID_REF','description'])
r1

Affymetrix U95Av2 probe ID


[]

In [68]:
currentdir = os.getcwd()
dirlist = currentdir.split('/')
projectdir = '/'.join(dirlist[0:-1])

In [71]:
gene = 'GSE2770'
genedir = os.path.join(projectdir,gene + '_RAW')
genedir

'/Users/zhzhao/Dropbox/Helikar/pipelines/GSE2770_RAW'

In [39]:
gse.gsms['GSM60349'].table

Unnamed: 0,ID_REF,VALUE,ABS_CALL,DETECTION P-VALUE
0,100_g_at,82.0,M,0.054470
1,1000_at,66.7,P,0.011447
2,1001_at,7.3,A,0.320830
3,1002_f_at,1.2,A,0.926170
4,1003_s_at,4.7,A,0.765443
5,1004_at,27.7,A,0.218983
6,1005_at,48.6,P,0.014937
7,1006_at,8.5,A,0.267463
8,1007_s_at,71.8,P,0.017001
9,1008_f_at,183.3,P,0.001602


In [30]:
r1 = re.findall(r"\[.*?\]",celformat['GPL96'])

In [35]:
r1[0][4:-1]

'U133A'

In [13]:
import rpy2
print(rpy2.__version__)

3.0.5


In [38]:
from rpy2.robjects.packages import importr
from rpy2.robjects import r, pandas2ri
import rpy2.robjects as ro
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage

pandas2ri.activate()

Converting Rpy2 dataframe to pandas

https://rpy2.github.io/doc/latest/html/pandas.html

In [15]:
utils = importr("utils")
utils.data

R object with classes: ('function',) mapped to:

In [23]:
affy = importr("affy")
string = """
readaffydir <- function(addr){
    setwd(addr)
    mydata = ReadAffy()
    eset = mas5(mydata)
    eset_PMA <- mas5calls(mydata)
    y <- data.frame(exprs(eset), exprs(eset_PMA), assayDataElement(eset_PMA, "se.exprs"))
    y <- y[,sort(names(y))]
    return(y)
}
"""

affyio = SignatureTranslatedAnonymousPackage(string, "affyio")


In [24]:
# powerpack.square(25)
gpl8300rawdir = '/Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL8300/'
outputdf = affyio.readaffydir(gpl8300rawdir)

background correction: mas 
PM/MM correction : mas 
expression values: mas 
background correcting...

R[write to console]: 



done.
12625 ids to be processed
|                    |
|####################|
Getting probe level data...
Computing p-values
Making P/M/A Calls


In [32]:
outputdf

GSM60349.CEL.gz,GSM60349.CEL.gz.1,GSM60349.CEL.gz.2,...,GSM60381.CEL.gz,GSM60381.CEL.gz.1,GSM60381.CEL.gz.2
...,...,...,...,...,...,...


In [37]:
with localconverter(ro.default_converter + pandas2ri.converter):
    pd_from_r_df = ro.conversion.rpy2py(outputdf)

pd_from_r_df

Unnamed: 0,GSM60349.CEL.gz,GSM60349.CEL.gz.1,GSM60349.CEL.gz.2,GSM60350.CEL.gz,GSM60350.CEL.gz.1,GSM60350.CEL.gz.2,GSM60351.cel.gz,GSM60351.cel.gz.1,GSM60351.cel.gz.2,GSM60352.cel.gz,...,GSM60378.CEL.gz.2,GSM60379.CEL.gz,GSM60379.CEL.gz.1,GSM60379.CEL.gz.2,GSM60380.CEL.gz,GSM60380.CEL.gz.1,GSM60380.CEL.gz.2,GSM60381.CEL.gz,GSM60381.CEL.gz.1,GSM60381.CEL.gz.2
100_g_at,819.696849,M,0.054470,697.976614,P,0.035163,792.199601,M,0.054470,670.927981,...,0.073830,525.182524,M,0.048995,605.096506,P,0.035163,671.219124,P,0.039365
1000_at,666.601875,P,0.011447,377.096623,P,0.035163,633.796783,P,0.019303,369.024441,...,0.014937,502.625967,P,0.014937,575.696476,P,0.027860,476.986010,P,0.009985
1001_at,73.056571,A,0.320830,27.907207,A,0.284747,64.561944,A,0.175989,9.274875,...,0.189687,94.660663,A,0.175989,171.428355,M,0.054470,17.197764,A,0.602006
1002_f_at,11.787246,A,0.926170,53.378871,A,0.358691,57.671637,A,0.541184,14.085050,...,0.520620,19.654731,A,0.765443,22.265423,A,0.715253,5.054754,A,0.983000
1003_s_at,46.570660,A,0.765443,49.144225,A,0.621815,108.366510,A,0.500000,73.856262,...,0.581931,132.972994,A,0.438361,65.490578,A,0.602006,18.326015,A,0.810313
1004_at,276.871626,A,0.218983,350.451574,P,0.011447,252.800586,A,0.127645,212.181074,...,0.204022,157.720347,A,0.189687,215.081926,A,0.081337,156.242030,A,0.234557
1005_at,485.753658,P,0.014937,99.133354,A,0.541184,934.061710,P,0.000266,100.917653,...,0.001354,471.338044,P,0.004863,468.230344,P,0.000468,485.045190,P,0.000468
1006_at,85.230428,A,0.267463,48.180077,A,0.378185,6.745844,A,0.602006,3.240873,...,0.320830,7.598366,A,0.500000,15.661077,A,0.715253,23.027493,A,0.438361
1007_s_at,717.599098,P,0.017000,966.038612,P,0.003067,724.167413,P,0.005643,910.941864,...,0.003067,885.906655,P,0.002617,769.702265,P,0.001892,805.025121,P,0.000959
1008_f_at,1832.103394,P,0.001602,1141.965627,P,0.001141,828.595844,P,0.000805,615.969660,...,0.001602,2256.733954,P,0.000673,2059.737948,P,0.000388,1689.705825,P,0.000219
