# Pre-Process the data
Function: Throw data of each gene to corresponding folder based on their platform (e.g. GPL96, GPL97, GPL8300, etc.)

In [1]:
import GEOparse
import re
import os
import pandas as pd

## Extract Gene Info from GEO DataSets

In [2]:
gse = GEOparse.get_GEO(geo="GSE2770", destdir="./")

06-Aug-2019 12:00:26 DEBUG utils - Directory ./ already exists. Skipping.
06-Aug-2019 12:00:26 INFO GEOparse - File already exist: using local version.
06-Aug-2019 12:00:26 INFO GEOparse - Parsing ./GSE2770_family.soft.gz: 
06-Aug-2019 12:00:26 DEBUG GEOparse - DATABASE: GeoMiame
06-Aug-2019 12:00:26 DEBUG GEOparse - SERIES: GSE2770
06-Aug-2019 12:00:26 DEBUG GEOparse - PLATFORM: GPL96
06-Aug-2019 12:00:30 DEBUG GEOparse - PLATFORM: GPL97
06-Aug-2019 12:00:32 DEBUG GEOparse - PLATFORM: GPL8300
06-Aug-2019 12:00:33 DEBUG GEOparse - SAMPLE: GSM60348
06-Aug-2019 12:00:33 DEBUG GEOparse - SAMPLE: GSM60349
06-Aug-2019 12:00:33 DEBUG GEOparse - SAMPLE: GSM60350
06-Aug-2019 12:00:34 DEBUG GEOparse - SAMPLE: GSM60351
06-Aug-2019 12:00:34 DEBUG GEOparse - SAMPLE: GSM60352
06-Aug-2019 12:00:34 DEBUG GEOparse - SAMPLE: GSM60353
06-Aug-2019 12:00:34 DEBUG GEOparse - SAMPLE: GSM60354
06-Aug-2019 12:00:34 DEBUG GEOparse - SAMPLE: GSM60355
06-Aug-2019 12:00:34 DEBUG GEOparse - SAMPLE: GSM60356
06-Aug

## Extract Platform Information

In [3]:
keys = []
values = []
for gpl in gse.gpls:
    keys.append(gse.gpls[gpl].name)
    r1 = re.findall(r"\[.*?\]",gse.gpls[gpl].metadata['title'][0])
    values.append(r1[0][4:-1])
    #values.append(gse.gpls[gpl].metadata['title'][0])
    print(gse.gpls[gpl].name)
    print(gse.gpls[gpl].metadata['title'][0])
    
celformat = dict(zip(keys, values))

GPL96
[HG-U133A] Affymetrix Human Genome U133A Array
GPL97
[HG-U133B] Affymetrix Human Genome U133B Array
GPL8300
[HG_U95Av2] Affymetrix Human Genome U95 Version 2 Array


## Classify the Samples by Platform

In [4]:
keys = []
values = []
for gsm in gse.gsms:
    #print(gse.gsms[gsm].name)
    keys.append(gse.gsms[gsm].name)
    for key,val in celformat.items():
        r1 = re.findall(r"{}".format(val),gse.gsms[gsm].columns.loc['ID_REF','description'])
        if not r1:
            pass
        else:
            values.append(key)
    #r1 = re.findall(r"\(.*?\)",gse.gsms[gsm].columns.loc['ID_REF','description'])
    #values.append(r1[0][1:-1])
    #values.append(gse.gsms[gsm].columns.loc['ID_REF','description'])

gsm_platform = dict(zip(keys, values))
gsm_platform

{'GSM60348': 'GPL8300',
 'GSM60349': 'GPL8300',
 'GSM60350': 'GPL8300',
 'GSM60351': 'GPL8300',
 'GSM60352': 'GPL8300',
 'GSM60353': 'GPL8300',
 'GSM60354': 'GPL8300',
 'GSM60355': 'GPL8300',
 'GSM60356': 'GPL8300',
 'GSM60357': 'GPL8300',
 'GSM60358': 'GPL8300',
 'GSM60359': 'GPL8300',
 'GSM60360': 'GPL8300',
 'GSM60361': 'GPL8300',
 'GSM60362': 'GPL8300',
 'GSM60363': 'GPL8300',
 'GSM60364': 'GPL8300',
 'GSM60365': 'GPL8300',
 'GSM60366': 'GPL8300',
 'GSM60367': 'GPL8300',
 'GSM60368': 'GPL8300',
 'GSM60369': 'GPL8300',
 'GSM60370': 'GPL8300',
 'GSM60371': 'GPL8300',
 'GSM60372': 'GPL8300',
 'GSM60373': 'GPL8300',
 'GSM60374': 'GPL8300',
 'GSM60375': 'GPL8300',
 'GSM60376': 'GPL8300',
 'GSM60377': 'GPL8300',
 'GSM60378': 'GPL8300',
 'GSM60379': 'GPL8300',
 'GSM60380': 'GPL8300',
 'GSM60381': 'GPL8300',
 'GSM60699': 'GPL96',
 'GSM60700': 'GPL96',
 'GSM60701': 'GPL96',
 'GSM60702': 'GPL96',
 'GSM60703': 'GPL96',
 'GSM60704': 'GPL96',
 'GSM60705': 'GPL96',
 'GSM60706': 'GPL96',
 'GSM607

## Create Folders if not exist, Move files to corresponding folders

In [5]:
# Setup paths
currentdir = os.getcwd()
dirlist = currentdir.split('/')
projectdir = '/'.join(dirlist[0:-1])
datadir = os.path.join(projectdir,'data')
outputdir = os.path.join(projectdir,'output')
gene = 'GSE2770'
genedir = os.path.join(datadir,gene + '_RAW')

In [6]:
# create a folder for each platform
for key in celformat.keys():
    platformdir = os.path.join(genedir,key)
    if not os.path.exists(platformdir):
        os.makedirs(platformdir)
        print('Path created: {}'.format(platformdir))
    else:
        print('Path already exist: {}'.format(platformdir))


Path already exist: /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL96
Path already exist: /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL97
Path already exist: /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL8300


In [91]:
# Move Corresponding Cel files to Folders
#onlyfiles = [f for f in os.listdir(genedir) if os.path.isfile(os.path.join(genedir, f))]
onlyfiles = [f for f in os.listdir(genedir) if f.endswith('.gz')]

for file in onlyfiles:
    filelist = file.split('.')
    prefix = filelist[0]
    if prefix in gsm_platform:
        platform = gsm_platform[prefix]
        platformdir = os.path.join(genedir,platform)
        src_path = os.path.join(genedir, file)
        dst_path = os.path.join(platformdir, file)
        os.rename(src_path,dst_path)
        print('Move {} to {}'.format(src_path,dst_path))
        


Move /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GSM60739.CEL.gz to /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL97/GSM60739.CEL.gz
Move /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GSM60744.CEL.gz to /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL97/GSM60744.CEL.gz
Move /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GSM60727.CEL.gz to /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL96/GSM60727.CEL.gz
Move /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GSM60756.CEL.gz to /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL97/GSM60756.CEL.gz
Move /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GSM60735.CEL.gz to /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL97/GSM60735.CEL.gz
Move /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GSM60748.CEL.gz to /Users/zhzhao/Dropbox/Helikar/pipelines/data/GSE2770_RAW/GPL97/GSM60748.CEL.gz
Move /Users/zhzhao/Dropbox/Helikar/pipel

## Download GPL (Gene ID map for Platforms)

In [11]:
platforms = ['GPL96','GPL97','GPL8300']

maps_list = []
gene_maps = pd.DataFrame([],columns=['GPL96','GPL97','GPL8300','ENTREZ_GENE_ID'])
gene_maps.set_index('ENTREZ_GENE_ID',inplace=True)
for platform in platforms:
    temp =gse.gpls[platform].table[['ID','ENTREZ_GENE_ID']]
    # Save to file
    filefullpath = os.path.join(datadir,'{}entrez.csv'.format(platform))
    print(filefullpath)
    temp.to_csv(filefullpath, index=False)
    # Single Table
    temp.dropna(axis=0,inplace=True)
    temp.set_index('ENTREZ_GENE_ID',inplace=True)
    maps_list.append(temp)
    #gene_maps[platform]=temp['ID']
    #gene_maps.merge(temp,right_index=True)

#gene_maps = pd.concat(maps_list, axis=1, sort=False)
#gene_maps

/Users/zhzhao/Dropbox/Helikar/pipelines/data/GPL96entrez.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


/Users/zhzhao/Dropbox/Helikar/pipelines/data/GPL97entrez.csv
/Users/zhzhao/Dropbox/Helikar/pipelines/data/GPL8300entrez.csv


In [7]:
gse.gpls['GPL96'].table

Unnamed: 0,ID,GB_ACC,SPOT_ID,Species Scientific Name,Annotation Date,Sequence Type,Sequence Source,Target Description,Representative Public ID,Gene Title,Gene Symbol,ENTREZ_GENE_ID,RefSeq Transcript ID,Gene Ontology Biological Process,Gene Ontology Cellular Component,Gene Ontology Molecular Function
0,1007_s_at,U48705,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,Affymetrix Proprietary Database,U48705 /FEATURE=mRNA /DEFINITION=HSU48705 Huma...,U48705,discoidin domain receptor tyrosine kinase 1 //...,DDR1 /// MIR4640,780 /// 100616237,NM_001202521 /// NM_001202522 /// NM_001202523...,0001558 // regulation of cell growth // inferr...,0005576 // extracellular region // inferred fr...,0000166 // nucleotide binding // inferred from...
1,1053_at,M87338,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,GenBank,M87338 /FEATURE= /DEFINITION=HUMA1SBU Human re...,M87338,"replication factor C (activator 1) 2, 40kDa",RFC2,5982,NM_001278791 /// NM_001278792 /// NM_001278793...,0000278 // mitotic cell cycle // traceable aut...,0005634 // nucleus // inferred from electronic...,0000166 // nucleotide binding // inferred from...
2,117_at,X51757,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,Affymetrix Proprietary Database,X51757 /FEATURE=cds /DEFINITION=HSP70B Human h...,X51757,heat shock 70kDa protein 6 (HSP70B'),HSPA6,3310,NM_002155,0000902 // cell morphogenesis // inferred from...,0005737 // cytoplasm // inferred from direct a...,0000166 // nucleotide binding // inferred from...
3,121_at,X69699,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,GenBank,X69699 /FEATURE= /DEFINITION=HSPAX8A H.sapiens...,X69699,paired box 8,PAX8,7849,NM_003466 /// NM_013951 /// NM_013952 /// NM_0...,0001655 // urogenital system development // in...,0005634 // nucleus // inferred from direct ass...,0000979 // RNA polymerase II core promoter seq...
4,1255_g_at,L36861,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,Affymetrix Proprietary Database,L36861 /FEATURE=expanded_cds /DEFINITION=HUMGC...,L36861,guanylate cyclase activator 1A (retina),GUCA1A,2978,NM_000409 /// XM_006715073,0007165 // signal transduction // non-traceabl...,0001750 // photoreceptor outer segment // infe...,0005509 // calcium ion binding // inferred fro...
5,1294_at,L13852,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,GenBank,L13852 /FEATURE= /DEFINITION=HUME1URP Homo sap...,L13852,microRNA 5193 /// ubiquitin-like modifier acti...,MIR5193 /// UBA7,7318 /// 100847079,NM_003335 /// NR_049825 /// XM_005265430 /// X...,0006464 // cellular protein modification proce...,0005634 // nucleus // not recorded /// 0005829...,0000166 // nucleotide binding // inferred from...
6,1316_at,X55005,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,Affymetrix Proprietary Database,X55005 /FEATURE=mRNA /DEFINITION=HSCERBAR Homo...,X55005,"thyroid hormone receptor, alpha",THRA,7067,NM_001190918 /// NM_001190919 /// NM_003250 //...,0000122 // negative regulation of transcriptio...,0000790 // nuclear chromatin // inferred from ...,0000978 // RNA polymerase II core promoter pro...
7,1320_at,X79510,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,Affymetrix Proprietary Database,X79510 /FEATURE=cds /DEFINITION=HSPTPD1 H.sapi...,X79510,"protein tyrosine phosphatase, non-receptor typ...",PTPN21,11099,NM_007039 /// XM_005267287 /// XM_006720011,0006470 // protein dephosphorylation // tracea...,0005737 // cytoplasm // inferred from electron...,0004721 // phosphoprotein phosphatase activity...
8,1405_i_at,M21121,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,GenBank,M21121 /FEATURE= /DEFINITION=HUMTCSM Human T c...,M21121,chemokine (C-C motif) ligand 5,CCL5,6352,NM_001278736 /// NM_002985,0000165 // MAPK cascade // inferred from mutan...,0005576 // extracellular region // traceable a...,0004435 // phosphatidylinositol phospholipase ...
9,1431_at,J02843,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,Affymetrix Proprietary Database,J02843 /FEATURE=cds /DEFINITION=HUMCYPIIE Huma...,J02843,"cytochrome P450, family 2, subfamily E, polype...",CYP2E1,1571,NM_000773,0006641 // triglyceride metabolic process // i...,0000139 // Golgi membrane // inferred from ele...,0004497 // monooxygenase activity // inferred ...


In [32]:
#dir(gse.gpls['GPL96'])
# dir(gse)
#dir(gse.gsms['GSM60749'])
#temp = gse.gsms['GSM60349'].table
#print(temp.VALUE[0])
#dir(gse.gsms['GSM60349'])
temp =gse.gpls['GPL96'].table
temp[['ID','ENTREZ_GENE_ID']]

Unnamed: 0,ID,ENTREZ_GENE_ID
0,1007_s_at,780 /// 100616237
1,1053_at,5982
2,117_at,3310
3,121_at,7849
4,1255_g_at,2978
5,1294_at,7318 /// 100847079
6,1316_at,7067
7,1320_at,11099
8,1405_i_at,6352
9,1431_at,1571


In [60]:
#temp
#pd.concat(maps_list, axis=1, join='outer',sort=False)
maps_list

[                                              ID
 ENTREZ_GENE_ID                                  
 780 /// 100616237                      1007_s_at
 5982                                     1053_at
 3310                                      117_at
 7849                                      121_at
 2978                                   1255_g_at
 7318 /// 100847079                       1294_at
 7067                                     1316_at
 11099                                    1320_at
 6352                                   1405_i_at
 1571                                     1431_at
 2049                                     1438_at
 2101                                     1487_at
 1548                                   1494_f_at
 2621                                   1598_g_at
 4323                                   160020_at
 8717                                     1729_at
 2342 /// 100529261                       1773_at
 5337                                      177_at


## End of Code

In [90]:
# onlyfiles = [f for f in os.listdir(genedir) if os.path.isfile(os.path.join(genedir, f))]
# onlyfiles
# os.listdir(genedir)
onlyfiles = [f for f in os.listdir(genedir) if f.endswith('.gz')]
onlyfiles

['GSM60739.CEL.gz',
 'GSM60744.CEL.gz',
 'GSM60727.CEL.gz',
 'GSM60756.CEL.gz',
 'GSM60735.CEL.gz',
 'GSM60748.CEL.gz',
 'GSM60760.CEL.gz',
 'GSM60703.CEL.gz',
 'GSM60711.CEL.gz',
 'GSM60351.cel.gz',
 'GSM60379.CEL.gz',
 'GSM60367.CEL.gz',
 'GSM60375.CEL.gz',
 'GSM60353.cel.gz',
 'GSM60377.CEL.gz',
 'GSM60369.CEL.gz',
 'GSM60365.CEL.gz',
 'GSM60737.CEL.gz',
 'GSM60729.CEL.gz',
 'GSM60754.CEL.gz',
 'GSM60725.CEL.gz',
 'GSM60758.CEL.gz',
 'GSM60746.CEL.gz',
 'GSM60713.CEL.gz',
 'GSM60701.CEL.gz',
 'GSM60762.CEL.gz',
 'GSM60361.CEL.gz',
 'GSM60699.CEL.gz',
 'GSM60373.CEL.gz',
 'GSM60357.CEL.gz',
 'GSM60349.CEL.gz',
 'GSM60705.CEL.gz',
 'GSM60766.CEL.gz',
 'GSM60380.CEL.gz',
 'GSM60717.CEL.gz',
 'GSM60709.CEL.gz',
 'GSM60721.CEL.gz',
 'GSM60742.CEL.gz',
 'GSM60733.CEL.gz',
 'GSM60750.CEL.gz',
 'GSM60715.CEL.gz',
 'GSM60764.CEL.gz',
 'GSM60719.CEL.gz',
 'GSM60707.CEL.gz',
 'GSM60752.CEL.gz',
 'GSM60731.CEL.gz',
 'GSM60740.CEL.gz',
 'GSM60723.CEL.gz',
 'GSM60371.CEL.gz',
 'GSM60363.CEL.gz',


In [55]:
# dir(gse.gsms[gsm])
# print(gse.gsms[gsm].metadata['title'])
print(gse.gsms[gsm].columns.loc['ID_REF','description'])
r1 = re.findall(r"U133B",gse.gsms[gsm].columns.loc['ID_REF','description'])
r1

Affymetrix U95Av2 probe ID


[]

In [68]:
currentdir = os.getcwd()
dirlist = currentdir.split('/')
projectdir = '/'.join(dirlist[0:-1])

In [71]:
gene = 'GSE2770'
genedir = os.path.join(projectdir,gene + '_RAW')
genedir

'/Users/zhzhao/Dropbox/Helikar/pipelines/GSE2770_RAW'

In [37]:
celformat

{'GPL96': 'U133A', 'GPL97': 'U133B', 'GPL8300': 'U95Av2'}

In [30]:
r1 = re.findall(r"\[.*?\]",celformat['GPL96'])

In [35]:
r1[0][4:-1]

'U133A'