In [1]:
import urllib
import xml.etree.ElementTree as ET
import os
import time
import numpy as np
import pandas as pd

#might move output destination
#os.chdir('/home/ /Downloads/')
#os.mkdir('Info')

In [2]:
# getting a list of studies (GSEs) from the GEO datasets database 

# Inclusion criteria:
# > studies in mammals treated with valproic acid : valpro* AND animal -> animal is too broad??? species names tricky
# > not subjected to any other intervention : ?? (control vs valproate only) (only can know from each sample, so wait next step)
# > not cancer studies: NOT cancer -> bias
# > not epilepsy studies: NOT epilepsy -> used as drug

# > not isolated cells (cultivated cell lines or primary cells): ?? (only can know from each sample, so wait next step)
# > mRNA expression arrays
# > not non-coding RNA or genome arrays: exptype


term='term=GSE[ETYP]+AND+valpro*+AND+animal+NOT+epilepsy+NOT+cancer'

ids=urllib.request.urlopen('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&'+term+'&retmax=5000&usehistory=y') # do query?
ids=ET.parse(ids).getroot() #get ids from XML?

env=ids[4].text
k=ids[3].text

sum=urllib.request.urlopen('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gds&version=2.0&query_key='+k+'&WebEnv='+env) #fetch data according to id?
sum=ET.parse(sum).getroot()

# DocumentSummarySet/DocumentSummary folder
sumL=sum.findall('DocumentSummarySet/DocumentSummary')
gse=list()
nSamples=list()
gpl=list()
species=list()
date=list()
desc=list()
ftp=list()
ftpMartrix=list()
for i in sumL:
    gse.append(i[0].text)
    nSamples.append(i[18].text)
    gpl.append(i[4].text)
    species.append(i[6].text)
    date.append(i[13].text)
    desc.append(i[8].text)
    ftp.append(i[25].text+'miniml/'+i[0].text+'_family.xml.tgz')
    ftpMartrix.append(i[25].text+'matrix/'+i[0].text+'_series_matrix.txt.gz')

uid=list()
for id in ids.findall('IdList/Id'):
    uid.append(id.text)
    
gdsSearch=pd.DataFrame({'UID':uid,
            'GSE':gse,
            'N.samples':nSamples,
             'GPL':gpl,
             'Species':species,
             'Date':date,
             'ExpType':desc,
             'DataLink':ftp,
            'MatrixLink':ftpMartrix})


In [3]:
# getting PubMed cross-links

pubIds=list()
gdsIds=list()
counter=0
for id in uid:
    pubId=urllib.request.urlopen('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=gds&db=pubmed&id='+uid[counter])
    pubId=ET.parse(pubId).getroot()
    counter+=1
    gdsIds.append(pubId[0][1][0].text)
    try:
        pubIds.append(pubId[0][2][2][0].text)
        time.sleep(0.34)
    except:
        pubIds.append('No Pubmed entry')

pubLinks=pd.DataFrame({'UID':gdsIds,
             'Pubmed':pubIds})

In [4]:
# combining PubMed with GSE data

vpaData=pd.merge(gdsSearch,pubLinks,on='UID',how='outer')
#vpaData.to_csv('./Info/vpa_search.csv') #puts result in Download/Info folder
vpaData.to_csv('vpa_search_first.csv') #puts result in this folder

In [5]:
# summary of extracted GSE meta-data

dataSummary=vpaData.groupby('ExpType').count()

output: GSE meta-data
vpa_search.csv (from vpaData)
dataSummary
comma-delimited

Task: sift through metadata, find IDs/names of samples to extract for analysis 

idea: species name. have list of common mammals. if not match, do search and identify OR highlight it with a flag

In [6]:
print(vpaData)

           UID        GSE N.samples          GPL                    Species  \
0    200210642  GSE210642        49  17021;18413  Danio rerio; Mus musculus   
1    200204779  GSE204779         8        24676               Homo sapiens   
2    200191286  GSE191286        12  24676;18573               Homo sapiens   
3    200191285  GSE191285         1        24676               Homo sapiens   
4    200191282  GSE191282         1        18573               Homo sapiens   
..         ...        ...       ...          ...                        ...   
171  200005388    GSE5388        61           96               Homo sapiens   
172  200003210    GSE3210       540         1820          Rattus norvegicus   
173  200002303    GSE2303        94           85          Rattus norvegicus   
174  200002187    GSE2187       587         1820          Rattus norvegicus   
175  200000666     GSE666        19      8300;96               Homo sapiens   

           Date                                    

In [7]:
print(dataSummary)

                                                    UID  GSE  N.samples  GPL  \
ExpType                                                                        
Expression profiling by array                        80   80         80   80   
Expression profiling by array; Methylation prof...    2    2          2    2   
Expression profiling by array; Methylation prof...    1    1          1    1   
Expression profiling by array; Non-coding RNA p...    1    1          1    1   
Expression profiling by genome tiling array           1    1          1    1   
Expression profiling by high throughput sequencing   63   63         63   63   
Expression profiling by high throughput sequenc...    7    7          7    7   
Expression profiling by high throughput sequenc...    1    1          1    1   
Expression profiling by high throughput sequenc...    2    2          2    2   
Genome binding/occupancy profiling by genome ti...    1    1          1    1   
Genome binding/occupancy profiling by hi

In [10]:
mammals=["Homo", "Mus", "Rattus", "Sus", "Callithrix", "Macaca"] #sus=pig #Callithrix, macaca =monkey (excluded: drosophilia=fly // dario= fish)
vpaDataFiltered1a = vpaData[vpaData.Species.str.contains('|'.join(mammals))] 
#finds values that contains at least one word from the genus of the mammals (in commonMammals list),
#  then copies its entry row to new dataframe 

print(vpaDataFiltered1a)
vpaDataFiltered1a.to_csv('vpa_search_f1a_animal.csv') #uts result in this folder



           UID        GSE N.samples          GPL                    Species  \
0    200210642  GSE210642        49  17021;18413  Danio rerio; Mus musculus   
1    200204779  GSE204779         8        24676               Homo sapiens   
2    200191286  GSE191286        12  24676;18573               Homo sapiens   
3    200191285  GSE191285         1        24676               Homo sapiens   
4    200191282  GSE191282         1        18573               Homo sapiens   
..         ...        ...       ...          ...                        ...   
171  200005388    GSE5388        61           96               Homo sapiens   
172  200003210    GSE3210       540         1820          Rattus norvegicus   
173  200002303    GSE2303        94           85          Rattus norvegicus   
174  200002187    GSE2187       587         1820          Rattus norvegicus   
175  200000666     GSE666        19      8300;96               Homo sapiens   

           Date                                    

In [9]:
expTypeToUse=["Expression profiling by array","Expression profiling by high throughput sequencing"] #include any,


vpaDataFiltered1b1 = vpaDataFiltered1a[vpaDataFiltered1a.ExpType.str.contains('|'.join(expTypeToUse))] 

#finds values that contains either one of the two strings (in expTypeToUse list),
#  then copies its entry row to new dataframe 

print(vpaDataFiltered1b1)

vpaDataFiltered1b1.to_csv('vpa_search_f1.csv') #uts result in this folder
    

           UID        GSE N.samples          GPL                    Species  \
0    200210642  GSE210642        49  17021;18413  Danio rerio; Mus musculus   
1    200204779  GSE204779         8        24676               Homo sapiens   
2    200191286  GSE191286        12  24676;18573               Homo sapiens   
3    200191285  GSE191285         1        24676               Homo sapiens   
5    200182900  GSE182900         9        23159               Homo sapiens   
..         ...        ...       ...          ...                        ...   
171  200005388    GSE5388        61           96               Homo sapiens   
172  200003210    GSE3210       540         1820          Rattus norvegicus   
173  200002303    GSE2303        94           85          Rattus norvegicus   
174  200002187    GSE2187       587         1820          Rattus norvegicus   
175  200000666     GSE666        19      8300;96               Homo sapiens   

           Date                                    