In [1]:
import pandas as pd
from intermine.webservice import Service
service = Service("https://phytozome.jgi.doe.gov/phytomine/service")

def get_cds(name): # name should be transcript name
    template = service.get_template('Transcript_CDS_sequence')
    rows = template.rows(
        A = {"op": "=", "value": name}
        )
    for row in rows:
        return row["CDSs.sequence.residues"]

def get_pep(name): # name should be transcript name
    template = service.get_template('Transcript_peptide_sequence')
    rows = template.rows(
        A = {"op": "=", "value": name}
        )
    for row in rows:
        return row["sequence.residues"]

In [2]:
def get_info_phyto(genelist): # genelist should be consisted with genenames
    service = Service("https://phytozome.jgi.doe.gov/phytomine/service")
    dic = {"organism.shortName" : [],
           "primaryIdentifier" : [],
           "briefDescription" : [],
           "symbol" : [],
           "description" : [],
           "Name" : []
           }
    # The view specifies the output columns
    for gene in genelist:
        query = service.new_query("Gene")
        query.add_view(
        "primaryIdentifier", "briefDescription", "symbol",
        "description", "organism.shortName","name"
        )
        query.add_constraint("primaryIdentifier", "=", gene, code = "A")
        for row in query.rows():
            dic["organism.shortName"].append(row["organism.shortName"])
            dic["primaryIdentifier"].append(row["primaryIdentifier"])
            dic["briefDescription"].append(row["briefDescription"])
            dic["symbol"].append(row["symbol"])
            dic["description"].append(row["description"])
            dic["Name"].append(row["name"])
                
    return dic

In [3]:
def get_info_thale(genelist): # genelist should be consisted with genenames
    service = Service("https://apps.araport.org/thalemine/service")
    dic = {"organism.shortName" : [],
           "primaryIdentifier" : [],
           "briefDescription" : [],
           "symbol" : [],
           "description" : [],
           "Name" : []
           }
    # The view specifies the output columns
    for gene in genelist:
        query = service.new_query("Gene")
        query.add_view(
        "primaryIdentifier", "briefDescription", "symbol",
        "description", "organism.shortName","name"
        )
        query.add_constraint("primaryIdentifier", "=", gene, code = "A")
        for row in query.rows():
            dic["organism.shortName"].append(row["organism.shortName"])
            dic["primaryIdentifier"].append(row["primaryIdentifier"])
            dic["briefDescription"].append(row["briefDescription"])
            dic["symbol"].append(row["symbol"])
            dic["description"].append(row["description"])
            dic["Name"].append(row["name"])
                
    return dic

In [4]:

# Get a new query on the class (table) you will be querying:
def gene2homologs(genename,organism1,organism2):
    service = Service("https://phytozome.jgi.doe.gov/phytomine/service")
    query = service.new_query("Homolog")
    # The view specifies the output columns
    query.add_view(
        "gene1.primaryIdentifier", "gene2.primaryIdentifier", "organism2.shortName",
        "relationship"
    )
    # Uncomment and edit the line below (the default) to select a custom sort order:
    # query.add_sort_order("Homolog.gene1.primaryIdentifier", "ASC")
    # You can edit the constraint values below
    query.add_constraint("gene1.primaryIdentifier", "=", genename, code = "A")
    query.add_constraint("organism1.shortName", "=", organism1, code = "B")
    query.add_constraint("organism2.shortName", "=", organism2, code = "C")
    # Uncomment and edit the code below to specify your own custom logic:
    #query.set_logic("A and B and (C or D or E or F)")
    homolog = []
    for row in query.rows():
        homolog.append(row["gene2.primaryIdentifier"])
    return list(set(homolog))


In [5]:
def getClusterID(genename,organism):
    service = Service("https://phytozome.jgi.doe.gov/phytomine/service")
    # Get a new query on the class (table) you will be querying:
    query = service.new_query("ProteinFamily")
    # The view specifies the output columns
    query.add_view(
        "clusterId", "gene.primaryIdentifier", "gene.briefDescription",
        "gene.organism.shortName", "gene.symbol"
    )
    # Uncomment and edit the line below (the default) to select a custom sort order:
    # query.add_sort_order("ProteinFamily.clusterId", "ASC")
    # You can edit the constraint values below
    query.add_constraint("gene.primaryIdentifier", "=", genename, code = "A")
    query.add_constraint("gene.organism.shortName", "=", organism, code = "B")
    # Uncomment and edit the code below to specify your own custom logic:
    #query.set_logic("A" and "B")
    clusterID = []
    for row in query.rows():
        #print row["clusterId"], row["gene.primaryIdentifier"], row["gene.briefDescription"], \
        #    row["gene.organism.shortName"], row["gene.symbol"]
        clusterID.append(row["clusterId"])
    return(clusterID)

In [6]:
def get_genelistfromClusterID(clusteridlist,organism):
    service = Service("https://phytozome.jgi.doe.gov/phytomine/service")
    # Get a new query on the class (table) you will be querying:
    genelist = []
    for clusterid in clusteridlist:
        query = service.new_query("ProteinFamily")
        # The view specifies the output columns
        query.add_view(
            "clusterId", "gene.primaryIdentifier", "gene.briefDescription",
            "gene.organism.shortName", "gene.symbol"
        )
        # Uncomment and edit the line below (the default) to select a custom sort order:
        # query.add_sort_order("ProteinFamily.clusterId", "ASC")
        # You can edit the constraint values below
        query.add_constraint("clusterId", "=", clusterid, code = "A")
        query.add_constraint("gene.organism.shortName", "=", organism, code = "B")
        # Uncomment and edit the code below to specify your own custom logic:
        # query.set_logic("A")
        
        for row in query.rows():
            #print row["clusterId"], row["gene.primaryIdentifier"], row["gene.briefDescription"], \
            #    row["gene.organism.shortName"], row["gene.symbol"]
            genelist.append(row["gene.primaryIdentifier"])
    return(set(genelist))

In [7]:
def gene2genefamiliy(genename,organism):
    genelist = get_genelistfromClusterID(getClusterID(genename,organism),organism)
    return(list(genelist))

In [8]:
getClusterID('AT5G21960','A. thaliana')

[27860306,
 63455334,
 63601196,
 63847027,
 64102876,
 64211139,
 64529071,
 64602490,
 64927177,
 65167215,
 65392492]

In [10]:
get_genelistfromClusterID([63601196],'A. thaliana')

{u'AT1G19210',
 u'AT1G21910',
 u'AT1G44830',
 u'AT1G74930',
 u'AT1G77640',
 u'AT5G21960'}

In [11]:
gene2genefamiliy('AT5G21960','A. thaliana')

[u'AT1G44830',
 u'AT1G77640',
 u'AT1G74930',
 u'AT5G21960',
 u'AT1G19210',
 u'AT1G21910']

In [12]:
get_info_phyto(['AT1G01360'])

{'Name': [u'AT1G01360'],
 'briefDescription': [u'regulatory component of ABA receptor 1'],
 'description': [None],
 'organism.shortName': [u'A. thaliana'],
 'primaryIdentifier': [u'AT1G01360'],
 'symbol': [None]}

In [16]:
dic = get_info_phyto(Primary_DroughtDB)

WebserviceError: [Errno 502] Proxy Error: '<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">\n<html><head>\n<title>502 Proxy Error</title>\n</head><body>\n<h1>Proxy Error</h1>\n<p>The proxy server received an invalid\r\nresponse from an upstream server.<br />\r\nThe proxy server could not handle the request <em><a href="/phytomine/service/query/results">POST&nbsp;/phytomine/service/query/results</a></em>.<p>\nReason: <strong>Error reading from remote server</strong></p></p>\n</body></html>\n'

In [58]:
df = pd.DataFrame(dic)
df

Unnamed: 0,briefDescription,description,organism.shortName,primaryIdentifier,symbol
0,regulatory component of ABA receptor 1,,A. thaliana,AT1G01360,
1,ascorbate peroxidase 1,,A. thaliana,AT1G07890,
2,myb domain protein 60,,A. thaliana,AT1G08810,
3,myb domain protein 61,,A. thaliana,AT1G09540,
4,dsRNA-binding domain-like superfamily protein,,A. thaliana,AT1G09700,
5,Glutathione S-transferase family protein,,A. thaliana,AT1G10370,
6,Undecaprenyl pyrophosphate synthetase family p...,,A. thaliana,AT1G11755,
7,C4-dicarboxylate transporter/malic acid transp...,,A. thaliana,AT1G12480,
8,Integrase-type DNA-binding superfamily protein,,A. thaliana,AT1G15360,
9,pleiotropic drug resistance 12,,A. thaliana,AT1G15520,


In [9]:
dic = {'DroughtDB':[],
       'A. thaliana':[],
       'A. halleri early-release':[],
       'A. lyrata':[],
       'E. salsugineum':[]
       }
for i in df.index:
    genename = df.loc[i]['primaryIdentifier']
    organism1 = df.loc[i]['organism.shortName']
    target_organisms = ['A. thaliana','A. halleri early-release','A. lyrata','E. salsugineum']
    orthgenes = []
    dic['DroughtDB'].append(genename)
    for organism2 in target_organisms:
        if organism2 == organism1:
            #orthgenes.append(genename)
            #dic[organism2].append(genename)
            dic[organism2].append([genename] + gene2genefamiliy(genename,organism1))
        else:
            #orthgenes.append(gene2homologs(genename,organism1,organism2))
            outgene = gene2homologs(genename,organism1,organism2)
            if outgene != []:
                homo_outgene = gene2genefamiliy(outgene[0],organism2)
            else: homo_outgene = []
            dic[organism2].append(list(set(outgene+homo_outgene)))
        #dic[organism2].append(gene2homologs(genename,organism1,organism2))
    #print genename, orthgenes


NameError: name 'df' is not defined

In [187]:
len(dic['A. halleri early-release'])

145

In [188]:
df_homo = pd.DataFrame(dic,columns=['DroughtDB','A. thaliana','A. lyrata','A. halleri early-release','E. salsugineum'])
df_homo.to_csv('DroughtNet.Primary.Orth.Homo.ver2.txt',sep='\t')

In [215]:
df_homo.head(100)

Unnamed: 0,DroughtDB,A. thaliana,A. lyrata,A. halleri early-release,E. salsugineum
0,AT1G01360,"[AT1G01360, AT4G18620, AT5G45860, AT1G73000, A...","[926669, 325175, 889350, 483087, 909840, 93281...",[Araha.11664s0007],"[Thhalv10026943m.g, Thhalv10001220m.g, Thhalv1..."
1,AT1G07890,"[AT1G07890, AT1G77490, AT1G07890, AT4G35000, A...","[930190, 933328, 478284, 491040, 931501, 93150...",[Araha.22931s0002],"[Thhalv10022797m.g, Thhalv10008402m.g, Thhalv1..."
2,AT1G08810,"[AT1G08810, AT5G62470, AT1G74650, AT3G47600, A...","[496423, 484660, 937505, 919629, 476602]",[Araha.5150s0013],"[Thhalv10004487m.g, Thhalv10004538m.g, Thhalv1..."
3,AT1G09540,"[AT1G09540, AT4G01680, AT1G09540, AT1G57560]","[919710, 490409, 314948]",[Araha.1407s0050],"[Thhalv10008000m.g, Thhalv10023601m.g, Thhalv1..."
4,AT1G09700,"[AT1G09700, AT5G41070, AT1G09700, AT5G27120, A...","[471071, 481667, 484479, 493870]",[Araha.3403s0032],"[Thhalv10020473m.g, Thhalv10014087m.g, Thhalv1..."
5,AT1G10370,"[AT1G10370, AT1G59700, AT1G10370, AT1G10360, A...","[935980, 919812, 907524, 907518, 896178, 471148]",[Araha.4736s0001],"[Thhalv10004880m.g, Thhalv10023666m.g, Thhalv1..."
6,AT1G11755,"[AT1G11755, AT1G11755]",[910935],[Araha.2699s0009],[Thhalv10008545m.g]
7,AT1G12480,"[AT1G12480, AT1G62262, AT1G12480, AT4G27970, A...","[924774, 489261, 354000, 471398]",[Araha.2779s0005],"[Thhalv10003801m.g, Thhalv10023540m.g, Thhalv1..."
8,AT1G15360,"[AT1G15360, AT5G11190, AT5G25390, AT5G25190, A...","[489371, 893032, 326806, 471733]",[Araha.4433s0016],"[Thhalv10004973m.g, Thhalv10005514m.g, Thhalv1..."
9,AT1G15520,"[AT1G15520, AT4G15236, AT1G15210, AT4G15233, A...","[481566, 488830, 329987, 357521, 925919, 48559...",[Araha.3835s0001],"[Thhalv10006564m.g, Thhalv10019897m.g, Thhalv1..."


In [214]:
# Successfully orthologous DroughtDB genes
len(df_homo['DroughtDB'])

145

In [218]:
# genefamilies in Ath
x = []
for y in df_homo['A. thaliana']:
    x += y
len(set(x))  

739

In [219]:
# genefamilies in Aly
x = []
for y in df_homo['A. lyrata']:
    x += y
len(set(x))

696

In [221]:
# genefamilies in Aha
x = []
for y in df_homo['A. halleri early-release']:
    x += y
len(set(x))

120

In [222]:
# genefamilies in Esa
x = []
for y in df_homo['E. salsugineum']:
    x += y
len(set(x))

781

In [190]:
dic = {'DroughtDB':[],
       'A.thaliana':[]
        }
for i in df_homo.index:
    ath_list = df_homo.loc[i]['A. thaliana']
    DDB_genename = df_homo.loc[i]['DroughtDB']
    dic['A.thaliana'] += ath_list
    dic['DroughtDB'] += [DDB_genename] * len(ath_list)

In [191]:
len(pd.DataFrame(dic)['A.thaliana'])

1129

In [192]:
dic2 = get_info_thale(dic['A.thaliana'])      

In [194]:
dic.update(dic2)

In [195]:
df_homo_ath = pd.DataFrame(dic)
df_homo_ath.to_csv('DroughtNet.Primary.Orth.Homo.ATH.ver2.txt',sep='\t')

In [197]:
len(df_homo_ath)

1129

In [193]:
get_info_thale(['AT4G36920'])

{'Name': [u'APETALA 2'],
 'briefDescription': [u'Integrase-type DNA-binding superfamily protein'],
 'description': [None],
 'organism.shortName': [u'A. thaliana'],
 'primaryIdentifier': [u'AT4G36920'],
 'symbol': [u'AP2']}

In [105]:
genelist_temp = []
for each in df_homo['A. lyrata']:
    genelist_temp += each
print len(genelist_temp)
print len(set(genelist_temp))

756
623


In [102]:
pd.DataFrame(get_info(df_homo['A. thaliana'][10]))

Unnamed: 0,briefDescription,description,organism.shortName,primaryIdentifier,symbol
0,Inorganic H pyrophosphatase family protein,,A. thaliana,AT1G15690,


In [81]:
gene2homologs('AT1G01360','A. thaliana','A. thaliana')

[u'AT4G01026',
 u'AT5G53160',
 u'AT5G05440',
 u'AT2G40330',
 u'AT2G38310',
 u'AT4G27920']

In [15]:
# 길어서 밑으로 내려놓음 ! 
Primary_DroughtDB = '''AEGTA00197
AEGTA05384
AEGTA07634
AEGTA28497
AEGTA28957
AT1G01360
AT1G07890
AT1G08810
AT1G09540
AT1G09700
AT1G10370
AT1G11755
AT1G12480
AT1G15360
AT1G15520
AT1G15690
AT1G15820
AT1G16060
AT1G16540
AT1G27730
AT1G28490
AT1G32640
AT1G33560
AT1G35720
AT1G45249
AT1G48270
AT1G48410
AT1G49230
AT1G52400
AT1G54160
AT1G56600
AT1G58440
AT1G64060
AT1G64990
AT1G69270
AT1G71960
AT1G72770
AT1G73360
AT1G75270
AT1G78290
AT1G78580
AT1G80460
AT2G04240
AT2G13540
AT2G17820
AT2G18960
AT2G22430
AT2G26300
AT2G27150
AT2G29980
AT2G31320
AT2G31470
AT2G33150
AT2G36450
AT2G38750
AT2G38880
AT2G39800
AT2G43350
AT2G47180
AT2G47190
AT2G47800
AT3G08510
AT3G14440
AT3G19290
AT3G22170
AT3G26090
AT3G51920
AT3G53420
AT3G54820
AT4G00430
AT4G02390
AT4G04720
AT4G04740
AT4G15090
AT4G17645
AT4G18290
AT4G19230
AT4G19395
AT4G23450
AT4G25480
AT4G26080
AT4G27630
AT4G29890
AT4G30950
AT4G31120
AT4G33950
AT4G34000
AT4G34240
AT4G37870
AT4G38130
AT5G03740
AT5G05410
AT5G06530
AT5G10450
AT5G11270
AT5G13680
AT5G16820
AT5G24030
AT5G37500
AT5G40280
AT5G44200
AT5G45340
AT5G47910
AT5G48870
AT5G49890
AT5G51070
AT5G51990
AT5G57050
AT5G61760
AT5G63110
AT5G67300
Contig19733
Glyma10g44160
GRMZM2G006745
GRMZM2G028386
GRMZM2G054361
GRMZM2G085019
GRMZM2G135341
GRMZM2G136910
GRMZM2G166035
GRMZM2G321239
GRMZM2G492252
GRMZM5G804893
LOC_Os01g01420
LOC_Os01g07120
LOC_Os01g42860
LOC_Os01g50110
LOC_Os01g55450
LOC_Os01g58420
LOC_Os01g62410
LOC_Os01g72530
LOC_Os02g08230
LOC_Os02g10760
LOC_Os02g41860
LOC_Os02g44630
LOC_Os02g52780
LOC_Os02g57720
LOC_Os03g03370
LOC_Os03g03660
LOC_Os03g60080
LOC_Os04g56130
LOC_Os05g25770
LOC_Os05g48300
LOC_Os05g51670
LOC_Os06g41010
LOC_Os07g07350
LOC_Os07g22600
LOC_Os07g48760
LOC_Os08g35240
LOC_Os09g05020
LOC_Os09g13570
LOC_Os09g28310
LOC_Os09g35030
LOC_Os11g02240
MLOC_13871
MLOC_54227
MLOC_62301
MLOC_77777
Sb02g025790
Sb04g038330
Sb07g006195
Solyc00g015750
Solyc01g005660
Solyc01g079480
Solyc01g079750
Solyc01g111510
Solyc02g084850
Solyc03g043950
Solyc03g080180
Solyc03g093560
Solyc04g072900
Solyc04g077980
Solyc04g077980
Solyc05g010420
Solyc05g051200
Solyc05g052410
Solyc05g053330
Solyc05g055990
Solyc06g063070
Solyc06g063070
Solyc06g068090
Solyc07g056570
Solyc08g043170
Solyc08g077980
Solyc08g081190
Solyc10g054440
Solyc11g018800
Solyc11g033280
Solyc12g009020'''.split('\n')

In [209]:
len(Primary_DroughtDB)

189