# RNAi Phenotypes Wormbase (WS266)

Author: Zachary Flamholz  
Date: 06-2018  
Database: https://wormbase.org/#012-34-5  
Data: ftp://ftp.wormbase.org/pub/wormbase/releases/WS266/ONTOLOGY/rnai_phenotypes.WS266.wb   
Companion file: http://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Invertebrates/Caenorhabditis_elegans.gene_info.gz,  http://tazendra.caltech.edu/~azurebrd/cgi-bin/forms/generic.cgi?action=WpaXref  

# Versions of Modules in Use

In [3]:
%load_ext version_information
%version_information numpy, pandas

The version_information extension is already loaded. To reload it, use:
  %reload_ext version_information


Software,Version
Python,2.7.15 64bit [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
IPython,5.7.0
OS,Darwin 17.5.0 x86_64 i386 64bit
numpy,1.14.3
pandas,0.23.0
Mon Jul 30 10:29:38 2018 EDT,Mon Jul 30 10:29:38 2018 EDT


## import libraries

In [2]:
import numpy as np
import pandas as pd
import sys
import datetime
import scipy.stats as stat

## read in data

In [4]:
df = pd.read_csv("input/rnai_phenotypes.WS266_WormBase.txt", sep="\t", header=None)
df.columns = ["WB gene ID", "Gene symbol", "phenotype", "WB phenotype ID", "WB source ID"]

In [5]:
df.head()

Unnamed: 0,WB gene ID,Gene symbol,phenotype,WB phenotype ID,WB source ID
0,WBGene00001908,F17E9.9,locomotion variant,WBPhenotype:0000643,WBRNAi00025129|WBPaper00006395 WBRNAi00025631|...
1,WBGene00001908,F17E9.9,avoids bacterial lawn,WBPhenotype:0000402,WBRNAi00095640|WBPaper00040984
2,WBGene00001908,F17E9.9,RAB-11 recycling endosome localization variant,WBPhenotype:0002107,WBRNAi00090830|WBPaper00041129
3,WBGene00001908,F17E9.9,body elongation defective,WBPhenotype:0000242,WBRNAi00082054|WBPaper00005085
4,WBGene00001908,F17E9.9,chromosome segregation variant,WBPhenotype:0000773,WBRNAi00082059|WBPaper00005085


In [6]:
df.shape

(54934, 5)

In [7]:
len(df["phenotype"].unique())

1223

In [8]:
len(df["Gene symbol"].unique())

8106

In [9]:
df_location_variant = df.loc[df["phenotype"] == "locomotion variant"].copy()

In [10]:
df_location_variant.iloc[0]

WB gene ID                                            WBGene00001908
Gene symbol                                                  F17E9.9
phenotype                                         locomotion variant
WB phenotype ID                                  WBPhenotype:0000643
WB source ID       WBRNAi00025129|WBPaper00006395 WBRNAi00025631|...
Name: 0, dtype: object

In [11]:
df_location_variant.iloc[0]["WB gene ID"]

'WBGene00001908'

## analyze term set

In [12]:
count_terms = lambda x: len(df.loc[df["phenotype"] == x])
func = np.vectorize(count_terms)
numGenes_perTerms = func(df["phenotype"].unique())


In [13]:
np.mean(numGenes_perTerms)

44.91741618969746

In [14]:
np.median(numGenes_perTerms)

6.0

## convert gene symbol column to general nomenclature

In [16]:
cElegans_geneInfo = pd.read_csv("input/Caenorhabditis_elegans.gene_info", sep="\t")

In [17]:
cElegans_geneInfo.head()

Unnamed: 0,#tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
0,6239,171590,homt-1,CELE_Y74C9A.3,-,WormBase:WBGene00022277,I,-,Alpha N-terminal protein methyltransferase 1,protein-coding,-,-,-,Alpha N-terminal protein methyltransferase 1,20180606,-
1,6239,171591,nlp-40,CELE_Y74C9A.2,-,WormBase:WBGene00022276,I,-,Peptide P4,protein-coding,-,-,-,Peptide P4,20180606,-
2,6239,171592,rcor-1,CELE_Y74C9A.4,-,WormBase:WBGene00022278,I,-,RCOR (REST CO-Repressor) homolog,protein-coding,-,-,-,RCOR (REST CO-Repressor) homolog,20180606,-
3,6239,171593,sesn-1,CELE_Y74C9A.5,-,WormBase:WBGene00022279,I,-,Sestrin homolog,protein-coding,-,-,-,Sestrin homolog,20180606,-
4,6239,171594,pgs-1,CELE_Y48G1C.4,-,WormBase:WBGene00021677,I,-,CDP-diacylglycerol--glycerol-3-phosphate 3-pho...,protein-coding,-,-,-,CDP-diacylglycerol--glycerol-3-phosphate 3-pho...,20180406,-


In [18]:
cElegans_geneInfo.iloc[0,5].split(":")[1]

'WBGene00022277'

In [19]:
remove_WormBase = lambda x: x.replace("|", ":").split(":")[1] if "WormBase" in x else x
## there are two rows that have muliple dbXrefs 
cElegans_geneInfo["dbXrefs"] = cElegans_geneInfo["dbXrefs"].apply(remove_WormBase)
cElegans_geneInfo.head()

Unnamed: 0,#tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
0,6239,171590,homt-1,CELE_Y74C9A.3,-,WBGene00022277,I,-,Alpha N-terminal protein methyltransferase 1,protein-coding,-,-,-,Alpha N-terminal protein methyltransferase 1,20180606,-
1,6239,171591,nlp-40,CELE_Y74C9A.2,-,WBGene00022276,I,-,Peptide P4,protein-coding,-,-,-,Peptide P4,20180606,-
2,6239,171592,rcor-1,CELE_Y74C9A.4,-,WBGene00022278,I,-,RCOR (REST CO-Repressor) homolog,protein-coding,-,-,-,RCOR (REST CO-Repressor) homolog,20180606,-
3,6239,171593,sesn-1,CELE_Y74C9A.5,-,WBGene00022279,I,-,Sestrin homolog,protein-coding,-,-,-,Sestrin homolog,20180606,-
4,6239,171594,pgs-1,CELE_Y48G1C.4,-,WBGene00021677,I,-,CDP-diacylglycerol--glycerol-3-phosphate 3-pho...,protein-coding,-,-,-,CDP-diacylglycerol--glycerol-3-phosphate 3-pho...,20180406,-


## testing how to convert names

In [20]:
type(cElegans_geneInfo["dbXrefs"][0])

str

In [21]:
type(df.iloc[0,0])

str

In [22]:
cElegans_geneInfo.loc[cElegans_geneInfo["dbXrefs"] == df.iloc[0,0]]["Symbol"]

17913    his-34
Name: Symbol, dtype: object

In [23]:
'WBGene00022277' in cElegans_geneInfo["dbXrefs"].values

True

In [24]:
df_location_variant.head(10)

Unnamed: 0,WB gene ID,Gene symbol,phenotype,WB phenotype ID,WB source ID
0,WBGene00001908,F17E9.9,locomotion variant,WBPhenotype:0000643,WBRNAi00025129|WBPaper00006395 WBRNAi00025631|...
57,WBGene00007110,B0035.11,locomotion variant,WBPhenotype:0000643,WBRNAi00024350|WBPaper00006395
118,WBGene00006707,R09B3.4,locomotion variant,WBPhenotype:0000643,WBRNAi00026070|WBPaper00006395
168,WBGene00003808,T05A1.1,locomotion variant,WBPhenotype:0000643,WBRNAi00075428|WBPaper00006132
185,WBGene00016250,C30C11.4,locomotion variant,WBPhenotype:0000643,WBRNAi00024683|WBPaper00006395
227,WBGene00012735,Y40B1A.4,locomotion variant,WBPhenotype:0000643,WBRNAi00004607|WBPaper00004402 WBRNAi00026682|...
290,WBGene00004951,K10B3.10,locomotion variant,WBPhenotype:0000643,WBRNAi00008979|WBPaper00005654 WBRNAi00025934|...
356,WBGene00017678,F21F8.4,locomotion variant,WBPhenotype:0000643,WBRNAi00078339|WBPaper00027214
394,WBGene00019778,M57.2,locomotion variant,WBPhenotype:0000643,WBRNAi00009019|WBPaper00005654 WBRNAi00026001|...
536,WBGene00019881,R05D3.8,locomotion variant,WBPhenotype:0000643,WBRNAi00026026|WBPaper00006395 WBRNAi00074594|...


In [25]:
type(cElegans_geneInfo.loc[cElegans_geneInfo["dbXrefs"] == 'WBGene00001908']["Symbol"].tolist()[0])

str

In [26]:
## if there is no WB gene ID in the reference table then the symbol becomes the WB gene ID from the phenotype list
change_name = lambda x: cElegans_geneInfo.loc[cElegans_geneInfo["dbXrefs"] == x]["Symbol"].tolist()[0] if x in cElegans_geneInfo["dbXrefs"].values else x
df_location_variant["Gene symbol"] = df_location_variant["WB gene ID"].apply(change_name)

In [27]:
df_location_variant.head(100)

Unnamed: 0,WB gene ID,Gene symbol,phenotype,WB phenotype ID,WB source ID
0,WBGene00001908,his-34,locomotion variant,WBPhenotype:0000643,WBRNAi00025129|WBPaper00006395 WBRNAi00025631|...
57,WBGene00007110,leo-1,locomotion variant,WBPhenotype:0000643,WBRNAi00024350|WBPaper00006395
118,WBGene00006707,ubc-12,locomotion variant,WBPhenotype:0000643,WBRNAi00026070|WBPaper00006395
168,WBGene00003808,npr-2,locomotion variant,WBPhenotype:0000643,WBRNAi00075428|WBPaper00006132
185,WBGene00016250,hsp-110,locomotion variant,WBPhenotype:0000643,WBRNAi00024683|WBPaper00006395
227,WBGene00012735,sptf-3,locomotion variant,WBPhenotype:0000643,WBRNAi00004607|WBPaper00004402 WBRNAi00026682|...
290,WBGene00004951,spc-1,locomotion variant,WBPhenotype:0000643,WBRNAi00008979|WBPaper00005654 WBRNAi00025934|...
356,WBGene00017678,asp-12,locomotion variant,WBPhenotype:0000643,WBRNAi00078339|WBPaper00027214
394,WBGene00019778,M57.2,locomotion variant,WBPhenotype:0000643,WBRNAi00009019|WBPaper00005654 WBRNAi00026001|...
536,WBGene00019881,R05D3.8,locomotion variant,WBPhenotype:0000643,WBRNAi00026026|WBPaper00006395 WBRNAi00074594|...


## use reference list to extract only protein coding genes

In [28]:
cElegans_proteinCoding = cElegans_geneInfo[cElegans_geneInfo["type_of_gene"] == "protein-coding"]

In [29]:
df_onlyProteinCoding = df.loc[df["WB gene ID"].isin(cElegans_proteinCoding["dbXrefs"].unique())]

In [30]:
df_onlyProteinCoding.shape

(53433, 5)

## load the table for PMID # reference

In [31]:
paper_lookup = pd.read_csv('input/WB_paper_id_conversion_table', sep='~', header=None)
paper_lookup.columns = ['info']

In [32]:
paper_lookup.head()

Unnamed: 0,info
0,WBPaper00000003 cgc3
1,WBPaper00000005 cgc5
2,WBPaper00000005 doi10.1163/187529275X00518
3,WBPaper00000006 cgc6
4,WBPaper00000007 cgc7


In [33]:
## need to split the file and then create a dictionary for each paper id
table = paper_lookup['info'].apply(lambda x: x.split(' '))
table_dict = {}
for i in range(len(table)):
    table_dict[table[i][0]] = table[i][1]

In [34]:
table_dict['WBPaper00006395']

'pmid14551910'

## create gmt by writing to file

In [35]:
filename = 'wormbase_rnai_phenotype_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
file = open(filename,'w+') 
terms = df["phenotype"].unique()
for term in terms:
    df_byTerm = df_onlyProteinCoding.loc[df["phenotype"] == term].copy()
    
    if df_byTerm.shape[0] > 4:
        # convert gene name using the cElegans gene info table
        change_name = lambda x: cElegans_geneInfo.loc[cElegans_geneInfo["dbXrefs"] == x]["Symbol"].tolist()[0] if x in cElegans_geneInfo["dbXrefs"].values else x
        df_byTerm["Gene symbol"] = df_byTerm["WB gene ID"].apply(change_name)
        # split splice variant names. some genes have format X.#.# where the second decimal describes a TX ID
        split_splice = lambda x: (x.split('.')[0] + x.split('.')[1]) if len(x.split('.')) > 2 else x
        df_byTerm["Gene symbol"] = df_byTerm["Gene symbol"].apply(split_splice)
        
        if len(df_byTerm["Gene symbol"].unique()) > 4:
            file.write("%s" % term + "_" + df_byTerm.iloc[0, 3] + '\t')
            file.write("\t")
            genes = df_byTerm["Gene symbol"].unique()
            for gene in genes:
                file.write("%s\t" % gene)
            file.write("\n")
        
file.close()