# RNAi Phenotypes Wormbase (WS264)

Author: Zachary Flamholz  
Date: 06-2018  
Database: https://wormbase.org/#012-34-5  
Data: ftp://ftp.wormbase.org/pub/wormbase/releases/WS264/ONTOLOGY/rnai_phenotypes.WS264.wb  
Companion file: http://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Invertebrates/Caenorhabditis_elegans.gene_info.gz,  http://tazendra.caltech.edu/~azurebrd/cgi-bin/forms/generic.cgi?action=WpaXref  

# Versions of Modules in Use

In [1]:
%load_ext version_information
%version_information numpy, pandas, clustergrammer_widget 

Software,Version
Python,2.7.15 64bit [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
IPython,5.7.0
OS,Darwin 17.5.0 x86_64 i386 64bit
numpy,1.14.3
pandas,0.23.0
clustergrammer_widget,The 'clustergrammer_widget' distribution was not found and is required by the application
Tue Jul 03 10:18:42 2018 EDT,Tue Jul 03 10:18:42 2018 EDT


## import libraries

In [2]:
import numpy as np
import pandas as pd
import sys
import datetime
import scipy.stats as stat

## read in data

In [3]:
df = pd.read_csv("input/rnai_phenotypes.WS264_WormBase.txt", sep="\t", header=None)
df.columns = ["WB gene ID", "Gene symbol", "phenotype", "WB phenotype ID", "WB source ID"]

In [4]:
df.head()

Unnamed: 0,WB gene ID,Gene symbol,phenotype,WB phenotype ID,WB source ID
0,WBGene00001908,F17E9.9,locomotion variant,WBPhenotype:0000643,WBRNAi00025129|WBPaper00006395 WBRNAi00025631|...
1,WBGene00001908,F17E9.9,avoids bacterial lawn,WBPhenotype:0000402,WBRNAi00095640|WBPaper00040984
2,WBGene00001908,F17E9.9,RAB-11 recycling endosome localization variant,WBPhenotype:0002107,WBRNAi00090830|WBPaper00041129
3,WBGene00001908,F17E9.9,body elongation defective,WBPhenotype:0000242,WBRNAi00082054|WBPaper00005085
4,WBGene00001908,F17E9.9,chromosome segregation variant,WBPhenotype:0000773,WBRNAi00082059|WBPaper00005085


In [5]:
df.shape

(54540, 5)

In [6]:
len(df["phenotype"].unique())

1219

In [7]:
len(df["Gene symbol"].unique())

8076

In [8]:
df_location_variant = df.loc[df["phenotype"] == "locomotion variant"].copy()

In [9]:
df_location_variant.iloc[0]

WB gene ID                                            WBGene00001908
Gene symbol                                                  F17E9.9
phenotype                                         locomotion variant
WB phenotype ID                                  WBPhenotype:0000643
WB source ID       WBRNAi00025129|WBPaper00006395 WBRNAi00025631|...
Name: 0, dtype: object

In [10]:
df_location_variant.iloc[0]["WB gene ID"]

'WBGene00001908'

## analyze term set

In [11]:
count_terms = lambda x: len(df.loc[df["phenotype"] == x])
func = np.vectorize(count_terms)
numGenes_perTerms = func(df["phenotype"].unique())


In [12]:
np.mean(numGenes_perTerms)

44.741591468416736

In [13]:
np.median(numGenes_perTerms)

6.0

## convert gene symbol column to general nomenclature

In [15]:
cElegans_geneInfo = pd.read_csv("input/Caenorhabditis_elegans.gene_info-1", sep="\t")

In [16]:
cElegans_geneInfo.head()

Unnamed: 0,#tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
0,6239,171590,homt-1,CELE_Y74C9A.3,-,WormBase:WBGene00022277,I,-,Alpha N-terminal protein methyltransferase 1,protein-coding,-,-,-,Alpha N-terminal protein methyltransferase 1,20180606,-
1,6239,171591,nlp-40,CELE_Y74C9A.2,-,WormBase:WBGene00022276,I,-,Peptide P4,protein-coding,-,-,-,Peptide P4,20180606,-
2,6239,171592,rcor-1,CELE_Y74C9A.4,-,WormBase:WBGene00022278,I,-,RCOR (REST CO-Repressor) homolog,protein-coding,-,-,-,RCOR (REST CO-Repressor) homolog,20180606,-
3,6239,171593,sesn-1,CELE_Y74C9A.5,-,WormBase:WBGene00022279,I,-,Sestrin homolog,protein-coding,-,-,-,Sestrin homolog,20180606,-
4,6239,171594,pgs-1,CELE_Y48G1C.4,-,WormBase:WBGene00021677,I,-,CDP-diacylglycerol--glycerol-3-phosphate 3-pho...,protein-coding,-,-,-,CDP-diacylglycerol--glycerol-3-phosphate 3-pho...,20180406,-


In [17]:
cElegans_geneInfo.iloc[0,5].split(":")[1]

'WBGene00022277'

In [18]:
remove_WormBase = lambda x: x.replace("|", ":").split(":")[1] if "WormBase" in x else x
## there are two rows that have muliple dbXrefs 
cElegans_geneInfo["dbXrefs"] = cElegans_geneInfo["dbXrefs"].apply(remove_WormBase)
cElegans_geneInfo.head()

Unnamed: 0,#tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
0,6239,171590,homt-1,CELE_Y74C9A.3,-,WBGene00022277,I,-,Alpha N-terminal protein methyltransferase 1,protein-coding,-,-,-,Alpha N-terminal protein methyltransferase 1,20180606,-
1,6239,171591,nlp-40,CELE_Y74C9A.2,-,WBGene00022276,I,-,Peptide P4,protein-coding,-,-,-,Peptide P4,20180606,-
2,6239,171592,rcor-1,CELE_Y74C9A.4,-,WBGene00022278,I,-,RCOR (REST CO-Repressor) homolog,protein-coding,-,-,-,RCOR (REST CO-Repressor) homolog,20180606,-
3,6239,171593,sesn-1,CELE_Y74C9A.5,-,WBGene00022279,I,-,Sestrin homolog,protein-coding,-,-,-,Sestrin homolog,20180606,-
4,6239,171594,pgs-1,CELE_Y48G1C.4,-,WBGene00021677,I,-,CDP-diacylglycerol--glycerol-3-phosphate 3-pho...,protein-coding,-,-,-,CDP-diacylglycerol--glycerol-3-phosphate 3-pho...,20180406,-


## testing how to convert names

In [19]:
type(cElegans_geneInfo["dbXrefs"][0])

str

In [20]:
type(df.iloc[0,0])

str

In [21]:
cElegans_geneInfo.loc[cElegans_geneInfo["dbXrefs"] == df.iloc[0,0]]["Symbol"]

17913    his-34
Name: Symbol, dtype: object

In [22]:
'WBGene00022277' in cElegans_geneInfo["dbXrefs"].values

True

In [23]:
df_location_variant.head(10)

Unnamed: 0,WB gene ID,Gene symbol,phenotype,WB phenotype ID,WB source ID
0,WBGene00001908,F17E9.9,locomotion variant,WBPhenotype:0000643,WBRNAi00025129|WBPaper00006395 WBRNAi00025631|...
57,WBGene00007110,B0035.11,locomotion variant,WBPhenotype:0000643,WBRNAi00024350|WBPaper00006395
118,WBGene00006707,R09B3.4,locomotion variant,WBPhenotype:0000643,WBRNAi00026070|WBPaper00006395
168,WBGene00003808,T05A1.1,locomotion variant,WBPhenotype:0000643,WBRNAi00075428|WBPaper00006132
185,WBGene00016250,C30C11.4,locomotion variant,WBPhenotype:0000643,WBRNAi00024683|WBPaper00006395
226,WBGene00012735,Y40B1A.4,locomotion variant,WBPhenotype:0000643,WBRNAi00004607|WBPaper00004402 WBRNAi00026682|...
289,WBGene00004951,K10B3.10,locomotion variant,WBPhenotype:0000643,WBRNAi00008979|WBPaper00005654 WBRNAi00025934|...
355,WBGene00017678,F21F8.4,locomotion variant,WBPhenotype:0000643,WBRNAi00078339|WBPaper00027214
393,WBGene00019778,M57.2,locomotion variant,WBPhenotype:0000643,WBRNAi00009019|WBPaper00005654 WBRNAi00026001|...
535,WBGene00019881,R05D3.8,locomotion variant,WBPhenotype:0000643,WBRNAi00026026|WBPaper00006395 WBRNAi00074594|...


In [24]:
type(cElegans_geneInfo.loc[cElegans_geneInfo["dbXrefs"] == 'WBGene00001908']["Symbol"].tolist()[0])

str

In [25]:
## if there is no WB gene ID in the reference table then the symbol becomes the WB gene ID from the phenotype list
change_name = lambda x: cElegans_geneInfo.loc[cElegans_geneInfo["dbXrefs"] == x]["Symbol"].tolist()[0] if x in cElegans_geneInfo["dbXrefs"].values else x
df_location_variant["Gene symbol"] = df_location_variant["WB gene ID"].apply(change_name)

In [26]:
df_location_variant.head(100)

Unnamed: 0,WB gene ID,Gene symbol,phenotype,WB phenotype ID,WB source ID
0,WBGene00001908,his-34,locomotion variant,WBPhenotype:0000643,WBRNAi00025129|WBPaper00006395 WBRNAi00025631|...
57,WBGene00007110,leo-1,locomotion variant,WBPhenotype:0000643,WBRNAi00024350|WBPaper00006395
118,WBGene00006707,ubc-12,locomotion variant,WBPhenotype:0000643,WBRNAi00026070|WBPaper00006395
168,WBGene00003808,npr-2,locomotion variant,WBPhenotype:0000643,WBRNAi00075428|WBPaper00006132
185,WBGene00016250,hsp-110,locomotion variant,WBPhenotype:0000643,WBRNAi00024683|WBPaper00006395
226,WBGene00012735,sptf-3,locomotion variant,WBPhenotype:0000643,WBRNAi00004607|WBPaper00004402 WBRNAi00026682|...
289,WBGene00004951,spc-1,locomotion variant,WBPhenotype:0000643,WBRNAi00008979|WBPaper00005654 WBRNAi00025934|...
355,WBGene00017678,asp-12,locomotion variant,WBPhenotype:0000643,WBRNAi00078339|WBPaper00027214
393,WBGene00019778,M57.2,locomotion variant,WBPhenotype:0000643,WBRNAi00009019|WBPaper00005654 WBRNAi00026001|...
535,WBGene00019881,R05D3.8,locomotion variant,WBPhenotype:0000643,WBRNAi00026026|WBPaper00006395 WBRNAi00074594|...


## use reference list to extract only protein coding genes

In [27]:
cElegans_proteinCoding = cElegans_geneInfo[cElegans_geneInfo["type_of_gene"] == "protein-coding"]

In [28]:
df_onlyProteinCoding = df.loc[df["WB gene ID"].isin(cElegans_proteinCoding["dbXrefs"].unique())]

In [29]:
df_onlyProteinCoding.shape

(53073, 5)

## load the table for PMID # reference

In [53]:
paper_lookup = pd.read_csv('input/WB_paper_id_conversion_table', sep='~', header=None)
paper_lookup.columns = ['info']

In [54]:
paper_lookup.head()

Unnamed: 0,info
0,WBPaper00000003 cgc3
1,WBPaper00000005 cgc5
2,WBPaper00000005 doi10.1163/187529275X00518
3,WBPaper00000006 cgc6
4,WBPaper00000007 cgc7


In [72]:
## need to split the file and then create a dictionary for each paper id
table = paper_lookup['info'].apply(lambda x: x.split(' '))
table_dict = {}
for i in range(len(table)):
    table_dict[table[i][0]] = table[i][1]

In [74]:
table_dict['WBPaper00006395']

'pmid14551910'

## create gmt by writing to file

In [76]:
filename = 'wormbase_rnai_phenotype_greater4_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
file = open(filename,'w+') 
terms = df["phenotype"].unique()
for term in terms:
    df_byTerm = df_onlyProteinCoding.loc[df["phenotype"] == term].copy()
    
    if df_byTerm.shape[0] > 4:
        # convert gene name using the cElegans gene info table
        change_name = lambda x: cElegans_geneInfo.loc[cElegans_geneInfo["dbXrefs"] == x]["Symbol"].tolist()[0] if x in cElegans_geneInfo["dbXrefs"].values else x
        df_byTerm["Gene symbol"] = df_byTerm["WB gene ID"].apply(change_name)
        # split splice variant names. some genes have format X.#.# where the second decimal describes a TX ID
        split_splice = lambda x: (x.split('.')[0] + x.split('.')[1]) if len(x.split('.')) > 2 else x
        df_byTerm["Gene symbol"] = df_byTerm["Gene symbol"].apply(split_splice)
        
        if len(df_byTerm["Gene symbol"].unique()) > 4:
            file.write("%s" % term + "_" + df_byTerm.iloc[0, 3] + '\t')
            file.write("\t")
            genes = df_byTerm["Gene symbol"].unique()
            for gene in genes:
                file.write("%s\t" % gene)
            file.write("\n")
        
file.close()

## get mean and median for each term

In [31]:
def getTermStats(loaded_gmt, geneInfoTable, num_samplings, num_genes_in_sampling) :
    # get the relevant terms and set a dictionary for them
    term_set = loaded_gmt.iloc[:,0].apply(lambda x: x.split("\t")[0])
    term_genes_dict = {term_set[i]: loaded_gmt.iloc[i,0].split("\t")[2:] for i in range(0, loaded_gmt.shape[0])}
    term_rank_dict = {k: [] for k in term_set}
    term_stats_dict = {k: [] for k in term_set}
    
    # get the total number of genes in the organism
    n_genes = len(geneInfoTable["Symbol"].unique())
    
    # number of genes per sampling, number of times to sample
    genes_in_sampleing = num_genes_in_sampling
    num_samplings = num_samplings
    
    for i,x in enumerate(range(0,num_samplings)):
        
        progressPercent = ((i+1)/len(range(0,num_samplings)))*100
        
        sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(range(0,num_samplings))))
        sys.stdout.flush()
            
        
        randomSet = set(geneInfoTable["Symbol"].sample(genes_in_sampleing).apply(lambda x: x.split('.')[0]))
        
        # initialize an array to hold the pvalue for each term
        pvals = np.array([])
        for key in term_rank_dict.keys():
            
            termSet = set(term_genes_dict[key])
            # calculate p value using fisher exact test
            # implemented using the formula found in the GeneOverap bioconductor package for R
            pval_term = stat.fisher_exact([[n_genes - len(termSet.union(randomSet)), len(randomSet.difference(termSet))], [len(termSet.difference(randomSet)), len(termSet.intersection(randomSet))]])[1]
            pvals = np.append(pvals, [pval_term])
        
        # sort the pvals and add the rank to the term_rank dict
        sorted_pvals = pvals.argsort()
        
        # find the rank for each term by sorting the pvals array and getting the index of the key position in the
        # sorted list. Need to +1 because the index begins with 0
        
        for j,key in enumerate(term_rank_dict.keys()):
            term_rank_dict[key].append(np.where(sorted_pvals == j)[0][0] + 1)
            
    for key in term_rank_dict:
        term_stats_dict[key] = [np.mean(term_rank_dict[key]), np.std(term_rank_dict[key])]
            
    
    
    return term_stats_dict

In [37]:
## load gmt
rnai_WB_gmt = pd.read_csv("wormbase_rnai_phenotype_greater4_2018_06.gmt", sep="~", header=None)

In [38]:
rnai_WB_gmt.head()

Unnamed: 0,0
0,locomotion variant\t\this-34\tleo-1\tubc-12\tn...
1,avoids bacterial lawn\t\this-34\tpign-1\tset-3...
2,RAB-11 recycling endosome localization variant...
3,body elongation defective\t\this-34\this-11\th...
4,chromosome segregation variant\t\this-34\tsmc-...


In [39]:
iterations_200 = getTermStats(rnai_WB_gmt, cElegans_proteinCoding, num_samplings=200, num_genes_in_sampling=300)

Progress: 100%  200 Out of 200   

In [40]:
iterations_200

{'ABa ABp EMS synchronous division early emb': [446.065, 174.11938655704023],
 'ATP levels reduced': [515.425, 175.50610922415208],
 'Aba ABp division axis defective early emb': [391.655, 200.22456386517615],
 'Bacillus thuringiensis toxin hypersensitive': [189.135, 111.13917749830615],
 'C lineage variant': [248.825, 72.95638680060848],
 'DNA damage checkpoint variant': [554.725, 168.04359367437962],
 'DNA repair variant': [282.915, 84.69443768630853],
 'E lineage variant': [416.83, 173.67544760270522],
 'EMS anterior extension fails early emb': [395.25, 181.13560527958052],
 'Fluorouracil resistant': [479.1, 125.6230074468845],
 'L1 arrest': [327.24, 222.3097217847209],
 'L1 lethal': [284.46, 164.29253908805475],
 'L2 arrest': [289.65, 112.80397820999045],
 'L3 arrest': [598.365, 176.33273029985105],
 'L4 lethal': [281.915, 89.70589598794496],
 'M lineage variant': [501.695, 180.99768499900765],
 'P granule defective': [239.99, 110.26082667928806],
 'P granule localization defective'

In [41]:
df_200_iterations = pd.DataFrame.from_dict(iterations_200, orient='index', columns = ['mean', 'sd'])

In [42]:
df_200_iterations.head()

Unnamed: 0,mean,sd
lysosome morphology variant,377.53,133.995892
body elongation defective,275.525,178.061785
hypodermis disorganized,438.78,126.4787
pore forming toxin hypersensitive,218.55,163.641644
transgene expression reduced,169.415,145.342467


In [43]:
df_200_iterations.sort_values(by=['mean']).head()

Unnamed: 0,mean,sd
embryonic lethal,39.1,54.056082
slow growth,63.86,80.597273
larval arrest,94.455,84.134642
maternal sterile,111.38,94.745003
sterile progeny,112.545,95.139361


In [44]:
df_200_iterations.to_csv('output/wormbase_rnai_phenotype_greater4_2018_06_stats.tsv', sep='\t', header=False)