In [1]:
import pandas as pd
import numpy as np 

from scipy.cluster.hierarchy import dendrogram, linkage, to_tree, is_valid_linkage
from scipy.spatial.distance import pdist

In [2]:
def getNewick(node, newick, parentdist, leaf_names):
    if node.is_leaf():
        return "%s:%.2f%s" % (leaf_names[node.id], parentdist - node.dist, newick)
    else:
        if len(newick) > 0:
            newick = "):%.2f%s" % (parentdist - node.dist, newick)
        else:
            newick = ");"
        newick = getNewick(node.get_left(), newick, node.dist, leaf_names)
        newick = getNewick(node.get_right(), ",%s" % (newick), node.dist, leaf_names)
        newick = "(%s" % (newick)
        return newick
    
# Taken from stackOverflow 
# At http://stackoverflow.com/questions/28222179/save-dendrogram-to-newick-format
# On: Tue 07/06/16

In [3]:
# 1.Get highly representative ViPhOGs

In [4]:
def readViPhOGsList(aPath):
    fr = open(aPath)
    viphogs = []
    for v in fr:
        viphogs.append(v.strip('\n'))
    return viphogs

viphogsOrder = readViPhOGsList("../8_machineLearning_copy/1_Classification/Order300320/importantViphogs.minNumberBestScore.csv")
viphogsFamily = readViPhOGsList("../8_machineLearning_copy/1_Classification/Family300320/importantViphogs.minNumberBestScore.csv")
viphogsGenus = readViPhOGsList("../8_machineLearning_copy/1_Classification/Genus300320/importantViphogs.minNumberBestScore.csv")

In [5]:
print len(viphogsOrder),len(viphogsFamily),len(viphogsGenus)
importantViphogs = list(set(viphogsOrder)|set(viphogsFamily)|set(viphogsGenus))
print len(importantViphogs)

20 388 1392
1457


In [6]:
#2. Read genomes vs clusters matrix

In [7]:
matrix = pd.read_csv("../8_machineLearning/SP/Type/fooGenXCLS.Type.csv",sep='\t',header=0,index_col=0)

In [8]:
matrix.shape

(10151, 31150)

In [9]:
matrix = matrix[importantViphogs].copy()
matrix.shape

(10151, 1457)

In [10]:
#3. Subsetting Data

In [11]:
fr = open("../7_DBtests/iv_queries/taxonomicAnnotationGenomesWithVOGs.csv")
print fr.readline()
accessions = {}
for line in fr:
    line = line.strip('\n').split(',')
    accession = line[0]
    taxPath = line[1:] #taxPath means taxonomy path
    accessions[accession] = taxPath
print len(accessions)
taxLabels = {"subspecies":0,"species":1,"subgenus":2,"genus":3,"subfamily":4,"family":5,"order":6,"type":7}

Accession,Subsepecies,Species,Subgenus,Genus,Subfamily,Family,Order,type

13098


In [12]:
#genomes = list(genomesVsClusters["Matrix"])
genomes = matrix.index.tolist()
labels = {}
for accession in genomes:
    label = accessions[accession][taxLabels["genus"]] 
    if label != "NA":
        if label in labels:
            aux = labels[label]
            aux.append(accession)
            labels[label] = aux
        else:
            labels[label] = [accession]
        
#accessions are randomized since a random selection will be done

for label in labels:
    np.random.shuffle(labels[label])

In [13]:
numPerLabel = []
for label in labels:
    print label + ':' + str(len(labels[label]))
    numPerLabel.append(len(labels[label]))
print np.median(np.array(numPerLabel))
print len(labels) 
labelsTraining = {}    
i = 0  
for label in labels:
    if label in labelsTraining:
        continue
    else:
        labelsTraining[label] = i
        i += 1

Spbetalikevirus:1
Cucumovirus:2
Norovirus:102
Rotavirus:3
Dependoparvovirus:22
Torovirus:3
Mammarenavirus:19
Tepovirus:4
Betacoronavirus:137
Betapapillomavirus:40
N15likevirus:1
Cuevavirus:1
Gammatorquevirus:3
Lolavirus:1
Maculavirus:6
Alphapapillomavirus:101
Betatetravirus:1
Ilarvirus:24
Betalipothrixvirus:6
Cytorhabdovirus:7
Cp8unalikevirus:3
Respirovirus:30
Sakobuvirus:1
Nepovirus:11
Phikmvlikevirus:23
Rhopapillomavirus:1
Kappatorquevirus:2
Marseillevirus:1
Dyopipapillomavirus:1
Yatapoxvirus:3
Lymphocryptovirus:31
Mycoflexivirus:1
Avulavirus:77
Arenavirus:5
Hk578likevirus:5
Reylikevirus:2
Parechovirus:28
Bornavirus:4
Piscihepevirus:1
Pbiunalikevirus:2
Avipoxvirus:5
Viunalikevirus:8
Skunalikevirus:16
Brambyvirus:1
Tibrovirus:3
Lambdatorquevirus:1
Mimivirus:5
Curtovirus:15
Iotapapillomavirus:1
Begomovirus:519
Phi29likevirus:5
Suipoxvirus:1
Emaravirus:2
Macluravirus:4
Luz24likevirus:6
Betanodavirus:5
Xipapillomavirus:10
Felixounalikevirus:6
Trichomonasvirus:16
Poacevirus:5
Zeavirus:1
S

In [14]:
trainAndTest = 4
training = {}

for label in labels:
    if len(labels[label])>trainAndTest:
        training[label] = labels[label][0:trainAndTest]
    else:
        training[label] = labels[label]
    
genomesTraining = []
for label in training:
    genomesTraining = genomesTraining + training[label]
#genomesTraining = labels["Retro-transcribing viruses"]
print len(genomesTraining)

1245


In [15]:
matrixTree = matrix.loc[genomesTraining].copy()
print matrixTree.shape

(1245, 1457)


In [16]:
matrixTree[matrixTree.sum(axis=1)<1].sum(axis=1)

NC_000867    0
dtype: float64

In [17]:
matrixTree.drop(["NC_000867"],inplace=True)

In [18]:
matrixTree.to_csv("matrixForTree.using4genomesPerGenus.csv",header=True,index=True,sep='\t')

In [148]:
#4. Calculate distances and create a dendogram.

In [19]:
pdistMatrix = pdist(matrixTree,metric='jaccard')
#pdistMatrix = np.clip(pdistMatrix,a_min=0,a_max=None)
Z = linkage(pdistMatrix,'average')
len(Z)

1243

In [20]:
tree = to_tree(Z,False)
#newickTree = getNewick(tree, "", tree.dist, list(genomes["Matrix"]))
newickTree = getNewick(tree, "", tree.dist, genomesTraining)
fw = open("newickTree.using4genomesPerGenus.txt","w")
#fw = open("Family/newickTreeFamily.retro.nwk","w")
fw.write(newickTree)
fw.close()

In [74]:
#5. Edit tree in figTree

In [29]:
fw = open("testLabels.txt","w")
for genome in matrixTree.index.tolist():
    tax = accessions[genome]
    fw.write(genome+'\t'+tax[taxLabels["order"]]+'\t'+tax[taxLabels["family"]]+'\t'+tax[taxLabels["genus"]]+'\n')
    #label = accessions[genome][taxLabels["family"]]
    #fw.write(label + "\n")
fw.close()