In [1]:
import numpy as np
from Bio import Phylo
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

#### Given an array of form:  
$[[parent_k,child_k,branchlength_k\ for\ k\ pairs\ in\ tree_i]\ for\ i\ trees]$  
1. Transform branch lengths to proportions of tree depth

In [None]:
def tree_depth(tree):
    '''given a tree in newick format, return overall depth'''
    
    tree = Phylo( io.StringIO(tree), 'newick')
    max_depth = max(tree.depths().values())
    
    return max_depth

def branch_ratio(data, tree_data, index):
    '''given a data_row with form [[parent ,child, branch_length]],
    return data_row with ratios of branch lengths'''
    
    tree = tree_data[tree_data['dreamID'] == index]['ground'].item()
    max_depth = tree_depth(tree)
    transformed_data = data #initialize
    for i in range(data):
        transformed_data[i][2] = data[i][2] / max_depth
    
    return transformed_data
    

2. Convert Parent,Child pairs to 20x1 mutation array

In [None]:
def trit_det(parent, child):
    '''Given two trits from parent node 1 and child node 2 joined by an edge,
    return list alpha,beta determining mutation'''
    
    if parent == '1':
        if child == '2':#1->2
            alpha,beta = 0,1
        else:#1->0
            alpha,beta = 1,0
    else:#no mutation
        alpha,beta = 0,0
    
    return [alpha,beta]

def barcode_det(parent, child):
    '''Given two barcodes from parent node 1 and child node 2 joined by an edge,
    return 10x2 array with rows alpha_i beta_i'''
    
    alpha_beta_array = np.zeros((10,2))
    for i in range(10):
        alpha_beta_array[i, :] = trit_det(parent[i], child[i])
    
    return alpha_beta_array

def convert_pair(tree):
    '''Given a data row of form [[parent,child,branch_length]]
    return row of form [1x20 mutations, branch_length]'''
    
    converted_tree = []#initialize
    for pair in tree:
        mut_array = barcode_det(pair[0], pair[1]).reshape((1, 20))
        converted_pair = [mut_array, pair[2]]
        converted_tree.append(converted_pair)
    
    return converted_tree

3. Reformat data for the model

In [None]:
DREAM_data = pd.read_csv('\data\Dream_data_intMemoir.csv', sep = '\t')
DREAM_train = DREAM_data[30:]
DREAM_test = DREAM_data[:30]

test_data = 'mayas'
train_data = 'mayas_too'

reformat_train = np.zeros(train_data.shape)
reformat_test = np.zeros(test_data.shape)

reformat_pair = [reformat_train, reformat_test]
DREAM_pair = [DREAM_train, DREAM_test]
data_pair = [train_data, test_data]


for k in range(2): #0 = train, 1 = test
    for i in range(data_pair[k].shape[0]):
        tree_row = data_pair[k][i]
        tree_row = branch_ratio(tree_row, DREAM_pair[k], i)
        reformat_row = convert_pair(tree_row)
        reformat_pair[k][i] = reformat_row

#split data from labels
train_input, train_label = reformat_train[:, 0], reformat_train[:, 1]
test_input, test_label = reformat_test[:, 0], reformat_test[:, 1]

#tbd: conversion to torch tensors

In [9]:
class model(nn.Module):
    '''1 layer (FC) Neural Network'''
    
    def __init__(self):
        '''Define model module'''
        super(model, self).__init__()
        self.fc = nn.Linear(20, 1)
        
    def forward(self, x):
        '''Define model activation'''
        return F.sigmoid(self.fc(x))

BRANCH_MODEL = model()
print(BRANCH_MODEL)
#define the optimizer
optimizer = optim.SGD(BRANCH_MODEL.parameters(), lr = 0.001)

model(
  (fc): Linear(in_features=20, out_features=1, bias=True)
)


In [10]:
EPOCHS = 30
train_epoch_loss = []

for epoch in range(EPOCHS+1):
    train_loss = []
    
    #here train_set is the dataset of trees
    #each index correlates with set of [alpha_beta_array,branchlength]
    #at this point branch length needs to be normalized
    #for given tree 
    for train_loader in train_set: #this would correlate with a DataLoader object

        for batch_index, (train_data, train_label) in enumerate(train_loader):
            
            BRANCH_MODEL.train()
            train_label_predicted = BRANCH_MODEL(train_data)
            
            #compute the loss
            loss = F.smooth_l1_loss(train_label_predicted, train_label)
            train_loss.append(loss.cpu().data.item())
            
            #reset the gradient
            optimizer.zero_grad()
            #backpropagate the loss
            loss.backward()
            #update the parameters
            optimizer.step()
        
        train_epoch_loss.append(np.mean(train_loss))
        
        if epoch%5 == 0:
            print("Epoch: {} | train_loss: {}".format(epoch, train_epoch_loss[-1]))
        

TypeError: 'int' object is not iterable