In [22]:
import os
import torch
import torch.utils.data
import pandas as pd
import numpy as np
from torch import nn, optim
from torch.autograd import Variable
from torch.nn import functional as F
from torchvision import datasets, transforms
from torchvision.utils import save_image
from torch.utils.data.sampler import SubsetRandomSampler



In [24]:
CUDA = False
SEED = 1
BATCH_SIZE = 128
LOG_INTERVAL = 10
EPOCHS = 10
VALIDATION_SPLIT = .2
ZDIMS = 20

In [14]:
torch.manual_seed(SEED)

kwargs = {'num_workers': 1, 'pin_memory': True} if CUDA else {}

In [4]:
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor()),
    batch_size=BATCH_SIZE, shuffle=True, **kwargs

)

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data', train=False, transform=transforms.ToTensor()),
    batch_size=BATCH_SIZE, shuffle=True, **kwargs)               




In [6]:
import pandas as pd

kmer_path = "data\kmers-gzip\\output.txt.gz"

kmer_all = pd.read_csv(kmer_path ,delim_whitespace=True, compression="gzip", names = ["kmer", "count"])






In [7]:
kmer_culled = kmer_all.loc[kmer_all['count'] >= 700]
print(kmer_all.describe())

print(kmer_culled.describe())



              count
count  3.388926e+07
mean   5.418934e+01
std    9.312541e+01
min    2.000000e+00
25%    3.000000e+00
50%    1.000000e+01
75%    5.200000e+01
max    8.817000e+03
             count
count  8763.000000
mean    923.065617
std     320.572167
min     700.000000
25%     780.000000
50%     851.000000
75%     968.000000
max    8817.000000


In [9]:

kmer_path = "data\kmers-gzip\\upec-182.txt.gz"

kmer_culled.sort_values(by=['kmer'])

kmer_contig1 = pd.read_csv(kmer_path ,delim_whitespace=True, compression="gzip", names = ["kmer", "count"])
print(kmer_contig1.describe())
kmer_contig1.sort_values(by="kmer",)

all = kmer_culled.kmer

contig1_parsed = kmer_contig1[kmer_contig1.kmer.isin(all)]

print(contig1_parsed.describe())
kmer_temp = kmer_culled.copy()
kmer_temp['count'] = '0'

               count
count  388884.000000
mean        2.296068
std         0.511630
min         2.000000
25%         2.000000
50%         2.000000
75%         3.000000
max         6.000000
             count
count  1686.000000
mean      4.771056
std       1.080067
min       2.000000
25%       4.000000
50%       4.000000
75%       6.000000
max       6.000000


In [77]:
contig1_merged = contig1_parsed.merge(kmer_temp, how='right',on='kmer')

contig1_merged[['kmer', 'count_x']].fillna(value='0')

Unnamed: 0,kmer,count_x
0,AAAAACATGATCACCGGTGCTGCTCAGATGG,4
1,AAAACATGATCACCGGTGCTGCTCAGATGGA,4
2,AAAATGGTTGTTACCCTGATCCACCCGATCG,4
3,AAACATGATCACCGGTGCTGCTCAGATGGAC,4
4,AAATGCGACATGGTTGATGACGAAGAGCTGC,4
...,...,...
8758,GGTAAGGAGGTGATCCAACCGCAGGTTCCCC,0
8759,GGTCGGCGGTTCGATCCCGTCATCACCCACC,0
8760,GGTTTAGAACGTCGTGAGACAGTTCGGTCCC,0
8761,GGTGATTAGCTCAGCTGGGAGAGCACCTCCC,0


In [20]:


CULL_SIZE = 700

class KmerDataset(Dataset):
    
    def __init__(self, dirname):
        files = os.listdir(dirname)
        
        X, y = [],[]
        
        for line in files:
            if(line == 'output.txt.gz'):
                kmer_all = pd.read_csv('data\kmers-gzip\\output.txt.gz' ,delim_whitespace=True, compression="gzip", names = ["kmer", "count"])
            else:
                kmer_path = dirname + '\\' + line
                kmer_df = pd.read_csv(kmer_path ,delim_whitespace=True, compression="gzip", names = ["kmer", "count"])
                X.append(kmer_df)
        self.X = X
        kmer_culled = kmer_all.loc[kmer_all['count'] >= CULL_SIZE]
        kmer_temp = kmer_culled.copy()
        kmer_temp['count'] = '0'
        self.template = kmer_temp
    
    def preprocess(self, contig):
        contig_parsed = contig[contig.kmer.isin(self.template.kmer)]
        
        contig_merged = contig_parsed.merge(self.template, how='right',on='kmer')

        contig_merged[['kmer', 'count_x']].fillna(value='0')
        contig_merged.columns = ['kmer', 'count']
        return contig_merged
        
    def __len__(self):
        return len(self.X)
        
    def __getitem__(self, index):
        return self.preprocess(self, self.X[index])

In [25]:
#initialize dataset
dataset = KmerDataset('data\kmers-gzip')



In [None]:
#Separate into train/val

dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(VALIDATION_SPLIT * dataset_size))
np.random.seed(SEED)
np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

#initialize data loaders

train_loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, 
                                           sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE,
                                                sampler=valid_sampler)

In [5]:
class VAE(nn.Module):
    def __init__(self):
        super(VAE, self).__init__()
        
        self.fc1 = nn.Linear(784, 400)
        
        self.relu = nn.ReLU()
        self.fc21 = nn.Linear(400, ZDIMS)
        self.fc22 = nn.Linear(400, ZDIMS)
        
        
        self.fc3 = nn.Linear(ZDIMS, 400)
        
        self.fc4 = nn.Linear(400, 784)
        self.sigmoid = nn.Sigmoid()
        
    
    
    def reparameterize(self, mu: Variable, logvar: Variable) -> Variable:
        
        if self.training:
            
            std = logvar.mul(.5).exp_()
            
            eps = Variable(std.data.new(std.size()).normal_())
            
            return eps.mul(std).add_(mu)
        else:
            return mu
    
    def encode(self, x: Variable) -> (Variable, Variable):
        
        h1 = self.relu(self.fc1(x))
        return self.fc21(h1), self.fc22(h1)
    
    def decode(self, z: Variable) -> Variable:
        h3 = self.relu(self.fc3(z))
        return self.sigmoid(self.fc4(h3))

    def forward(self, x: Variable) -> (Variable, Variable, Variable):
        mu, logvar = self.encode(x.view(-1, 784))
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

        

In [24]:
model = VAE()
if CUDA:
    model.cuda()


def loss_function(recon_x, x, mu, logvar) -> Variable:
    
    BCE = F.binary_cross_entropy(recon_x, x.view(-1, 784))
    
    
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    
    KLD /= BATCH_SIZE * 784
    
    return BCE + KLD



In [25]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)

def train(epoch):
    
    model.train()
    train_loss = 0
    
    for batch_idx, (data, _) in enumerate(train_loader):
        data = Variable(data)
        if CUDA:
            data = data.cuda()
        optimizer.zero_grad()
        
        recon_batch, mu, logvar = model(data)
        
        loss = loss_function(recon_batch, data, mu, logvar)
        
        loss.backward()
        
        train_loss += loss.data.item()
        optimizer.step()
        
        if batch_idx % LOG_INTERVAL == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batch_idx * len(data), len (train_loader.dataset),
            100. * batch_idx / len(train_loader),
            loss.data.item() / len(data)))
    print('====> Epoch: {} Average loss: {:.4f}'.format(
        epoch, train_loss / len(train_loader.dataset)
    ))

In [26]:
def test(epoch):
    
    model.eval()
    test_loss = 0
    
    for i, (data, _) in enumerate(test_loader):
        if CUDA:
            data = data.cuda()
        
        data = Variable(data, volatile=True)
        recon_batch, mu, logvar = model(data)
        test_loss += loss_function(recon_batch, data, mu, logvar).data.item()
        
        if i == 0:
            
            n = min(data.size(0), 8)
            
            comparison = torch.cat([data[:n],
                                  recon_batch.view(BATCH_SIZE, 1, 28,28)[:n]])
            save_image(comparison.data.cpu(),
                      'results/reconstruction_' + str(epoch) + '.png', nrow=n)
    test_loss /= len(test_loader.dataset)
    print('====> Test set loss: {:.4f}'.format(test_loss))

In [27]:
for epoch in range(1, EPOCHS + 1):
    train(epoch)
    test(epoch)
    
    
    sample = Variable(torch.randn(64, ZDIMS))
    
    if CUDA:
        sample = sample.cuda()
    
    sample = model.decode(sample).cpu()
    
    
    save_image(sample.data.view(64, 1, 28, 28),
              'results/sample_' + str(epoch) + '.png')

====> Epoch: 1 Average loss: 0.0016


  # Remove the CWD from sys.path while we load stuff.


====> Test set loss: 0.0012
====> Epoch: 2 Average loss: 0.0012
====> Test set loss: 0.0011
====> Epoch: 3 Average loss: 0.0011
====> Test set loss: 0.0010
====> Epoch: 4 Average loss: 0.0011
====> Test set loss: 0.0010


KeyboardInterrupt: 