In [1]:
import os
import numpy as np 

import torch
import torchvision
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset

cuda = False

### Load Data

In [2]:
RAW_DATA_DIR = "../Data/Raw/"
PROCESSED_DATA_DIR = "../Data/Processed/"

class CovariateDataset(Dataset):
    def __init__(self, file_name_pattern, file_number):
        self.file_name = file_name_pattern.format(file_number)
        self.data = np.loadtxt(RAW_DATA_DIR + self.file_name, delimiter=",")
        
    def __getitem__(self, index):
        return (self.data[index].astype(float), 0)

    def __len__(self):
        return self.data.shape[0]
    
    def save_data(self, data):
        np.savetxt(PROCESSED_DATA_DIR + self.file_name, data, delimiter=",")

### Params

### Define Model

In [5]:
# 64 32 16 4 (0.54)
# 16 8 (.06)
# 16 4 (.14)

class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(11, 16),
            nn.ReLU(True),
            nn.Linear(16, 8),
            nn.ReLU(True),
            nn.Linear(8, 4))
        self.decoder = nn.Sequential(
            nn.Linear(4, 8),
            nn.ReLU(True),
            nn.Linear(8, 16),
            nn.ReLU(True),
            nn.Linear(16, 11))

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

### Initial Training

In [7]:
# Original training config
model = autoencoder()
if cuda:
    model.cuda()
    
num_epochs = 10000
batch_size = 256
learning_rate = 1e-1
lr_sched = True

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(
    model.parameters(), lr=learning_rate, weight_decay=1e-5)

# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, int(num_epochs/3), gamma=0.1)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [100, 1000, 5000, 7500], gamma=0.1)

# Train on the first dataset
dataset = CovariateDataset("n_1000_model_A_add_lin_v_{}_covar_data.csv", 0)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    if lr_sched:
        scheduler.step()
    
    for data in dataloader:
        data_batch, _ = data
        data_batch = Variable(data_batch)
        data_batch = data_batch.float()
        
        if cuda:
            data_batch.cuda()
            
        # Forward pass
        output = model(data_batch)
        
        loss = criterion(output, data_batch)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log========================
    if epoch%int(num_epochs/10) == int(num_epochs/10)-1:
        print('epoch [{}/{}], loss:{:.4f}'
              .format(epoch + 1, num_epochs, loss.data[0]))

torch.save(model.state_dict(), "../Models/simple_autoencoder.pth".format(dataset_number))

### Fine Tune

In [14]:
# Fine tuning config
for dataset_number in range(50):
    print("Starting run for Dataset {}".format(dataset_number))
    model = autoencoder()
    if cuda:
        model.cuda()

    state_dict = torch.load("../Models/simple_autoencoder.pth")
    model.load_state_dict(state_dict)

    num_epochs = 500
    batch_size = 256
    learning_rate = 1e-3
    lr_sched = False

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(
        model.parameters(), lr=learning_rate, weight_decay=1e-5)

    dataset = CovariateDataset("n_1000_model_A_add_lin_v_{}_covar_data.csv", dataset_number)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(num_epochs):
        if lr_sched:
            scheduler.step()

        for data in dataloader:
            data_batch, _ = data
            data_batch = Variable(data_batch)
            data_batch = data_batch.float()

            if cuda:
                data_batch.cuda()

            # Forward pass
            output = model(data_batch)

            loss = criterion(output, data_batch)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        # ===================log========================
        if epoch%int(num_epochs/10) == int(num_epochs/10)-1:
            print('epoch [{}/{}], loss:{:.4f}'
                  .format(epoch + 1, num_epochs, loss.data[0]))

    torch.save(model.state_dict(), "../Models/simple_autoencoder_{}.pth".format(dataset_number))

    all_data = torch.from_numpy(dataset.data)
    all_data = Variable(all_data)
    all_data = all_data.float()

    output = model.encoder(all_data)
    dataset.save_data(output.data.numpy())
    print("================\n\n")

Starting run for Dataset 0
epoch [50/500], loss:0.1511
epoch [100/500], loss:0.1501
epoch [150/500], loss:0.1494
epoch [200/500], loss:0.1524
epoch [250/500], loss:0.1485
epoch [300/500], loss:0.1550
epoch [350/500], loss:0.1601
epoch [400/500], loss:0.1532
epoch [450/500], loss:0.1548
epoch [500/500], loss:0.1543


Starting run for Dataset 1
epoch [50/500], loss:0.1576
epoch [100/500], loss:0.1505
epoch [150/500], loss:0.1466
epoch [200/500], loss:0.1511
epoch [250/500], loss:0.1499
epoch [300/500], loss:0.1513
epoch [350/500], loss:0.1545
epoch [400/500], loss:0.1519
epoch [450/500], loss:0.1515
epoch [500/500], loss:0.1477


Starting run for Dataset 2
epoch [50/500], loss:0.1514
epoch [100/500], loss:0.1485
epoch [150/500], loss:0.1515
epoch [200/500], loss:0.1493
epoch [250/500], loss:0.1483
epoch [300/500], loss:0.1531
epoch [350/500], loss:0.1516
epoch [400/500], loss:0.1541
epoch [450/500], loss:0.1498
epoch [500/500], loss:0.1511


Starting run for Dataset 3
epoch [50/500], los

epoch [250/500], loss:0.1503
epoch [300/500], loss:0.1501
epoch [350/500], loss:0.1481
epoch [400/500], loss:0.1516
epoch [450/500], loss:0.1443
epoch [500/500], loss:0.1487


Starting run for Dataset 25
epoch [50/500], loss:0.1568
epoch [100/500], loss:0.1577
epoch [150/500], loss:0.1562
epoch [200/500], loss:0.1539
epoch [250/500], loss:0.1483
epoch [300/500], loss:0.1609
epoch [350/500], loss:0.1562
epoch [400/500], loss:0.1506
epoch [450/500], loss:0.1527
epoch [500/500], loss:0.1534


Starting run for Dataset 26
epoch [50/500], loss:0.1617
epoch [100/500], loss:0.1531
epoch [150/500], loss:0.1525
epoch [200/500], loss:0.1515
epoch [250/500], loss:0.1508
epoch [300/500], loss:0.1494
epoch [350/500], loss:0.1448
epoch [400/500], loss:0.1533
epoch [450/500], loss:0.1534
epoch [500/500], loss:0.1570


Starting run for Dataset 27
epoch [50/500], loss:0.1538
epoch [100/500], loss:0.1569
epoch [150/500], loss:0.1576
epoch [200/500], loss:0.1486
epoch [250/500], loss:0.1527
epoch [300/500

epoch [500/500], loss:0.1545


Starting run for Dataset 49
epoch [50/500], loss:0.1518
epoch [100/500], loss:0.1544
epoch [150/500], loss:0.1483
epoch [200/500], loss:0.1525
epoch [250/500], loss:0.1544
epoch [300/500], loss:0.1495
epoch [350/500], loss:0.1474
epoch [400/500], loss:0.1495
epoch [450/500], loss:0.1513
epoch [500/500], loss:0.1529




In [21]:
# From scratch everytime config
for dataset_number in [44]:#range(50):
    print("Starting run for Dataset {}".format(dataset_number))
    model = autoencoder()
    if cuda:
        model.cuda()

    num_epochs = 15000
    batch_size = 256
    learning_rate = 1e-1
    lr_sched = True

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(
        model.parameters(), lr=learning_rate, weight_decay=1e-5)
    
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [100, 1000, 5000, 7500], gamma=0.1)

    dataset = CovariateDataset("n_1000_model_A_add_lin_v_{}_covar_data.csv", dataset_number)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(num_epochs):
        if lr_sched:
            scheduler.step()

        for data in dataloader:
            data_batch, _ = data
            data_batch = Variable(data_batch)
            data_batch = data_batch.float()

            if cuda:
                data_batch.cuda()

            # Forward pass
            output = model(data_batch)

            loss = criterion(output, data_batch)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        # ===================log========================
        if epoch%int(num_epochs/10) == int(num_epochs/10)-1:
            print('epoch [{}/{}], loss:{:.4f}'
                  .format(epoch + 1, num_epochs, loss.data[0]))

    torch.save(model.state_dict(), "../Models/simple_autoencoder_{}.pth".format(dataset_number))

    all_data = torch.from_numpy(dataset.data)
    all_data = Variable(all_data)
    all_data = all_data.float()

    output = model.encoder(all_data)
    dataset.save_data(output.data.numpy())
    print("================\n\n")

Starting run for Dataset 44
epoch [1500/15000], loss:0.3786
epoch [3000/15000], loss:0.3728
epoch [4500/15000], loss:0.2782
epoch [6000/15000], loss:0.2716
epoch [7500/15000], loss:0.2547
epoch [9000/15000], loss:0.2267
epoch [10500/15000], loss:0.2201
epoch [12000/15000], loss:0.2220
epoch [13500/15000], loss:0.2240
epoch [15000/15000], loss:0.2281




- reg/Decay functions
- 
