In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from scipy.signal import convolve2d
from tqdm import tqdm, notebook
import seaborn as sns

In [2]:
batch_size = 4096*2 # Batch size 16384
image_shape = (1,28,28)
image_1d_shape = np.prod(image_shape)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
train_dataset = datasets.MNIST("./data/",download=True, train=True, transform=transforms.ToTensor())
test_dataset = datasets.MNIST("./data/",download=True, train=False, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size, shuffle=True )
test_loader = DataLoader(test_dataset, batch_size, shuffle=True)

def show_image(x, name=None):
    x = x.squeeze()
    if name:
        plt.title(name)
    plt.imshow(x, cmap="gray")
    plt.show()

# The method for generating masks for negative data mentioned by Geoffrey Hinton in the article
def mask_gen():
    random_iter = np.random.randint(5,10)
    random_image = np.random.randint(2, size=image_shape).squeeze().astype(np.float32)
    blur_filter = np.array([[1, 2, 1], [2, 4, 2], [1, 2, 1]]) / 16
    for i in range(random_iter):
        random_image = convolve2d(random_image, blur_filter, mode='same', boundary='symm')
    mask = (random_image > 0.50).astype(np.float32)
    return mask

# The method for creating masks for negative data that I tried for testing purposes.
def mask_gen1():
    n = image_1d_shape
    arr1 = np.random.normal(loc=0, scale=0.01, size=int(5*n/8))
    arr1 = arr1+ abs(0-arr1.min())
    arr2 = np.random.normal(loc=1, scale=0.01, size=int(3*n/8))
    arr2 = arr2 + abs(1-arr2.max())
    arr = np.concatenate([arr1,arr2])
    np.random.shuffle(arr)
    mask = arr.reshape(image_shape).astype(np.float32)
    return mask

In [4]:
def negative_data_gen(batch, mask_num=1):
    indexes = torch.randperm(batch.shape[0])
    x1 = batch
    x2 = batch[indexes]
    if mask_num == 1:
        mask = mask_gen1()
    else:
        mask = mask_gen()
    merged_x1 = x1*mask
    merged_x2 = x2*(1-mask)
    hybrid_image = merged_x1+merged_x2
    return hybrid_image

# take the data and create x_pos and x_neg in cuda
def prepocess_data(x, device, training=True):
    x_pos = x
    x_pos = x_pos.view(x_pos.shape[0], -1).to(device)
    if training:
        x_neg = negative_data_gen(x,0)
        x_neg = x_neg.view(x_neg.shape[0], -1).to(device)
        return x_pos, x_neg
    else:
        return x_pos
    
    # function who onehot encode the labels in dim (batch, labels)
def onehot(labels, num_classes=10):
    return torch.zeros(labels.shape[0], num_classes).scatter_(1, labels.unsqueeze(1), 1).to(device)

In [5]:
class RNN_Multilayer_Layer(nn.Module):
    def __init__(self, input_size, output_size, hidden_a_size, hidden_b_size, learning_rate=3e-4, threshold=2):
        super().__init__()
        self.input_size = input_size
        self.hidden_a_size = hidden_a_size
        self.hidden_b_size = hidden_b_size
        self.output_size = output_size

        self.Wx = nn.Parameter(torch.randn(output_size, self.input_size))

        self.Wh_above = nn.Parameter(torch.randn(output_size, self.hidden_a_size))
        self.Wh_below = nn.Parameter(torch.randn(output_size, self.hidden_b_size))
        self.Bh = nn.Parameter(torch.zeros(self.output_size))
        
        self.Wy = nn.Parameter(torch.randn(self.output_size, self.output_size))
        self.By = nn.Parameter(torch.zeros(self.output_size))

        self.loss = []
        self.update_rate = 0.7

        self.threshold = threshold
        self.opt = optim.Adam(self.parameters(), lr=learning_rate)

    def normalize(self,x):
        return x / (x.norm(2, 1, keepdim=True) + 1e-4)
        
    def forward(self,x, hidden_state, hidden_below_state, hidden_above_state):
        norm_hidden_above_state = self.normalize(hidden_above_state)
        norm_hidden_below_state = self.normalize(hidden_below_state)
        prev_hidden_state = norm_hidden_above_state  @ self.Wh_above.T + norm_hidden_below_state @ self.Wh_below.T
        computed_new_state = torch.tanh((x @ self.Wx.T) + (prev_hidden_state) + self.Bh)
        new_hidden_state = self.update_rate * computed_new_state +  (1 - self.update_rate) * hidden_state
        output = torch.tanh(new_hidden_state @ self.Wy.T + self.By)
        return output, new_hidden_state
    
    def train(self, x_pos, x_neg, pos_hidden_state, neg_hidden_state, num_epochs=10):
        pos_hidden_state, pos_hidden_below_state, pos_hidden_above_state = pos_hidden_state
        neg_hidden_state, neg_hidden_below_state, neg_hidden_above_state = neg_hidden_state
        for epoch in range(num_epochs):
            output_pos, _ = self.forward(x_pos, pos_hidden_state, pos_hidden_below_state, pos_hidden_above_state)
            output_neg, _ = self.forward(x_neg, neg_hidden_state, neg_hidden_below_state, neg_hidden_above_state)
            loss = torch.log(1 + torch.exp(torch.cat([
                    -output_pos + self.threshold,
                    output_neg - self.threshold]))).mean()
            self.opt.zero_grad()
            loss.backward(retain_graph=True)
            self.loss.append(loss.item())
            self.opt.step()

        return (self.forward(x_pos, pos_hidden_state, pos_hidden_below_state, pos_hidden_above_state), self.forward(x_neg, neg_hidden_state, neg_hidden_below_state, neg_hidden_above_state))

In [6]:
class RNN_Cell(nn.Module):
    def __init__(self, layers, learning_rate, threshold):
        super().__init__()
        self.flatten = nn.Flatten(start_dim=2, end_dim=3)
        self.layers = layers
        self.rnn_layer1 = RNN_Multilayer_Layer(input_size= layers[0], output_size= layers[1], hidden_a_size= layers[2], hidden_b_size= layers[0], learning_rate=learning_rate, threshold=threshold)
        self.rnn_layer2 = RNN_Multilayer_Layer(input_size= layers[1], output_size= layers[2], hidden_a_size= layers[3], hidden_b_size= layers[1], learning_rate=learning_rate, threshold=threshold)
        self.rnn_layer3 = RNN_Multilayer_Layer(input_size= layers[2], output_size= layers[3], hidden_a_size= layers[3], hidden_b_size= layers[2], learning_rate=learning_rate, threshold=threshold)
        self.softmax = nn.Softmax(dim=1)
        self.losses = [[],[],[]]
    
    def forward(self,x,y, hidden_layers):
        hidden_1_prev, hidden_2_prev, hidden_3_prev = hidden_layers
        flattened_x = self.flatten(x)
        x, y = flattened_x, y
        print("\t Layer1")
        out, hidden_1_new = self.rnn_layer1(x, hidden_1_prev, x, hidden_2_prev)
        print("\t Layer2")
        out, hidden_2_new = self.rnn_layer2(out, hidden_2_prev, hidden_1_prev, hidden_3_prev)
        print("\t Layer3")
        out, hidden_3_new = self.rnn_layer3(out, hidden_3_prev, hidden_2_prev, y)
        return out , [hidden_1_new, hidden_2_new, hidden_3_new]
    
    def train(self, x_pos, x_neg, y, pos_hidden_layers, neg_hidden_layers):
        pos_hidden_1_prev, pos_hidden_2_prev, pos_hidden_3_prev = pos_hidden_layers
        neg_hidden_1_prev, neg_hidden_2_prev, neg_hidden_3_prev = neg_hidden_layers
        flattened_x_pos, flattened_x_neg = self.flatten(x_pos),self.flatten(x_neg)
        x_pos, x_neg, y = flattened_x_pos, flattened_x_neg, y
        self.rnn_layer1.zero_grad()
        print("Layer 1")
        (out_pos, hidden_new_1_pos), (out_neg, hidden_new_1_neg) = self.rnn_layer1.train(x_pos, x_neg,
                                                                                         pos_hidden_state=(pos_hidden_1_prev, x_pos, pos_hidden_2_prev),
                                                                                          neg_hidden_state=(neg_hidden_1_prev, x_neg, neg_hidden_2_prev))
        self.rnn_layer2.zero_grad()
        print("Layer 2")
        (out_pos, hidden_new_2_pos), (out_neg, hidden_new_2_neg) = self.rnn_layer2.train(out_pos, out_neg,
                                                                                         pos_hidden_state=(pos_hidden_2_prev, pos_hidden_1_prev, pos_hidden_3_prev),
                                                                                          neg_hidden_state=(neg_hidden_2_prev, neg_hidden_1_prev, neg_hidden_3_prev))
        self.rnn_layer3.zero_grad()
        print("Layer 3")
        (out_pos, hidden_new_3_pos), (out_neg, hidden_new_3_neg) = self.rnn_layer3.train(out_pos, out_neg,
                                                                                            pos_hidden_state=(pos_hidden_3_prev,pos_hidden_2_prev, y),
                                                                                            neg_hidden_state=(neg_hidden_3_prev,neg_hidden_2_prev, y))
        pos_hidden_layers = [hidden_new_1_pos, hidden_new_2_pos, hidden_new_3_pos]
        neg_hidden_layers = [hidden_new_1_neg, hidden_new_2_neg, hidden_new_3_neg]
        self.losses[0].append(self.rnn_layer1.loss)
        self.losses[1].append(self.rnn_layer2.loss)
        self.losses[2].append(self.rnn_layer3.loss)
        return out_pos, out_neg, pos_hidden_layers, neg_hidden_layers

In [7]:
# x = torch.randn(16,1, 28, 28)
# y = torch.randn(16,10)
# layers = [784,2000,2000,10]
# hidden_states = [torch.zeros(1, layers[1]), torch.zeros(1, layers[2]),torch.zeros(1, layers[3])]
# out, hidden_states = RNN_Cell(layers, 1,1).train(x ,x, y, hidden_states, hidden_states)
# # out, hidden_states = RNN_Cell(layers, 1,1).train(x, x, y, hidden_states, hidden_states)

In [8]:
class RNN_FF_Net(nn.Module):
    def __init__(self, layers, learning_rate, threshold, time_steps):
        super().__init__()
        self.rnn_cell = RNN_Cell(layers, learning_rate, threshold)
        self.num_time_steps = time_steps
        self.layers = layers
        self.learning_rate = learning_rate
        self.threshold = threshold

        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        self.criterion = nn.CrossEntropyLoss()
        self.hidden_state = [[[torch.zeros(1, layers[1]).to(device), torch.zeros(1, layers[2]).to(device),torch.zeros(1, layers[3]).to(device)],[torch.zeros(1, layers[1]).to(device), torch.zeros(1, layers[2]).to(device),torch.zeros(1, layers[3]).to(device)]]]
    def forward(self, x, y):
        out = 0
        for i in range(self.num_time_steps):
            print(i)
            out, hiddent_state_new = self.rnn_cell(x, y, self.hidden_state[i])
            self.hidden_state.append(hiddent_state_new)
        return out
    def train(self, x_pos, x_neg, y):
        out = 0
        for i in range(self.num_time_steps):
            print(i)
            _, _,pos_hidden_states, neg_hidden_states = self.rnn_cell.train(x_pos, x_neg, y, self.hidden_state[i][0],self.hidden_state[i][1])
            hiddent_state_new = [pos_hidden_states, neg_hidden_states]
            self.hidden_state.append(hiddent_state_new)
        return out


In [9]:
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

layers = [28*28, 100, 100, 10]
model = RNN_FF_Net(layers, 3e-4, 2, 8).to(device)


train_iterator = iter(train_loader)
for x,y in train_iterator:
    x_pos = x.to(device)
    x_neg = negative_data_gen(x, 0).to(device)
    y = onehot(y,10).to(device)
    model.train(x_pos, x_neg, y)

0
Layer 1
Layer 2
Layer 3


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.50 GiB (GPU 0; 3.82 GiB total capacity; 2.65 GiB already allocated; 306.00 MiB free; 2.73 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
x = next(train_iterator)[0]
print(x.shape)
print(negative_data_gen(x, 0).shape)