In [2]:
from support.datatools import *
from support.paths import PATH
from objectives.logist import objective
from tqdm import tqdm
from collections import deque
from coordinator import Coordinator

In [3]:
TEST_FOLD = str(2)
VALID_FOLD = str(3)
SHIFT = 2
BATCH_SIZE = 32
NB_EPOCH = 20
OVERSAMPLING = .7
UNDERSAMPLING = 8
LOWER = -1000
UPPER = 400
IN_SHAPE = (1, 22, 22)
WEIGHTS = '/home/a.dobrenkii/Projects/Kaggle/DataScienceBowl2K17/data/WEIGHTS/'
CPU = 6

In [None]:
train, valid, test = extract_paths(VALID_FOLD, TEST_FOLD)
train_generator = batch_generator(train,
                                  batch_size=BATCH_SIZE, 
                                  in_shape=IN_SHAPE,
                                  lower=LOWER,
                                  upper=UPPER,
                                  shift=SHIFT,
                                  undersampling=UNDERSAMPLING,
                                  oversampling=OVERSAMPLING,
                                  CPU=CPU)

new_test = manipulate_samples(test.tolist(), UNDERSAMPLING, 1)
test_generator = batch_generator(new_test,
                                 batch_size=BATCH_SIZE, 
                                 in_shape=IN_SHAPE,
                                 lower=LOWER,
                                 upper=UPPER,
                                 shift=0, 
                                 undersampling=0,
                                 oversampling=1,
                                 CPU=CPU)

Here $lr == 1/L$  
Define prox-function $d(\vec{x}) \equiv ||\, \vec{x}\, ||^{2}$

L-BFGS two-loop recursion:
Input: $\vec{∇}f(x_i),\: s_k,\: y_k$ where $k = i − m,\: ...,\: i − 1$   
Output: new direction p  
$ p = −\vec{∇}f(x_i);$  
for $k \leftarrow i − 1$ to $i − m$ do   
$\qquad \alpha_i \leftarrow \frac{s_k \cdot p}{s_k \cdot y_k };$   
$\qquad p = p - \alpha_k \cdot y_k;$   
end

for $k \leftarrow i − m$ to $i − 1$ do   
$\qquad \beta = \frac{y_i \cdot s_i}{y_i \cdot p_i};$   
$\qquad p = p + (\alpha_i − \beta) \cdot s_i;$   
end

In [None]:
class optimizer:
    def __init__(self, method, objective, lr=.01, history_len=0):
        self.methods = {
            "SGD": self.SGD,
            "FGD": self.FGD,
            "L_BFGS": self.L_BFGS
        }

        message = "The method param should match one of: " + ', '.join(self.methods.keys())
        assert method in self.methods.keys(), message
        message = "The param history_len should be positive while using BFGS-like methods"
        assert "BFGS" in method and history_len, message
        
        self.i = 0
        self.lr = lr
        self.method =  self.methods[method]
        self.objective = objective
        self.history_len = history_len
        self.grads_hist = array(0)
        self.grad_i = array(0)
        
        if history_len != 0:
            self.y_diffs = deque([], maxlen=self.history_len)
            self.s_diffs = deque([], maxlen=self.history_len)
            self.a_hist = deque([], maxlen=self.history_len)
        
        
        
    def iteration_prior(self, X, y):
        self.i += 1
        self.grad_prev = self.grad_i.copy()
        self.w = self.objective.w.copy()
        self.grad_i = self.objective.gradf(X, y)
        
        
    def __call__(self, X, y):
        self.iteration_prior(X, y)
        self.method(X, y)

        
    def SGD(self, X, y):
        self.objective.w -= self.lr * self.grad_i
    
    
    def FGD(self, X, y):
        # Here lr == 1/L
        y_i = self.objective.w - self.lr * self.grad_i
        # Define prox-function d(\vec{x}) \equiv || \vec{x} ||^{2}
        self.grads_hist += (self.i + 1) / 2 * self.grad_i
        z_i = - self.lr * self.grads_hist
        self.objective.w = 2. /(self.i + 3.) * z_i + (self.i + 1.) / (self.i + 3.) * y_i
    

    def L_BFGS(self, X, y):
        """
        Large scale BFGS using two-loop recursion
    
        """
        if self.i > self.history_len:
            p = - self.grad_i
            for y_i, s_i in zip(self.y_diffs, 
                                self.s_diffs):
                self.a_hist.append(dot(s_i, p) / dot(y_i, s_i))
                p -= self.a_hist[-1] * y_i
                
            p *= (dot(self.y_diffs[-1], self.s_diffs[-1]) 
                  / dot(self.y_diffs[-1], self.y_diffs[-1]))
            print(self.y_diffs[-1].shape)
            ids = list(reversed(arange(self.history_len)))
            for i in ids:
                b = dot(self.s_diffs[i], p) / dot(self.y_diffs[i], self.s_diffs[i])
                p += (self.a_hist[i] - b) * self.s_diffs[i]
                
            self.objective.w += self.lr * p
        else:
            self.SGD(X, y)
            
        self.y_diffs.append(self.grad_i - self.grad_prev)
        self.s_diffs.append(self.objective.w - self.w)

In [None]:
objf = objective(dim=484, w=None, l1=1e-4, l2=1e-4)
optf = optimizer('L_BFGS', objf, history_len=20)
clf = Coordinator(objf, optf, 'logloss')

In [None]:
history = clf.fit_generator(train_generator, 
                            10 * len(new_test) // BATCH_SIZE, 
                            nb_epoch=10, 
                            validation_data=test_generator, 
                            nb_val_iterations=len(new_test) // BATCH_SIZE, 
                            verbose=False)