In [123]:
from support.datatools import *
from support.paths import PATH
from objectives.logist import objective
from IPython.display import clear_output
from tqdm import tqdm
from functools import partial
from cvxopt import matrix, solvers
from numpy import logspace

# %pylab inline
%load_ext autoreload
%autoreload 1
%aimport objectives.logist

%load_ext autoreload
%autoreload 1
%aimport support.datatools

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [172]:
TEST_FOLD = str(2)
VALID_FOLD = str(3)
SHIFT = 2
BATCH_SIZE = 32
NB_EPOCH = 20
OVERSAMPLING = .7
UNDERSAMPLING = 8
LOWER = -1000
UPPER = 400
IN_SHAPE = (1, 22, 22)
WEIGHTS = '/home/a.dobrenkii/Projects/Kaggle/DataScienceBowl2K17/data/WEIGHTS/'
CPU = 6

In [174]:
# train, valid, test = extract_paths(VALID_FOLD, TEST_FOLD)
train_generator = batch_generator(train,
                                  batch_size=BATCH_SIZE, 
                                  in_shape=IN_SHAPE,
                                  lower=LOWER,
                                  upper=UPPER,
                                  shift=SHIFT,
                                  undersampling=UNDERSAMPLING,
                                  oversampling=OVERSAMPLING,
                                  CPU=CPU)

test_generator = batch_generator(new_test,
                                 batch_size=BATCH_SIZE, 
                                 in_shape=IN_SHAPE,
                                 lower=LOWER,
                                 upper=UPPER,
                                 shift=0, 
                                 undersampling=0,
                                 oversampling=1,
                                 CPU=CPU)

In [175]:
new_test = manipulate_samples(test.tolist(), UNDERSAMPLING, 1)

Here $lr == 1/L$  
Define prox-function $d(\vec{x}) \equiv ||\, \vec{x}\, ||^{2}$

In [181]:
class optimizer:
    def __init__(self, method, objective, lr=.01):
        self.methods = {
            "SGD": self.SGD,
            "FGD": self.FGD,
            "L_BFGS": self.L_BFGS,
            "VL_BFGS": self.VL_BFGS
        }

        message = "The method param should match one of: " + ', '.join(self.methods.keys())
        assert method in self.methods.keys(), message
        
        self.method =  self.methods[method]
        self.objective = objective
        self.grads_hist = 0
        self.i = 0
        self.lr = lr
        
        
    def iteration_prior(self):
        self.i += 1
        
        
    def __call__(self, X, y):
        self.iteration_prior()
        self.method(X, y)

        
    def SGD(self, X, y):
        self.objective.w -= self.lr * self.objective.gradf(X, y)
    
    
    def FGD(self, X, y):
        grad_i = self.objective.gradf(X, y)
        # Here lr == 1/L
        y_i = self.objective.w - self.lr * grad_i
        # Define prox-function d(\vec{x}) \equiv || \vec{x} ||^{2}
        self.grads_hist += (self.i + 1) / 2 * grad_i
        z_i = - self.lr * self.grads_hist
        self.objective.w = 2. /(self.i + 3.) * z_i + (self.i + 1.) / (self.i + 3.) * y_i
    
    
    def VL_BFGS(self, X, y):
        return 1
    
    
    def VL_BFGS(self, X, y):
        return 1

In [182]:
def logloss(X, y, objective=None):
    if objective is not None:
        return np.log(1 + np.exp(np.matmul(objective(X), y)))
    return np.log(1 + np.exp(np.matmul(X, y)))

In [183]:
class LR:
    def __init__(self, objective, optimizer, loss):
        self.losses = {
            "logloss": logloss
        }

        message = "The loss param should match one of: " + ', '.join(self.losses.keys())
        assert loss in self.losses.keys(), message
        self.loss =  self.losses[loss]
        self.loss_name = loss
        self.optimizer = optimizer
        self.objective = objective
        
    
    def predict_proba(self, X):
        """
        Предсказание вероятности принадлежности объекта к классу 1.
        Возвращает np.array размера (N,) чисел в отрезке от 0 до 1.
        :param X: numpy.array размера  (N, M), dtype = np.float
        :return: numpy.array размера  (N,), dtype = np.int
        """
        return special.expit(objective(X))
    
    
    def predict(self, X):
        """
        Предсказание класса для объекта.
        Возвращает np.array размера (N,) элементов 1 или -1.
        :param X: numpy.array размера  (N, M), dtype = np.float
        :return:  numpy.array размера  (N,), dtype = np.int
        """
        return self.predict_proba(X) > .5
    
    
    def evaluate_generator(self, data_generator, nb_iterations):
        objective_loss = list()
        addition_loss = list()
        for i in tqdm(range(nb_iterations)):
            X, y = next(data_generator)
            objective_loss.append(self.objective.lossf(X, y))
            addition_loss.append(self.loss(self.objective(X), y))
            clear_output()
        return objective_loss, addition_loss
    
    
    def predict_generator(self, data_generator, nb_iterations):
        predicted = list()
        for i in tqdm(range(nb_iterations)):
            X, y = next(data_generator)
            predicted += objective(X).tolist()
            clear_output()
        return predicted
        

    def fit_generator(self, train_data, 
                      nb_iterations, nb_epoch, 
                      validation_data=None, 
                      nb_val_iterations=None,
                      verbose=0
                     ):
        """
        Обучение логистической регрессии.
        Настраивает self.w коэффициенты модели.
        Если self.verbose == True, то выводите значение 
        функции потерь на итерациях метода оптимизации. 
        :param X: numpy.array размера  (N, M), dtype = np.float
        :param y: numpy.array размера  (N,), dtype = np.int
        :return: self
        """
        history = {
            'objective_loss': [],
            'addition_loss': [],
            'objective_val_loss': [-1],
            'addition_val_loss': [-1]
        }
        
        for epoch in tqdm(range(nb_epoch)):
            objective_loss = list()
            addition_loss = list()
            
            for i in tqdm(range(nb_iterations)):
                X, y = next(train_data)
                if verbose:
                    objective_loss.append(self.objective.lossf(X, y))
                    addition_loss.append(self.loss(self.objective(X), y))
                    history['objective_loss'].append(mean(objective_loss))
                    history['addition_loss'].append(mean(addition_loss))
                    clear_output()
                    print("Epoch " + str(epoch) + "/" + str(nb_epoch)) 
                    if validation_data is not None:
                        print("Current objective val loss is " + str(history['objective_val_loss'][-1]))
                        print("Current val " + self.loss_name + " is " + str(history['addition_val_loss'][-1]))
                    print("Iteration " + str(i) + ".")
                    print("Current objective loss is " + str(history['objective_loss'][-1]))
                    print("Current " + self.loss_name + " is " + str(history['objective_loss'][-1]))
                self.w = self.optimizer(X, y)  
            
            if validation_data is not None:
                objective_val_loss, addition_val_loss = \
                    self.evaluate_generator(validation_data, nb_val_iterations)
                history['objective_val_loss'].append(mean(objective_val_loss))
                history['addition_val_loss'].append(mean(addition_val_loss))
        return  history

In [184]:
objf = objective(dim=484, w=None, l1=1e-4, l2=1e-4)
optf = optimizer('FGD', objf)
clf = LR(objf, optf, 'logloss')

In [185]:
history = clf.fit_generator(train_generator, 
                            10 * len(new_test) // BATCH_SIZE, 
                            nb_epoch=10, 
                            validation_data=test_generator, 
                            nb_val_iterations=len(new_test) // BATCH_SIZE, 
                            verbose=1)


100%|██████████| 43/43 [00:23<00:00,  1.68it/s][A
100%|██████████| 10/10 [45:09<00:00, 270.59s/it]


In [194]:
print(history['addition_val_loss'])
print(history['objective_val_loss'])

[-1, 50.879387378644907, 43.106004618714493, 201.36140271658786, 304.84066715535374, 140.51239266775499, 59.535261077028537, 56.17323863503794, inf, inf, inf]
[-1, 50.88022443540077, 43.106718866102433, 201.3675230412604, 304.85181339078321, 140.51770488516124, 59.537895772182871, 56.176272972828023, inf, inf, inf]


In [121]:
import pickle
import pandas as pd
# test_labels = array(['nodule' in i.lower() for i in test])
# table = pickle.load(open(join(WEIGHTS, 'table_nodules'), 'rb'))
df = pd.DataFrame(columns=['seriesuid', 'coordX', 
                           'coordY', 'coordZ', 
                           'class', 'probability'])
for uid, val in tqdm(zip(test, predicted)):
    table[basename(uid)[:-4]][0]['probability'] = val
    df = df.append(table[basename(uid)[:-4]][0])

55800it [05:30, 169.03it/s]


In [132]:
path = '/home/a.dobrenkii/Projects/Kaggle/DataScienceBowl2K17/I/LungCancerDetection/support/evaluationScript/annotations/seriesuids_log.csv'
seriesuid = pd.Series(unique(df.seriesuid.values))
seriesuid.to_csv(path, index=False)
path = '/home/a.dobrenkii/Projects/Kaggle/DataScienceBowl2K17/I/LungCancerDetection/support/evaluationScript/exampleFiles/submission/sampleSubmission_log.csv'
df.to_csv(path, index=False)

In [133]:
path = '/home/a.dobrenkii/Projects/Kaggle/DataScienceBowl2K17/I/LungCancerDetection/support/evaluationScript/annotations/annotations.csv'
annotations = pd.read_csv(path)
annotations = annotations[annotations.seriesuid.isin(seriesuid)]
path = '/home/a.dobrenkii/Projects/Kaggle/DataScienceBowl2K17/I/LungCancerDetection/support/evaluationScript/annotations/annotations_log.csv'
annotations.to_csv(path, index=False)

In [50]:
clf.fit_generator(train_generator, len(train) // (8 * BATCH_SIZE), 10)


100%|██████████| 1934/1934 [09:24<00:00,  3.67it/s][A
100%|██████████| 10/10 [1:32:22<00:00, 558.22s/it]

Epoch 9/10
Iteration 1933. Current loss is 0.0612552275888





<__main__.LR at 0x7f6a30298438>