# Обучение представлений и глубокое обучение, Домашнее задание 2
## Верификация пар китайских иероглифов
### https://inclass.kaggle.com/c/rdl-verification-challenge

### Кашин Андрей, ШАД, Computer Science 

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import os
import sys

# Make sure that caffe is on the python path:
caffe_root = os.environ["CAFFE_ROOT"]  # this file is expected to be in {caffe_root}/examples
sys.path.insert(0, os.path.join(caffe_root, "python"))

import caffe
import pandas as pd
import matplotlib.pyplot as plt
import scipy

#### Фкнции для загрузки данных

In [3]:
def load_data(path, num_samples=None):
    samples = []
    samples_processed = 0
    with open(path, "r") as f:
        while True:
            try:
                image = np.load(f)
                samples.append(image)
                samples_processed += 1
            except Exception, e:
                break
            if samples_processed % 10000 == 0:
                print("Processed: {}".format(samples_processed))
            if num_samples and samples_processed >= num_samples:
                break
    return samples

def load_labels(path, num_samples=None):
    labels = pd.read_csv(path)
    if num_samples:
        return labels["Prediction"].values[:num_samples]
    else:
        return labels["Prediction"].values

### Загружаем данные

In [4]:
num_samples = None

X = load_data("./data/Train/data.bin", num_samples)
X_test = load_data("./verification/data/Test/data.bin", num_samples)
y_orig = load_labels("./data/Train/train.csv", num_samples)

print("Data size: {}".format(len(X)))
print("Labels size: {}".format(len(y_orig)))

Processed: 10000
Processed: 20000
Processed: 30000
Processed: 40000
Processed: 50000
Processed: 60000
Processed: 70000
Processed: 80000
Processed: 90000
Processed: 100000
Processed: 110000
Processed: 120000
Processed: 130000
Processed: 140000
Processed: 150000
Processed: 160000
Processed: 170000
Processed: 180000
Processed: 190000
Processed: 200000
Processed: 10000
Processed: 20000
Data size: 200000
Labels size: 200000


### Масштабируем все картинки до одинакового размера

In [5]:
h = 50
w = 50

Xr = np.array([scipy.misc.imresize(x, (h, w)).reshape((1, h, w)).astype(np.float64) for x in X])
Xr_test = np.array([scipy.misc.imresize(x, (h, w)).reshape((1, h, w)).astype(np.float64) for x in X_test])
y = y_orig[:]

### Нормализируем значения интенсивностей

In [6]:
Xr /= 256.0
Xr_test /= 256.0

In [7]:
mean = Xr.mean().mean().mean().mean()
Xr -= mean
Xr_test -= mean

## Разбиваем данные на обучающую и тестовую выборки

In [8]:
import random
from copy import deepcopy

random.seed(17)
    
n_classes = 2000

def shuffle(a, b):
    indexes = range(len(a))
    random.shuffle(indexes)
    a[:] = a[indexes]
    b[:] = b[indexes]

print("Shuffle data randomly")
shuffle(Xr, y)

Shuffle data randomly


### Для того чтобы сделать сбалансированное разбиение (stratified split) нам необходимо выделить объекты каждого класса

In [9]:
n_classes = 2000

print("Find class items")
class_items = [[] for i in range(n_classes)]

for i in range(len(y)):
    if y[i] < n_classes:
        class_items[y[i]].append(i)
    
class_items = np.array(class_items)

Find class items


### Делаем разбиение в пропорциях 90% train / 10% test

In [10]:
print("Making train test split")

test_ratio = 0.1

Xr_subset_train = []
Xr_subset_test = []
y_subset_train = []
y_subset_test = []

for i in range(n_classes):
    class_Xr = Xr[class_items[i]]
    class_y = y[class_items[i]]
    
    test_size = test_ratio * len(class_Xr)
    train_size = len(class_Xr) - test_size
    
    Xr_subset_train.append(class_Xr[:train_size])
    y_subset_train.append(class_y[:train_size])
    Xr_subset_test.append(class_Xr[train_size:])
    y_subset_test.append(class_y[train_size:])
    
Xr_subset_train = np.array(Xr_subset_train)
y_subset_train = np.array(y_subset_train)
Xr_subset_test = np.array(Xr_subset_test)
y_subset_test = np.array(y_subset_test)
    

Making train test split


### Превращаем картинки в четырехмерный тензор для входа в caffe

In [11]:
Xr_subset_train = Xr_subset_train.reshape(Xr_subset_train.shape[0] * Xr_subset_train.shape[1], 1, h, w)
Xr_subset_test = Xr_subset_test.reshape(Xr_subset_test.shape[0] * Xr_subset_test.shape[1], 1, h, w)
y_subset_train = y_subset_train.reshape(y_subset_train.shape[0] * y_subset_train.shape[1])
y_subset_test = y_subset_test.reshape(y_subset_test.shape[0] * y_subset_test.shape[1])

#### На всякий случай еще раз перемешиваем

In [12]:
shuffle(Xr_subset_train, y_subset_train)
shuffle(Xr_subset_test, y_subset_test)

In [13]:
print(Xr_subset_train.shape)
print(Xr_subset_test.shape)

(180000, 1, 50, 50)
(20000, 1, 50, 50)


### Функция для генерации обучающих пар для сиамской сети

In [14]:
# Generate pairs data

def generate_pairs(X, y, n_pairs=2):
    np.random.seed(46)
    class_items = [[] for i in range(n_classes)]

    for i in range(len(y)):
        if y[i] < n_classes:
            class_items[y[i]].append(i)

    class_items = np.array(class_items)

    X_pairs = []

    for first_class in range(n_classes):
        for p in range(n_pairs):
            same_class = np.random.rand() > 0.5
            if same_class:
                second_class = first_class
            else:
                second_class = np.random.randint(n_classes)

            first_item = np.random.choice(class_items[first_class])
            second_item = np.random.choice(class_items[second_class])

            X_pairs.append((first_item, second_item, first_class == second_class))
    
    random.shuffle(X_pairs)
    return X_pairs

### Генерируем обучающие и тестовые пары

In [15]:
pairs_train = generate_pairs(Xr_subset_train, y_subset_train, n_pairs=40)
pairs_test = generate_pairs(Xr_subset_test, y_subset_test, n_pairs=20)

In [16]:
print(len(pairs_train))
print(len(pairs_test))

80000
40000


### Сохраняем пары на диск

#### Этот процесс можно повторить несколько раз с различными seed'ами, добавляя в обучающую выборку все новые пары

In [17]:
# Write them to disk

def create_image_pairs(X, y, pairs):
    labels = np.array(map(lambda v: v[2], pairs))
    data = np.array(map(lambda v: np.vstack([X[v[0]], X[v[1]]]), pairs))
    return data, labels

def write_pairs(X_train, y_train, pairs_train, X_test, y_test, pairs_test, folder, append=False):
    import h5py

    # Write out the data to HDF5 files in a temp directory.
    dirname = os.path.abspath(folder)
    if not os.path.exists(dirname):
        os.makedirs(dirname)

    if append:
        mode = 'a'
    else:
        mode = 'w'
        
    train_filename = os.path.join(dirname, 'train_' + str(h) + "_4" + '.h5')
    test_filename = os.path.join(dirname, 'test_' + str(h) + "_4" + '.h5')
    
    print("Write train data")
    with h5py.File(train_filename, 'w') as f:
        data, labels = create_image_pairs(X_train, y_train, pairs_train)
        f.create_dataset('pair_data', data=data)
        f.create_dataset('sim', data=labels.astype(np.float32))
    with open(os.path.join(dirname, 'train.txt'), mode) as f:
        f.write(train_filename + '\n')

    print("Write test data")
    with h5py.File(test_filename, 'w') as f:
        data, labels = create_image_pairs(X_test, y_test, pairs_test)
        f.create_dataset('pair_data', data=data)
        f.create_dataset('sim', data=labels.astype(np.float32))
    with open(os.path.join(dirname, 'test.txt'), mode) as f:
        f.write(test_filename + '\n')
    
write_pairs(Xr_subset_train, y_subset_train, pairs_train, Xr_subset_test, y_subset_test, pairs_test, "verification/data", append=True)

Write train data
Write test data


## Этап верификации

### Загружаем обученную модель с диска

In [18]:
MODEL_FILE = 'classification/network/lenet/lenet.prototxt'
PRETRAINED = 'classification/network/lenet/data/train_iter_22000.caffemodel'

In [19]:
caffe.set_mode_gpu()
net = caffe.Classifier(MODEL_FILE, PRETRAINED, image_dims=(50, 50))

### Прогоняем тестовые картинки через сеть

In [20]:
from copy import deepcopy

batch_size = 25
data_size = len(Xr_test)

predictions = []

for i in range((data_size + batch_size - 1) / batch_size):
    if i % 400 == 0:
        print("Iteration: {}".format(i))
    caffe_data =  np.array([x.reshape(1, h, w) for x in Xr_test[i * batch_size:min((i + 1) * batch_size, data_size)]])
    net.forward(data=caffe_data)
    prediction = net.blobs["prob"].data
    predictions.append(deepcopy(prediction))

Iteration: 0
Iteration: 400
Iteration: 800


#### Разворачиваем батчи и выбираем класс с максимальной вероятностью

In [21]:
y_predicted = np.array([x.argmax() for batch in predictions for x in batch])

### Считываем пары, для которых необходимо построить предсказание

In [22]:
pairs = open("verification/data/Test/test_pairs.csv").readlines()
pairs = pairs[1:]

pairs = map(lambda s: s.strip().split(","), pairs)

### Строим предсказание наивным образом просто сравнивая метки классов

In [24]:
with open("verification/predictions/prediction_pairs.csv", "w") as f:
    f.write("Id,Prediction\n")
        
    for pair in pairs:
        i = int(pair[0])
        n1 = int(pair[1])
        n2 = int(pair[2])
        if y_predicted[n1] == y_predicted[n2]:
            res = 1
        else:
            res = 0
        f.write("{},{}\n".format(i, res))

### Загрузим сиамскую модель

In [27]:
MODEL_FILE = 'verification/network/mnist_siamese/mnist_siamese.prototxt'
PRETRAINED = 'verification/network/mnist_siamese/data/mnist_siamese_iter_38000.caffemodel'

In [28]:
caffe.set_mode_gpu()
net = caffe.Classifier(MODEL_FILE, PRETRAINED, image_dims=(50, 50))

### Строим представление для каждой картинки из тестовой выборки

In [29]:
from copy import deepcopy

batch_size = 25
data_size = len(Xr_test)

predictions = []

for i in range((data_size + batch_size - 1) / batch_size):
    if i % 400 == 0:
        print("Iteration: {}".format(i))
    caffe_data =  np.array([x.reshape(1, h, w) for x in Xr_test[i * batch_size:min((i + 1) * batch_size, data_size)]])
    net.forward(data=caffe_data)
    prediction = net.blobs["feat"].data
    predictions.append(deepcopy(prediction))

Iteration: 0
Iteration: 400
Iteration: 800


#### Вытаскиваем представления из батчей в массив

In [30]:
feats = np.array([x for batch in predictions for x in batch])
print(feats.shape)

(26325, 80)


### Строим предсказание

In [31]:
zeros = 0
ones = 0

with open("verification/predictions/prediction_siamese.csv", "w") as f:
    f.write("Id,Prediction\n")
        
    for pair in pairs:
        i = int(pair[0])
        n1 = int(pair[1])
        n2 = int(pair[2])
        distance = np.linalg.norm(feats[n1] - feats[n2])
        if distance < 0.68:
            res = 1
            ones += 1
        else:
            res = 0
            zeros += 1
        f.write("{},{}\n".format(i, res))
print("Zeros: {}, Ones: {}".format(zeros, ones))

Zeros: 183802, Ones: 184748


#### Расстояние между двумя объектами в новом пространстве

In [32]:
np.linalg.norm(feats[0] - feats[20])

0.79592383