# Image features

    使用 从原始像素上计算出来的  特征 来训练线性分类器，提高性能
    而不是 直接 从 原始像素 上进行训练

In [1]:
import random
import numpy as np
from data_utils import load_CIFAR10
import matplotlib.pyplot as plt

from __future__ import print_function # 把下一个新版本的特性导入到当前版本，就可以在当前版本中测试一些新版本的特性

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # 设置 size
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# notebook 会 reload外部python modules
%load_ext autoreload 
%autoreload 2

## Load data

In [2]:
def get_CIFAR_10_data(num_train = 49000,num_val = 1000,num_test = 10000,num_dev = 500):
    # 1、Load the raw CIFAR-10 data.
    cifar10_dir = 'datasets/cifar-10-batches-py/'
    X_train, y_train,  X_test, y_test = load_CIFAR10(cifar10_dir)

    # 2、subsample the data
    # validation set
    mask = list(range(num_train, num_train + num_val)) 
    X_val = X_train[mask]
    y_val = y_train[mask]

    # training set 
    mask = list(range(num_train))  
    X_train = X_train[mask]
    y_train = y_train[mask]

    # test set   
    mask = list(range(num_test))
    X_test = X_test[mask]
    y_test = y_test[mask]
    


   
    return X_train,y_train,X_val,y_val, X_test, y_test

In [3]:
# get data and cheak it
X_train,y_train,X_val,y_val, X_test, y_test = get_CIFAR_10_data()
print ('X_train shape:  ',X_train.shape)
print ('y_train shape:  ',y_train.shape)
print ('X_val shape:    ',X_val.shape)
print ('y_val shape:    ',y_val.shape)
print ('X_test shape:   ',X_test.shape)
print ('y_test shape:   ',y_test.shape)


X_train shape:   (49000, 32, 32, 3)
y_train shape:   (49000,)
X_val shape:     (1000, 32, 32, 3)
y_val shape:     (1000,)
X_test shape:    (10000, 32, 32, 3)
y_test shape:    (10000,)


## Extract Features
    对于每张图像，使用 HSV颜色空间中的 色调通道 计算 
        定向梯度直方图（HOG-Histogram of Oriented
        Gradients）
        以及 颜色直方图（color histogram）

    通过  连接HOG和颜色直方图特征向量  来形成 每个图像的  最终特征向量。

    粗略地说，
    HOG  ：忽略颜色信息，捕获图像的纹理/texture，
    颜色直方图 ：代表输入图像的颜色，忽略纹理。

    因此，期望使用两者应该比单独使用更好。

    验证这个假设 -- 通过 尝试奖金部分/ the bonus section。

    hog_feature和color_histogram_hsv函数都对单个图像进行操作，并返回该图像的特征向量。 

    extract_features函数 
        采用一组图像和特征函数列表， 
        并  在每个图像上评估每个特征函数，
        将结果存储在矩阵中，
        其中每列是单个图像的所有特征向量的并置/concatenation/级联/并列。



In [4]:
from features import *

num_color_bins = 10 # Number of bins in the color histogram
feature_fns = [hog_feature, lambda img: color_histogram_hsv(img, nbin=num_color_bins)]
X_train_feats = extract_features(X_train, feature_fns, verbose=True)
X_val_feats = extract_features(X_val, feature_fns)
X_test_feats = extract_features(X_test, feature_fns)

# Preprocessing: Subtract the mean feature
mean_feat = np.mean(X_train_feats, axis=0, keepdims=True)
X_train_feats -= mean_feat
X_val_feats -= mean_feat
X_test_feats -= mean_feat

# Preprocessing: Divide by standard deviation. This ensures that each feature
# has roughly the same scale.
std_feat = np.std(X_train_feats, axis=0, keepdims=True)
X_train_feats /= std_feat
X_val_feats /= std_feat
X_test_feats /= std_feat

# Preprocessing: Add a bias dimension
X_train_feats = np.hstack([X_train_feats, np.ones((X_train_feats.shape[0], 1))])
X_val_feats = np.hstack([X_val_feats, np.ones((X_val_feats.shape[0], 1))])
X_test_feats = np.hstack([X_test_feats, np.ones((X_test_feats.shape[0], 1))])

TypeError: slice indices must be integers or None or have an __index__ method

## 训练 SVM on image features


在 提取的特征上 训练 multiclass SVM 
- on val 设置 不同的 lr 和 λ --> 训练SVM 获取 test model 和 best 参数
- on val 设置 颜色直方图不同的bins的数量 --> 训练SVM 

In [7]:
from classifiers.linear_classifier import LinearSVM

learning_rates = [1e-9, 1e-8, 1e-7]
regularization_strengths = [5e4, 5e5, 5e6]

results = {}
best_val = -1
best_svm = None






In [None]:
# Print out results.
for lr, reg in sorted(results):
    train_accuracy, val_accuracy = results[(lr, reg)]
    print('lr %e reg %e train accuracy: %f val accuracy: %f' % (
                lr, reg, train_accuracy, val_accuracy))
    
print('best validation accuracy achieved during cross-validation: %f' % best_val)

In [None]:
# 评估 SVM on the test set
y_test_pred = best_svm.predict(X_test_feats)
test_accuracy = np.mean(y_test == y_test_pred)
print(test_accuracy)

In [None]:
# 可视化 算法产生的 错误/mistakes

# 用于 展示 由当前系统 错误分类的 图像数量
# 图像的第一列是 plane

examples_per_class = 8
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
for cls, cls_name in enumerate(classes):
    idxs = np.where((y_test != cls) & (y_test_pred == cls))[0]
    idxs = np.random.choice(idxs, examples_per_class, replace=False)
    for i, idx in enumerate(idxs):
        plt.subplot(examples_per_class, len(classes), i * len(classes) + cls + 1)
        plt.imshow(X_test[idx].astype('uint8'))
        plt.axis('off')
        if i == 0:
            plt.title(cls_name)
plt.show()

## 训练 Neural Network on image features

In [None]:
print(X_train_feats.shape)

In [9]:
from classifiers.neural_net import TwoLayerNet

input_dim = X_train_feats.shape[1]
hidden_dim = 500
num_classes = 10

net = TwoLayerNet(input_dim, hidden_dim, num_classes)
best_net = None






NameError: name 'X_train_feats' is not defined

In [None]:
# 评估 Neural Network on the test set. 

test_acc = (net.predict(X_test_feats) == y_test).mean()
print(test_acc)