# Logistic Regression
逻辑回归模型是一种常用的分类模型，它通过sigmoid或者softmax函数，将函数值映射到(0, 1)区间内，从而实现对样本的分类。在这个小作业中，你需要实现：
1. 二分类和多分类两种逻辑回归模型
2. 分别含有 L1 和 L2 两种正则项的损失函数，并计算对应的梯度
3. 权重参数W的更新
4. 比较不同的学习率对损失函数和分类器性能的影响
5. 比较不同的正则项参数对于分类器性能的影响

In [2]:
!pip install sklearn

## 一、二分类逻辑回归：
### 1.1数据集介绍
这个任务中使用的数据集是手写数字集MNIST，它有50000个训练样本和10000个测试样本，共10个类别。在二分类任务上，我们对MNIST数据集进行了一个采样，抽取了数据集中的‘5’和‘3’对应的样本作为二分类的正负样本，共得到10842个训练样本，1784个测试样本，其中正负样本数量均相同。为了让大家对于这个数据集有一个更直观的认识，我们从正负样本中各抽取了8个样例进行了可视化。

In [95]:
import os
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


def load_data1(path):
    # load all MNIST data
    fd = open(os.path.join(path, 'train-images-idx3-ubyte'))
    loaded = np.fromfile(file=fd, dtype=np.uint8)
    train_X_all = loaded[16:].reshape((60000, 28, 28, 1)).astype(np.float)
    fd = open(os.path.join(path, 'train-labels-idx1-ubyte'))
    loaded = np.fromfile(file=fd, dtype=np.uint8)
    train_Y_all = loaded[8:].reshape(60000).astype(np.float)
    fd = open(os.path.join(path, 't10k-images-idx3-ubyte'))
    loaded = np.fromfile(file=fd, dtype=np.uint8)
    test_X_all = loaded[16:].reshape((10000, 28, 28, 1)).astype(np.float)
    fd = open(os.path.join(path, 't10k-labels-idx1-ubyte'))
    loaded = np.fromfile(file=fd, dtype=np.uint8)
    test_Y_all = loaded[8:].reshape(10000).astype(np.float)

    #subsample data
    train_idxs_df = pd.read_csv(os.path.join(path, 'train_indices.csv'))
    test_idxs_df = pd.read_csv(os.path.join(path, 'test_indices.csv'))
    pos_train_indices = train_idxs_df['pos_train_indices'].tolist()
    neg_train_indices = train_idxs_df['neg_train_indices'].tolist()
    pos_test_indices = test_idxs_df['pos_test_indices'].tolist()
    neg_test_indices = test_idxs_df['neg_test_indices'].tolist()
    train_Y_all[pos_train_indices] = 1
    train_Y_all[neg_train_indices] = 0
    test_Y_all[pos_test_indices] = 1
    test_Y_all[neg_test_indices] = 0
    train_indices = np.append(pos_train_indices, neg_train_indices)
    test_indices = np.append(pos_test_indices, neg_test_indices)
    train_X = train_X_all[train_indices]
    train_Y = train_Y_all[train_indices]
    test_X = test_X_all[test_indices]
    test_Y = test_Y_all[test_indices]

    #visualiza data
    sample_num = 8
    pos_sample_indices = np.random.choice(pos_train_indices, sample_num, replace=False)
    neg_sample_indices = np.random.choice(neg_train_indices, sample_num, replace=False)
    for i, idx in enumerate(pos_sample_indices):
        plt_idx = i + 1
        plt.subplot(2, sample_num, plt_idx)
        plt.imshow(train_X_all[idx, :, :, :].reshape((28, 28)), cmap=plt.cm.gray)
        plt.axis('off')
        if i == 0:
            plt.title('Positive')

    for i, idx in enumerate(neg_sample_indices):
        plt_idx = sample_num + i + 1
        plt.subplot(2, sample_num, plt_idx)
        plt.imshow(train_X_all[idx, :, :, :].reshape((28, 28)), cmap=plt.cm.gray)
        plt.axis('off')
        if i == 0:
            plt.title('Negative')
    
    # reshaple into rows and normaliza
    train_X = train_X.reshape((train_X.shape[0], -1))
    test_X = test_X.reshape((test_X.shape[0], -1))
    mean_image = np.mean(train_X, axis=0)
    train_X = train_X - mean_image
    test_X = test_X - mean_image

    # add a bias columu into X
    train_X = np.hstack([train_X, np.ones((train_X.shape[0], 1))])
    test_X = np.hstack([test_X, np.ones((test_X.shape[0], 1))])
    return train_X, train_Y, test_X, test_Y


X_train, Y_train, X_test, Y_test = load_data1('/home/kesci/input/MNIST_dataset4284')

### 1.2逻辑回归模型
在这一部分中你需要完成以下内容：
1. train函数中权重的更新
2. L1和L2两种正则化的损失函数及对应梯度的计算
3. predict函数中的预测类别的计算

In [96]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


class LinearRegression1(object):
    def __init__(self):
        self.W = None
    
    
    def train(self, X, Y, display, learning_rate=1e-3, reg=1e-5, reg_type='L2', num_iters=2000, batch_size=128):
        num_train, feat_dim = X.shape
        self.W = 0.001 * np.random.randn(feat_dim)
        loss_history = []
        for i in range(num_iters):
            batch_indices = np.random.choice(num_train, batch_size, replace=True)
            X_batch = X[batch_indices]
            Y_batch = Y[batch_indices]
            if reg_type == 'L1':
                loss, grad = self.l1_loss(X_batch, Y_batch, reg)
            else:
                loss, grad = self.l2_loss(X_batch, Y_batch, reg)
            loss_history.append(loss)
            
            # Todo 1
            #lr = learning_rate / (i // 800 + 1)
            #self.W -= lr * grad
            self.W -= learning_rate * grad
            
            if display and i % 100 == 0:
                print("In iteration {}/{} , the loss is {}".format(i, num_iters, loss))
        return loss_history


    def loss_grad(self, X, Y, reg):
        num, feat_dim = X.shape
        
        loss = 0
        grad = np.zeros(feat_dim)
        for i in range(num):
            z = sigmoid(np.sum(self.W * X[i]))
            loss += -Y[i] * math.log(z) - (1 - Y[i]) * math.log(1 - z)
            grad += (z - Y[i]) * X[i]
        
        return loss / num, grad / num
    
    
    def l1_loss(self, X, Y, reg):
        # Todo 2
        loss, grad = self.loss_grad(X, Y, reg)
        
        loss = loss + reg * np.sum(abs(self.W))
        grad = grad + reg
        
        return loss, grad


    def l2_loss(self, X, Y, reg):
        # Todo 3
        loss, grad = self.loss_grad(X, Y, reg)
        
        loss = loss + reg * np.sum(self.W * self.W)
        grad = grad + 2 * reg * self.W
        
        return loss, grad


    def predict(self, X, threshold=0.5):
        # Todo 4
        num, feat_dim = X.shape
        Y_pred = np.zeros(num)

        for i in range(num):
            val = sigmoid(np.sum(self.W * X[i]))
            if val >= threshold:
                Y_pred[i] = 1
            else:
                Y_pred[i] = 0
        
        return Y_pred


### 1.3 训练模型实例
在这一部分，你不需要完成任何代码，你可以通过这一部分验证你上面实现的LogisticRegression1的代码是否实现正确。

***Answer:
无论是采用 L1 正则化还是 L2 正则化，最终的 accuracy 均保持在 94%~96%。***

In [97]:
lr_param = 1.5e-6
reg_param = 0.01

model = LinearRegression1()
loss_history = model.train(X_train, Y_train, True, lr_param, reg_param, 'L2')
pred = model.predict(X_test)
acc = np.mean(pred == Y_test)
print("The Accuracy is {}\n".format(acc))
x = range(len(loss_history))
plt.plot(x, loss_history, label='Loss')
plt.legend()
plt.xlabel('Iteration Num')
plt.ylabel('Loss')
plt.show()
W = model.W

In iteration 0/2000 , the loss is 1.0085522003106717
In iteration 100/2000 , the loss is 0.16307978926765507
In iteration 200/2000 , the loss is 0.15338794582094234
In iteration 300/2000 , the loss is 0.17525618256538808
In iteration 400/2000 , the loss is 0.17715761989785897
In iteration 500/2000 , the loss is 0.10806885608443047
In iteration 600/2000 , the loss is 0.10650489191878605
In iteration 700/2000 , the loss is 0.12226732465605153
In iteration 800/2000 , the loss is 0.15981368249994374
In iteration 900/2000 , the loss is 0.1339793853593213
In iteration 1000/2000 , the loss is 0.131951880483782
In iteration 1100/2000 , the loss is 0.09361016842325245
In iteration 1200/2000 , the loss is 0.14187617762424212
In iteration 1300/2000 , the loss is 0.18638983126306052
In iteration 1400/2000 , the loss is 0.09144666113489389
In iteration 1500/2000 , the loss is 0.08538753642875357
In iteration 1600/2000 , the loss is 0.1170663814536412
In iteration 1700/2000 , the loss is 0.149586179

### 1.4 学习率和Loss函数、模型性能的关系
因为学习率和正则化参数都是超参数，在一般的训练过程中，我们没办法直接优化，所以我们一般会将训练集细分成训练集和验证集，然后通过模型在验证集上的表现选择一个最优的超参数，再将它对应的最优的模型应用到测试集中。
在这一部分你需要完成以下内容：
1. 尝试多种不同的学习率
2. 储存学习率对应的损失函数值到L1_loss和L2_loss中（我们对损失函数值进行了20步平均化处理）。
3. 储存学习率对应的**在验证集上**的正确率到L1_lr_val_acc和L2_lr_val_acc中

#### 注意：
因为已有代码中L1_loss，L1_lr_val_acc都是数组，在可视化的过程中我们需要学习率和它们相对应，比如learning_rates[0]对应的loss和validation accuracy应该储存在数组index为0的位置

#### 拓展：
在这个部分中采取的损失函数都是定值，如果你有时间的话，可以尝试根据迭代轮数改变学习率，并比较不变的学习率和变化的学习率对于模型性能的影响。

In [81]:
reg = 0.01
reg_types = ['L1', 'L2']
L1_loss = []
L2_loss = []
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2)
L1_lr_val_acc = []
L2_lr_val_acc = []

# Todo 5:
learning_rates = [1e-5, 7e-6, 4e-6, 1e-6, 7e-7, 3e-7, 1e-7]
for type in range(2):
    for i in range(len(learning_rates)):
        model = LinearRegression1()
        learning_rate = learning_rates[i]
        loss_history = model.train(X_train, Y_train, False, learning_rate, reg, reg_types[type])
        accuracy = np.mean(model.predict(X_val) == Y_val)
        if type == 0:
            L1_loss.append(loss_history)
            L1_lr_val_acc.append(accuracy)
        else:
            L2_loss.append(loss_history)
            L2_lr_val_acc.append(accuracy)
        print(reg_types[type], learning_rate, accuracy)

#visulize the relationship between lr and loss
for i, lr in enumerate(learning_rates):
    L1_loss_label = str(lr) + 'L1'
    L2_loss_label = str(lr) + 'L2'
    L1_loss_i = L1_loss[i]
    L2_loss_i = L2_loss[i]
    ave_L1_loss = np.zeros_like(L1_loss_i)
    ave_L2_loss = np.zeros_like(L2_loss_i)
    ave_step = 20
    for j in range(len(L1_loss_i)):
        if j < ave_step:
            ave_L1_loss[j] = np.mean(L1_loss_i[0: j + 1])
            ave_L2_loss[j] = np.mean(L2_loss_i[0: j + 1])
        else:
            ave_L1_loss[j] = np.mean(L1_loss_i[j - ave_step + 1: j + 1])    
            ave_L2_loss[j] = np.mean(L2_loss_i[j - ave_step + 1: j + 1])
    x = range(len(L1_loss_i))
    plt.plot(x, ave_L1_loss, label=L1_loss_label)
    plt.plot(x, ave_L2_loss, label=L2_loss_label)
    
plt.legend()
plt.xlabel('high-parameter lr')
plt.ylabel('Loss')
plt.show()

#visulize the relationship between lr and accuracy
x = range(len(learning_rates))
plt.plot(x, L1_lr_val_acc, label='L1_val_acc')
plt.plot(x, L2_lr_val_acc, label='L2_val_acc')
plt.xticks(x, learning_rates)
plt.margins(0.08)
plt.legend()
plt.xlabel('high-parameter lr')
plt.ylabel('Validation Accuracy')
plt.show()

L1 1e-05 0.9594283079760259
L1 7e-06 0.9557399723374828
L1 4e-06 0.9543568464730291
L1 1e-06 0.9474412171507607
L1 7e-07 0.9432918395573997
L1 3e-07 0.9377593360995851
L1 1e-07 0.9294605809128631
L2 1e-05 0.9571230982019364
L2 7e-06 0.954817888427847
L2 4e-06 0.9580451821115722
L2 1e-06 0.9474412171507607
L2 7e-07 0.9483633010603965
L2 3e-07 0.9409866297833103
L2 1e-07 0.9266943291839558


#### Question1: 学习率和损失函数的变化、模型性能之间分别有什么关系？

Answer1:
当学习率比较大时(`learning_rate >= 5e-5`)，在学习的迭代过程中会出现 `self.W * X[i]` 很大的情况，导致 `Sigmoid` 得到的函数值 `z` 趋近于 `1`，使得 `log(1-z)` 出现错误，因此学习率不能设置太大。

同时可以看到学习率在 `7e-6 ~ 7e-7` 之间，模型准确率均能保持在 `95%` 左右。当学习率继续增加时，准确率开始下降，此时增大迭代次数可在一定程度上提高准确率。

可以看到，在 `learning_rate >= 4e-6` 时，损失函数下降情况基本一致，当学习率逐渐减小时，学习率下降稍微减慢，且最后不收敛到同一值。

此外，似乎在学习率大一些的时候，L1 正则化与 L2 正则化差别不大。当学习率比较小的时候，L2 正则化准确率比 L1 正则化准确率更低。

**尝试了一下让学习率随迭代次数而减小……不过没什么效果，所以觉得还是固定学习率好。**

### 1.5 正则项与模型性能
在这一部分中，你需要完成以下内容：
1. 尝试多个正则化参数的值
2. 储存对应的在**验证集上**的正确率到L1_reg_val_acc和L2_reg_val_acc中
3. 通过验证集X_val和Y_val选择最优的正则化超参数，并储存最优正则化参数和对应模型

已有的代码会画出正则化参数和验证集上正确率的关系图，并计算最优的模型在测试集上的正确率。

#### 注意：
和上面学习率一样，L1_reg_val_acc的存储也需要和正则化参数值对应。

In [83]:
learning_rate = 1.5e-6
reg_types = ['L1', 'L2']
L1_reg_val_acc = []
L2_reg_val_acc = []
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2)
best_L1_model = None
best_L2_model = None
best_L1_reg = 0
best_L2_reg = 0

# Todo 6:
regs = [0.001, 0.005, 0.010, 0.015, 0.020, 0.025, 0.030, 0.035, 0.040]
for type in range(2):
    best_accuracy = 0
    for i in range(len(regs)):
        model = LinearRegression1()
        reg = regs[i]
        loss_history = model.train(X_train, Y_train, False, learning_rate, reg, reg_types[type])
        accuracy = np.mean(model.predict(X_val) == Y_val)
        if type == 0:
            L1_reg_val_acc.append(accuracy)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_L1_reg = reg
                best_L1_model = model
        else:
            L2_reg_val_acc.append(accuracy)
            if (accuracy > best_accuracy):
                best_accuracy = accuracy
                best_L2_reg = reg
                best_L2_model = model
        print(reg_types[type], reg, accuracy)

#visuliza the relation of regularization parameter and validation accuracy
x = range(len(regs))
plt.plot(x, L1_reg_val_acc, label='L1_val_acc')
plt.plot(x, L2_reg_val_acc, label='L2_val_acc')
plt.xticks(x, regs)
plt.margins(0.08)
plt.legend()
plt.xlabel('high-parameter reg')
plt.ylabel('Validation Accuracy')
plt.show()

#Compute the performance of best model on the test set
L1_pred = best_L1_model.predict(X_test)
L1_acc = np.mean(L1_pred == Y_test)
print("The Accuracy with L1 regularization parameter {} is {}\n".format(best_L1_reg, L1_acc))
L2_pred = best_L2_model.predict(X_test)
L2_acc = np.mean(L2_pred == Y_test)
print("The Accuracy with L2 regularization parameter {} is {}\n".format(best_L2_reg, L2_acc))

L1 0.001 0.9632564841498559
L1 0.005 0.9567723342939481
L1 0.01 0.9567723342939481
L1 0.015 0.9553314121037464
L1 0.02 0.9596541786743515
L1 0.025 0.9596541786743515
L1 0.03 0.9618155619596542
L1 0.035 0.9582132564841499
L1 0.04 0.9582132564841499
L2 0.001 0.9531700288184438
L2 0.005 0.957492795389049
L2 0.01 0.9589337175792507
L2 0.015 0.9589337175792507
L2 0.02 0.9582132564841499
L2 0.025 0.9582132564841499
L2 0.03 0.9589337175792507
L2 0.035 0.9589337175792507
L2 0.04 0.9589337175792507


The Accuracy with L1 regularization parameter 0.001 is 0.9602017937219731

The Accuracy with L2 regularization parameter 0.01 is 0.952914798206278



## 二、多分类逻辑回归

### 2.1 加载数据集

In [34]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


def load_data2(path):
    # load all MNIST data
    fd = open(os.path.join(path, 'train-images-idx3-ubyte'))
    loaded = np.fromfile(file=fd, dtype=np.uint8)
    train_X = loaded[16:].reshape((60000, 28, 28, 1)).astype(np.float)
    fd = open(os.path.join(path, 'train-labels-idx1-ubyte'))
    loaded = np.fromfile(file=fd, dtype=np.uint8)
    train_Y = loaded[8:].reshape(60000).astype(np.float)
    fd = open(os.path.join(path, 't10k-images-idx3-ubyte'))
    loaded = np.fromfile(file=fd, dtype=np.uint8)
    test_X = loaded[16:].reshape((10000, 28, 28, 1)).astype(np.float)
    fd = open(os.path.join(path, 't10k-labels-idx1-ubyte'))
    loaded = np.fromfile(file=fd, dtype=np.uint8)
    test_Y = loaded[8:].reshape(10000).astype(np.float)

    #visualiza data
    sample_num = 8
    num_classes = 10
    for y in range(num_classes):
        idxs = np.flatnonzero(train_Y == y)
        idxs = np.random.choice(idxs, sample_num, replace=False)
        for i, idx in enumerate(idxs):
            plt_idx = i * num_classes + y + 1
            plt.subplot(sample_num, num_classes, plt_idx)
            plt.imshow(train_X[idx, :, :, :].reshape((28,28)),cmap=plt.cm.gray)
            plt.axis('off')
            if i == 0:
                plt.title(y)
    plt.show()

    # reshaple into rows and normaliza
    train_X = train_X.reshape((train_X.shape[0], -1))
    test_X = test_X.reshape((test_X.shape[0], -1))
    mean_image = np.mean(train_X, axis=0)
    train_X = train_X - mean_image
    test_X = test_X - mean_image

    # add a bias columu into X
    train_X = np.hstack([train_X, np.ones((train_X.shape[0], 1))])
    test_X = np.hstack([test_X, np.ones((test_X.shape[0], 1))])
    train_Y = train_Y.astype(np.int32)
    test_Y = test_Y.astype(np.int32)
    return train_X, train_Y, test_X, test_Y


X_train, Y_train, X_test, Y_test = load_data2('/home/kesci/input/MNIST_dataset4284')

### 2.2逻辑回归模型
在这一部分中你需要完成与二分类逻辑回归相同的任务。

In [35]:
class LinearRegression2(object):
    def __init__(self):
        self.W = None
    
    def train(self, X, Y, display, learning_rate=1e-3, reg=1e-5, reg_type='L2', num_iters=2000,
              batch_size=128):
        num_train, feat_dim = X.shape
        num_classes = 10
        self.W = 0.001 * np.random.randn(feat_dim, num_classes).transpose()
        loss_history = []
        for i in range(num_iters):
            batch_indices = np.random.choice(num_train, batch_size, replace=True)
            X_batch = X[batch_indices]
            Y_batch = Y[batch_indices]
            if reg_type == 'L1':
                loss, grad = self.l1_loss(X_batch, Y_batch, reg)
            else:
                loss, grad = self.l2_loss(X_batch, Y_batch, reg)
            loss_history.append(loss)
            
            # Todo 1
            self.W -= learning_rate * grad
       
            if display and i % 100 == 0:
                print("In iteration {}/{} , the loss is {}".format(i, num_iters, loss))
        return loss_history

    def loss_grad(self, X, Y, reg):
        data_num, feat_dim = X.shape
        class_num = self.W.shape[0]
        
        loss = 0
        grad = np.zeros([class_num, feat_dim])
        for data in range(data_num):
            sum = 0
            
            exp_sum = np.exp(np.sum(X[data] * self.W, axis=1))
            sum = exp_sum.sum()
            loss += -np.log(exp_sum[Y[data]] / sum)
            
            for i in range(class_num):
                p = exp_sum[i] / sum
                if i == Y[data]:
                    p -= 1
                grad[i] += X[data] * p
        
        return loss / data_num, grad / data_num
    
    
    def l1_loss(self,X, Y, reg):
        # Todo 2
        loss, grad = self.loss_grad(X, Y, reg)
        
        loss = loss + reg * abs(self.W).sum()
        grad = grad + reg
        
        return loss, grad
    
    def l2_loss(self, X, Y, reg):
        # Todo 3
        loss, grad = self.loss_grad(X, Y, reg)
        
        loss = loss + reg * np.sum(self.W * self.W)
        grad = grad + 2 * reg * self.W
        
        return loss, grad

    def predict(self, X):
        data_num, feat_dim = X.shape
        class_num = self.W.shape[0]
        Y_pred = np.zeros(data_num)
        
        for data in range(data_num):
            max_p = 0
            pos = 0
            
            exp_sum = np.exp(np.sum(self.W * X[data], axis=1))
            Y_pred[data] = np.argmax(exp_sum)
        
        return Y_pred


### 2.3 训练模型样例
在这一部分，你不需要完成任何代码，你可以通过这一部分验证你上面实现的LogisticRegression1的代码是否实现正确。

In [36]:
lr_param = 7e-6
reg_param = 500 # for visualize
# reg_param = 0.01
model = LinearRegression2()
loss_history = model.train(X_train, Y_train, True, lr_param, reg_param, 'L2')
pred = model.predict(X_test)
acc = np.mean(pred == Y_test)
print("The Accuracy is {}\n".format(acc))
x = range(len(loss_history))
plt.plot(x, loss_history, label='Loss')
plt.legend()
plt.xlabel('Iteration Num')
plt.ylabel('Loss')
plt.show()

print("reg =", reg)
W = model.W
for digit in range(10):
    w = np.reshape(np.delete(W[digit], -1), (28, -1))
    w_min = np.min(w)
    w_max = np.max(w)
    w = (w - w_min) / (w_max - w_min) * 255.0
    print(digit)
    plt.imshow(w, cmap=plt.cm.gray)
    # plt.imshow(w)
    plt.axis('off')
    plt.show()


In iteration 0/2000 , the loss is 7.74026280634094
In iteration 100/2000 , the loss is 1.6200678095793086
In iteration 200/2000 , the loss is 0.841270304352612
In iteration 300/2000 , the loss is 0.7669157691315339
In iteration 400/2000 , the loss is 0.7022173791691593
In iteration 500/2000 , the loss is 0.7651952244146731
In iteration 600/2000 , the loss is 0.7167420677259189
In iteration 700/2000 , the loss is 0.526422186100991
In iteration 800/2000 , the loss is 0.6015692372041385
In iteration 900/2000 , the loss is 0.6709375233420287
In iteration 1000/2000 , the loss is 0.5904602781961209
In iteration 1100/2000 , the loss is 0.575234514885548
In iteration 1200/2000 , the loss is 0.7388522504798021
In iteration 1300/2000 , the loss is 0.6164285600924444
In iteration 1400/2000 , the loss is 0.6561681887744997
In iteration 1500/2000 , the loss is 0.6192726692834813
In iteration 1600/2000 , the loss is 0.5933476173942047
In iteration 1700/2000 , the loss is 0.6515525313829451
In iterat

reg = 1000
0


1


2


3


4


5


6


7


8


9


### 2.4 学习率与损失函数、模型性能的关系
因为学习率和正则化参数都是超参数，在一般的训练过程中，我们没办法直接优化，所以我们一般会将训练集细分成训练集和验证集，然后通过模型在验证集上的表现选择一个最优的超参数，再将它对应的最优的模型应用到测试集中。
在这一部分你需要完成以下内容：
1. 尝试多种不同的学习率
2. 储存学习率对应的损失函数值到L1_loss和L2_loss中（我们对损失函数值进行了20步平均化处理）。
3. 储存学习率对应的**在验证集上**的正确率到L1_lr_val_acc和L2_lr_val_acc中

#### 注意：
因为已有代码中L1_loss，L1_lr_val_acc都是数组，在可视化的过程中我们需要学习率和它们相对应，比如learning_rates[0]对应的loss和validation accuracy应该储存在数组index为0的位置

#### 拓展：
在这个部分中采取的损失函数都是定值，如果你有时间的话，可以尝试根据迭代轮数改变学习率，并比较不变的学习率和变化的学习率对于模型性能的影响。

In [100]:
reg = 0.01
reg_types = ['L1', 'L2']
L1_loss = []
L2_loss = []
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2)
L1_lr_val_acc = []
L2_lr_val_acc = []

# Todo 5:
learning_rates = [7e-5, 4e-5, 1e-5, 7e-6, 4e-6, 1e-6, 7e-7]
for type in range(2):
    for i in range(len(learning_rates)):
        model = LinearRegression2()
        learning_rate = learning_rates[i]
        loss_history = model.train(X_train, Y_train, False, learning_rate, reg, reg_types[type])
        accuracy = np.mean(model.predict(X_val) == Y_val)
        if type == 0:
            L1_loss.append(loss_history)
            L1_lr_val_acc.append(accuracy)
        else:
            L2_loss.append(loss_history)
            L2_lr_val_acc.append(accuracy)
        print(reg_types[type], learning_rate, accuracy)

#visulize the relationship between lr and loss
for i, lr in enumerate(learning_rates):
    L1_loss_label = str(lr) + 'L1'
    L2_loss_label = str(lr) + 'L2'
    L1_loss_i = L1_loss[i]
    L2_loss_i = L2_loss[i]
    ave_L1_loss = np.zeros_like(L1_loss_i)
    ave_L2_loss = np.zeros_like(L2_loss_i)
    ave_step = 20
    for j in range(len(L1_loss_i)):
        if j < ave_step:
            ave_L1_loss[j] = np.mean(L1_loss_i[0: j + 1])
            ave_L2_loss[j] = np.mean(L2_loss_i[0: j + 1])
        else:
            ave_L1_loss[j] = np.mean(L1_loss_i[j - ave_step + 1: j + 1])    
            ave_L2_loss[j] = np.mean(L2_loss_i[j - ave_step + 1: j + 1])
    x = range(len(L1_loss_i))
    plt.plot(x, ave_L1_loss, label=L1_loss_label)
    plt.plot(x, ave_L2_loss, label=L2_loss_label)
    
plt.legend()
plt.xlabel('high-parameter lr')
plt.ylabel('Loss')
plt.show()

#visulize the relationship between lr and accuracy
x = range(len(learning_rates))
plt.plot(x, L1_lr_val_acc, label='L1_val_acc')
plt.plot(x, L2_lr_val_acc, label='L2_val_acc')
plt.xticks(x, learning_rates)
plt.margins(0.08)
plt.legend()
plt.xlabel('high-parameter lr')
plt.ylabel('Validation Accuracy')
plt.show()

L1 7e-05 0.9028125
L1 4e-05 0.9083333333333333
L1 1e-05 0.909375
L1 7e-06 0.9117708333333333
L1 4e-06 0.9066666666666666
L1 1e-06 0.8851041666666667
L1 7e-07 0.879375
L2 7e-05 0.9
L2 4e-05 0.9104166666666667
L2 1e-05 0.911875
L2 7e-06 0.9102083333333333
L2 4e-06 0.9057291666666667
L2 1e-06 0.8870833333333333
L2 7e-07 0.8760416666666667


### 2.5 正则项与模型性能
在这一部分中，你需要完成以下内容：
1. 尝试多个正则化参数的值
2. 储存对应的在**验证集上**的正确率到L1_reg_val_acc和L2_reg_val_acc中
3. 通过验证集X_val和Y_val选择最优的正则化超参数，并储存最优正则化参数和对应模型

已有的代码会画出正则化参数和验证集上正确率的关系图，并计算最优的模型在测试集上的正确率。

#### 注意：
和上面学习率一样，L1_reg_val_acc的存储也需要和正则化参数值对应。

In [23]:
learning_rate = 7e-6
reg_types = ['L1', 'L2']
L1_reg_val_acc = []
L2_reg_val_acc = []
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2)
best_L1_model = None
best_L2_model = None
best_L1_reg = 0
best_L2_reg = 0

# Todo 6:
regs = [0.001, 0.005, 0.010, 0.015, 0.020, 0.025, 0.030, 0.035, 0.040]
regs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
for type in range(2):
    best_accuracy = 0
    for i in range(len(regs)):
        model = LinearRegression2()
        reg = regs[i]
        loss_history = model.train(X_train, Y_train, False, learning_rate, reg, reg_types[type])
        accuracy = np.mean(model.predict(X_val) == Y_val)
        if type == 0:
            L1_reg_val_acc.append(accuracy)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_L1_reg = reg
                best_L1_model = model
        else:
            L2_reg_val_acc.append(accuracy)
            if (accuracy > best_accuracy):
                best_accuracy = accuracy
                best_L2_reg = reg
                best_L2_model = model
        print(reg_types[type], reg, accuracy)

#visuliza the relation of regularization parameter and validation accuracy
x = range(len(regs))
plt.plot(x, L1_reg_val_acc, label='L1_val_acc')
plt.plot(x, L2_reg_val_acc, label='L2_val_acc')
plt.xticks(x, regs)
plt.margins(0.08)
plt.legend()
plt.xlabel('high-parameter reg')
plt.ylabel('Validation Accuracy')
plt.show()

#Compute the performance of best model on the test set
L1_pred = best_L1_model.predict(X_test)
L1_acc = np.mean(L1_pred == Y_test)
print("The Accuracy with L1 regularization parameter {} is {}\n".format(best_L1_reg, L1_acc))
L2_pred = best_L2_model.predict(X_test)
L2_acc = np.mean(L2_pred == Y_test)
print("The Accuracy with L1 regularization parameter {} is {}\n".format(best_L2_reg, L2_acc))

L1 0.001 0.9072916666666667
L1 0.01 0.9061197916666667
L1 0.1 0.9088541666666666
L1 1 0.91015625




L1 10 0.10013020833333333
L1 100 0.10013020833333333
L1 1000 0.10013020833333333
L2 0.001 0.910546875
L2 0.01 0.908984375
L2 0.1 0.9100260416666667
L2 1 0.9098958333333333
L2 10 0.90859375
L2 100 0.9015625
L2 1000 0.875


The Accuracy with L1 regularization parameter 1 is 0.911

The Accuracy with L1 regularization parameter 0.001 is 0.9123



#### Question2: 对于上面的多分类逻辑回归模型，你觉得它的权重矩阵数值上会呈现出什么样子？你可以通过可视化的方法观察权重矩阵。
Answer2:

可视化的结果在 “2.3 训练模型样例”里，为了使可视化结果明显，需要将正则化项调大到 `100~1000`。观察到权重矩阵可视化的结果分别是数字 0-9，在数字对应的区域值较大，在其他地方值比较小