In [59]:
import scipy.io
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score
import torch
import numpy as np

In [60]:
# 分割数据集（无用）
def data_split(data, test_size):
    trn_data = {}
    val_data = {}

    for label, data in data.items():
        lens = len(data[label])
        print(lens)
        trn_data[label] = data[: lens * (1 - test_size)]
        val_data[label] = data[lens * (1 - test_size) :]
    
    return trn_data, val_data

In [61]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [62]:
# 读取mat
trn_mat = scipy.io.loadmat("data/train_data.mat")
trn_data = trn_mat["train"]
# print(len(trn_data))
x_len = len(trn_data)

# 标注数据label
trn_label = [i + 1 for i in range(x_len)]
# print(trn_label)

# 将label与data组合为字典
trn_dict = {}
for i in range(x_len):
    trn_dict[i+1] = trn_data[i]
# print(len(trn_dict))
print(type(trn_dict[1]))

# 读取测试集
val_mat = scipy.io.loadmat('data/test_data.mat')
val_data = val_mat['test']
val_data = val_data
print(type(val_data[1]))
print(len(val_data))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
1000


In [63]:
# 定义SVM
class LinearSVM:
    # 定义学习率、正则化强度、迭代次数
    def __init__(self, lr=1e-3, reg=1e-5, num_iter=1000):
        self.lr = lr
        self.reg = reg
        self.num_iter = num_iter
        
    # 计算损失和梯度
    def compute_loss_and_grad(self, x, y):
        num_trn = x.shape[0]
        num_cls = np.max(y) # 样本类别的数量，在预处理数据时已+1，此处无需+1
        scores = x.dot(self.W)  # 获取全分数矩阵
        correct_cls_scores = scores[np.arange(num_trn), y] # 提取正确类别的分数 
        
        margins = np.maximum(0, scores - correct_cls_scores[:, np.newaxis] + 1) # 计算损失边缘
        margins[np.arange(num_trn), y] = 0  # 令正确分类损失为0
        loss = np.sum(margins) / num_trn    # 计算损失
        loss += 0.5 * self.reg * np.sum(self.W * self.W)    # 对损失进行正则化处理，防止过拟合
        
        margins[margins>0] = 1  # 将边缘损失矩阵大于零的部分设为1，方便计算梯度
        margins[np.arange(num_trn), y] = -np.sum(margins, axis=1)   
        dW = x.T.dot(margins) / num_trn # 计算权重矩阵的梯度取平均值，用于更新模型参数
        dW += self.reg * self.W # 正则化梯度，用于惩罚过大的权重
        
        return loss, dW

    # 训练
    def fit(self, x, y):
        num_trn, dim = x.shape
        num_cls = np.max(y) + 1
        self.W = 0.001 * np.random.randn(dim, num_cls)  # 生成一个初始权重矩阵

        for i in range(self.num_iter):
            loss, grad = self.compute_loss_and_grad(x, y)
            self.W -= self.lr * grad    # 梯度下降法更新权重
            if i % 100 == 0:
                print(f"Iteration {i}/{self.num_iter}, Loss: {loss}")
                
    # 预测
    def predict(self, x):
        scores = x.dot(self.W)  # 利用权重进行预测
        if scores.size == 0:
            return []  # 如果输入序列为空，则返回空列表
        return np.argmax(scores, axis=1)    # 输出得分最高的类别

In [64]:
# 收集并转换数据
x_trn = []
y_trn = []
for y, x in trn_dict.items():
    x_trn.append(x)
    y_trn.append([y] * len(x))

x_trn = np.array(x_trn)
y_trn = np.array(y_trn)
print(x_trn.shape)

# 重塑数据
x_trn = x_trn.reshape(200 * 15, 28 * 28).astype(np.float64)
y_trn = np.repeat(np.arange(200), 15)
x_val = val_data.reshape(val_data.shape[0], -1).astype(np.float64)

# 计算并减去均值图像
mean_img = np.mean(x_trn, axis=0).astype(np.float64)
x_trn -= mean_img
x_val -= mean_img

# 训练 SVM
svm = LinearSVM()
svm.fit(x_trn, y_trn)

(200, 15, 28, 28)
Iteration 0/1000, Loss: 266.8600838540189
Iteration 100/1000, Loss: 0.36589494565902514
Iteration 200/1000, Loss: 0.01214613232435776
Iteration 300/1000, Loss: 0.00020144803915678674
Iteration 400/1000, Loss: 0.0002014476362611093
Iteration 500/1000, Loss: 0.00020144723336623765
Iteration 600/1000, Loss: 0.00020144683047217183
Iteration 700/1000, Loss: 0.00020144642757891174
Iteration 800/1000, Loss: 0.00020144602468645745
Iteration 900/1000, Loss: 0.00020144562179480894


In [75]:
# 预测
predictions = svm.predict(x_val)

print(len(x_val))
print(predictions)

1000
[  6 172 194  12  71 162 194 134  13  28  91 153 187 147 121  94 116  12
 183  21  71 180  39  13 177  91 102   1 199  25  94 155  36 118   5  76
 176  18 176 187  57   9  43   3 111  36  79  84 189 141  75 125 163  19
 123  85   9 145  56  31  62  24 137 167  81 119 107  97 191 170  36 160
 166 116  14 119 182  55 134 137  33 102 147  64   8 163  79 177  90  21
  44 119 104  85 101 163 138  71 127   2 155  56 155  71  27 171  45  85
   0  16  31   9  30  68 130 130 176   3  40 190 193  70 131  27  90  97
  77 123 100 112  82 194  26 184 130 197 104  79 165 188  92  95 177  68
 184  32 127 184  72 154 187 166  50 112 157 104 110  53 127  79 171  21
 154 152  77 127  84  18  45 142  87 138   9  44  32  31 156 179 160 173
 171 102 100 144  45  72  33  43 107 166 154  46 186  90  94  92  40  14
 124  92  26  81 156 162  73 149 189  61 117  87 171 115 128  41  94  93
 122 193   3  84 158 107  91 166  19 187 162  92 156  71  92 168 165  54
 161 166   5 177 147  86  17 109 142 173 165  

In [82]:
# 将预测结果写入csv
import pandas as pd

submission = pd.read_csv('data/submission.csv', encoding='utf8')
submission['预测结果'] = predictions
submission.to_csv('data/submission.csv', index=False)