In [12]:
# 导入必要的库
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.utils.extmath import randomized_svd

mnist = fetch_openml('mnist_784', data_home="data/")

# 将数据集分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, test_size=0.2, random_state=42)

In [13]:
# 加载mnist数据集
# 使用PCA进行降维
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# 使用逻辑回归进行分类
logistic_regression = LogisticRegression(random_state=42, solver='lbfgs')
logistic_regression.fit(X_train_pca, y_train)

# 预测测试集
y_pred = logistic_regression.predict(X_test_pca)

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print("准确率为：", accuracy)

准确率为： 0.9152142857142858


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# 使用RPCA算法进行特征提取和降维
rank = 100  # 低秩矩阵的秩
lmbda = 1.0 / np.sqrt(max(X_train.shape))
tolerance = 1e-7
max_iter = 1000

# 数据矩阵拆分为低秩和稀疏矩阵
U, Sigma, VT = randomized_svd(X_train.values, n_components=rank)
low_rank = U[:, :rank] @ np.diag(Sigma[:rank]) @ VT[:rank, :]
sparse = X_train - low_rank

# 使用PCA进行降维
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(low_rank)
X_test_pca = pca.transform(X_test - np.mean(X_train, axis=0))

# 使用逻辑回归进行分类
logistic_regression = LogisticRegression(random_state=42, solver='lbfgs')
logistic_regression.fit(X_train_pca, y_train)

# 预测测试集
y_pred = logistic_regression.predict(X_test_pca)

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print("准确率为：", accuracy)


准确率为： 0.8254285714285714


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
