In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from pathlib import Path
import gzip

In [2]:
train_path = "train-images-idx3-ubyte.gz"
train_labels_path = "train-labels-idx1-ubyte.gz"

In [3]:
def read_mnist_data(images_path, labels_path, num_images, shuffle=False, _is=True,image_size=28, pca=False, n_components=None):
    f_images = gzip.open(images_path, 'r')
    # skip 16 first bytes because these are not data, only header infor
    f_images.read(16)
    # general: read num_images data samples if this parameter is set;
    # if not, read all (60000 training or 10000 test)
    real_num = num_images if not shuffle else (60000 if _is else 10000)
    # read all data to buf_images (28x28xreal_num)
    buf_images = f_images.read(image_size * image_size * real_num)
    # images
    images = np.frombuffer(buf_images, dtype=np.uint8).astype(np.float32)
    images = images.reshape(real_num, image_size, image_size,)
    # Read labels
    f_labels = gzip.open(labels_path, 'r')
    f_labels.read(8)
    labels = np.zeros((real_num)).astype(np.int64)

    for i in range(0, real_num):
        buf_labels = f_labels.read(1)
        labels[i] = np.frombuffer(buf_labels, dtype=np.uint8).astype(np.int64)

    if shuffle is True:
        rand_id = np.random.randint(real_num, size=num_images)
        images = images[rand_id, :]
        labels = labels[rand_id,]

    images = images.reshape(num_images, image_size * image_size)

    if (pca == True):
        pca = PCA(n_components=n_components)
        images = pca.fit_transform(images)

    return images, labels

In [4]:
images0, labels0 = read_mnist_data(train_path, train_labels_path, 5000, shuffle=True)
train_images0, test_images0, train_labels0, test_labels0 = train_test_split(images0, labels0, test_size=0.3, random_state=1)

print(train_images0.shape)
print(test_labels0.shape)

(3500, 784)
(1500,)


  labels[i] = np.frombuffer(buf_labels, dtype=np.uint8).astype(np.int64)


Thực hiện phân loại nhiều lớp khi chưa PCA

In [5]:
model0 = LogisticRegression(multi_class="multinomial", max_iter=1000, solver="sag")
model0.fit(train_images0, train_labels0)



In [6]:
predict0 = model0.predict(test_images0)

accuracy = accuracy_score(test_labels0, predict0)
confusion = confusion_matrix(test_labels0, predict0)

print("Độ chính xác:", accuracy)
print("Ma trận nhầm lẫn:")
print(confusion)

Độ chính xác: 0.8866666666666667
Ma trận nhầm lẫn:
[[119   0   3   1   0   1   1   0   1   0]
 [  0 170   0   0   0   2   0   2   3   0]
 [  2   1 141   2   3   1   2   0   2   0]
 [  2   0   4 140   0   3   1   2   6   1]
 [  1   1   3   0 131   0   2   1   2   6]
 [  3   2   4   8   3  98   3   1  11   1]
 [  1   0   3   0   1   4 133   0   0   1]
 [  0   1   6   0   0   1   1 152   0  11]
 [  3   2   2   2   0   9   2   1 107   2]
 [  0   0   5   2   1   2   0   8   1 139]]


Phân loại nhiều lớp khi PCA

In [7]:
# Giảm số chiều xuống 100
pca_images, labels1 = read_mnist_data(train_path, train_labels_path, 5000, shuffle=True, pca=True, n_components=100)

train_images1, test_images1, train_labels1, test_labels1 = train_test_split(pca_images, labels1, test_size=0.3, random_state=1)

print(train_images1.shape)
print(test_labels1.shape)

  labels[i] = np.frombuffer(buf_labels, dtype=np.uint8).astype(np.int64)


(3500, 100)
(1500,)


In [8]:
model1 = LogisticRegression(multi_class="multinomial", max_iter=1000, solver="sag")
model1.fit(train_images1, train_labels1)



In [9]:
predict1 = model1.predict(test_images1)

accuracy = accuracy_score(test_labels1, predict1)
confusion = confusion_matrix(test_labels1, predict1)

print("Độ chính xác:", accuracy)
print("Ma trận nhầm lẫn:")
print(confusion)

Độ chính xác: 0.8586666666666667
Ma trận nhầm lẫn:
[[123   2   1   1   2   7   2   0   2   3]
 [  0 175   1   2   0   0   0   0   2   0]
 [  0   8 121   5   1   1   5   4   6   1]
 [  1   1   5 101   0   3   1   4   3   7]
 [  3   0   0   1 123   0   5   2   4   6]
 [  5   3   6   3   2  98   3   0  14   0]
 [  4   0   2   0   0   2 150   1   1   0]
 [  1   0   5   1   0   0   0 166   1   7]
 [  0   5   1   5   4   3   4   0 116   4]
 [  0   0   2   2  11   0   1   4   3 115]]


* Khi chưa PCA: độ chính xác là 0.8813333333333333
* Khi giảm số chiều xuống 100, độ chính xác là 0.854