In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

In [2]:
# 1. sklearn으로 구현
from sklearn import linear_model

# raw data loading
df = pd.read_csv('./data/mnist/train.csv')

# 결측치 -> 없음
# 이상치 -> 없음
# 정규화 -> sklearn에 위임

# taining data set
x_data = df.drop('label', axis=1, inplace=False).values
t_data = df['label'].values.reshape(-1,1)

# 데이터 분리
x_train, x_test, t_train, t_test = \
train_test_split(x_data,
                 t_data,
                 test_size=0.3,
                 random_state=0)

# 모델 생성
sklearn_model = linear_model.LogisticRegression(random_state=42, max_iter=1000)

# 모델 학습
sklearn_model.fit(x_train,
                  t_train.ravel())

# 모델 평가
# y_true,
# y_pred,
print(classification_report(t_test,
                      sklearn_model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      1242
           1       0.95      0.96      0.96      1429
           2       0.88      0.88      0.88      1276
           3       0.87      0.87      0.87      1298
           4       0.90      0.90      0.90      1236
           5       0.87      0.84      0.85      1119
           6       0.92      0.94      0.93      1243
           7       0.92      0.90      0.91      1334
           8       0.85      0.85      0.85      1204
           9       0.85      0.87      0.86      1219

    accuracy                           0.90     12600
   macro avg       0.89      0.89      0.89     12600
weighted avg       0.90      0.90      0.90     12600



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [3]:
# 2. tensorflow로 구현
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.optimizers import Adam

# raw data loading
df = pd.read_csv('./data/mnist/train.csv')

# 독립변수와 종속변수 분리
x_data = df.drop('label', axis=1, inplace=False).values
t_data = df['label'].values.reshape(-1,1)

# 정규화
scaler = MinMaxScaler()
scaler.fit(x_data)
x_data_norm = scaler.transform(x_data)

# 데이터 분리
x_train_norm, x_test_norm, t_train, t_test = \
train_test_split(x_data_norm,
                 t_data,
                 test_size=0.3,
                 random_state=0)

# 모델 생성
keras_model = Sequential()

# layer 추가
keras_model.add(Flatten(input_shape=(784,)))
keras_model.add(Dense(10, activation='softmax'))

# model 설정
keras_model.compile(optimizer=Adam(learning_rate=1e-2),
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])

# model 학습
keras_model_result = keras_model.fit(x_train_norm,
                                     t_train,
                                     epochs=100,
                                     batch_size=100,
                                     verbose=1,
                                     validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [4]:
# 모델 평가
keras_model.evaluate(x_test_norm, t_test)



[0.5405497550964355, 0.8956349492073059]

In [5]:
import tensorflow as tf

print(classification_report(t_test,
                      tf.argmax(keras_model.predict(x_test_norm), axis=1).numpy()))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      1242
           1       0.94      0.96      0.95      1429
           2       0.90      0.86      0.88      1276
           3       0.90      0.84      0.87      1298
           4       0.95      0.84      0.90      1236
           5       0.87      0.83      0.85      1119
           6       0.91      0.94      0.93      1243
           7       0.90      0.92      0.91      1334
           8       0.78      0.91      0.84      1204
           9       0.85      0.88      0.87      1219

    accuracy                           0.90     12600
   macro avg       0.90      0.89      0.89     12600
weighted avg       0.90      0.90      0.90     12600



In [6]:
# test 데이터를 이용해서 예측하고 결과를 csv 파일로 만들어 kaggle에 제출
ml_df = pd.read_csv('./data/mnist/sample_submission.csv')
ml_df

Unnamed: 0,ImageId,Label
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
27995,27996,0
27996,27997,0
27997,27998,0
27998,27999,0


In [7]:
test_df = pd.read_csv('./data/mnist/test.csv')
x_data = test_df.values
scaler = MinMaxScaler()
scaler.fit(x_data)
x_data_norm = scaler.transform(x_data)
result = tf.argmax(keras_model.predict(x_data_norm), axis=1).numpy()
result



array([2, 0, 9, ..., 3, 9, 2], dtype=int64)

In [8]:
ml_df['Label'] = result
ml_df

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,7
4,5,3
...,...,...
27995,27996,9
27996,27997,7
27997,27998,3
27998,27999,9


In [10]:
ml_df.to_csv('./data/mnist/ml_result.csv', index=None)

In [None]:
# 비정형 데이터의 학습
# 1. layer 추가