# Training a classifier for hand-written digits

Data Source:
* https://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html

Useful reading:
* https://stackoverflow.com/questions/42471523/how-can-i-generate-a-proper-mnist-image
* https://stackoverflow.com/questions/45539289/convert-image-from-28-28-4-to-2d-flat-array-and-write-to-csv
* https://stackoverflow.com/questions/61552402/if-image-has-28-28-3-shape-how-do-i-convert-it-to-28-28-1
* https://stackoverflow.com/questions/51205502/convert-a-black-and-white-image-to-array-of-numbers

In [None]:
# ! conda install xgboost -y

In [32]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn import metrics
import numpy as np
import pandas as pd
import pickle
from sklearn.datasets import fetch_openml

In [33]:
# import the mnist dataset
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [34]:
# separate features and target
X, y = mnist["data"], mnist["target"]
print(X.shape)
print(y.shape)

(70000, 784)
(70000,)


In [35]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                       test_size=0.3, 
                                       random_state=42)

## Exploratory Analysis

In [None]:
# explore one digit
some_digit=np.array( X.iloc[0])
print(len(some_digit))
some_digit[400:500]

In [None]:
# explore one digit
some_digit_image = some_digit.reshape(28, 28)
print(len(some_digit_image))
print(len(some_digit_image[0]))
some_digit_df=pd.DataFrame(some_digit_image)

In [None]:
# show the output
pd.set_option('display.max_columns', None)
some_digit_df

In [None]:
# examine its label
print(y[0])
print(type(y[0]))
y_int=y.astype(np.uint8)
print(y_int[0])

In [None]:
# display the figure
import matplotlib as mpl 
import matplotlib.pyplot as plt
f = plt.figure()
plt.imshow(some_digit_image, cmap = mpl.cm.binary, interpolation="nearest") 
plt.axis("off")
#plt.savefig('model_outputs/foo_five.png')
plt.show()

## Preprocessing

In [None]:
# standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() 
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler() 
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [36]:
X_train_scaled = X_train
X_test_scaled = X_test

In [None]:
# pickle the scaler

f = open('scaler.pkl', 'wb')
pickle.dump(scaler, f)
f.close()  

## Single Decision Tree

In [None]:
# instantiate with arbitrary hyperparameters
tree_model = DecisionTreeClassifier(max_depth=7, 
                               criterion='entropy', 
                               min_samples_leaf=10,
                               class_weight='balanced')

In [None]:
# train the model
tree_model.fit(X_train_scaled, y_train)

In [None]:
# predict
y_preds=tree_model.predict(X_test_scaled)
print(list(y_preds[:10]))
print(list(y_test[:10]))

In [None]:
# evaluate
print('Accuracy:', metrics.accuracy_score(y_test, y_preds))
print('Precision:', metrics.precision_score(y_test, y_preds,average='macro'))
print('Recall:', metrics.recall_score(y_test, y_preds,average='macro'))
print('F1 Score:', metrics.f1_score(y_test, y_preds,average='macro'))

## Random Forest

In [None]:
# modeling: random forest (arbitrary hyperparameters)
rf_model = RandomForestClassifier(max_depth=8, min_samples_leaf=10, n_estimators=100)


In [None]:
# train the model (this is very time-consuming!!) uncomment next line
# rf_model.fit(X_train_scaled, y_train)

# as a workaround, we unpickle the trained model from the previous run
filename = open('model_outputs/rf_model.pkl', 'rb')
rf_model = pickle.load(filename)
filename.close()

In [None]:
# predict
y_preds=rf_model.predict(X_test_scaled)
print(list(y_preds[:10]))
print(list(y_test[:10]))

In [None]:
# evaluate
print('Accuracy:', metrics.accuracy_score(y_test, y_preds))
print('Precision:', metrics.precision_score(y_test, y_preds,average='macro'))
print('Recall:', metrics.recall_score(y_test, y_preds,average='macro'))
print('F1 Score:', metrics.f1_score(y_test, y_preds,average='macro'))

## XG Boost

There are in general two ways that you can control overfitting in XGBoost:

- The first way is to directly control model complexity.

    - This includes max_depth, min_child_weight and gamma.

- The second way is to add randomness to make training robust to noise.

    - This includes subsample and colsample_bytree.

    - You can also reduce stepsize eta. Remember to increase num_round when you do so.

[source](https://xgboost.readthedocs.io/en/stable/tutorials/param_tuning.html#:~:text=There%20are%20in,you%20do%20so.)

In [37]:
# modeling: XGBoost (arbitrary hyperparameters)
xgb_model = XGBClassifier(max_depth=6, 
                          min_child_weight=1, 
                          gamma=0, 
                          subsample=1, 
                          learning_rate=0.3)

In [38]:
y_train=y_train.astype('int')


In [39]:
y_test=y_test.astype('int')

In [40]:
xgb_model.fit(X_train_scaled, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.3, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)

In [28]:
# train the model (this is very time-consuming!!) uncomment next line
# xgb_model.fit(X_train_scaled, y_train)

# as a workaround, we unpickle the trained model from the previous run
#filename = open('model_outputs/xgb_model.pkl', 'rb')
#xgb_model = pickle.load(filename)
#filename.close()

In [41]:
# predict
y_preds=xgb_model.predict(X_test_scaled)
print(list(y_preds[:10]))
print(list(y_test[:10]))

[8, 4, 8, 7, 7, 0, 6, 2, 7, 4]
[8, 4, 8, 7, 7, 0, 6, 2, 7, 4]


In [42]:
# evaluate
print('Accuracy:', metrics.accuracy_score(y_test, y_preds))
print('Precision:', metrics.precision_score(y_test, y_preds,average='macro'))
print('Recall:', metrics.recall_score(y_test, y_preds,average='macro'))
print('F1 Score:', metrics.f1_score(y_test, y_preds,average='macro'))

Accuracy: 0.9763333333333334
Precision: 0.9763135823515506
Recall: 0.9762052321723518
F1 Score: 0.9762474693631734


#By using 30% test data and by min-max scalar/robust scalar (Interquantile range), Improved from 0.9759 to 0.9763
Accuracy: 0.9763333333333334
Precision: 0.9763135823515506
Recall: 0.9762052321723518
F1 Score: 0.9762474693631734

## Evaluate on new data

In [None]:
## read in our pickle file
filename = open('analysis/example-user-input.pkl', 'rb')
array_to_data_output = pickle.load(filename)
filename.close()

In [None]:
# what is the digit type
print(type(array_to_data_output))
print(array_to_data_output.shape)

In [None]:
# show the digit
pd.set_option('display.max_columns', None)
array_to_data_output

In [None]:
# convert the user input to the format expected by the model
some_digit_array = np.reshape(array_to_data_output.values, -1)
print(some_digit_array[:150])

In [None]:
# Standardize
some_digit_scaled = scaler.transform([some_digit_array])
print(some_digit_scaled[0][:50])

In [None]:
# make a prediction: Random Forest
rf_pred = rf_model.predict(some_digit_scaled)
rf_prob_array = rf_model.predict_proba(some_digit_scaled)
rf_prob = max(rf_prob_array[0])
rf_prob=round(rf_prob*100,2)
print( f'Digit: {rf_pred[0]}', f'Probability: {rf_prob}%')

In [None]:
# make a prediction: XG Boost
xgb_pred = tree_model.predict(some_digit_scaled)
xgb_prob_array = tree_model.predict_proba(some_digit_scaled)
xgb_prob = max(xgb_prob_array[0])
xgb_prob=round(xgb_prob*100,2)
print(f'Digit: {xgb_pred[0]}', f'Probability: {xgb_prob}%')

## Pickle the trained models

In [None]:
# random forest
f = open('model_outputs/rf_model.pkl', 'wb')
pickle.dump(rf_model, f)
f.close()  

In [None]:
# XG Boost
f = open('model_outputs/xgb_model.pkl', 'wb')
pickle.dump(xgb_model, f)
f.close()  