In [2]:
%run ../DataClass.ipynb

In [3]:
X_train = np.load('./0/data/X_train.npy')
X_test = np.load('./0/data/X_test.npy')
X_val = np.load('./0/data/X_val.npy')
y_train = np.load('./0/data/y_train.npy')
y_test = np.load('./0/data/y_test.npy')
y_val = np.load('./0/data/y_val.npy')

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Tuning RF

In [4]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 200, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 45, num = 3)]
# Minimum number of samples required to split a node
min_samples_split = [5, 10]


# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split}

print(random_grid)

{'n_estimators': [20, 65, 110, 155, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [1, 23, 45], 'min_samples_split': [5, 10]}


In [5]:
%%time
rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=1, max_features=auto, min_samples_split=5, n_estimators=20; total time=   0.4s
[CV] END max_depth=1, max_features=auto, min_samples_split=5, n_estimators=20; total time=   0.4s
[CV] END max_depth=1, max_features=auto, min_samples_split=5, n_estimators=20; total time=   0.4s
[CV] END max_depth=1, max_features=auto, min_samples_split=5, n_estimators=20; total time=   0.4s
[CV] END max_depth=1, max_features=auto, min_samples_split=5, n_estimators=20; total time=   0.4s
[CV] END max_depth=23, max_features=sqrt, min_samples_split=5, n_estimators=155; total time=  24.2s
[CV] END max_depth=23, max_features=sqrt, min_samples_split=5, n_estimators=155; total time=  24.2s
[CV] END max_depth=23, max_features=sqrt, min_samples_split=5, n_estimators=155; total time=  24.1s
[CV] END max_depth=23, max_features=sqrt, min_samples_split=5, n_estimators=155; total time=  24.1s
[CV] END max_depth=23, max_features=sqrt, min_sam

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
                   param_distributions={'max_depth': [1, 23, 45],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_split': [5, 10],
                                        'n_estimators': [20, 65, 110, 155,
                                                         200]},
                   verbose=2)

In [6]:
rf_random.best_estimator_ , rf_random.best_score_

(RandomForestClassifier(max_depth=45, max_features='sqrt', min_samples_split=5,
                        n_estimators=155),
 0.9634761904761906)

In [7]:
type(rf_random.best_estimator_)

sklearn.ensemble._forest.RandomForestClassifier

In [8]:
RF = rf_random.best_estimator_

In [9]:
RF.fit(X_train,y_train)
y_pred_val = RF.predict(X_val)
y_pred_test = RF.predict(X_test)

print('val accuracy:',accuracy_score(y_val, y_pred_val))
print('test accuracy:',accuracy_score(y_test, y_pred_test))

val accuracy: 0.9697142857142858
test accuracy: 0.9647142857142857


# GB tuning

In [10]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 70, num = 4)]
learning_rate = [x for x in np.linspace(start = 0.05, stop = 0.2, num = 4)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(start = 5, stop = 16, num = 4)]


# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate' : learning_rate,
               'max_features': max_features,
               'max_depth': max_depth}

pprint(random_grid)

{'learning_rate': [0.05, 0.1, 0.15000000000000002, 0.2],
 'max_depth': [5, 8, 12, 16],
 'max_features': ['auto', 'sqrt'],
 'n_estimators': [20, 36, 53, 70]}


In [11]:
%%time
GBDT_random = RandomizedSearchCV(estimator = GradientBoostingClassifier(), param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2)
# Fit the random search model
GBDT_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END learning_rate=0.2, max_depth=12, max_features=sqrt, n_estimators=36; total time= 1.7min
[CV] END learning_rate=0.2, max_depth=12, max_features=sqrt, n_estimators=36; total time= 1.7min
[CV] END learning_rate=0.2, max_depth=12, max_features=sqrt, n_estimators=36; total time= 1.7min
[CV] END learning_rate=0.2, max_depth=12, max_features=sqrt, n_estimators=36; total time= 1.7min
[CV] END learning_rate=0.2, max_depth=12, max_features=sqrt, n_estimators=36; total time= 1.7min
[CV] END learning_rate=0.15000000000000002, max_depth=8, max_features=auto, n_estimators=70; total time=42.4min
[CV] END learning_rate=0.15000000000000002, max_depth=8, max_features=auto, n_estimators=70; total time=42.4min
[CV] END learning_rate=0.15000000000000002, max_depth=8, max_features=auto, n_estimators=70; total time=42.1min
[CV] END learning_rate=0.15000000000000002, max_depth=8, max_features=auto, n_estimators=70; total time=42.1min
[CV] E

RandomizedSearchCV(cv=5, estimator=GradientBoostingClassifier(),
                   param_distributions={'learning_rate': [0.05, 0.1,
                                                          0.15000000000000002,
                                                          0.2],
                                        'max_depth': [5, 8, 12, 16],
                                        'max_features': ['auto', 'sqrt'],
                                        'n_estimators': [20, 36, 53, 70]},
                   verbose=2)

In [12]:
GBDT_random.best_estimator_ , GBDT_random.best_score_

(GradientBoostingClassifier(learning_rate=0.15000000000000002, max_depth=12,
                            max_features='sqrt', n_estimators=70),
 0.967047619047619)

In [13]:
GB = rf_random.best_estimator_
GB.fit(X_train,y_train)
y_pred_val = GB.predict(X_val)
y_pred_test = GB.predict(X_test)

print('val accuracy:',accuracy_score(y_val, y_pred_val))
print('test accuracy:',accuracy_score(y_test, y_pred_test))

val accuracy: 0.9704285714285714
test accuracy: 0.9634285714285714


In [57]:
if not os.path.exists('./model_params/'):
    os.makedirs('./model_params')

with open('./model_params/GB_params_tuning.pkl', 'wb') as f:
        pickle.dump(GBDT_random.best_estimator_, f)
with open('./model_params/RF_params_tuning.pkl', 'wb') as f:
        pickle.dump(rf_random.best_estimator_, f)

# 1-NN

In [5]:
OneNN_model = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)

In [7]:
y_pred_val = OneNN_model.predict(X_val)
y_pred_test = OneNN_model.predict(X_test)
print('val accuracy:',accuracy_score(y_val, y_pred_val))
print('test accuracy:',accuracy_score(y_test, y_pred_test))

val accuracy: 0.9695714285714285
test accuracy: 0.9677142857142857


In [8]:
if not os.path.exists('./model_params/'):
    os.makedirs('./model_params')
with open('./model_params/OneNN_params_tuning.pkl', 'wb') as f:
        pickle.dump(OneNN_model, f)

# CNN

In [54]:
from tensorflow.keras.optimizers import SGD
pixels  = int(sqrt(X_train.shape[1]))
classes = len(np.unique(y_test))

model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', input_shape=(28, 28, 1)))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(100, activation='relu', kernel_initializer='he_uniform'))
model.add(Dense(10, activation='softmax'))
# compile model
opt = SGD(learning_rate=0.01, momentum=0.9)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model.save(f'./model_params/CNN_params_tuning')

In [61]:
#load model
num_labels = classes
pixels = int(sqrt(X_train.shape[1]))
# Reshaping to format which CNN expects (batch, height, width, channels)
trainX_cnn = X_train.reshape(X_train.shape[0], pixels, pixels, 1).astype('float32')
valX_cnn = X_val.reshape(X_val.shape[0], pixels, pixels, 1).astype('float32')
testX_cnn= X_test.reshape(X_test.shape[0], pixels, pixels, 1).astype('float32')

# Normalize images from 0-255 to 0-1
trainX_cnn /= 255
valX_cnn /= 255

train_y_cnn = utils.to_categorical(y_train, num_labels)
val_y_cnn = utils.to_categorical(y_val, num_labels)
test_y_cnn = utils.to_categorical(y_test, num_labels)
# Learn model
model.fit(trainX_cnn, train_y_cnn, validation_data=(valX_cnn, val_y_cnn), epochs=10, batch_size=32)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2643091b0a0>

In [62]:
y_pred_val=np.argmax(model.predict(valX_cnn),axis=1)
y_pred_test=np.argmax(model.predict(testX_cnn),axis=1)

In [63]:
print('val accuracy:',accuracy_score(y_val, y_pred_val))
print('test accuracy:',accuracy_score(y_test, y_pred_test))

val accuracy: 0.9860714285714286
test accuracy: 0.9707857142857143
