In [3]:
import sys  
sys.path.insert(0, '../')
from utils import *

In [5]:
X_train = np.load('./0/data/X_train.npy')
X_test = np.load('./0/data/X_test.npy')
X_val = np.load('./0/data/X_val.npy')
y_train = np.load('./0/data/y_train.npy')
y_test = np.load('./0/data/y_test.npy')
y_val = np.load('./0/data/y_val.npy')

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Tuning RF

In [7]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 200, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 45, num = 3)]
# Minimum number of samples required to split a node
min_samples_split = [5, 10]


# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
              'random_state': [0]}
print(random_grid)

{'n_estimators': [20, 65, 110, 155, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [1, 23, 45], 'min_samples_split': [5, 10], 'random_state': [0]}


In [8]:
%%time
rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=23, max_features=sqrt, min_samples_split=5, n_estimators=110, random_state=0; total time= 1.5min
[CV] END max_depth=23, max_features=sqrt, min_samples_split=5, n_estimators=110, random_state=0; total time= 1.5min
[CV] END max_depth=23, max_features=sqrt, min_samples_split=5, n_estimators=110, random_state=0; total time= 1.5min
[CV] END max_depth=23, max_features=sqrt, min_samples_split=5, n_estimators=110, random_state=0; total time= 1.4min
[CV] END max_depth=23, max_features=sqrt, min_samples_split=5, n_estimators=110, random_state=0; total time= 1.5min
[CV] END max_depth=23, max_features=sqrt, min_samples_split=10, n_estimators=20, random_state=0; total time=  15.6s
[CV] END max_depth=23, max_features=sqrt, min_samples_split=10, n_estimators=20, random_state=0; total time=  15.7s
[CV] END max_depth=23, max_features=sqrt, min_samples_split=10, n_estimators=20, random_state=0; total time=  15.6s
[CV] END ma

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
                   param_distributions={'max_depth': [1, 23, 45],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_split': [5, 10],
                                        'n_estimators': [20, 65, 110, 155, 200],
                                        'random_state': [0]},
                   verbose=2)

In [9]:
rf_random.best_estimator_ , rf_random.best_score_

(RandomForestClassifier(max_depth=45, min_samples_split=5, n_estimators=200,
                        random_state=0),
 0.9667876981347794)

In [10]:
type(rf_random.best_estimator_)

sklearn.ensemble._forest.RandomForestClassifier

In [11]:
RF = rf_random.best_estimator_

In [12]:
RF.fit(X_train,y_train)
y_pred_val = RF.predict(X_val)
y_pred_test = RF.predict(X_test)

print('val accuracy:',accuracy_score(y_val, y_pred_val))
print('test accuracy:',accuracy_score(y_test, y_pred_test))

val accuracy: 0.9756944444444444
test accuracy: 0.9782986111111112


In [13]:
if not os.path.exists('./model_params/'):
    os.makedirs('./model_params')
with open('./model_params/RF_params_tuning.pkl', 'wb') as f:
        pickle.dump(rf_random.best_estimator_, f)

# GB tuning

In [19]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 6, stop = 12, num = 4)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(start = 2, stop = 8, num = 4)]


# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate' : [0.05, 0.1, 0.15, 0.2],
               'max_features': ['auto', 'sqrt'],
               'max_depth': max_depth,
               'random_state' : [0]}

pprint(random_grid)

{'learning_rate': [0.05, 0.1, 0.15, 0.2],
 'max_depth': [2, 4, 6, 8],
 'max_features': ['auto', 'sqrt'],
 'n_estimators': [6, 8, 10, 12],
 'random_state': [0]}


In [20]:
%%time
GBDT_random = RandomizedSearchCV(estimator = GradientBoostingClassifier(), param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2)
# Fit the random search model
GBDT_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END learning_rate=0.1, max_depth=8, max_features=sqrt, n_estimators=10, random_state=0; total time= 3.6min
[CV] END learning_rate=0.1, max_depth=8, max_features=sqrt, n_estimators=10, random_state=0; total time= 3.6min
[CV] END learning_rate=0.1, max_depth=8, max_features=sqrt, n_estimators=10, random_state=0; total time= 3.6min
[CV] END learning_rate=0.1, max_depth=8, max_features=sqrt, n_estimators=10, random_state=0; total time= 3.6min
[CV] END learning_rate=0.1, max_depth=8, max_features=sqrt, n_estimators=10, random_state=0; total time= 3.6min
[CV] END learning_rate=0.1, max_depth=2, max_features=auto, n_estimators=6, random_state=0; total time=26.5min
[CV] END learning_rate=0.1, max_depth=2, max_features=auto, n_estimators=6, random_state=0; total time=26.5min
[CV] END learning_rate=0.1, max_depth=2, max_features=auto, n_estimators=6, random_state=0; total time=26.5min
[CV] END learning_rate=0.1, max_depth=2, max_f

RandomizedSearchCV(cv=5, estimator=GradientBoostingClassifier(),
                   param_distributions={'learning_rate': [0.05, 0.1, 0.15, 0.2],
                                        'max_depth': [2, 4, 6, 8],
                                        'max_features': ['auto', 'sqrt'],
                                        'n_estimators': [6, 8, 10, 12],
                                        'random_state': [0]},
                   verbose=2)

In [21]:
GBDT_random.best_estimator_ , GBDT_random.best_score_

(GradientBoostingClassifier(learning_rate=0.2, max_depth=8, max_features='auto',
                            n_estimators=10, random_state=0),
 0.7844899523499649)

In [24]:
GB = rf_random.best_estimator_
GB.fit(X_train,y_train)
y_pred_val = GB.predict(X_val)
y_pred_test = GB.predict(X_test)

print('val accuracy:',accuracy_score(y_val, y_pred_val))
print('test accuracy:',accuracy_score(y_test, y_pred_test))

val accuracy: 0.9756944444444444
test accuracy: 0.9782986111111112


In [25]:
if not os.path.exists('./model_params/'):
    os.makedirs('./model_params')

with open('./model_params/GB_params_tuning.pkl', 'wb') as f:
        pickle.dump(GBDT_random.best_estimator_, f)


In [27]:
model = Sequential()
model.add(Conv2D(filters=32, kernel_size=(5,5), activation='relu', input_shape=(30,30,3)))
model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(rate=0.25))
model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(rate=0.25))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(rate=0.5))
model.add(Dense(43, activation='softmax'))

#Compilation of the model
model.compile(
    loss='categorical_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
)

In [28]:
model.save(f'./model_params/CNN_params_tuning')

INFO:tensorflow:Assets written to: ./model_params/CNN_params_tuning\assets


In [30]:
num_labels = 43

pixels = int(sqrt(X_train.shape[1]/3))
# Reshaping to format which CNN expects (batch, height, width, channels)
trainX_cnn = X_train.reshape(X_train.shape[0], pixels, pixels, 3).astype('float32')
valX_cnn = X_val.reshape(X_val.shape[0], pixels, pixels, 3).astype('float32')
testX_cnn= X_test.reshape(X_test.shape[0], pixels, pixels, 3).astype('float32')

# Normalize images from 0-255 to 0-1
trainX_cnn /= 255
valX_cnn /= 255
testX_cnn /=255

train_y_cnn = utils.to_categorical(y_train, num_labels)
val_y_cnn = utils.to_categorical(y_val, num_labels)
test_y_cnn = utils.to_categorical(y_test, num_labels)



In [31]:
model.fit(trainX_cnn, train_y_cnn, epochs=30, batch_size=32, validation_split=0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x29a039a8dc0>

In [34]:
y_pred_val=np.argmax(model.predict(valX_cnn),axis=1)
y_pred_test=np.argmax(model.predict(testX_cnn),axis=1)

In [35]:
print('val accuracy:',accuracy_score(y_val, y_pred_val))
print('test accuracy:',accuracy_score(y_test, y_pred_test))

val accuracy: 0.9965277777777778
test accuracy: 0.9968171296296297


# 1-NN

In [5]:
OneNN_model = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)

In [6]:
y_pred_val = OneNN_model.predict(X_val)
y_pred_test = OneNN_model.predict(X_test)
print('val accuracy:',accuracy_score(y_val, y_pred_val))
print('test accuracy:',accuracy_score(y_test, y_pred_test))

val accuracy: 0.8733741392501913
test accuracy: 0.8727365467992859


In [7]:
if not os.path.exists('./model_params/'):
    os.makedirs('./model_params')
with open('./model_params/OneNN_params_tuning.pkl', 'wb') as f:
        pickle.dump(OneNN_model, f)