In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
import itertools
import numpy as np
import concurrent.futures
from threading import Lock

import tensorflow as tf
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeClassifier

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
SMALL_SIZE = 30
MEDIUM_SIZE = 30
BIGGER_SIZE = 30

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
plt.rc('figure', figsize=(15,8))
plt.rc('lines', linewidth=2)


##Preprocessing

Load the MONK dataset

In [None]:
header=['class','head_shape','body_shape','is_smiling','holding','jacket_color','has_tie','ID']

m2_train = pd.read_csv("/content/drive/Shareddrives/ML/MONK/monks-2.train", header=None, delimiter=' ', skipinitialspace=True,
                           names=header)
m2_test = pd.read_csv("/content/drive/Shareddrives/ML/MONK/monks-2.test", header=None, delimiter=' ', skipinitialspace=True, 
                           names=header)

In [None]:
print('n. record:', len(m2_train))
m2_train.head()

In [None]:
print('n. record:', len(m2_test))
m2_test.head()

Convert ID column in index

In [None]:
index_train=[]
index_test=[]
for id in m2_train['ID']:
  index_train.append(int(id.split('_')[1]))
for id in m2_test['ID']:
  index_test.append(int(id.split('_')[1]))

In [None]:
m2_train.index=index_train
m2_train.drop('ID', axis=1, inplace=True)
m2_test.index=index_test
m2_test.drop('ID', axis=1, inplace=True)

In [None]:
m2_train.head()

One-hot encoding of categorical variable

In [None]:
todummy=[c for c in m2_train.columns if c not in ['class']]

m2_train= pd.get_dummies(m2_train, columns=todummy)
m2_test= pd.get_dummies(m2_test, columns=todummy)

In [None]:
m2_train.head()

In [None]:
col=[c for c in m2_train.columns if c!='class']

x_train= m2_train[col].values
y_train= m2_train['class'].values

x_test= m2_test[col].values
y_test= m2_test['class'].values

In [None]:
def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))

##KNN

Hyper-parameters tuning using GridSearchCV from Scikit-learn

In [None]:
param_grid = {'n_neighbors': range(2,80),
              'weights':['uniform','distance'],
              'p': [1,2]} 


Grid_KNN = GridSearchCV(KNeighborsClassifier(), 
                         param_grid, cv=StratifiedKFold(5), scoring='accuracy')

Grid_fit = Grid_KNN.fit(x_train, y_train)
report(Grid_fit.cv_results_, n_top=5)
 

In [None]:
report(Grid_fit.cv_results_, n_top=1)

Model fitting and TR/TS prediction

In [None]:
model = KNeighborsClassifier(n_neighbors= 23, p= 2, weights= 'uniform')
model=model.fit(x_train, y_train)

In [None]:
y_train_pred = model.predict(x_train)
print('Train accuracy:')
accuracy_score(y_train, y_train_pred)

In [None]:
y_test_pred = model.predict(x_test)
print('Test accuracy:')
accuracy_score(y_test, y_test_pred)

## LBE

Hyper-parameters tuning using GridSearchCV from Scikit-learn

In [None]:
pipe= Pipeline(steps=[('lbe',PolynomialFeatures()),('ridge',RidgeClassifier(random_state=0))])

param_grid={
    'lbe__degree':[2, 3, 4, 5, 6],
    'ridge__solver':['saga'],
    'ridge__alpha':[100, 10, 1, 0, 0.1, 0.01, 0.001],
    'lbe__interaction_only':[True,False]
}

grid_search= GridSearchCV(pipe, param_grid=param_grid, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=0), verbose=4)
grid_search.fit(x_train, y_train)
report(grid_search.cv_results_, n_top=5)

Model fitting and TR/TS prediction

In [None]:
pipe= Pipeline(steps=[('lbe',PolynomialFeatures(degree=2)),('ridge',RidgeClassifier(alpha=100, solver='saga', random_state=0))])

pipe.fit(x_train, y_train)

In [None]:
y_pred_train= pipe.predict(x_train)
print('Train accuracy:')
accuracy_score(y_train, y_pred_train)

In [None]:
y_pred= pipe.predict(x_test)
print('Test accuracy:')
accuracy_score(y_test, y_pred)

##SVM

Hyper-parameters tuning using GridSearchCV from Scikit-learn

In [None]:
param_grid=[{
    'kernel':['rbf'],
    'C':[1e-3, 1e-2, 1e-1, 1, 1e2, 1e3],
    'gamma':['scale','auto']
    },
    {'kernel':['poly'],
    'C':[1e-3, 1e-2, 1e-1, 1, 1e2, 1e3],
    'gamma':['scale','auto'],
    'degree':[2,3,4,5,6]
    },
    {'kernel':['linear'],
    'C':[1e-3, 1e-2, 1e-1, 1, 1e2, 1e3],
    }]
    
grid_search= GridSearchCV(SVC(random_state=0), param_grid=param_grid, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=0), verbose=4)
grid_search.fit(x_train, y_train)
report(grid_search.cv_results_, n_top=5)

Model fitting and TR/TS prediction

In [None]:
clf = SVC(C=100, degree=2, gamma='scale', kernel='poly', random_state=0)
clf.fit(x_train, y_train)

In [None]:
y_pred_train= clf.predict(x_train)
print('Train accuracy:')
accuracy_score(y_train, y_pred_train)

In [None]:
y_pred= clf.predict(x_test)
print('Test accuracy:')
accuracy_score(y_test, y_pred)

##Random Forest

Hyper-parameters tuning using GridSearchCV from Scikit-learn

In [None]:
param_grid = {'max_depth': range(5,30,5),
              'n_estimators':[100],
              'criterion': ['gini', 'entropy'],
              'min_samples_split': range(5,40,5),
              'min_samples_leaf':range(5,40,5),
              'max_features': range(2,18,3),
              'bootstrap' : [True, False]}

In [None]:
grid_search = GridSearchCV(RandomForestClassifier(random_state=0), param_grid=param_grid, 
                           cv=StratifiedKFold(5), scoring='accuracy')

grid_search.fit(x_train, y_train)
report(grid_search.cv_results_, n_top=1)

Model fitting and TR/TS prediction

In [None]:
clf = RandomForestClassifier(max_depth=15, random_state=0, n_estimators=100, criterion='gini', min_samples_split=35, min_samples_leaf=35,
 max_features = 17, bootstrap=True)
clf.fit(x_train, y_train)

In [None]:
y_train_pred = clf.predict(x_train)
print('Training accuracy')
accuracy_score(y_train, y_train_pred)

In [None]:

y_test_pred = clf.predict(x_test)
print('Test accuracy')
accuracy_score(y_test, y_test_pred)

##NN

Next we define **build_model** function in order to build our Neural Network. In this function we use:

*   A seed to get reproducible results. 
*   L2 reguralization term to loss function
*   SGD optimizer to train our NN
*   MSE as loss
*   Accuracy as metric

For this task it is good enough an 1 hidden layer architecture, 17 input unit and 1 output unit with sigmoid activation function dealing with a binary classification.

In [None]:
def build_model(weight_init=0.2, weight_distr=0, activ='tanh', unit=4, eta=0.2, alpha=0.5, lambd=0):
  
  tf.random.set_seed(0)  
    
  if weight_distr==0:
    init= tf.keras.initializers.RandomUniform(minval=-weight_init, maxval=weight_init)
  elif weight_distr==1:
    init= tf.keras.initializers.RandomNormal(mean=0., stddev=weight_init)
  else:
    init= tf.keras.initializers.GlorotNormal()

  reg= tf.keras.regularizers.l2(l2=lambd)

  model= tf.keras.models.Sequential()
  model.add(tf.keras.layers.Input(17,))
  model.add(tf.keras.layers.Dense(unit, activation=activ, kernel_initializer=init, bias_initializer=init, kernel_regularizer=reg))
  model.add(tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=init, bias_initializer=init, kernel_regularizer=reg))

  loss= tf.keras.losses.MeanSquaredError()
  opt= tf.keras.optimizers.SGD(learning_rate=eta, momentum=alpha, nesterov=False)
  metric= tf.keras.metrics.BinaryAccuracy()
  model.compile(loss=loss, 
                optimizer=opt,
                metrics=[metric])
  
  print(model.get_weights())
  return model

Next we define our grid search which is parallelized on single CV splits' fitting using Futures class from Concurrent library. Model building is made sequentially acquiring a lock; it is necessary to guarantee same weights initializing.


In [None]:
def parallel_cv(list_split, iter, d, x_train, y_train, lock):
    
  lock.acquire()
  try:
    model= build_model(**d)
  finally:
    lock.release()

  x_train_cv, x_val_cv, y_train_cv, y_val_cv=  x_train[list_split[iter,0]], x_train[list_split[iter,1]], y_train[list_split[iter,0]], y_train[list_split[iter,1]]
  res= model.fit(x_train_cv, y_train_cv, epochs=400, batch_size=len(x_train_cv), validation_data=(x_val_cv,y_val_cv))

  return (res.history['val_loss'][-1], res.history['val_binary_accuracy'][-1])

  
def grid_search(x_train, y_train, param_grid, fold):
  totale_iter= 1
  for _,value in param_grid.items():
    totale_iter= totale_iter*len(value)

  split= fold.split(x_train, y_train)
  list_split=[]
  for train_index, test_index in split:
    list_split.append([train_index,test_index])
  list_split=np.array(list_split, dtype=object)

  iter=0
  cv_results=[]
  for params in itertools.product(*[l for l in param_grid.values()]):
    d= dict(zip(param_grid.keys(), params))
    
    accuracy=[]
    losses=[]
    future=[]
    
    lock= Lock()
    
    for i in range(0,len(list_split)):
      executor=concurrent.futures.ThreadPoolExecutor(max_workers=4)
      future.append(executor.submit(parallel_cv, list_split, i, d, x_train, y_train, lock))
    
    for f in future:
      (loss, acc)=f.result()
      losses.append(loss)
      accuracy.append(acc)
    
    mean_acc= np.mean(accuracy)
    std_acc= np.std(accuracy)
    d['mean_val_acc']= mean_acc
    d['std_val_acc']= std_acc

    mean_loss= np.mean(losses)
    std_loss= np.std(losses)
    d['mean_val_loss']= mean_loss
    d['std_val_loss']= std_loss


    
    cv_results.append(d)
    iter=iter+1
    print('ITERAZIONE NUMERO ' + str(iter)+ '   su '+ str(totale_iter)+ ' totali')
  
  return cv_results

In [None]:
param_grid={
    'weight_init': [0.2, 0.3, 0.4],
    'weight_distr': [0, 1],
    'unit': [2, 3, 4],
    'eta': [0.1, 0.3, 0.5, 0.7],
    'alpha': [0.3, 0.5, 0.7],
    'lambd': [0, 0.01],
    'activ': ['relu']
}

In [None]:
%%time
cv_results= grid_search(x_train, y_train, param_grid, fold=StratifiedKFold(n_splits=4, shuffle=True, random_state=0))

In [None]:
cv_results

In [None]:
sorted_result = (sorted(cv_results, key = lambda i: (i['mean_val_loss'], i['std_val_loss'])))
best_5_result=sorted_result[:5]
best_5_result

In [None]:
best_model_par=best_5_result[0]
best_model_par

In [None]:
best_model_par={'weight_init': 0.2,
  'weight_distr': 1,
  'unit': 4,
  'eta': 0.7,
  'alpha': 0.7,
  'lambd': 0,
  'activ': 'relu',
  'mean_val_acc': 1.0,
  'std_val_acc': 0.0,
  'mean_val_loss': 0.0007847996748751029,
  'std_val_loss': 0.00016908766496319332
}

Final retraining on the whole training set

In [None]:
d=best_model_par
model_best_final= build_model(weight_init=d['weight_init'], weight_distr=d['weight_distr'], activ=d['activ'], unit=d['unit'], eta=d['eta'], alpha=d['alpha'], lambd=d['lambd'])

result_best=model_best_final.fit(x=x_train, y=y_train, epochs=400, batch_size=len(x_train), validation_data=(x_test, y_test), shuffle=True)

In [None]:
# summarize history for accuracy
plt.plot(result_best.history['binary_accuracy'])
plt.plot(result_best.history['val_binary_accuracy'], linestyle='--')
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['training', 'test'], loc='lower right')
plt.show()
# summarize history for loss

plt.plot(result_best.history['loss'])
plt.plot(result_best.history['val_loss'], linestyle='--')
plt.title('model MSE')
plt.ylabel('MSE')
plt.xlabel('epoch')
plt.legend(['training', 'test'], loc='upper right')
plt.show()


In [None]:
model_best_final.evaluate(x_train, y_train, batch_size=len(x_test))

In [None]:
model_best_final.evaluate(x_test, y_test, batch_size=len(x_test))