In [None]:
#import the necessary libraries 
import pandas as pd
import scipy 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import LabelEncoder
import tensorflow 
import json
import random
import sys
import matplotlib.pyplot as plt 
%matplotlib inline

from numpy.random import seed
seed(1)
#from tensorflow import set_random_seed
tensorflow.random.set_seed(2)

#read in the data using pandas 
data=pd.read_csv("/home/keerat/Desktop/Latest_features.csv")

#check the data has been read in properly 
data.head()

In [None]:
# Importing the Keras libraries and packages
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dropout 
from tensorflow.keras.constraints import max_norm

# Exploratory Data Analysis and Preprocessing

In [None]:
#find size, shape, dimension of the Latest_features preprocessed data 
size=data.size
shape=data.shape
df_ndim=data.ndim

#print the size, shape, dimension found 
print("Size:{}\nShape:{}\nNumber of Dimensions:{}".format(size, shape, df_ndim))

#segregate features and labels 

X=data.drop('Label', axis=1)
X=X.drop(X.columns[0], axis=1)
y=data["Label"]
#check that the target variable has been removed 

print(X.head())
print(y.head())

#find size, shape, dimension of the new Bern Barcelona preprocessed data 
size_X=X.size
[m, n]=shape_X=X.shape
X_ndim=X.ndim

size_y=y.size
shape_y=y.shape
y_ndim=y.ndim

print("For X\nSize:{}\nShape:{}\nNumber of Dimensions:{}".format(size_X, shape_X, X_ndim))
print("For y\nSize:{}\nShape:{}\nNumber of Dimensions:{}".format(size_y, shape_y, y_ndim))

# print m,n also 
print("Number of features=", n, "\nNumber of training examples=", m)

#Label Encoding 
print(data.Label.value_counts())
le = LabelEncoder().fit_transform(y)

print(le)
#output suggests N=1 and F=0

#train-test-cv split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X, le, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val= train_test_split(X_train, y_train, test_size=0.25, random_state=1)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train.astype(np.float))
X_test = sc.fit_transform(X_test.astype(np.float))
X_val=sc.fit_transform(X_val.astype(np.float))

# Neural Network Architectures

In [None]:
from tensorflow.keras import initializers
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K
my_init = tensorflow.keras.initializers.glorot_normal(seed=1)
#add layers to the model 
model=Sequential()
#input layer
model.add(Dropout(rate=0.2, input_shape=(88,)))
model.add(Dense(input_shape=(88,), units=20, kernel_initializer=my_init, activation='relu', kernel_constraint=max_norm(4.)))
#first hidden layer 
model.add(Dropout(rate=0.4))
model.add(Dense(units=20, kernel_initializer=my_init, activation='relu', kernel_constraint=max_norm(4.)))
#dropout layer
model.add(Dropout(rate=0.4))
#output layer 
model.add(Dense(units=1, kernel_initializer=my_init, activation='sigmoid'))

In [None]:
#compile the model
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
history= model.fit(X_train, y_train, validation_split=0.25, batch_size=3, epochs=1000, verbose=0)

In [None]:
print(model.summary())

In [None]:
# plot training history with accuracy as parameter 
plt.plot(history.history['accuracy'], label='Training accuracy')
plt.plot(history.history['val_accuracy'], label='CV accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

#evaluation on test set
test_loss, test_acc=model.evaluate(X_test, y_test)
print("Accuracy on test set:", test_acc)
train_loss, train_acc=model.evaluate(X_train, y_train)
print("\nAccuracy on training set:", train_acc)

In [None]:
#Use EarlyStopping and ModelCheckpoint to reduce overfitting 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
#simple early stopping 
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1)
# fit model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=1000, verbose=0, callbacks=[es])

In [None]:
# patient early stopping
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=200)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
# fit model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=1000, verbose=0, callbacks=[es, mc])
# load the saved model
saved_model = load_model('best_model.h5')
#evaluation
test_loss, test_acc= saved_model.evaluate(X_test, y_test)
print("Accuracy on test set:", test_acc)
test_loss, train_acc= saved_model.evaluate(X_train, y_train)
print("\nAccuracy on training set:", train_acc)
# plot training history
plt.plot(history.history['accuracy'], label='Training accuracy')
plt.plot(history.history['val_accuracy'], label='CV accuracy')
plt.legend(loc='lower right')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

Bayesian Optimisation

In [None]:
from keras import initializers
from keras import regularizers
from keras import backend as K
from hyperas.distributions import uniform, choice

from random import randint
from hyperopt import Trials, STATUS_OK, tpe
from keras.utils import np_utils
from hyperas import optim

In [None]:
def data():
    #read in the data using pandas 
    data=pd.read_csv("/home/keerat/Desktop/Latest_features.csv")

    #check the data has been read in properly 
    data.head()
    
    #find size, shape, dimension of the Latest_features preprocessed data 
    size=data.size
    shape=data.shape
    df_ndim=data.ndim

    #print the size, shape, dimension found 
    print("Size:{}\nShape:{}\nNumber of Dimensions:{}".format(size, shape, df_ndim))
    
    #segregate features and labels 

    X=data.drop('Label', axis=1)
    X=X.drop(X.columns[0], axis=1)
    y=data["Label"]
    
    #Label Encoding 
    print(data.Label.value_counts())
    le = LabelEncoder().fit_transform(y)

    print(le)
    #output suggests N==1 and F=0
    
    #train-test-cv split
    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test= train_test_split(X, le, test_size=0.2, random_state=1)

    X_train, X_val, y_train, y_val= train_test_split(X_train, y_train, test_size=0.25, random_state=1)
    
    # Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train.astype(np.float))
    X_test = sc.fit_transform(X_test.astype(np.float))
    X_val=sc.fit_transform(X_val.astype(np.float))
        
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [None]:
#unregularised #without changing the lr
def model(X_train, y_train, X_val, y_val, X_test, y_test):
    from keras import initializers
    from keras import backend as K
    list_units=list(range(0,21))
    
    my_init=keras.initializers.glorot_normal(seed=1)
    #add layers to the model 
    model=Sequential()
    #input layer
    model.add(Dropout({{uniform(0,1)}}, input_shape=(88,)))
    model.add(Dense(units={{choice(list(range(0,21)))}}, kernel_initializer=my_init, activation='relu', 
                    kernel_constraint=max_norm(4.)))
    #first hidden layer 
    model.add(Dropout({{uniform(0,1)}}))
    model.add(Dense(units={{choice(list(range(0,21)))}}, kernel_initializer=my_init, activation='relu', 
                           kernel_constraint=max_norm(4.)))
    model.add(Dropout({{uniform(0,1)}}))
    #output layer 
    model.add(Dense(units=1, kernel_initializer=my_init, activation='sigmoid'))
    
    #compile the model
    model.compile(optimizer={{choice(['rmsprop', 'adam', 'sgd'])}}, 
                  loss='binary_crossentropy', metrics=['accuracy'])
    result= model.fit(X_train, y_train, validation_split=0.25, 
                       batch_size={{choice([8, 16, 32, 64, 128, 256])}}, epochs=1000, verbose=0)
    score = model.evaluate(X_test, y_test, verbose=0)
    accuracy = score[1]
    #get the highest validation accuracy of the training epochs
    validation_acc = np.amax(result.history['val_acc']) 
    print('Best validation acc of epoch:', validation_acc)
    
    return {'loss': -accuracy, 'status': STATUS_OK, 'model': model}   


In [None]:
from hyperas import optim
from hyperopt import hp
# SMBO - TPE in action

best_run, best_model = optim.minimize(model=model,
                                      data=data,
                                      algo=tpe.suggest,
                                      max_evals=2,
                                      trials=Trials(),
                                      notebook_name='Run 2_1d-Automated.Hyp.Opt.',
                                      keep_temp=False)

# Show the results
X_train, y_train, X_test, y_test = data()
print("Evaluation of best performing model:")
print(best_model.evaluate(X_test, y_test))
print("Best performing model chosen hyper-parameters:")
print(best_run)

In [None]:
!pip install hyperopt
# os.listdir('.')
# Install the PyDrive wrapper & import libraries.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Copy/download the file
fid = drive.ListFile({'q':"title='notebook.ipynb'"}).GetList()[0]['id']
f = drive.CreateFile({'id': fid})
f.GetContentFile('notebook.ipynb')

# Support Vector Machines

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
#using Gaussian kernel
svclassifier = SVC(kernel='rbf')
svclassifier.fit(X_train, y_train)

#prediction and evaluation
y_pred = svclassifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
#using sigmoid kernel 
svclassifier = SVC(kernel='sigmoid')
svclassifier.fit(X_train, y_train)

#prediction and evaluation
y_pred = svclassifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
for degree in range(1, 20):
    #using poly kernel 
    svclassifier = SVC(kernel='poly', degree=degree)
    svclassifier.fit(X_train, y_train)

    #prediction and evaluation
    y_pred = svclassifier.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(accuracy_score(y_test, y_pred))

# Ensemble Methods

Computing Best Parameters

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
#create a model with 100 trees
values=[]
for i in range(600, 700):
    model = RandomForestClassifier(n_estimators=i, 
                               bootstrap = True,
                               max_features = 'sqrt')
    model.fit(X_train, y_train)
    # Actual class predictions
    y_pred = model.predict(X_test)
    values.append(accuracy_score(y_test, y_pred))


print("Maximum Accuracy:"+str(max(values)))

# print(model.summary())
# #evaluation on test set
# test_loss, test_acc=model.evaluate(X_test, y_test)
# print("Accuracy on test set:", test_acc)
# train_loss, train_acc=model.evaluate(X_train, y_train)
# print("\nAccuracy on training set:", train_acc)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 42)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

print(rf_random.best_params_)

#Performance with recommended params 
from sklearn import model_selection
model = RandomForestClassifier(n_estimators= 600, min_samples_split= 10, min_samples_leaf= 4, max_features='sqrt',
                            max_depth= 90, bootstrap= False)
model.fit(X_train, y_train)
# Actual class predictions
y_pred = model.predict(X_test)
kfold = model_selection.KFold(n_splits=10, random_state=seed, cv=kfold)
results = model_selection.cross_val_score(model, X_test, y_test)
print(results.mean())
print("Accuracy: %0.2f (+/- %0.2f)" % (results.mean(), results.std() * 2))

Using the recommended params for different classifiers

In [None]:
#Random Forest Classification
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

seed = 7
num_trees = 700
max_features = 3
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = model_selection.cross_val_score(model, X_test, y_test, cv=kfold)
print(results.mean())
print("Accuracy: %0.2f (+/- %0.2f)" % (results.mean(), results.std() * 2))

In [None]:
#Extra Trees Classifier
from sklearn import model_selection
from sklearn.ensemble import ExtraTreesClassifier

seed = 7
num_trees = 600
max_features = 3
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features)
results = model_selection.cross_val_score(model, X_test, y_test, cv=kfold)
print(results.mean())
print("Accuracy: %0.2f (+/- %0.2f)" % (results.mean(), results.std() * 2))

In [None]:
#Bagged Decision Tree
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

seed = 7
num_trees = 100
max_features = 3
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cart=DecisionTreeClassifier()
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, max_features=max_features)
results = model_selection.cross_val_score(model, X_test, y_test, cv=kfold)
print(results.mean())
print("Accuracy: %0.2f (+/- %0.2f)" % (results.mean(), results.std() * 2))

In [None]:
from sklearn.ensemble import AdaBoostClassifier

#AdaBoost Classifier
from sklearn import model_selection

seed = 7
num_trees = 600
max_features = 3
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X_test, y_test, cv=kfold)
print(results.mean())
print("Accuracy: %0.2f (+/- %0.2f)" % (results.mean(), results.std() * 2))

In [None]:
#Stochastic Gradient Boosting Classification

from sklearn import model_selection
from sklearn.ensemble import GradientBoostingClassifier

seed = 7
num_trees = 200
max_features = 3
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X_test, y_test, cv=kfold)
print(results.mean())
print("Accuracy: %0.2f (+/- %0.2f)" % (results.mean(), results.std() * 2))

In [None]:
#Hard Voting Ensemble 

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB 

seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
# create the sub models
estimators = []
model1 = LogisticRegression(solver='liblinear')
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC(gamma='scale')
estimators.append(('svm', model3))
# model4 = GaussianNB()
# estimators.append(('gnb', model4))
# create the ensemble model
ensemble = VotingClassifier(estimators, voting='hard')
results = model_selection.cross_val_score(ensemble, X_test, y_test, cv=kfold)
print(results.mean())
print("Accuracy: %0.2f (+/- %0.2f)" % (results.mean(), results.std() * 2))

In [None]:
#Soft Voting Ensemble 

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB 

seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
# create the sub models
estimators = []
model1 = LogisticRegression(solver='liblinear')
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC(gamma='scale', probability=True)
estimators.append(('svm', model3))
# model4 = GaussianNB()
# estimators.append(('gnb', model4))
# create the ensemble model
ensemble = VotingClassifier(estimators, voting='soft')

results = model_selection.cross_val_score(ensemble, X_test, y_test, error_score='raise', cv=kfold)
print(results.mean())
print("Accuracy: %0.2f (+/- %0.2f)" % (results.mean(), results.std() * 2))