In [1]:
# Importing relevant modules
import numpy
import pandas as pd
import matplotlib
%matplotlib inline
import itertools
import IPython.display
from tqdm import tqdm_notebook as tqdm
import sklearn.model_selection
import sklearn.neural_network
import sklearn.model_selection

# Ignoring warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load Dataset:
data_url = 'https://raw.githubusercontent.com/Moataz-AbdElKhalek/Concrete_Compressive_Strength_Prediction/main/dataset/Concrete_Dataset_Classification.csv'
dataset = pd.read_csv(data_url)

print(dataset.head(4))

# Descriptive statistics
print("\nDataset has {} rows and {} columns".format(dataset.shape[0],dataset.shape[1]))

print()
y = dataset['y']
print(y.head(4))
print(y.shape)
print()

X = dataset.drop(['y'], axis=1)
print(X.head(4))
print(X.shape)

      X1     X2   X3     X4   X5      X6     X7     X8    y
0  540.0    0.0  0.0  162.0  2.5  1040.0  676.0   28.0  1.0
1  540.0    0.0  0.0  162.0  2.5  1055.0  676.0   28.0  1.0
2  332.5  142.5  0.0  228.0  0.0   932.0  594.0  270.0  1.0
3  332.5  142.5  0.0  228.0  0.0   932.0  594.0  365.0  1.0

Dataset has 1030 rows and 9 columns

0    1.0
1    1.0
2    1.0
3    1.0
Name: y, dtype: float64
(1030,)

      X1     X2   X3     X4   X5      X6     X7     X8
0  540.0    0.0  0.0  162.0  2.5  1040.0  676.0   28.0
1  540.0    0.0  0.0  162.0  2.5  1055.0  676.0   28.0
2  332.5  142.5  0.0  228.0  0.0   932.0  594.0  270.0
3  332.5  142.5  0.0  228.0  0.0   932.0  594.0  365.0
(1030, 8)


In [3]:
# Applying statistical Analysis on the data:
dataset.describe()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,y
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,-0.048544
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,0.999306
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,-1.0
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,-1.0
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,-1.0
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,1.0
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,1.0


In [4]:
# Dividing samples dataset into training and test datasets:
def dataset_divide(X, y):
  X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.90, random_state=1)
  return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = dataset_divide(X,y)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(927, 8)
(927,)
(103, 8)
(103,)


# **Artificial Neural Network (ANN) Model**
# 10-fold Cross-Validation

In [5]:
def ANN_CV(max_iterations,alpha_range, hidden_layers):
    
  # Preparing the Model:
  model = sklearn.neural_network.MLPClassifier(activation='relu',random_state=1)
  
  # Determining Model Hyperparameters to be tested and optimized:
  paras = {'max_iter':max_iterations, 'alpha':alpha_range, 'hidden_layer_sizes':hidden_layers}

  # Preparing Cross-Validation to be used to fit the Model and the Hyperparameters:
  # Using 10-fold Cross-Validation:
  gridCV = sklearn.model_selection.GridSearchCV(model, paras, cv=10, scoring='accuracy', verbose=10)
  gridCV.fit(X, y)

  best_max_iterations = gridCV.best_params_['max_iter']
  best_alpha = gridCV.best_params_['alpha']
  best_hidden_layers = gridCV.best_params_['hidden_layer_sizes']
  best_score = gridCV.best_score_
  results = gridCV.cv_results_

  return best_max_iterations, best_alpha, best_hidden_layers, best_score, results

In [6]:
test_max_iter_range = numpy.arange(50,1000,200) 
test_alpha_range = [0.00946, 0.01668, 0.01808] # The alphas with max score in the case of fixed split 
test_hidden_layers = [(12, 12, 12, 12, 12, 12, 12), (64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64), (92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92)] # The hidden layers with max score in the case of fixed split

In [7]:
# Testing the ANN Model using Cross-Validation with Grid Search to determine best score (accuracy) and most optimum Hyperparameters:
best_max_iterations, best_alpha, best_hidden_layers, best_score, results = ANN_CV(test_max_iter_range, test_alpha_range, test_hidden_layers)

Fitting 10 folds for each of 45 candidates, totalling 450 fits
[CV] alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50, score=0.709, total=   0.4s
[CV] alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV]  alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50, score=0.825, total=   0.3s
[CV] alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.7s remaining:    0.0s


[CV]  alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50, score=0.806, total=   0.3s
[CV] alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.0s remaining:    0.0s


[CV]  alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50, score=0.495, total=   0.3s
[CV] alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.3s remaining:    0.0s


[CV]  alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50, score=0.621, total=   0.3s
[CV] alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.7s remaining:    0.0s


[CV]  alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50, score=0.631, total=   0.4s
[CV] alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    2.0s remaining:    0.0s


[CV]  alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50, score=0.864, total=   0.3s
[CV] alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    2.3s remaining:    0.0s


[CV]  alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50, score=0.796, total=   0.3s
[CV] alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    2.7s remaining:    0.0s


[CV]  alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50, score=0.621, total=   0.3s
[CV] alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    3.0s remaining:    0.0s


[CV]  alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=50, score=0.670, total=   0.3s
[CV] alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=250 
[CV]  alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=250, score=0.699, total=   0.9s
[CV] alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=250 
[CV]  alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=250, score=0.796, total=   1.6s
[CV] alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=250 
[CV]  alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=250, score=0.971, total=   1.4s
[CV] alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=250 
[CV]  alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=250, score=0.728, total=   1.6s
[CV] alpha=0.00946, hidden_layer_sizes=(12, 12, 12, 12, 12, 12, 12), max_iter=250 
[CV]  alpha=0.00946, hidden_lay

[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed: 23.4min finished


In [8]:
print('best_max_iterations =',best_max_iterations)
print('best_alpha =',best_alpha)
print('best_hidden_layers =', best_hidden_layers)
print('Cross-Validation Mean Best Score for the Model =',best_score)
print('\nCross-Validation Mean Test Scores\n', results['mean_test_score'])

for i in range(10):
  print('\nSplit_'+str(i+1)+' Scores\n',results['split'+str(i)+'_test_score'])
  print('best_score (Split_'+str(i+1)+') =', max(results['split'+str(i)+'_test_score']))

best_max_iterations = 450
best_alpha = 0.01668
best_hidden_layers = (12, 12, 12, 12, 12, 12, 12)
Cross-Validation Mean Best Score for the Model = 0.829126213592233

Cross-Validation Mean Test Scores
 [0.7038835  0.81747573 0.82135922 0.82135922 0.82135922 0.74174757
 0.78446602 0.78446602 0.78446602 0.78446602 0.8        0.80485437
 0.80485437 0.80485437 0.80485437 0.70582524 0.82621359 0.82912621
 0.82912621 0.82912621 0.79514563 0.80291262 0.80291262 0.80291262
 0.80291262 0.81262136 0.80097087 0.80097087 0.80097087 0.80097087
 0.70291262 0.81359223 0.81359223 0.81359223 0.81359223 0.79417476
 0.79126214 0.79126214 0.79126214 0.79126214 0.79514563 0.78640777
 0.78640777 0.78640777 0.78640777]

Split_1 Scores
 [0.70873786 0.69902913 0.69902913 0.69902913 0.69902913 0.7184466
 0.69902913 0.69902913 0.69902913 0.69902913 0.73786408 0.72815534
 0.72815534 0.72815534 0.72815534 0.70873786 0.7184466  0.7184466
 0.7184466  0.7184466  0.7184466  0.7184466  0.7184466  0.7184466
 0.7184466  0.