In [1]:
# Importing relevant modules
import numpy
import pandas as pd
import matplotlib
%matplotlib inline
import itertools
import IPython.display
from tqdm import tqdm_notebook as tqdm
import sklearn
import sklearn.model_selection
import sklearn.ensemble
import sklearn.model_selection

# Ignoring warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load Dataset:
data_url = 'https://raw.githubusercontent.com/Moataz-AbdElKhalek/Concrete_Compressive_Strength_Prediction/main/dataset/Concrete_Dataset_Classification.csv'
dataset = pd.read_csv(data_url)

print(dataset.head(4))

# Descriptive statistics
print("\nDataset has {} rows and {} columns".format(dataset.shape[0],dataset.shape[1]))

print()
y = dataset['y']
print(y.head(4))
print(y.shape)
print()

X = dataset.drop(['y'], axis=1)
print(X.head(4))
print(X.shape)

      X1     X2   X3     X4   X5      X6     X7     X8    y
0  540.0    0.0  0.0  162.0  2.5  1040.0  676.0   28.0  1.0
1  540.0    0.0  0.0  162.0  2.5  1055.0  676.0   28.0  1.0
2  332.5  142.5  0.0  228.0  0.0   932.0  594.0  270.0  1.0
3  332.5  142.5  0.0  228.0  0.0   932.0  594.0  365.0  1.0

Dataset has 1030 rows and 9 columns

0    1.0
1    1.0
2    1.0
3    1.0
Name: y, dtype: float64
(1030,)

      X1     X2   X3     X4   X5      X6     X7     X8
0  540.0    0.0  0.0  162.0  2.5  1040.0  676.0   28.0
1  540.0    0.0  0.0  162.0  2.5  1055.0  676.0   28.0
2  332.5  142.5  0.0  228.0  0.0   932.0  594.0  270.0
3  332.5  142.5  0.0  228.0  0.0   932.0  594.0  365.0
(1030, 8)


In [3]:
# Applying statistical Analysis on the data:
dataset.describe()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,y
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,-0.048544
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,0.999306
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,-1.0
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,-1.0
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,-1.0
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,1.0
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,1.0


In [4]:
# Dividing samples dataset into training and test datasets:
def dataset_divide(X, y):
  X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.70, random_state=1)
  return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = dataset_divide(X,y)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(721, 8)
(721,)
(309, 8)
(309,)


# **Random Forest Model**
# Fixed Single Split

In [10]:
def study_RF(max_depth,min_impurity):

    scores = numpy.zeros((len(min_impurity),len(max_depth)))
    
    for i in range (0,len(min_impurity)):
        for j in range (0,len(max_depth)):
            clf = sklearn.ensemble.RandomForestClassifier(max_depth=max_depth[j],min_impurity_decrease=min_impurity[i],random_state=1)
            clf.fit(X_train, y_train)
            clf.predict(X_test)
            scores[i,j]=clf.score(X_test,y_test)
    
    #best_min_impurity = numpy.unravel_index(numpy.argmin(grid, axis=None), grid.shape)[1]
    best_max_depth=max_depth[numpy.unravel_index(numpy.argmax(scores, axis=None), scores.shape)[1]]
    best_min_impurity=min_impurity[numpy.unravel_index(numpy.argmax(scores, axis=None), scores.shape)[0]]
    
    print('Max.score: ',numpy.amax(scores))
    print('best maximum depth: ',best_max_depth)
    print('best minimum impurity: ',best_min_impurity)
    print('----------scores----------')
    return scores

In [11]:
study_RF([4,8,12],[0.0,0.005,0.01])

Max.score:  0.8511326860841424
best maximum depth:  12
best minimum impurity:  0.0
----------scores----------


array([[0.82200647, 0.84466019, 0.85113269],
       [0.80906149, 0.82847896, 0.83171521],
       [0.80582524, 0.81553398, 0.81553398]])

# **Random Forest Model**
# 10-fold Cross-Validation

In [14]:
def RFC_CV(max_depth_range,min_impurity_range):
    
  # Preparing the Model:
  model = sklearn.ensemble.RandomForestClassifier(criterion='gini', random_state=1)

  # Determining Model Hyperparameters to be tested and optimized:
  paras = {'max_depth':max_depth_range, 'min_impurity_decrease':min_impurity_range}
  # min_impurity_decrease is used instead of min_impurity_split as min_impurity_split is deprecated in favor of min_impurity_decrease.
  # And the official scikit-learn manual advises to use min_impurity_decrease.

  # Preparing Cross-Validation to be used to fit the Model and the Hyperparameters:
  # Using 10-fold Cross-Validation:
  gridCV = sklearn.model_selection.GridSearchCV(model, paras, cv=10, scoring='accuracy', verbose=10)
  gridCV.fit(X, y)

  best_max_depth = gridCV.best_params_['max_depth']
  best_min_impurity = gridCV.best_params_['min_impurity_decrease']
  best_score = gridCV.best_score_
  results = gridCV.cv_results_

  return best_max_depth, best_min_impurity, best_score, results

In [None]:
range(1,150,1),[0.0, 1e-10, 1e-5, 0.01, 0.1]

In [None]:
best_max_depth, best_min_impurity, best_score, cv_results = RFC_CV([1, 5, 10, 100, 1000],[0, 1e-10, 1e-5, 0.01, 0.1])

Fitting 10 folds for each of 745 candidates, totalling 7450 fits
[CV] max_depth=1, min_impurity_decrease=0.0 ..........................
[CV]  max_depth=1, min_impurity_decrease=0.0, score=0.553, total=   0.1s
[CV] max_depth=1, min_impurity_decrease=0.0 ..........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  max_depth=1, min_impurity_decrease=0.0, score=0.796, total=   0.2s
[CV] max_depth=1, min_impurity_decrease=0.0 ..........................
[CV]  max_depth=1, min_impurity_decrease=0.0, score=0.864, total=   0.1s
[CV] max_depth=1, min_impurity_decrease=0.0 ..........................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.4s remaining:    0.0s


[CV]  max_depth=1, min_impurity_decrease=0.0, score=0.466, total=   0.2s
[CV] max_depth=1, min_impurity_decrease=0.0 ..........................
[CV]  max_depth=1, min_impurity_decrease=0.0, score=0.718, total=   0.1s
[CV] max_depth=1, min_impurity_decrease=0.0 ..........................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.7s remaining:    0.0s


[CV]  max_depth=1, min_impurity_decrease=0.0, score=0.573, total=   0.2s
[CV] max_depth=1, min_impurity_decrease=0.0 ..........................
[CV]  max_depth=1, min_impurity_decrease=0.0, score=0.806, total=   0.1s
[CV] max_depth=1, min_impurity_decrease=0.0 ..........................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    1.0s remaining:    0.0s


[CV]  max_depth=1, min_impurity_decrease=0.0, score=0.563, total=   0.1s
[CV] max_depth=1, min_impurity_decrease=0.0 ..........................
[CV]  max_depth=1, min_impurity_decrease=0.0, score=0.660, total=   0.1s
[CV] max_depth=1, min_impurity_decrease=0.0 ..........................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.3s remaining:    0.0s


[CV]  max_depth=1, min_impurity_decrease=0.0, score=0.592, total=   0.2s
[CV] max_depth=1, min_impurity_decrease=1e-10 ........................
[CV]  max_depth=1, min_impurity_decrease=1e-10, score=0.553, total=   0.1s
[CV] max_depth=1, min_impurity_decrease=1e-10 ........................
[CV]  max_depth=1, min_impurity_decrease=1e-10, score=0.796, total=   0.2s
[CV] max_depth=1, min_impurity_decrease=1e-10 ........................
[CV]  max_depth=1, min_impurity_decrease=1e-10, score=0.864, total=   0.1s
[CV] max_depth=1, min_impurity_decrease=1e-10 ........................
[CV]  max_depth=1, min_impurity_decrease=1e-10, score=0.466, total=   0.1s
[CV] max_depth=1, min_impurity_decrease=1e-10 ........................
[CV]  max_depth=1, min_impurity_decrease=1e-10, score=0.718, total=   0.1s
[CV] max_depth=1, min_impurity_decrease=1e-10 ........................
[CV]  max_depth=1, min_impurity_decrease=1e-10, score=0.573, total=   0.2s
[CV] max_depth=1, min_impurity_decrease=1e-10 .....

In [18]:
print('best_max_depth =',best_max_depth)
print('best_min_impurity =',best_min_impurity)
print('Cross-Validation Mean Best Score for the Model =',best_score)
print('\nCross-Validation Mean Test Scores\n', cv_results['mean_test_score'])

for i in range(10):
  print('\nSplit_'+str(i+1)+' Scores\n',cv_results['split'+str(i)+'_test_score'])
  print('best_score (Split_'+str(i+1)+') =', max(cv_results['split'+str(i)+'_test_score']))

best_max_depth = 12
best_min_impurity = 0.0
Cross-Validation Mean Best Score for the Model = 0.7699029126213592

Cross-Validation Mean Test Scores
 [0.74466019 0.7368932  0.72330097 0.75728155 0.74271845 0.72621359
 0.76990291 0.74174757 0.72621359]

Split_1 Scores
 [0.69902913 0.68932039 0.67961165 0.73786408 0.69902913 0.67961165
 0.72815534 0.69902913 0.67961165]
best_score (Split_1) = 0.7378640776699029

Split_2 Scores
 [0.65048544 0.62135922 0.61165049 0.6407767  0.63106796 0.60194175
 0.66019417 0.6407767  0.60194175]
best_score (Split_2) = 0.6601941747572816

Split_3 Scores
 [0.86407767 0.85436893 0.83495146 0.82524272 0.85436893 0.86407767
 0.82524272 0.83495146 0.86407767]
best_score (Split_3) = 0.8640776699029126

Split_4 Scores
 [0.60194175 0.61165049 0.55339806 0.53398058 0.5631068  0.58252427
 0.5631068  0.55339806 0.58252427]
best_score (Split_4) = 0.6116504854368932

Split_5 Scores
 [0.7961165  0.75728155 0.78640777 0.76699029 0.77669903 0.76699029
 0.76699029 0.77669903