In [22]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql

In [3]:
# load the Concrete dataset 

concrete = pd.read_csv(r"E:\win7\files\13,14-08-22\Concrete_Data_V1.0.csv", header=0)

# copy the file to back-up file

concrete_bk = concrete.copy()

# display first 5 records

concrete.head()

Unnamed: 0,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [4]:
# Display concrete data information

concrete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Cement            1030 non-null   float64
 1   Slag              1030 non-null   float64
 2   Flyash            1030 non-null   float64
 3   Water             1030 non-null   float64
 4   SuperPlasticizer  1030 non-null   float64
 5   CoarseAggregate   1030 non-null   float64
 6   FineAggregate     1030 non-null   float64
 7   Age               1030 non-null   int64  
 8   CSinMPa           1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [5]:
# Display the columns in concrete dataset

concrete.columns

Index(['Cement', 'Slag', 'Flyash', 'Water', 'SuperPlasticizer',
       'CoarseAggregate', 'FineAggregate', 'Age', 'CSinMPa'],
      dtype='object')

In [6]:
# Prepare cols1 for scaling

cols1 = ['Cement', 'Slag', 'Flyash', 'Water', 'SuperPlasticizer', 'CoarseAggregate', 'FineAggregate', 'Age']

#cols1 = ['Cement', 'Slag', 'Flyash', 'Water', 'SuperPlasticizer', 'CoarseAggregate', 'FineAggregate', 'Age']

In [7]:
# Identify the independent and Target variables

IndepVar = []
for col in concrete.columns:
    if col != 'CSinMPa':
        IndepVar.append(col)

TargetVar = 'CSinMPa'

x = concrete[IndepVar]
y = concrete[TargetVar]

In [8]:
# Split the data into train and test

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=143)

# Display the shape of the train_data and test_data

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((721, 8), (309, 8), (721,), (309,))

In [9]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train[cols1] = mmscaler.fit_transform(x_train[cols1])
x_train = pd.DataFrame(x_train)

x_test[cols1] = mmscaler.fit_transform(x_test[cols1])
x_test = pd.DataFrame(x_test)

In [17]:
#train the algorithm and build the model with train dataset

from sklearn.tree import DecisionTreeRegressor

# Create model object

ModelDTR = DecisionTreeRegressor()



ModelDTR.fit(x_train, y_train)

y_pred=ModelDTR.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):',  round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test,y_pred))),3))

# Define the function to calculate the MAPE - Mean Absolute Percentage Error

def MAPE (y_test, y_pred): 
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Evaluation of MAPE 

result = MAPE(y_test, y_pred)
print('Mean Absolute Percentage Error (MAPE):', round(result, 3), '%')
#Sruthi G21:55
#print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test,y_pred))),3))

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 6.143
Mean Squared Error (MSE): 80.969
Root Mean Squared Error (RMSE): 8.998
R2_score: 0.690946
Root Mean Squared Log Error (RMSLE): 2.197
Mean Absolute Percentage Error (MAPE): 19.814 %
Adj R Square:  0.688524


In [18]:
# Predict the values with ET algorithm

y_predF = ModelDTR.predict(x_test)

In [19]:
Results = pd.DataFrame({'CSinMPa_A':y_test, 'CSinMPa_P_F':y_predF})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = concrete_bk.merge(Results, left_index=True, right_index=True)
ResultsFinal.sample(10)

Unnamed: 0,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa,CSinMPa_A,CSinMPa_P_F
455,213.5,0.0,174.2,159.2,11.7,1043.6,771.9,56,51.26,51.26,52.96
367,214.9,53.8,121.9,155.6,9.6,1014.3,780.6,56,53.96,53.96,48.99
213,230.0,0.0,118.3,195.5,4.6,1029.4,758.6,100,35.34,35.34,32.72
693,153.0,102.0,0.0,192.0,0.0,888.0,943.1,28,17.96,17.96,36.44
788,349.0,0.0,0.0,192.0,0.0,1047.0,806.0,7,18.13,18.13,9.01
741,480.0,0.0,0.0,192.0,0.0,936.0,721.0,28,43.89,43.89,43.94
793,302.0,0.0,0.0,203.0,0.0,974.0,817.0,14,18.13,18.13,20.77
578,193.5,290.2,0.0,185.7,0.0,998.2,704.3,7,17.2,17.2,17.17
351,213.5,0.0,174.2,154.6,11.7,1052.3,775.5,28,45.94,45.94,38.5
947,299.8,0.0,119.8,211.5,9.9,878.2,727.6,28,23.84,23.84,23.84


In [20]:
# Calculate the %of Error

ResultsFinal['%Error'] = round(((ResultsFinal['CSinMPa_A']-ResultsFinal['CSinMPa_P_F'])/ResultsFinal['CSinMPa_A'])*100,3)

In [21]:
# Display the results

ResultsFinal.sample(5)

Unnamed: 0,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa,CSinMPa_A,CSinMPa_P_F,%Error
236,213.8,98.1,24.5,181.7,6.7,1066.0,785.5,28,40.23,40.23,45.71,-13.622
326,252.3,0.0,98.8,146.3,14.2,987.8,889.0,28,50.6,50.6,30.85,39.032
982,312.7,0.0,0.0,178.1,8.0,999.7,822.2,28,25.1,25.1,36.8,-46.614
1015,322.5,148.6,0.0,185.8,8.5,951.0,709.5,28,52.43,52.43,52.45,-0.038
185,222.4,0.0,96.7,189.3,4.5,967.1,870.3,14,24.45,24.45,20.73,15.215
