In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# read the data and store in DataFrame 
concrete_data = pd.read_csv('Concrete_Data.csv')

In [2]:
# look at first 10 rows of data
concrete_data.head(10)

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3
5,266.0,114.0,0.0,228.0,0.0,932.0,670.0,90,47.03
6,380.0,95.0,0.0,228.0,0.0,932.0,594.0,365,43.7
7,380.0,95.0,0.0,228.0,0.0,932.0,594.0,28,36.45
8,266.0,114.0,0.0,228.0,0.0,,670.0,28,45.85
9,,0.0,0.0,228.0,0.0,932.0,594.0,28,39.29


In [3]:
# look at summary of data
concrete_data.describe()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
count,1028.0,1030.0,1030.0,1030.0,1030.0,1029.0,1030.0,1030.0,1030.0
mean,280.727529,73.895825,54.18835,181.567282,6.20466,972.958698,773.580485,45.662136,35.817961
std,104.11935,86.279342,63.997004,21.354219,5.973841,77.781283,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.0,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.8,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [4]:
# get names of columns with missing values
cols_with_missing_values = [col for col in concrete_data.columns
                     if concrete_data[col].isnull().any()]
cols_with_missing_values

['cement', 'coarseaggregate']

In [5]:
# number of missing values in each column of data
missing_val_count_by_column = (concrete_data.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

cement             2
coarseaggregate    1
dtype: int64


In [6]:
# remove rows with missing values
concrete_data.dropna(axis=0, inplace=True)

# check that rows were removed
concrete_data.shape

(1027, 9)

In [7]:
# setting target
y = concrete_data.csMPa

# selecting features
features= ['cement', 'slag', 'flyash', 'water', 'superplasticizer',
           'coarseaggregate', 'fineaggregate', 'age']

# setting features
X = concrete_data[features]

# split data set into training and validation data
train_X, validation_X, train_y, validation_y = train_test_split(X, y, random_state = 0)

In [8]:
# define random forest model
concrete_model = RandomForestRegressor(random_state=1)

# fit the model with training data
concrete_model.fit(train_X, train_y)

# get predictions with validation data
concrete_strength_preds = concrete_model.predict(validation_X)

# Use mean absolute error as measure of model accuracy. 
# Mean absolute error is a measure of the difference between the validation target values 
# and the target values predicted by the fitted model
print("Mean Absolute Error:", mean_absolute_error(validation_y, concrete_strength_preds), "csMPa")

Mean Absolute Error: 3.572653862794145 csMPa


In [9]:
# explore what effect removing age from modeled features would be
features= ['cement', 'slag', 'flyash', 'water', 'superplasticizer',
           'coarseaggregate', 'fineaggregate']

# setting features
X = concrete_data[features]

# split data set into training and validation data
train_X, validation_X, train_y, validation_y = train_test_split(X, y, random_state = 0)

# define random forest model
concrete_model = RandomForestRegressor(random_state=1)

# fit the model with training data
concrete_model.fit(train_X, train_y)

# get predictions with validation data
concrete_strength_preds = concrete_model.predict(validation_X)

# use mean absolute error as measure of model accuracy
print("Mean Absolute Error:", mean_absolute_error(validation_y, concrete_strength_preds), "csMPa")

Mean Absolute Error: 10.671877399401088 csMPa


In [10]:
# Mean Absolute Error with age removed from features is 10.67 csMPa which is
# greater than Mean Absolute Error of 3.57 csMPa with age included.  Therefore 
# including age in the modeled features improves the model accuracy.