In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score


data = pd.read_csv('~/JProjects/kaggle/data/concrete.csv')

data.head()

Unnamed: 0,Cement,BlastFurnaceSlag,FlyAsh,Water,Superplasticizer,CoarseAggregate,FineAggregate,Age,CompressiveStrength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [7]:
X = data.copy()
y = X.pop('CompressiveStrength')


baseline = RandomForestRegressor(criterion='absolute_error', random_state=0)

baseline_score = cross_val_score(
    baseline, X, y, cv=5, scoring='neg_mean_absolute_error'
)

baseline_score = -1 * baseline_score.mean()

print(f'MAE: {baseline_score:.4f}')

MAE: 8.2317


If you ever cook at home, you might know that the ratio of ingredients in a recipe is usually a better predictor of how the recipe turns out than their absolute amounts. We might reason then that ratios of the features above would be a good predictor of CompressiveStrength.

The cell below adds three new ratio features to the dataset.

In [8]:
X['FCRatio'] = X['FineAggregate'] / X['CoarseAggregate']
X['AggCmtRatio'] = (X['CoarseAggregate'] + X['FineAggregate']) / X['Cement']
X['WtrCmtRatio'] = X['Water'] / X['Cement']

X.head()

Unnamed: 0,Cement,BlastFurnaceSlag,FlyAsh,Water,Superplasticizer,CoarseAggregate,FineAggregate,Age,FCRatio,AggCmtRatio,WtrCmtRatio
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,0.65,3.177778,0.3
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,0.640758,3.205556,0.3
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,0.637339,4.589474,0.685714
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,0.637339,4.589474,0.685714
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,0.843724,9.083082,0.966767


In [9]:
model = RandomForestRegressor(criterion='absolute_error', random_state=0)

score = cross_val_score(
    model, X, y, cv=5, scoring='neg_mean_absolute_error'
)

score = -score.mean()

print(f'MAE: {score:.3f}')

MAE: 7.948
