# Example of a Pipeline (Data Processing and Cross-Validation Modelling)

In [1]:
# Example of using the ColumnTransformer for the Abalone dataset
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from pandas import read_csv
from numpy import absolute
from numpy import mean
from numpy import std


In [2]:
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/abalone.csv'
dataframe = read_csv(url, header=None)


In [3]:
# Split into inputs and outputs
last_ix = len(dataframe.columns) - 1
X, y = dataframe.drop(last_ix, axis=1), dataframe[last_ix]
print(X.shape, y.shape)


(4177, 8) (4177,)


In [4]:
# Determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns


In [5]:
# Define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix), ('num', MinMaxScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=t)


In [6]:
# Define the model
model = SVR(kernel='rbf',gamma='scale',C=100)


In [7]:
# Define the data preparation and modeling pipeline
pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])


In [8]:
# Define the model cross-validation configuration
cv = KFold(n_splits=10, shuffle=True, random_state=1)


In [9]:
# Evaluate the pipeline using cross validation and calculate MAE
scores = cross_val_score(pipeline, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)


In [10]:
# Convert MAE scores to positive values
scores = absolute(scores)


In [11]:
# Summarize the model performance
print('MAE: %.3f (%.3f)' % (mean(scores), std(scores)))

MAE: 1.465 (0.047)
