# Advanced Transforms

# How to Transform Numerical and Categorical Data

* The challenge of using data transformations with datasets that have mixed data types.
* How to define, fit, and use the ColumnTransformer to selectively apply data transforms to columns.
* How to work through a real dataset with mixed data types and use the ColumnTransformer to apply different transforms to categorical and numerical data columns.

In [1]:
# load the dataset
from pandas import read_csv

# load dataset
dataframe = read_csv('abalone.csv', header=None)

# split into inputs and outputs
last_ix = len(dataframe.columns) - 1
X, y = dataframe.drop(last_ix, axis=1), dataframe[last_ix]
print(X.shape, y.shape)

(4177, 8) (4177,)


In [2]:
dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [4]:
# example of using the ColumnTransformer for the Abalone dataset
from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR

# determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns

# define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix), ('num', MinMaxScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=t)

# define the model
model = SVR(kernel='rbf',gamma='scale',C=100)

# define the data preparation and modeling pipeline
pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])

# define the model cross-validation configuration
cv = KFold(n_splits=10, shuffle=True, random_state=1)

# evaluate the pipeline using cross validation and calculate MAE
scores = cross_val_score(pipeline, X, y, scoring='neg_mean_absolute_error', cv=cv,
n_jobs=-1)

# convert MAE scores to positive values
scores = absolute(scores)

# summarize the model performance
print('MAE: %.3f (%.3f)' % (mean(scores), std(scores)))

MAE: 1.465 (0.047)


# How to Transform the Target in Regression

* The importance of scaling input and target data for machine learning.
* The two approaches to applying data transforms to target variables.
* How to use the TransformedTargetRegressor on a real regression dataset.

In [5]:
# load and summarize the dataset
from numpy import loadtxt

# load data
dataset = loadtxt('housing.csv', delimiter=",")

# split into inputs and outputs
X, y = dataset[:, :-1], dataset[:, -1]

# summarize dataset
print(X.shape, y.shape)

(506, 13) (506,)


In [6]:
# example of normalizing input and output variables for regression.
from numpy import mean
from numpy import absolute
from numpy import loadtxt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import HuberRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import TransformedTargetRegressor

# prepare the model with input scaling
pipeline = Pipeline(steps=[('normalize', MinMaxScaler()), ('model', HuberRegressor())])

# prepare the model with target scaling
model = TransformedTargetRegressor(regressor=pipeline, transformer=MinMaxScaler())

# evaluate model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

# convert scores to positive
scores = absolute(scores)

# summarize the result
s_mean = mean(scores)
print('Mean MAE: %.3f' % (s_mean))

Mean MAE: 3.203


# How to Save and Load Data Transforms

* The challenge of correctly preparing test data and new data for a machine learning model.
* The solution of saving the model and data preparation objects to file for later use.
* How to save and later load and use a machine learning model and data preparation model on new data.

## Save Model and Data Scaler

In [7]:
# example of fitting a model on the scaled dataset
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from pickle import dump

# prepare dataset
X, y = make_blobs(n_samples=100, centers=2, n_features=2, random_state=1)

# split data into train and test sets
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.33, random_state=1)

# define scaler
scaler = MinMaxScaler()

# fit scaler on the training dataset
scaler.fit(X_train)

# transform the training dataset
X_train_scaled = scaler.transform(X_train)

# define model
model = LogisticRegression(solver='lbfgs')
model.fit(X_train_scaled, y_train)

# save the model
dump(model, open('model.pkl', 'wb'))

# save the scaler
dump(scaler, open('scaler.pkl', 'wb'))

## Load Model and Data Scaler

In [8]:
# load model and scaler and make predictions on new data
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pickle import load

# prepare dataset
X, y = make_blobs(n_samples=100, centers=2, n_features=2, random_state=1)

# split data into train and test sets
_, X_test, _, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

# load the model
model = load(open('model.pkl', 'rb'))

# load the scaler
scaler = load(open('scaler.pkl', 'rb'))

# check scale of the test set before scaling
print('Raw test set range')

Raw test set range


In [9]:
for i in range(X_test.shape[1]):
  print('>%d, min=%.3f, max=%.3f' % (i, X_test[:, i].min(), X_test[:, i].max()))

>0, min=-11.270, max=0.085
>1, min=-5.581, max=5.926


In [10]:
# transform the test dataset
X_test_scaled = scaler.transform(X_test)
print('Scaled test set range')

Scaled test set range


In [11]:
for i in range(X_test_scaled.shape[1]):
  print('>%d, min=%.3f, max=%.3f' % (i, X_test_scaled[:, i].min(), X_test_scaled[:, i].max()))

>0, min=0.047, max=0.964
>1, min=0.063, max=0.955


In [12]:
# make predictions on the test set
yhat = model.predict(X_test_scaled)

# evaluate accuracy
acc = accuracy_score(y_test, yhat)
print('Test Accuracy:', acc)

Test Accuracy: 1.0
