### Exercise 

Download the `autoMpg` dataset from OpenML:

In [2]:
from sklearn.datasets import fetch_openml
mpg = fetch_openml(name='autoMpg', version=1, parser='auto')

See https://www.openml.org/d/196 for a description of the data.

Separate the data into training and testing sets.

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    mpg.data, mpg.target, random_state=0)

In [4]:
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model,origin
254,6,200.0,85.0,2965,15.8,78,1
246,4,78.0,52.0,1985,19.4,78,3
282,4,140.0,88.0,2890,17.3,79,1
81,4,97.0,92.0,2288,17.0,72,3
347,4,85.0,65.0,1975,19.4,81,3


The `origin` feature is a nominal value coded as an integer.
Use one-hot encoding to turn this column into multiple binary features.

In [5]:
# extract the origin column as a pandas DataFrame
origin = X_train[["origin"]]  

origin

Unnamed: 0,origin
254,1
246,3
282,1
81,3
347,3
...,...
323,1
192,1
117,2
47,1


In [6]:
from sklearn.preprocessing import OneHotEncoder

#setting sparse=False means that enc.transform() will return an array
enc = OneHotEncoder( sparse=False )

# fit the encoder to the data
enc.fit(origin)

# encode the data
origin_enc = enc.transform(origin)

print(origin_enc)

[[1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1.



In [7]:
import pandas as pd

# convert to a DataFrame and attach the index
new_columns = pd.DataFrame( 
    origin_enc, columns= "origin_" + enc.categories_[0] 
    ).set_index( origin.index )

new_columns

Unnamed: 0,origin_1,origin_2,origin_3
254,1.0,0.0,0.0
246,0.0,0.0,1.0
282,1.0,0.0,0.0
81,0.0,0.0,1.0
347,0.0,0.0,1.0
...,...,...,...
323,1.0,0.0,0.0
192,1.0,0.0,0.0
117,0.0,1.0,0.0
47,1.0,0.0,0.0


In [8]:
# remove the original "origin" feature and add the new features
X_train_enc = X_train.drop("origin",axis=1).join(new_columns)

X_train_enc

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model,origin_1,origin_2,origin_3
254,6,200.0,85.0,2965,15.8,78,1.0,0.0,0.0
246,4,78.0,52.0,1985,19.4,78,0.0,0.0,1.0
282,4,140.0,88.0,2890,17.3,79,1.0,0.0,0.0
81,4,97.0,92.0,2288,17.0,72,0.0,0.0,1.0
347,4,85.0,65.0,1975,19.4,81,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
323,4,156.0,105.0,2800,14.4,80,1.0,0.0,0.0
192,6,250.0,105.0,3353,14.5,76,1.0,0.0,0.0
117,4,68.0,49.0,1867,19.5,73,0.0,1.0,0.0
47,6,250.0,100.0,3282,15.0,71,1.0,0.0,0.0


The `horsepower` feature has 6 missing values. Can you impute them?

In [9]:
import numpy as np

# inspect the missing values in the table
missing = X_train_enc[ 'horsepower' ].isnull()
X_train_enc[ missing ]

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model,origin_1,origin_2,origin_3
126,6,200.0,,2875,17.0,74,1.0,0.0,0.0
374,4,151.0,,3035,20.5,82,1.0,0.0,0.0
336,4,140.0,,2905,14.3,80,1.0,0.0,0.0
354,4,100.0,,2320,15.8,81,0.0,1.0,0.0
32,4,98.0,,2046,19.0,71,1.0,0.0,0.0


In [10]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# impute using the IterativeImputer
imp = IterativeImputer()
imp.fit(X_train_enc)
X_imp_array = imp.transform(X_train_enc)

# convert array back to a DataFrame and attach the index
X_imp = pd.DataFrame( 
    X_imp_array, columns=X_train_enc.columns 
    ).set_index( X_train_enc.index )

# inspect the imputed values
X_imp[ missing ]

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model,origin_1,origin_2,origin_3
126,6.0,200.0,95.216817,2875.0,17.0,74.0,1.0,0.0,0.0
374,4.0,151.0,73.50308,3035.0,20.5,82.0,1.0,0.0,0.0
336,4.0,140.0,98.052249,2905.0,14.3,80.0,1.0,0.0,0.0
354,4.0,100.0,77.286856,2320.0,15.8,81.0,0.0,1.0,0.0
32,4.0,98.0,61.004016,2046.0,19.0,71.0,1.0,0.0,0.0


Finally, standardise the dataset using `StandardScaler`.

In [11]:
# inspect the means
X_imp.mean( axis=0 )

cylinders          5.466443
displacement     192.669463
horsepower       103.728399
weight          2972.073826
acceleration      15.595302
model             76.120805
origin_1           0.617450
origin_2           0.177852
origin_3           0.204698
dtype: float64

In [12]:
# inspect the variances
X_imp.var( axis=0 )

cylinders            2.936581
displacement     10462.339031
horsepower        1357.969180
weight          713114.587124
acceleration         6.965096
model               13.197478
origin_1             0.237001
origin_2             0.146713
origin_3             0.163345
dtype: float64

In [13]:
from sklearn.preprocessing import StandardScaler

# do the scaling
scaler = StandardScaler()
scaler.fit(X_imp)
X_scaled = scaler.transform(X_imp)

X_scaled

array([[ 0.31188164,  0.07178791, -0.5090795 , ...,  0.78712465,
        -0.46510916, -0.50733023],
       [-0.85718413, -1.12295743, -1.40609282, ..., -1.27044681,
        -0.46510916,  1.97110271],
       [-0.85718413, -0.51579176, -0.42753284, ...,  0.78712465,
        -0.46510916, -0.50733023],
       ...,
       [-0.85718413, -1.22088737, -1.48763949, ..., -1.27044681,
         2.15003291, -0.50733023],
       [ 0.31188164,  0.56143764, -0.10134618, ...,  0.78712465,
        -0.46510916, -0.50733023],
       [-0.85718413, -1.00544149, -0.88963061, ..., -1.27044681,
         2.15003291, -0.50733023]])

In [14]:
# inspect the means
X_scaled.mean( axis=0 )

array([-2.20554373e-16,  3.87460384e-17,  1.99691121e-16,  1.97455773e-17,
        7.09350550e-16,  1.93581169e-15, -1.78827870e-17,  3.57655739e-17,
        1.49023225e-18])

In [15]:
# inspect the variances
X_scaled.var( axis=0 )

array([1., 1., 1., 1., 1., 1., 1., 1., 1.])