# Exercise 13

This particular Automobile Data Set includes a good mix of categorical values as well as continuous values and serves as a useful example that is relatively easy to understand. Since domain understanding is an important aspect when deciding how to encode various categorical values - this data set makes a good case study.

Read the data into Pandas

In [2]:
import pandas as pd

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [3]:
df.shape

(205, 26)

In [4]:
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

In [5]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


# Exercise 13.1

Does the database contain missing values? If so, replace them using one of the methods explained in class

In [6]:
df.isnull().sum()

symboling             0
normalized_losses    41
make                  0
fuel_type             0
aspiration            0
num_doors             2
body_style            0
drive_wheels          0
engine_location       0
wheel_base            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_cylinders         0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            2
peak_rpm              2
city_mpg              0
highway_mpg           0
price                 4
dtype: int64

In [7]:
df.dropna(inplace=True)
df.shape

(159, 26)

# Exercise 13.2

Split the data into training and testing sets

Train a Random Forest Regressor to predict the price of a car using the continuous features

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,make_scorer
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from numpy import sqrt

In [9]:
num_var=df.select_dtypes(exclude=['object']).copy()

X=num_var.drop('price',axis=1)
y=num_var['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
reg=RandomForestRegressor(n_estimators=10, random_state=1).fit(X_train,y_train)
y_pred=reg.predict(X_test)
sqrt(mean_squared_error(y_pred,y_test))

1771.7844075880741

# Exercise 13.3

Create dummy variables for the categorical features

Train a Random Forest Regressor and compare

In [11]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df=pd.get_dummies(obj_df, drop_first=True)
concat_df=pd.concat([obj_df,num_var],axis=1)
concat_df.head()

Unnamed: 0,make_bmw,make_chevrolet,make_dodge,make_honda,make_jaguar,make_mazda,make_mercedes-benz,make_mitsubishi,make_nissan,make_peugot,...,curb_weight,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
3,0,0,0,0,0,0,0,0,0,0,...,2337,109,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,0,0,0,0,0,0,0,0,0,0,...,2824,136,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0
6,0,0,0,0,0,0,0,0,0,0,...,2844,136,3.19,3.4,8.5,110.0,5500.0,19,25,17710.0
8,0,0,0,0,0,0,0,0,0,0,...,3086,131,3.13,3.4,8.3,140.0,5500.0,17,20,23875.0
10,1,0,0,0,0,0,0,0,0,0,...,2395,108,3.5,2.8,8.8,101.0,5800.0,23,29,16430.0


In [12]:
X=concat_df.drop('price',axis=1)
y=concat_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [13]:
reg=RandomForestRegressor(n_estimators=10, random_state=1).fit(X_train,y_train)
y_pred=reg.predict(X_test)
sqrt(mean_squared_error(y_pred,y_test))

2019.3145853234457

Al agregar las variables nominales al modelo, la raiz cuadrada del error medio aumento de forma notable, esto se puede deber a outliers ingresados al modelo a través de estas variables.

# Exercise 13.4

Apply two other methods of categorical encoding

compare the results

#### Binary Encoding

In [14]:
import category_encoders as ce

obj_df = df.select_dtypes(include=['object']).copy()
obj_df = ce.BinaryEncoder().fit_transform(obj_df)
concat_df=pd.concat([obj_df,num_var],axis=1)
concat_df.head()

Unnamed: 0,make_0,make_1,make_2,make_3,make_4,make_5,fuel_type_0,fuel_type_1,aspiration_0,aspiration_1,...,curb_weight,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
3,0,0,0,0,0,1,0,1,0,1,...,2337,109,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,0,0,0,0,0,1,0,1,0,1,...,2824,136,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0
6,0,0,0,0,0,1,0,1,0,1,...,2844,136,3.19,3.4,8.5,110.0,5500.0,19,25,17710.0
8,0,0,0,0,0,1,0,1,1,0,...,3086,131,3.13,3.4,8.3,140.0,5500.0,17,20,23875.0
10,0,0,0,0,1,0,0,1,0,1,...,2395,108,3.5,2.8,8.8,101.0,5800.0,23,29,16430.0


In [15]:
X=concat_df.drop('price',axis=1)
y=concat_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [16]:
reg=RandomForestRegressor(n_estimators=10, random_state=1).fit(X_train,y_train)
y_pred=reg.predict(X_test)
sqrt(mean_squared_error(y_pred,y_test))

1895.0973561311507

El Binary encoding presenta un mejor RMSE que la partición del modelo en variables Dummies sin ningun tipo de preprocesamiento de datos más alla de la eliminación de missing values.

#### Feature Hashing

In [17]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df = ce.HashingEncoder(n_components=12).fit_transform(obj_df)
concat_df=pd.concat([obj_df,num_var],axis=1)
concat_df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,curb_weight,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
3,3,0,1,0,1,0,0,0,1,1,...,2337,109,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,3,1,1,0,1,0,0,0,0,1,...,2824,136,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0
6,3,1,1,0,1,0,0,0,1,1,...,2844,136,3.19,3.4,8.5,110.0,5500.0,19,25,17710.0
8,2,1,1,0,1,0,0,0,1,2,...,3086,131,3.13,3.4,8.3,140.0,5500.0,17,20,23875.0
10,3,0,1,0,0,2,0,0,0,2,...,2395,108,3.5,2.8,8.8,101.0,5800.0,23,29,16430.0


In [18]:
X=concat_df.drop('price',axis=1)
y=concat_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [19]:
reg=RandomForestRegressor(n_estimators=10, random_state=1).fit(X_train,y_train)
y_pred=reg.predict(X_test)
sqrt(mean_squared_error(y_pred,y_test))

1988.7202281171233

El performance de esta codificación categórica dadas las mismas condiciones fue muy similar al de las Variables Dummies expuesto anteriormente.