# CATEGORICAL VARIABLES

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [2]:
data=pd.read_csv('./melb_data.csv')

#Variables predictoras y variable objetivo
y=data['Price']
X=data.drop(['Price'],axis=1)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [4]:
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [5]:
#Dividimos en entrenamiento y validación
x_train,x_valid,y_train,y_valid=train_test_split(X,y,test_size=0.20, random_state=0)

#Eliminamos toda columna que contenga por lo menos un valor nulo
cols_with_missing=[col for col in X.columns if X[col].isnull().any()]
x_train.drop(cols_with_missing,axis=1,inplace=True)
x_valid.drop(cols_with_missing,axis=1,inplace=True)

Seleccionamos columnas con baja cardinalidad (cardinality means the number of unique values in a column). Y por otro lado, columnas numericas.

In [6]:
low_cardinality=[c for c in x_train.columns if x_train[c].nunique()<10 and x_train[c].dtype=='object']
numericals=[c for c in x_train.columns if x_train[c].dtype in ['int64','float64']]

mis_columnas=low_cardinality+numericals
X_train=x_train[mis_columnas]
X_valid=x_valid[mis_columnas]

Obtenemos una lista de las variables categoricas.

In [7]:
s=X_train.dtypes=='object'
s[s]

Type          True
Method        True
Regionname    True
dtype: bool

### Function to measure quality of each approach

In [8]:
def scores(x_train,x_valid,y_train,y_valid):
    model=RandomForestRegressor(n_estimators=100,random_state=0)
    model.fit(x_train,y_train)
    y_pred=model.predict(x_valid)
    return (mean_absolute_error(y_valid,y_pred), mean_absolute_percentage_error(y_valid,y_pred))

## APPROACH 1: Drop categorical variables

In [9]:
X_train_1=X_train.select_dtypes(exclude=['object'])
X_valid_1=X_valid.select_dtypes(exclude=['object'])

scores(X_train_1,X_valid_1,y_train,y_valid)

(175703.48185157913, 0.16426659861408727)

## APPROACH 2: Ordinal encoding

Scikit-learn has a OrdinalEncoder class that can be used to get ordinal encodings. We loop over the categorical variables and apply the ordinal encoder separately to each column.

- técnica común utilizada para convertir variables categóricas ordinales en números. La codificación ordinal asigna un número entero a cada categoría única en función de su orden o jerarquía.

In [10]:
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
object_cols

['Type', 'Method', 'Regionname']

In [11]:
X_train_2=X_train.copy()
X_valid_2=X_valid.copy()

#Aplicamos el ordinal encoding a las columnas con variables categoricas
ordinal_encoder=OrdinalEncoder()
X_train_2[object_cols]=ordinal_encoder.fit_transform(X_train_2[object_cols])
X_valid_2[object_cols]=ordinal_encoder.transform(X_valid_2[object_cols])

scores(X_train_2,X_valid_2,y_train,y_valid)

(165936.40548390493, 0.15308848822388954)

## APPROACH 3: One-hot encoding

Imagina que tienes una variable categórica que representa colores: 'rojo', 'verde' y 'azul'. En lugar de representar estos colores como cadenas de texto, el "one-hot encoding" crea una nueva columna binaria para cada categoría única. Cada columna indicará si la observación pertenece o no a esa categoría. 'rojo' se representaría como [1, 0, 0], 'verde' se representaría como [0, 1, 0], 'azul' se representaría como [0, 0, 1].

- We set handle_unknown='ignore' to avoid errors when the validation data contains classes that aren't represented in the training data, and
- setting sparse=False ensures that the encoded columns are returned as a numpy array (instead of a sparse matrix).

In [15]:
oh_encoder=OneHotEncoder(handle_unknown='ignore', sparse_output=False)

#Aplicamos a las columnas categoricas. En estos dataframes unicamente se guardas los valores de oh.
oh_train=pd.DataFrame(oh_encoder.fit_transform(X_train[object_cols]))
oh_valid=pd.DataFrame(oh_encoder.transform(X_valid[object_cols]))

#El oh encoding borras los indices. Volvamos a traerlos
oh_train.index=X_train.index
oh_valid.index=X_valid.index

#En un dataframe guardamos los valores no categoricas
num_x_train=X_train.drop(object_cols,axis=1)
num_x_valid=X_valid.drop(object_cols,axis=1)

#Df final: variables numericas+categoricas como oh
X_train_3=pd.concat([num_x_train,oh_train],axis=1)
X_valid_3=pd.concat([num_x_valid,oh_valid],axis=1)

X_train_3.columns=X_train_3.columns.astype(str)
X_valid_3.columns=X_valid_3.columns.astype(str)

scores(X_train_3,X_valid_3,y_train,y_valid)

(166089.4893009678, 0.15333861737297402)