In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv("/home/miguel/Desktop/ML_Notebooks_&_Projects/30_Dias/Dia_9/First_model/melb_data.csv")
X = data.drop(['Price'], axis = 1)
y = data.Price
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [5]:
# Quitar las columnas con valores nulos
cols_missing = [col for col in X_train.columns if X_train[col].isnull().any()]
full_X_train = X_train.drop(cols_missing, axis = 1)
full_X_valid = X_valid.drop(cols_missing, axis = 1)

#Columnas de valores categoricos de baja cardinalidad
low_cardinality_cols = [col for col in full_X_train.columns if full_X_train[col].nunique() < 10 and full_X_train[col].dtype == "object"]

#Columnas de valores numericos
numeric_cols = [col for col in full_X_train.columns if full_X_train[col].dtype in ['int64', 'float64']]

#Columnas con las que nos quedamos
full_cols = low_cardinality_cols + numeric_cols

X_train = full_X_train[full_cols].copy()
X_valid = full_X_valid[full_cols].copy()

In [6]:
#Visualizar
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [9]:
# Mirar las columnas categoricas
s = (X_train.dtypes == "object")
object_cols = list(s[s].index)
print("Valores_Categoricos:\n", object_cols)

Valores_Categoricos:
 ['Type', 'Method', 'Regionname']


In [10]:
# Funcion para testear el modelo
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

def score_data(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators = 100, random_state = 0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

# 1. Quitar columnas con valores categoricos

In [11]:
drop_X_train = X_train.drop(object_cols, axis = 1)
drop_X_valid = X_valid.drop(object_cols, axis = 1)

print("MAE con el primer metodo:\n", score_data(drop_X_train, drop_X_valid, y_train, y_valid))

MAE con el primer metodo:
 175703.48185157913


# 2. Ordinal encoding, cambiar clases por numeros

In [14]:
from sklearn.preprocessing import OrdinalEncoder

#Hacemos una copia para evitar cambiar los datos originales
NX_train = X_train.copy()
NX_valid = X_valid.copy()

# Aplicamos el ordinal encoding
ordinal = OrdinalEncoder()
NX_train[object_cols] = ordinal.fit_transform(X_train[object_cols])
NX_valid[object_cols] = ordinal.transform(X_valid[object_cols])

# Resultados
print("MAE con el segundo metodo:\n", score_data(NX_train, NX_valid, y_train, y_valid))

MAE con el segundo metodo:
 165936.40548390493


# 3. One hot encoding, crear columnas implicando la existencia o no de las clases

In [16]:
from sklearn.preprocessing import OneHotEncoder

#Crear columnas
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_t_columns = pd.DataFrame(ohe.fit_transform(X_train[object_cols]))
X_v_columns = pd.DataFrame(ohe.transform(X_valid[object_cols]))

# El OneHotEncoder removio el index. hay que ponerlo de nuevo
X_t_columns.index = X_train.index
X_v_columns.index = X_valid.index

# Remover las columnas categoricas(seran reemplazadas)
OX_train = X_train.drop(object_cols, axis = 1)
OX_valid = X_valid.drop(object_cols, axis = 1)

#Concatenar
NX_train = pd.concat([OX_train, X_t_columns], axis = 1)
NX_valid = pd.concat([OX_valid, X_v_columns], axis = 1)

#Resultados
print("MAE con el tercer metodo:\n", score_data(NX_train, NX_valid, y_train, y_valid))

MAE con el tercer metodo:
 166089.4893009678


Normalmente los ultimos dos metodos siempre dara mejores resultados, el primero se usa a menos que la columna categorica de muy poca informacion.
El ultimo metodo no funciona muy bien si las clases de las columnas categoricas son muchas