## Aula Treino - Regressão

In [1]:
## importando bibliotecas iniciais

import pandas as pd
import numpy as np


## preprocessamento
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_regression

df_house = pd.read_csv('train_v2.csv')

print(f'O dataset tem {df_house.shape[0]} linhas e {df_house.shape[1]} colunas')
df_house.head()

O dataset tem 1460 linhas e 81 colunas


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Fazendo Separação Entre Treino e Teste

In [2]:
X = df_house.drop('SalePrice', axis=1)
y = df_house['SalePrice']

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((1168, 80), (292, 80))

# Identificando As Variáveis Categóricas E Numéricas

In [4]:
categorical_var = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_var = X_train.select_dtypes(exclude=['object']).columns.tolist()

categorical_var, numerical_var

(['MSZoning',
  'Street',
  'Alley',
  'LotShape',
  'LandContour',
  'Utilities',
  'LotConfig',
  'LandSlope',
  'Neighborhood',
  'Condition1',
  'Condition2',
  'BldgType',
  'HouseStyle',
  'RoofStyle',
  'RoofMatl',
  'Exterior1st',
  'Exterior2nd',
  'MasVnrType',
  'ExterQual',
  'ExterCond',
  'Foundation',
  'BsmtQual',
  'BsmtCond',
  'BsmtExposure',
  'BsmtFinType1',
  'BsmtFinType2',
  'Heating',
  'HeatingQC',
  'CentralAir',
  'Electrical',
  'KitchenQual',
  'Functional',
  'FireplaceQu',
  'GarageType',
  'GarageFinish',
  'GarageQual',
  'GarageCond',
  'PavedDrive',
  'PoolQC',
  'Fence',
  'MiscFeature',
  'SaleType',
  'SaleCondition'],
 ['Id',
  'MSSubClass',
  'LotFrontage',
  'LotArea',
  'OverallQual',
  'OverallCond',
  'YearBuilt',
  'YearRemodAdd',
  'MasVnrArea',
  'BsmtFinSF1',
  'BsmtFinSF2',
  'BsmtUnfSF',
  'TotalBsmtSF',
  '1stFlrSF',
  '2ndFlrSF',
  'LowQualFinSF',
  'GrLivArea',
  'BsmtFullBath',
  'BsmtHalfBath',
  'FullBath',
  'HalfBath',
  'Bedro

In [5]:
# Verificar os tipos de dados das colunas do DataFrame X_train
tipos_de_dados = X_train.dtypes

# Verificar se as colunas categóricas estão corretas
for coluna in categorical_var:
    if tipos_de_dados[coluna] != 'object':
        print(f"A coluna '{coluna}' não é do tipo 'object'.")

# Verificar se as colunas numéricas estão corretas
for coluna in numerical_var:
    if tipos_de_dados[coluna] == 'object':
        print(f"A coluna '{coluna}' não é numérica.")

## Desenvolvendo O Pipeline

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', SimpleImputer(strategy='most_frequent'), categorical_var),
        ('num', StandardScaler(), numerical_var)
    ],
    remainder='passthrough'
)

## Com As Especificações De Tratamento - Criação Do Pipeline

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
import lightgbm as lgb

## construção do Pipeline

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('encoder', OrdinalEncoder()),
    ('regressor', lgb.LGBMRegressor(random_state = 42))
])

## treinando o pipeline

pipeline.fit(X_train, y_train)

In [8]:
# Remover colunas sem dados (apenas valores nulos)
X_test_cleaned = X_test.dropna(axis=1, how='all')

# Verificar o novo shape do DataFrame após a remoção das colunas
print("Novo shape do DataFrame após a remoção das colunas sem dados:", X_test_cleaned.shape)

Novo shape do DataFrame após a remoção das colunas sem dados: (292, 80)


Modelo Treinado com sucesso, agora é hora de validar os resultados

In [9]:
import numpy as np
from sklearn.metrics import(
    mean_absolute_error,
    mean_squared_error,
    mean_absolute_percentage_error)

## validando o modelo
## fazendo predição com features de teste

# Manter apenas as categorias presentes nos dados de treinamento
X_test = X_test[X_test[categorical_var].isin(X_train[categorical_var])]

# Preencher valores ausentes com a categoria mais frequente dos dados de treinamento
X_test[categorical_var] = X_test[categorical_var].fillna(X_train[categorical_var].mode().iloc[0])

# Fazer predições com os dados de teste
y_pred = pipeline.predict(X_test)

## comparando as features de teste preditos com o real
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rsme = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)

## exibindo as métricas de validação

print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RSME: {rsme:.2f}')
print(f'MAPE: {mape:.2f}')

ValueError: Found unknown categories [nan] in column 43 during transform

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(drop='first'), categorical_var),
                 ('num', StandardScaler(), numerical_var)
])

from sklearn.pipeline import Pipeline
import lightgbm as lgb

## construção do Pipeline

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', lgb.LGBMRegressor(random_state = 42))
])

## treinando o pipeline

pipeline.fit(X_train, y_train)

import numpy as np
from sklearn.metrics import(
    mean_absolute_error,
    mean_squared_error,
    mean_absolute_percentage_error)

## validando o modelo
## fazendo predição com features de teste
y_pred = pipeline.predict(X_test)

## comparando as features de teste preditos com o real
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rsme = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)

## exibindo as métricas de validação

print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RSME: {rsme:.2f}')
print(f'MAPE: {mape:.2f}')
