Integrantes:

* Felipe Borges de Lima Naufal
* Felipe Nunes Costa
* Hiago de Sousa Patrício
* Leonardo Barbosa Almeida

Configurações

In [310]:
input_folder = './kaggle/input'

submission_files = {
    'DecisionTreeRegressor': 'submissionDecisionTreeRegressor.csv',
    'DecisionTreeClassifier': 'submissionDecisionTreeClassifier.csv',
    'RandomForestRegressor': 'submissionRandomForestRegressor.csv',
    'XGBRegressor': 'submissionXGBRegressor.csv',
}

options_strategy = [
    'mean',
    'median',
    'most_frequent',
]
strategy = options_strategy[1]

options_folder_to_save_submissions = [
    'Columns with missing values removed',
    f'Imputation using {strategy} strategy',
    f'Imputation with flag using {strategy} strategy'
]

selected_pre_processing = 2
folder_to_save_submissions = options_folder_to_save_submissions[selected_pre_processing]


In [311]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk(input_folder):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.set_option('display.max_rows', 1024)
pd.set_option('display.max_columns', 1024)

./kaggle/input\data_description.txt
./kaggle/input\sample_submission.csv
./kaggle/input\test.csv
./kaggle/input\train.csv


Leitura dos dados

In [312]:
train = pd.read_csv(f'{input_folder}/train.csv', index_col='Id')
test = pd.read_csv(f'{input_folder}/test.csv', index_col='Id')

print(train.shape)
print(test.shape)

(1460, 80)
(1459, 79)


Processamento dos dados

In [313]:
train.dropna(subset=['SalePrice'], axis=0, inplace=True)
y_train = train.SalePrice
X_train = train.drop(['SalePrice'], axis=1, inplace=False)

print(X_train.shape)
print(test.shape)
print(X_train.isnull().sum())
print(test.isnull().sum())

(1460, 79)
(1459, 79)
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinSF1          0
BsmtFinType2       38
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
1stFlrSF            0
2ndFlrSF            0
LowQualFin

In [314]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

if selected_pre_processing == 0:
    cols_with_missing_values = [col for col in X_train.columns
                                if X_train[col].isnull().any() or test[col].isnull().any()]
    print('Colunas que serão removidas:', cols_with_missing_values)
    X_train.drop(axis=1, labels=cols_with_missing_values, inplace=True)
    test.drop(axis=1, labels=cols_with_missing_values, inplace=True)
elif selected_pre_processing in [1, 2]:
    # Colunas categóricas mantidas
    categorical_cols_without_missing_values = [col for col in X_train.columns
                                               if (not(X_train[col].isnull().any()) and not(test[col].isnull().any()))
                                               and X_train[col].dtype == 'object']

    # Colunas numéricas mantidas
    numerical_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]

    # Colunas que terão imputação
    numerical_cols_with_missing_values = [col for col in X_train.columns 
                                          if X_train[col].isnull().any() and X_train[col].dtype in ['int64', 'float64']]

    print('Colunas que terão imputações: ')
    print(X_train[numerical_cols_with_missing_values].isnull().sum())
    print(test[numerical_cols_with_missing_values].isnull().sum())

    # Colunas categóricas removidas
    categorical_cols_with_missing_values = [col for col in X_train.columns
                                if (X_train[col].isnull().any() or test[col].isnull().any())
                                and X_train[col].dtype == 'object']
    print('\nColunas removidas:\n', categorical_cols_with_missing_values)

    X_train_plus = X_train[numerical_cols].copy()
    test_plus = test[numerical_cols].copy()

    if selected_pre_processing == 2:
        for col in numerical_cols_with_missing_values:
            X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
            test_plus[col + '_was_missing'] = test_plus[col].isnull()

    imputer = SimpleImputer(strategy=strategy)
    imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train_plus))
    imputed_test = pd.DataFrame(imputer.transform(test_plus))

    imputed_X_train.columns = X_train_plus.columns
    imputed_test.columns = test_plus.columns

    imputed_X_train.index = X_train_plus.index
    imputed_test.index = test_plus.index

    X_train = pd.concat([imputed_X_train, X_train[categorical_cols_without_missing_values]], axis=1)
    test = pd.concat([imputed_test, test[categorical_cols_without_missing_values]], axis=1)

In [315]:
print(X_train.shape)
print(test.shape)
print(X_train.isnull().sum())
print(test.isnull().sum())

(1460, 79)
(1459, 79)
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinSF1          0
BsmtFinType2       38
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
1stFlrSF            0
2ndFlrSF            0
LowQualFin

One Hot Encoder

In [316]:
from sklearn.preprocessing import OneHotEncoder

object_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
print('Colunas em que serão aplicadas OneHotEncoder:', object_cols)

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_X_train = pd.DataFrame(encoder.fit_transform(X_train[object_cols]))
OH_test = pd.DataFrame(encoder.transform(test[object_cols]))

OH_X_train.index = X_train.index
OH_test.index = test.index

num_X_train = X_train.drop(object_cols, axis=1)
num_test = test.drop(object_cols, axis=1)

X_train = pd.concat([OH_X_train, num_X_train], axis=1)
test = pd.concat([OH_test, num_test], axis=1)

print(X_train.shape)
print(test.shape)

Colunas em que serão aplicadas OneHotEncoder: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
(1460, 304)
(1459, 304)


In [317]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2)

print(X_train.shape)
print(X_valid.shape)
print(y_train.shape)
print(y_valid.shape)

(1168, 304)
(292, 304)
(1168,)
(292,)


In [318]:
import os

def output_to_csv(model, file_name):
    preds = model.predict(test)
    output = pd.DataFrame({'Id': test.index,
                           'SalePrice': preds})

    if not os.path.exists(folder_to_save_submissions):
        os.makedirs(folder_to_save_submissions)

    output.to_csv(f'{folder_to_save_submissions}/{file_name}', index=False)

Decision Tree Regressor

In [319]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae_decision_tree_regressor(max_leaf_nodes):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=42)
    model.fit(X_train, y_train)
    preds_val = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds_val)
    return (mae)

In [320]:
predicts = {}

for max_leaf_nodes in range(50, 1001, 10):
    my_mae = get_mae_decision_tree_regressor(max_leaf_nodes)
    predicts[max_leaf_nodes] = my_mae

In [321]:
best_max_leaf_nodes = min(predicts, key=predicts.get)
print(f'O melhor resultado foi {predicts[best_max_leaf_nodes]} com max_leaf_nodes = {best_max_leaf_nodes}')

model = DecisionTreeRegressor(max_leaf_nodes=best_max_leaf_nodes, random_state=42)
model.fit(X_train, y_train)

output_to_csv(model, submission_files['DecisionTreeRegressor'])

Decision Tree Classifier

In [322]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeClassifier

def get_mae_decision_tree_classifier(max_depth, criterion, splitter):
    model = DecisionTreeClassifier(
        max_depth=max_depth, 
        criterion=criterion, 
        splitter=splitter, 
        random_state=42)

    model.fit(X_train, y_train)
    preds_val = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds_val)
    return (mae)

In [323]:
maes = {}

for max_depth in range(1, 30):
    for criterion in ['gini', 'entropy']:
        for splitter in ['best', 'random']:
            my_mae = get_mae_decision_tree_classifier(max_depth, criterion, splitter)
            maes[(max_depth, criterion, splitter)] = my_mae

sorted_mae = {k: v for k, v in sorted(maes.items(), key=lambda item: item[1])}

In [324]:
max_depth, criterion, splitter = [k for k, v in sorted(maes.items(), key=lambda item: item[1])][0]

model = DecisionTreeClassifier(
    max_depth=max_depth, 
    criterion=criterion, 
    splitter=splitter, 
    random_state=42)
model.fit(X_train, y_train)

preds = model.predict(X_valid)
print(f'O melhor resultado foi {predicts[best_max_leaf_nodes]} com max_depth = {max_depth}, criterion = {criterion}, splitter = {splitter}')

output_to_csv(model, submission_files['DecisionTreeClassifier'])

Random Forest

In [325]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_valid)
print(mean_absolute_error(y_valid, preds))

output_to_csv(model, submission_files['RandomForestRegressor'])

XGBRegressor

In [326]:
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=3000, learning_rate=0.05, eval_metric="mae")

model.fit(X_train, y_train,
          early_stopping_rounds=5,
          eval_set=[(X_valid, y_valid)])
preds = model.predict(X_valid)
print(mean_absolute_error(y_valid, preds))

output_to_csv(model, submission_files['XGBRegressor'])

[0]	validation_0-mae:177592.37500
[1]	validation_0-mae:168869.64062
[2]	validation_0-mae:160657.28125
[3]	validation_0-mae:152882.56250
[4]	validation_0-mae:145424.59375
[5]	validation_0-mae:138273.90625
[6]	validation_0-mae:131724.46875
[7]	validation_0-mae:125235.54688
[8]	validation_0-mae:119334.25000
[9]	validation_0-mae:113610.04688
[10]	validation_0-mae:108443.51562
[11]	validation_0-mae:103419.68750
[12]	validation_0-mae:98610.19531
[13]	validation_0-mae:94028.43750
[14]	validation_0-mae:89687.66406
[15]	validation_0-mae:85538.87500
[16]	validation_0-mae:81654.40625
[17]	validation_0-mae:77986.72656
[18]	validation_0-mae:74418.78125
[19]	validation_0-mae:71045.50781
[20]	validation_0-mae:67818.33594
[21]	validation_0-mae:64802.17188
[22]	validation_0-mae:61983.32812
[23]	validation_0-mae:59248.39062
[24]	validation_0-mae:56674.54688
[25]	validation_0-mae:54324.73438
[26]	validation_0-mae:52084.79688
[27]	validation_0-mae:49954.76172
[28]	validation_0-mae:47869.92969
[29]	validat