In [0]:
import pandas as pd
from sklearn import tree
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [0]:
#!wget https://github.com/cassiasamp/semcomp/tree/master/dados/treino.csv
#!wget https://github.com/cassiasamp/semcomp/blob/master/dados/teste.csv


In [0]:
dados_treino = pd.read_csv('treino.csv', index_col='Id')
dados_teste = pd.read_csv('teste.csv', index_col='Id')

In [0]:
caracteristicas = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd' ]

In [0]:
X = dados_treino[caracteristicas].copy()
Y = dados_treino.SalePrice

In [0]:
X_treino, X_valid, Y_treino, Y_valid = train_test_split(X, Y, train_size=0.8, test_size=0.2, random_state=42)


In [0]:
modelo_1 = tree.DecisionTreeRegressor(random_state=42)

In [0]:
modelo_1.fit(X_treino, Y_treino)
preds_1 = modelo_1.predict(X_valid)
mae_1 = mean_absolute_error(Y_valid, preds_1)

In [0]:
modelo_2 = RandomForestRegressor(random_state=42)

In [0]:
modelo_2.fit(X_treino, Y_treino)
preds_2 = modelo_2.predict(X_valid)
mae_2 = mean_absolute_error(Y_valid, preds_2)



In [0]:
print('MAE do modelo de arvore', mae_1)
print('MAE do modelo de floresta', mae_2)


MAE do modelo de arvore 30821.260273972603
MAE do modelo de floresta 22740.137181996088


In [0]:
modelo_3 = RandomForestRegressor(n_estimators=50, random_state=42)
modelo_4 = RandomForestRegressor(n_estimators=100, random_state=42)
modelo_5 = RandomForestRegressor(n_estimators=200, max_depth=7, random_state=42)
modelo_6 = RandomForestRegressor(n_estimators=100, criterion='mse', random_state=42)
modelo_7 = RandomForestRegressor(n_estimators=50, min_samples_split=20, random_state=42)


In [0]:
modelos = [modelo_1, modelo_2, modelo_3, modelo_4, modelo_5, modelo_6, modelo_7 ]

In [0]:
for i in range(0, len(modelos)):
  modelos[i].fit(X_treino, Y_treino)
  preds = modelos[i].predict(X_valid)
  mae = mean_absolute_error(Y_valid, preds)
  print('MAE do modelo %d: %d' % (i+1, mae))

MAE do modelo 1: 30821
MAE do modelo 2: 22740
MAE do modelo 3: 22411
MAE do modelo 4: 22537
MAE do modelo 5: 22956
MAE do modelo 6: 22537
MAE do modelo 7: 22578


In [0]:
X_teste = dados_teste[caracteristicas].copy()

In [0]:
modelo_3.fit(X, Y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=50,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [0]:
preds_teste = modelo_3.predict(X_teste)

In [0]:
res = pd.DataFrame({'Id': X_teste.index, 'SalePrice': preds_teste})

In [0]:
res.to_csv('resultado_modelos.csv')