In [2]:
# %%
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

train_data = pd.read_csv('train.csv')

In [3]:
features = ['LotArea', 'OverallQual', 'YearBuilt', '1stFlrSF', '2ndFlrSF']
X = train_data[features]
y = train_data['SalePrice']

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

house_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [6]:
house_model.fit(X_train, y_train)

predictions = house_model.predict(X_val)

mae = mean_absolute_error(y_val, predictions)
r2 = r2_score(y_val, predictions)

print(f"Erro Médio Absoluto (MAE): ${mae:,.2f}")
print(f"R² Score (Poder de explicação): {r2:.4f}")

Erro Médio Absoluto (MAE): $20,708.30
R² Score (Poder de explicação): 0.8656


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

data = pd.read_csv('train.csv')
X = data.drop(['SalePrice', 'Id'], axis=1) #
y = data['SalePrice']

cols_categoricas = [cname for cname in X.columns if X[cname].nunique() < 10 and X[cname].dtype == "object"]
cols_numericas = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

transf_numerica = SimpleImputer(strategy='median')

transf_categorica = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Juntamos tudo em um único "Processador"
preprocessor = ColumnTransformer(
    transformers=[
        ('num', transf_numerica, cols_numericas),
        ('cat', transf_categorica, cols_categoricas)
    ])

meu_modelo = RandomForestRegressor(n_estimators=100, random_state=42)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', meu_modelo)
                     ])

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)
preds = clf.predict(X_val)

print(f"MAE Final: ${mean_absolute_error(y_val, preds):,.2f}")

MAE Final: $17,641.59
