In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

data_train = pd.read_csv("data/train.csv", index_col="Id")
data_test = pd.read_csv("data/test.csv", index_col="Id")

y = data_train.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

X = data_train[features].copy()
X_result= data_test[features].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
X_train.describe()

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
count,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0
mean,10589.672945,1970.890411,1160.958904,351.479452,1.566781,2.882705,6.544521
std,10704.180793,30.407486,373.315037,438.137938,0.546698,0.802166,1.624493
min,1300.0,1872.0,334.0,0.0,0.0,0.0,2.0
25%,7589.5,1953.75,884.0,0.0,1.0,2.0,5.0
50%,9512.5,1972.0,1092.0,0.0,2.0,3.0,6.0
75%,11601.5,2000.0,1389.25,729.0,2.0,3.0,7.0
max,215245.0,2010.0,3228.0,1872.0,3.0,8.0,14.0


### Model Evaluation

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Define the models
model_1 = RandomForestRegressor(n_estimators=50, criterion="mae", random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion="mae", random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, criterion="mae", min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, criterion="mae", max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

from sklearn.metrics import mean_absolute_error

# Function for comparing different models
def score_model(model, X_tr=X_train, X_t=X_test, y_tr=y_train, y_t=y_test):
    model.fit(X_tr, y_tr)
    preds = model.predict(X_t)
    return mean_absolute_error(y_t, preds)

for i in range(len(models)):
    mae = score_model(models[i])
    print("Model {0} MAE : {1}".format(i+1, round(mae)))

Model 1 MAE : 23582
Model 2 MAE : 23741
Model 3 MAE : 23529
Model 4 MAE : 24040
Model 5 MAE : 23497


### Apply the best model to the dataset "Test"

In [4]:
best_model = model_5

best_model.fit(X, y)
preds_result = best_model.predict(X_result)

output = pd.DataFrame({"Id": X_result.index, "SalePrice": preds_result})
output.to_csv("result.csv", index=False)