In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed

import numpy as np # linear algebra


In [2]:
import pandas as pd
X = pd.read_csv('../input/housing-prices-competition-for-kaggle-learn-users/train.csv', index_col='Id')
X_test = pd.read_csv('../input/housing-prices-competition-for-kaggle-learn-users/test.csv', index_col='Id')

In [3]:
#step1,2
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

In [4]:
#step3 ; determining object,good,bad cols
object_cols = [col for col in X.columns if X[col].dtype == "object"]

#determine good label cols(check whether columns of train is subset of test)
good_label_cols = [col for col in object_cols if 
                   set(X_test[col]).issubset(set(X[col]))]

#determine bad label cols
bad_label_cols = list(set(object_cols)-set(good_label_cols))

#drop bad label cols from dataset
X_train_ordinal = X.drop(bad_label_cols, axis=1)
X_test_ordinal =  X_test.drop(bad_label_cols, axis=1)

In [5]:
from sklearn.preprocessing import OrdinalEncoder

#set the ordinal encoder and encode the dataset
ordinal_encoder = OrdinalEncoder()
X_train_ordinal[good_label_cols] = ordinal_encoder.fit_transform(X_train_ordinal[good_label_cols])
X_test_ordinal[good_label_cols] = ordinal_encoder.transform(X_test_ordinal[good_label_cols])

In [6]:
from sklearn.impute import SimpleImputer

#step 4, fill the missing values
my_imputer = SimpleImputer(strategy="constant")
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train_ordinal))
imputed_X_test = pd.DataFrame(my_imputer.transform(X_test_ordinal))

In [7]:
from xgboost import XGBRegressor

#step5, defining an XGBoost and train it
my_model_1 = XGBRegressor(n_estimators=1000,learning_rate=0.05,n_jobs=-1,random_state=0)

#trained with train set which was preprocessed
my_model_1.fit(imputed_X_train,y)

#made predictions with imputed x test
predictions_1 = my_model_1.predict(imputed_X_test)

In [8]:
#submission file created
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': predictions_1})
output.to_csv('submission_midterm.csv', index=False)

My score: 14532.26020

My place: 637


In [9]:
from sklearn.model_selection import cross_val_score

my_model_2 = XGBRegressor(n_estimators=1000,learning_rate=0.05,n_jobs=-1,random_state=0)
scores = -1 * cross_val_score(my_model_2, imputed_X_train, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

MAE scores:
 [16229.90319991 17025.03847389 16919.30446008 13966.25952483
 16492.20208155]


MAE scores:
 [16229.90319991 17025.03847389 16919.30446008 13966.25952483
 16492.20208155]

In [10]:
from sklearn.model_selection import train_test_split
X_train_ordinal_2, X_valid_ordinal, y_train, y_valid = train_test_split(imputed_X_train, y, train_size=0.8, test_size=0.2,random_state=0)

In [11]:
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train,X_valid,y_train,y_valid,n_es):
    model = XGBRegressor(n_estimators = n_es,random_state=0,learning_rate=0.05)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [12]:
candidate_n_estimators = [100, 500,1000,1500]


small = 100000000
small_n_t =0  # smallest n_estimator
for estimators in candidate_n_estimators:
    my_mae =score_dataset(X_train_ordinal_2,X_valid_ordinal,y_train,y_valid,estimators)
    print("Estimator size(n_estimator): %d  \t\t Mean Absolute Error:  %d" %(estimators, my_mae))
    if my_mae < small :
        small_n_t = estimators
        small = my_mae


best_estimator_size = small_n_t
print("Best estimator size is: %d"%(small_n_t))


Estimator size(n_estimator): 100  		 Mean Absolute Error:  16874
Estimator size(n_estimator): 500  		 Mean Absolute Error:  16595
Estimator size(n_estimator): 1000  		 Mean Absolute Error:  16597
Estimator size(n_estimator): 1500  		 Mean Absolute Error:  16597
Best estimator size is: 500


Estimator size(n_estimator): 100  		 Mean Absolute Error:  16874

Estimator size(n_estimator): 500  		 Mean Absolute Error:  16595

Estimator size(n_estimator): 1000  		 Mean Absolute Error:  16597

Estimator size(n_estimator): 1500  		 Mean Absolute Error:  16597

Best estimator size is: 500