In [1]:
import pandas as pd

# Read the data
X = pd.read_csv('../input/housing-prices-competition-for-kaggle-learn-users/train.csv', index_col='Id') 
X_test = pd.read_csv('../input/housing-prices-competition-for-kaggle-learn-users/test.csv', index_col='Id')

print(X.shape)
print(X_test.shape)

#Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
print(X.shape)

#Separate target from predictors (i.e. input features)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

#Perform ordinal encoding to categorical feature columns 
from sklearn.preprocessing import OrdinalEncoder

# Get the columns that contain strings and treat them as categorical
object_cols = [col for col in X.columns if X[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_test[col]).issubset(set(X[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))

print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

# Drop categorical columns that will not be encoded
X_train_ordinal = X.drop(bad_label_cols, axis=1)
X_test_ordinal = X_test.drop(bad_label_cols, axis=1)

# Apply ordinal encoder 
ordinal_encoder = OrdinalEncoder() # Your code here
X_train_ordinal[good_label_cols] = ordinal_encoder.fit_transform(X_train_ordinal[good_label_cols])
X_test_ordinal[good_label_cols] = ordinal_encoder.transform(X_test_ordinal[good_label_cols])

#fill missing values in train and test sets using IterativeImputer

from sklearn.experimental import enable_iterative_imputer #(5 pts)
from sklearn.impute import IterativeImputer #(5 pts)

final_imputer = IterativeImputer(max_iter=3, random_state=0) #(6 pts)

X_train_imputed = final_imputer.fit_transform(X_train_ordinal)
X_test_imputed = final_imputer.transform(X_test_ordinal)

(1460, 80)
(1459, 79)
(1460, 80)
Categorical columns that will be ordinal encoded: ['Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['Utilities', 'Functional', 'Exterior2nd', 'SaleType', 'Exterior1st', 'KitchenQual', 'MSZoning']


In [2]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

def score_dataset(X_train,y,n_est,l_rate):
    model = XGBRegressor(n_estimators=n_est, learning_rate=l_rate, random_state=0)
    scores = -1 * cross_val_score(model,X_train, y,
                              cv=3,
                              scoring='neg_mean_absolute_error')
    return scores.mean()

In [3]:

candidate_n_estimators = [100,250,500,1000,1500]
candidate_learning_rate = [0.001,0.005,0.01,0.05,0.1]



small_mae = 100000000

small_n_estimator =0
small_learning_rate = 0

for n_estimator in candidate_n_estimators:
        for l_rate in candidate_learning_rate:
            my_mae =score_dataset(X_train_imputed,y,n_estimator,l_rate)
            print("n_estimator: {}  learning_rate: {}  \t\t Mean Absolute Error:  {}".format(n_estimator,l_rate, my_mae))
            if my_mae < small_mae :
                small_n_estimator = n_estimator
                small_mae = my_mae
                small_learning_rate = l_rate


best_n_estimator = small_n_estimator
best_learning_rate = small_learning_rate
best_mae = small_mae 
print("Best n_estimator size is: {}  Best learning rate is: {}  \t\t Best Mean Absolute Error:  {}".format(best_n_estimator,best_learning_rate,best_mae))



n_estimator: 100  learning_rate: 0.001  		 Mean Absolute Error:  164014.75495086762
n_estimator: 100  learning_rate: 0.005  		 Mean Absolute Error:  110907.13923489145
n_estimator: 100  learning_rate: 0.01  		 Mean Absolute Error:  68666.88910118258
n_estimator: 100  learning_rate: 0.05  		 Mean Absolute Error:  16957.37319728889
n_estimator: 100  learning_rate: 0.1  		 Mean Absolute Error:  16764.04333582569
n_estimator: 250  learning_rate: 0.001  		 Mean Absolute Error:  141642.56502330428
n_estimator: 250  learning_rate: 0.005  		 Mean Absolute Error:  54596.69584196614
n_estimator: 250  learning_rate: 0.01  		 Mean Absolute Error:  22721.86197180029
n_estimator: 250  learning_rate: 0.05  		 Mean Absolute Error:  16551.48284203374
n_estimator: 250  learning_rate: 0.1  		 Mean Absolute Error:  16765.40173046447
n_estimator: 500  learning_rate: 0.001  		 Mean Absolute Error:  111008.49902298088
n_estimator: 500  learning_rate: 0.005  		 Mean Absolute Error:  22783.301179617676
n_estim

Best n_estimator size is: 250  Best learning rate is: 0.05  		 Best Mean Absolute Error:  16551.48284203374

In [4]:
model1 = XGBRegressor(n_estimators=best_n_estimator, learning_rate=best_learning_rate, n_jobs=-1, random_state=0)
model1.fit(X_train_imputed, y)
preds_test1 = model1.predict(X_test_imputed)


output1 = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test1})
output1.to_csv('submission1.csv', index=False)

Score: 14622.39688

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_train_imputed, y, test_size=0.2,train_size=0.8, random_state=0)

In [6]:


def score_dataset_2(X_train,X_valid,y_train,y_valid,l_rate):
    model = XGBRegressor(n_estimators=2000,learning_rate=l_rate,early_stopping_rounds=5,  random_state=0)
    model.fit(X_train, y_train, 
              
             eval_set=[(X_valid, y_valid)],
             verbose=False)
    preds = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds)
    best_n_estimator = model.best_iteration+1
    return [mae,best_n_estimator]
    

In [7]:

candidate_learning_rate = [0.001,0.005,0.01,0.05,0.1]



small_mae2 = 100000000

small_n_estimator =0
small_learning_rate = 0


for l_rate in candidate_learning_rate:
    results =score_dataset_2(X_train,X_valid,y_train,y_valid,l_rate)
    my_mae = results[0]
    n_est = results[1]
    print("n_estimator: {}  learning_rate: {}  \t\t Mean Absolute Error:  {}".format(n_est,l_rate, my_mae))
    if my_mae < small_mae2 :
            small_n_estimator = n_est
            small_mae2 = my_mae
            small_learning_rate = l_rate


best_n_estimator2 = small_n_estimator
best_learning_rate2 = small_learning_rate
best_mae2 = small_mae2 
print("Best n_estimator size is: {}  Best learning rate is: {}  \t\t Best Mean Absolute Error:  {}".format(best_n_estimator2,best_learning_rate2,best_mae2))


n_estimator: 2000  learning_rate: 0.001  		 Mean Absolute Error:  29758.45402129709
n_estimator: 1346  learning_rate: 0.005  		 Mean Absolute Error:  16966.71544574058
n_estimator: 854  learning_rate: 0.01  		 Mean Absolute Error:  16684.603194563355
n_estimator: 152  learning_rate: 0.05  		 Mean Absolute Error:  16976.85847870291
n_estimator: 111  learning_rate: 0.1  		 Mean Absolute Error:  16965.61593000856
Best n_estimator size is: 854  Best learning rate is: 0.01  		 Best Mean Absolute Error:  16684.603194563355


Best n_estimator size is: 854  Best learning rate is: 0.01  		 Best Mean Absolute Error:  16684.603194563355

In [8]:
model2 = XGBRegressor(n_estimators=best_n_estimator2, learning_rate=best_learning_rate2, n_jobs=-1, random_state=0)
model2.fit(X_train_imputed, y)
preds_test2 = model2.predict(X_test_imputed)


output2 = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test2})
output2.to_csv('submission2.csv', index=False)

Score: 14975.15252

Question 11)

We get better results when we use Cross Validation method to train our model. The main reason of this situation is our dataset is very small and if we only do train test split, we lost to much important data which improves our model quality. So, like our situation, using Cross validation gives better result in small datasets due to it tries different folds of data to train our model and it captures more patterns which increases the quality of the model.

Question 12)

Yes , the result we get here is different. My answer for second question is a no because the previous model gave us a better result than we are having now. In test sets, the previous model performed better. Despite the fact that we used a train set, we discovered different parameter velocities. Our previous model's pattern is more similar to our test data. In the previous model, we can say that our variance is lower. Considering all these details, we can understand that the model we used before is more effective.