### Create plots of features vs sale price

In [185]:
import pandas as pd
import numpy as np

data = pd.read_csv("data/fold1/train.csv").fillna('None')
data["Garage_Yr_Blt"] = data["Garage_Yr_Blt"].replace("None", 0)
data["Sale_Price"] = np.log(data["Sale_Price"])

In [187]:
data["MS_SubClass"].unique()

array(['One_Story_1946_and_Newer_All_Styles', 'Two_Story_1946_and_Newer',
       'One_Story_PUD_1946_and_Newer', 'Split_Foyer',
       'Two_Story_PUD_1946_and_Newer', 'Split_or_Multilevel',
       'One_Story_1945_and_Older', 'Duplex_All_Styles_and_Ages',
       'One_and_Half_Story_Finished_All_Ages',
       'Two_Family_conversion_All_Styles_and_Ages',
       'Two_Story_1945_and_Older',
       'One_Story_with_Finished_Attic_All_Ages',
       'PUD_Multilevel_Split_Level_Foyer', 'Two_and_Half_Story_All_Ages',
       'One_and_Half_Story_Unfinished_All_Ages',
       'One_and_Half_Story_PUD_All_Ages'], dtype=object)

In [42]:
import matplotlib.pyplot as plt

Y = data["Sale_Price"]

for feature in data.columns[1:]:
    if isinstance(data[feature].iloc[0], str):
        feature_values = data[feature].unique()
        mean_sale_prices = []
        for feature_value in feature_values:
            sale_price = data[data[feature] == feature_value]["Sale_Price"]
            mean_sale_prices.append(sale_price.mean())
        plt.bar(feature_values, mean_sale_prices)
    else:
        plt.scatter(data[feature], Y)    

    plt.savefig(f"figures/{feature}.png")
    plt.clf()

<Figure size 640x480 with 0 Axes>

In [179]:
def preprocess_features(data, train_columns=None):
    # Some values in Mas_Vnr_Type and Misc_feature have a Nan float value instead of just a "None" string
    data = data.fillna('None')
    # Some homes don't have garages so replace their "None" string with 0
    data["Garage_Yr_Blt"] = data["Garage_Yr_Blt"].replace("None", 0)

    try:
        y = np.log(data["Sale_Price"])
    except KeyError:
        y = None

    # Select all features except Sales Price
    best_features = list(data.columns)[:-1]
    data = data[best_features]

    # One hot encoding of nominal features
    X = pd.get_dummies(data[best_features])

    # Handle column mismatch from one hot encoding
    if train_columns is not None:
        missing_columns = set(train_columns) - set(X.columns)
        for column in missing_columns:
            X[column] = 0
        # Ensure the order of column in the test set is in the same order than in train set
        X = X[train_columns]

    return X, y

In [133]:
import sys

def xgb_cross_validate(X, y, folds):

    n = X.shape[0]
    fold_size = n // folds
    
    min_error = sys.maxsize
    best_eta = -1
    best_T = -1

    for eta in np.arange(0.05, 0.5, 0.05):
        for T in np.arange(5, 50, 5):
            # print(eta, T)
            average_error = 0
            for i in range(folds):
                start = i * fold_size
                end = (i + 1) * fold_size
                
                X_holdout = X[start:end]
                X_train = np.concatenate([X[end:], X[:start]])
                y_holdout = y[start:end]
                y_train = np.concatenate([y[end:],y[:start]])

                clf = xgb.XGBRegressor(n_estimators=T, learning_rate=eta)
                clf.fit(X_train, y_train)
                yhat = np.log(clf.predict(X_holdout))
                
                error = np.mean((y_holdout - yhat)**2)**(0.5)
                average_error = ((average_error * i) + error) / (i + 1)
            
            if average_error <= min_error:
                min_error = average_error
                best_eta = eta
                best_T = T
    
    return best_eta, best_T

In [184]:
import xgboost as xgb

for i in range(1, 11):
    X_train, y_train = preprocess_features(pd.read_csv(f"data/fold{i}/train.csv"))
    # print(X_train.shape)
    # cv_eta, cv_T = xgb_cross_validate(X_train, y_train, folds=10)
    cv_eta = 0.025
    cv_T = 10000
    
    X_test, _ = preprocess_features(pd.read_csv(f"data/fold{i}/test.csv"), X_train.columns)
    y_test = pd.read_csv(f"data/fold{i}/test_y.csv")["Sale_Price"]
    y_test = np.log(y_test)
    
    clf = xgb.XGBRegressor(n_estimators=cv_T, learning_rate=cv_eta, 
                           max_depth=6, subsample=0.5, tree_method="exact")
    clf.fit(X_train, y_train)
    yhat = clf.predict(X_test)
    
    residuals = np.mean((y_test - yhat)**2)**(0.5)
    print(f"RMSE for fold_{i}:", residuals, cv_eta, cv_T)

RMSE for fold_1: 0.11601596770438807 0.025 10000
RMSE for fold_2: 0.12264511855081735 0.025 10000
RMSE for fold_3: 0.11292518944991467 0.025 10000
RMSE for fold_4: 0.11756599634604162 0.025 10000
RMSE for fold_5: 0.11287049849523371 0.025 10000
RMSE for fold_6: 0.12660726933712557 0.025 10000
RMSE for fold_7: 0.13121609245120477 0.025 10000
RMSE for fold_8: 0.12451253333074389 0.025 10000
RMSE for fold_9: 0.13268282043378155 0.025 10000
RMSE for fold_10: 0.12445116079288275 0.025 10000
