### Create plots of features vs sale price

In [40]:
import pandas as pd
import numpy as np

data = pd.read_csv("data/fold1/train.csv").fillna('None')
data["Garage_Yr_Blt"] = data["Garage_Yr_Blt"].replace("None", 0)
data["Sale_Price"] = np.log(data["Sale_Price"])

In [42]:
import matplotlib.pyplot as plt

Y = data["Sale_Price"]

for feature in data.columns[1:]:
    if isinstance(data[feature].iloc[0], str):
        feature_values = data[feature].unique()
        mean_sale_prices = []
        for feature_value in feature_values:
            sale_price = data[data[feature] == feature_value]["Sale_Price"]
            mean_sale_prices.append(sale_price.mean())
        plt.bar(feature_values, mean_sale_prices)
    else:
        plt.scatter(data[feature], Y)    

    plt.savefig(f"figures/{feature}.png")
    plt.clf()

<Figure size 640x480 with 0 Axes>

In [50]:
quant_features = ["Sale_Price"]
qual_features = ["Sale_Price"]
for feature in data.columns[1:-1]:
    if isinstance(data[feature].iloc[0], str):
        qual_features.append(feature)
    else:
        quant_features.append(feature)    

covariance = data[quant_features].cov()
price_covariance = covariance["Sale_Price"].sort_values(key=abs)
print(price_covariance[-10:])

Wood_Deck_SF      17.370360
Bsmt_Unf_SF       33.812496
Mas_Vnr_Area      34.352658
Second_Flr_SF     49.623253
Garage_Yr_Blt     58.330521
Garage_Area       58.664352
First_Flr_SF     102.003180
Total_Bsmt_SF    118.536704
Gr_Liv_Area      150.338034
Lot_Area         865.947859
Name: Sale_Price, dtype: float64


In [51]:
covariance = pd.get_dummies(data[qual_features]).cov()
price_covariance = covariance["Sale_Price"].sort_values(key=abs)
print(price_covariance[-10:])

Garage_Type_Attchd           0.087562
Garage_Finish_Unf           -0.093941
Bsmt_Qual_Typical           -0.094740
Exter_Qual_Good              0.095448
Heating_QC_Excellent         0.101483
Fireplace_Qu_No_Fireplace   -0.105689
Kitchen_Qual_Typical        -0.111815
Foundation_PConc             0.114617
Exter_Qual_Typical          -0.119229
Sale_Price                   0.173599
Name: Sale_Price, dtype: float64


In [58]:
data.columns

Index(['PID', 'MS_SubClass', 'MS_Zoning', 'Lot_Frontage', 'Lot_Area', 'Street',
       'Alley', 'Lot_Shape', 'Land_Contour', 'Utilities', 'Lot_Config',
       'Land_Slope', 'Neighborhood', 'Condition_1', 'Condition_2', 'Bldg_Type',
       'House_Style', 'Overall_Qual', 'Overall_Cond', 'Year_Built',
       'Year_Remod_Add', 'Roof_Style', 'Roof_Matl', 'Exterior_1st',
       'Exterior_2nd', 'Mas_Vnr_Type', 'Mas_Vnr_Area', 'Exter_Qual',
       'Exter_Cond', 'Foundation', 'Bsmt_Qual', 'Bsmt_Cond', 'Bsmt_Exposure',
       'BsmtFin_Type_1', 'BsmtFin_SF_1', 'BsmtFin_Type_2', 'BsmtFin_SF_2',
       'Bsmt_Unf_SF', 'Total_Bsmt_SF', 'Heating', 'Heating_QC', 'Central_Air',
       'Electrical', 'First_Flr_SF', 'Second_Flr_SF', 'Low_Qual_Fin_SF',
       'Gr_Liv_Area', 'Bsmt_Full_Bath', 'Bsmt_Half_Bath', 'Full_Bath',
       'Half_Bath', 'Bedroom_AbvGr', 'Kitchen_AbvGr', 'Kitchen_Qual',
       'TotRms_AbvGrd', 'Functional', 'Fireplaces', 'Fireplace_Qu',
       'Garage_Type', 'Garage_Yr_Blt', 'Garage_Fi

In [60]:
for i in range(data.shape[0]):
    area = data.iloc[0]["Gr_Liv_Area"] - data.iloc[0]["First_Flr_SF"] - data.iloc[0]["Second_Flr_SF"]
    if area != 0:
        print(area, i)

data.iloc[0]["Gr_Liv_Area"], data.iloc[0]["First_Flr_SF"], data.iloc[0]["Second_Flr_SF"], data.iloc[0]["Total_Bsmt_SF"]

(896, 896, 0, 882)

In [158]:
def preprocess_features(data, train_columns=None):
    # Some values in Mas_Vnr_Type and Misc_feature have a Nan float value instead of just a "None" string
    data = data.fillna('None')
    # Some homes don't have garages so replace their "None" string with 0
    data["Garage_Yr_Blt"] = data["Garage_Yr_Blt"].replace("None", 0)

    try:
        y = np.log(data["Sale_Price"])
    except KeyError:
        y = None

    # Selected qualitative features
    best_features = list(data.columns)[1:-1]
    data = data[best_features]
    
    # best_features = ["Exter_Qual", "Foundation", "Kitchen_Qual", "Fireplace_Qu", 
    #                  "Heating_QC", "Bsmt_Qual"]
    # # Selected quantitative features
    # best_features.extend(["Wood_Deck_SF", "Bsmt_Unf_SF", "Mas_Vnr_Area", "Second_Flr_SF", "Garage_Yr_Blt", 
    #                        "Garage_Area", "First_Flr_SF", "Total_Bsmt_SF", "Gr_Liv_Area", "Lot_Area"])

    # Numerical encoding for ordinal features
    ordinal_features = ["Lot_Shape", "Utilities", "Land_Slope", "Overall_Qual", "Overall_Cond", "Exter_Qual", "Exter_Cond", 
                        "Bsmt_Qual", "Bsmt_Cond", "Bsmt_Exposure", "BsmtFin_Type_1", "BsmtFin_Type_2", "Heating_QC", 
                        "Electrical", "Kitchen_Qual", "Functional", "Fireplace_Qu", "Garage_Finish", "Garage_Qual", 
                        "Garage_Cond", "Paved_Drive", "Pool_QC", "Fence"]
    
    for feature in ordinal_features:
        data[feature] = pd.factorize(data[feature])[0] + 1

    # One hot encoding of nominal features
    X = pd.get_dummies(data[best_features])

    # Handle column mismatch from one hot encoding
    if train_columns is not None:
        missing_columns = set(train_columns) - set(X.columns)
        for column in missing_columns:
            X[column] = 0
        # Ensure the order of column in the test set is in the same order than in train set
        X = X[train_columns]

    return X, y

In [133]:
import sys

def xgb_cross_validate(X, y, folds):

    n = X.shape[0]
    fold_size = n // folds
    
    min_error = sys.maxsize
    best_eta = -1
    best_T = -1

    for eta in np.arange(0.05, 0.5, 0.05):
        for T in np.arange(5, 50, 5):
            # print(eta, T)
            average_error = 0
            for i in range(folds):
                start = i * fold_size
                end = (i + 1) * fold_size
                
                X_holdout = X[start:end]
                X_train = np.concatenate([X[end:], X[:start]])
                y_holdout = y[start:end]
                y_train = np.concatenate([y[end:],y[:start]])

                clf = xgb.XGBRegressor(n_estimators=T, learning_rate=eta)
                clf.fit(X_train, y_train)
                yhat = np.log(clf.predict(X_holdout))
                
                error = np.mean((y_holdout - yhat)**2)**(0.5)
                average_error = ((average_error * i) + error) / (i + 1)
            
            if average_error <= min_error:
                min_error = average_error
                best_eta = eta
                best_T = T
    
    return best_eta, best_T

In [157]:
X_train, y_train = preprocess_features(pd.read_csv(f"data/fold1/train.csv"))

In [161]:
import xgboost as xgb

for i in range(1, 11):
    X_train, y_train = preprocess_features(pd.read_csv(f"data/fold{i}/train.csv"))
    # print(X_train.shape)
    # cv_eta, cv_T = xgb_cross_validate(X_train, y_train, folds=10)
    cv_eta = 0.05
    cv_T = 5000
    
    X_test, _ = preprocess_features(pd.read_csv(f"data/fold{i}/test.csv"), X_train.columns)
    y_test = pd.read_csv(f"data/fold{i}/test_y.csv")["Sale_Price"]
    y_test = np.log(y_test)
    
    clf = xgb.XGBRegressor(n_estimators=cv_T, learning_rate=cv_eta, max_depth=6, subsample=0.3)
    clf.fit(X_train, y_train)
    yhat = clf.predict(X_test)
    
    residuals = np.mean((y_test - yhat)**2)**(0.5)
    print(f"RMSE for fold_{i}:", residuals, cv_eta, cv_T)

RMSE for fold_1: 0.1387085241123735 0.05 5000
RMSE for fold_2: 0.15999776708906746 0.05 5000
RMSE for fold_3: 0.13585355370900068 0.05 5000
RMSE for fold_4: 0.15962948984434 0.05 5000
RMSE for fold_5: 0.13845091253036873 0.05 5000
RMSE for fold_6: 0.16052538379417647 0.05 5000
RMSE for fold_7: 0.17288935007552805 0.05 5000
RMSE for fold_8: 0.15942630180898326 0.05 5000
RMSE for fold_9: 0.18369483303813236 0.05 5000
RMSE for fold_10: 0.16569923435794826 0.05 5000


In [162]:
import xgboost as xgb

for i in range(1, 11):
    X_train, y_train = preprocess_features(pd.read_csv(f"data/fold{i}/train.csv"))
    
    # cv_eta, cv_T = xgb_cross_validate(X_train, y_train, folds=10)
    cv_eta = 0.3
    cv_T = 150
    
    X_test, _ = preprocess_features(pd.read_csv(f"data/fold{i}/test.csv"), X_train.columns)
    y_test = pd.read_csv(f"data/fold{i}/test_y.csv")["Sale_Price"]
    y_test = np.log(y_test)

    clf = xgb.XGBRegressor(n_estimators=cv_T, learning_rate=cv_eta)
    clf.fit(X_train, y_train)
    yhat = clf.predict(X_test)
    
    residuals = np.mean((y_test - yhat)**2)**(0.5)

    print(f"RMSE for fold_{i}:", residuals, cv_eta, cv_T)

RMSE for fold_1: 0.15746881831219336 0.3 150
RMSE for fold_2: 0.17059099538255576 0.3 150
RMSE for fold_3: 0.15378633181472062 0.3 150
RMSE for fold_4: 0.17122312673532372 0.3 150
RMSE for fold_5: 0.14160415971438822 0.3 150
RMSE for fold_6: 0.18051977602615593 0.3 150
RMSE for fold_7: 0.20488525807816413 0.3 150
RMSE for fold_8: 0.17140631407585138 0.3 150
RMSE for fold_9: 0.20092356843127718 0.3 150
RMSE for fold_10: 0.17102032784361956 0.3 150
