In [12]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%reload_ext rpy2.ipython

In [110]:
import numpy as np
import pandas as pd

from kaggle.house_prices import api
from kaggle.house_prices import helpers
from kaggle.house_prices import model_selection
from kaggle.house_prices import utils
from kaggle.house_prices.notebook import Plot, ggplot2

################################
# load data, missing, outliers #
################################
combined_dataset_step1 = api.stage1_transformation(helpers.load_data())
combined_dataset_step1.shape

################################
# transform numeric predictors #
################################
combined_dataset_step2, trans_config = api.stage2_transformation(
    combined_dataset_step1,
    threshold=20
)

trans_config

utils.frames_diff(
    combined_dataset_step1.drop(columns=['price_log']),
    combined_dataset_step2.drop(columns=['price_log'])
).head()

####################################                
# transform categorical predictors #
####################################
combined_dataset_step3 = api.stage3_transformation(combined_dataset_step2)

utils.frames_diff(
    combined_dataset_step2.drop(columns=['price_log']),
    combined_dataset_step3.drop(columns=['price_log'])
).head()

(2917, 81)

Unnamed: 0,var,progress_score,tran,tran_fn,r2_x,r2_tran
0,GrLivArea,91.157942,log,<function stage2_transformation.<locals>.<lamb...,0.525931,0.543804
3,LotArea,78.864026,inv4,<function stage2_transformation.<locals>.<lamb...,0.067883,0.159187
4,TotRmsAbvGrd,30.685015,log,<function stage2_transformation.<locals>.<lamb...,0.289123,0.293868


Unnamed: 0,GrLivArea.1,GrLivArea.2,LotArea.1,LotArea.2,TotRmsAbvGrd.1,TotRmsAbvGrd.2
0,1710,7.444833,8450,9.587694,8,2.197225
1,1262,7.141245,9600,9.898464,6,1.94591
2,1786,7.488294,11250,10.298836,6,1.94591
3,1717,7.448916,9550,9.88555,7,2.079442
4,2198,7.695758,14260,10.927728,9,2.302585


Unnamed: 0,Alley.1,Alley.2,BldgType.1,BldgType.2,BsmtCond.1,BsmtCond.2,BsmtExposure.1,BsmtExposure.2,BsmtFinType1.1,BsmtFinType1.2,...,RoofStyle.1,RoofStyle.2,SaleCondition.1,SaleCondition.2,SaleType.1,SaleType.2,Street.1,Street.2,Utilities.1,Utilities.2
0,_none_,2.531822,1Fam,2.564039,TA,2.540871,No,2.320042,GLQ,3.262019,...,Gable,2.418054,Normal,2.451586,WD,2.421468,Pave,2.499311,AllPub,2.496911
1,_none_,2.531822,1Fam,2.564039,TA,2.540871,Gd,3.295455,ALQ,2.231818,...,Gable,2.418054,Normal,2.451586,WD,2.421468,Pave,2.499311,AllPub,2.496911
2,_none_,2.531822,1Fam,2.564039,TA,2.540871,Mn,2.675439,GLQ,3.262019,...,Gable,2.418054,Normal,2.451586,WD,2.421468,Pave,2.499311,AllPub,2.496911
3,_none_,2.531822,1Fam,2.564039,Gd,3.046154,No,2.320042,ALQ,2.231818,...,Gable,2.418054,Abnorml,1.920792,WD,2.421468,Pave,2.499311,AllPub,2.496911
4,_none_,2.531822,1Fam,2.564039,TA,2.540871,Av,2.900452,GLQ,3.262019,...,Gable,2.418054,Normal,2.451586,WD,2.421468,Pave,2.499311,AllPub,2.496911


In [230]:
train_data = (
    combined_dataset_step3
    .query("dataSource == 'train'")
    .drop(columns=['dataSource'])
)
train_data.head()

Unnamed: 0,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFinType2,BsmtFullBath,...,TotalBsmtSF,Utilities,WoodDeckSF,X1stFlrSF,X2ndFlrSF,X3SsnPorch,YearBuilt,YearRemodAdd,YrSold,price_log
0,2.531822,3,2.564039,2.540871,2.320042,706.0,0.0,3.262019,2.553429,1.0,...,856.0,2.496911,0,856,854,0,2003,2003,2008,12.247694
1,2.531822,3,2.564039,2.540871,3.295455,978.0,0.0,2.231818,2.553429,0.0,...,1262.0,2.496911,298,1262,0,0,1976,1976,2007,12.109011
2,2.531822,3,2.564039,2.540871,2.675439,486.0,0.0,3.262019,2.553429,1.0,...,920.0,2.496911,0,920,866,0,2001,2002,2008,12.317167
3,2.531822,3,2.564039,3.046154,2.320042,216.0,0.0,2.231818,2.553429,1.0,...,756.0,2.496911,0,961,756,0,1915,1970,2006,11.849398
4,2.531822,4,2.564039,2.540871,2.900452,655.0,0.0,3.262019,2.553429,1.0,...,1145.0,2.496911,192,1145,1053,0,2000,2000,2008,12.429216


In [113]:
step1 = model_selection.find_initial_best_r2_predictor(
    data=train_data,
    target_var="price_log", 
    r2_discard_level=0.02
)
step1.head(6)

Unnamed: 0,formula,predictor,r2
0,price_log ~ OverallQual,OverallQual,0.674705
1,price_log ~ Neighborhood,Neighborhood,0.547709
2,price_log ~ GrLivArea,GrLivArea,0.543804
3,price_log ~ GarageCars,GarageCars,0.463805
4,price_log ~ ExterQual,ExterQual,0.445239
5,price_log ~ BsmtQual,BsmtQual,0.435309


In [212]:
step2 = model_selection.find_next_best_r2_predictor(
    data=train_data, 
    base_formula=step1.loc[0, "formula"], 
    predictors=step1['predictor'],
    r2_gain_discard_level=0.5,
    a_max=0,
    a_avg=0
)

step2.head()

Unnamed: 0,base_r2,formula,predictor,r2,r2_gain,cor_abs_max,cor_abs_avg,r2_gain_adj
0,0.674705,price_log ~ OverallQual + GrLivArea,GrLivArea,0.764191,13.26286,0.608875,0.608875,13.26286
1,0.674705,price_log ~ OverallQual + LotArea,LotArea,0.746262,10.605577,0.164507,0.164507,10.605577
2,0.674705,price_log ~ OverallQual + X1stFlrSF,X1stFlrSF,0.745623,10.510875,0.469092,0.469092,10.510875
3,0.674705,price_log ~ OverallQual + Neighborhood,Neighborhood,0.736391,9.142558,0.679009,0.679009,9.142558
4,0.674705,price_log ~ OverallQual + TotalBsmtSF,TotalBsmtSF,0.733994,8.787371,0.538595,0.538595,8.787371


In [224]:
report = model_selection.greedy_r2_gain_adj(
    data=train_data, 
    target_var='price_log', 
    r2_gain_discard_level=0.5,
    r2_discard_level=0.02, 
    a_max=0, a_avg=0
)

report.groupby('step').head(1)

Unnamed: 0,predictor,formula,r2,step,base_r2,r2_gain,cor_abs_max,cor_abs_avg,r2_gain_adj
0,OverallQual,price_log ~ OverallQual,0.674705,1,,,,,
0,GrLivArea,price_log ~ OverallQual + GrLivArea,0.764191,2,0.674705,13.26286,0.608875,0.608875,13.26286
0,Neighborhood,price_log ~ OverallQual + GrLivArea + Neighbor...,0.813279,3,0.764191,6.423583,0.679009,0.573244,6.423583
0,BsmtFinSF1,price_log ~ OverallQual + GrLivArea + Neighbor...,0.849388,4,0.813279,4.439874,0.242618,0.198366,4.439874
0,GarageArea,price_log ~ OverallQual + GrLivArea + Neighbor...,0.86119,5,0.849388,1.389528,0.55723,0.454719,1.389528
0,CentralAir,price_log ~ OverallQual + GrLivArea + Neighbor...,0.870262,6,0.86119,1.053398,0.272522,0.213866,1.053398
0,LotArea,price_log ~ OverallQual + GrLivArea + Neighbor...,0.877606,7,0.870262,0.843951,0.369755,0.232091,0.843951
0,YearRemodAdd,price_log ~ OverallQual + GrLivArea + Neighbor...,0.885986,8,0.877606,0.954833,0.549827,0.316392,0.954833
0,TotalBsmtSF,price_log ~ OverallQual + GrLivArea + Neighbor...,0.891448,9,0.885986,0.616455,0.538595,0.400899,0.616455


In [225]:
report = model_selection.greedy_r2_gain_adj(
    data=train_data, 
    target_var='price_log', 
    r2_gain_discard_level=0.5,
    r2_discard_level=0.02, 
    a_max=1, a_avg=1
)

report.groupby('step').head(1)

Unnamed: 0,predictor,formula,r2,step,base_r2,r2_gain,cor_abs_max,cor_abs_avg,r2_gain_adj
0,OverallQual,price_log ~ OverallQual,0.674705,1,,,,,
0,LotArea,price_log ~ OverallQual + LotArea,0.746262,2,0.674705,10.605577,0.164507,0.164507,7.980031
0,GrLivArea,price_log ~ OverallQual + LotArea + GrLivArea,0.794871,3,0.746262,6.513697,0.608875,0.489315,3.104436
0,BsmtFinSF1,price_log ~ OverallQual + LotArea + GrLivArea ...,0.829395,4,0.794871,4.34336,0.222467,0.18901,3.077175
0,YearBuilt,price_log ~ OverallQual + LotArea + GrLivArea ...,0.860887,5,0.829395,3.796929,0.571368,0.268476,2.063724
0,SaleCondition,price_log ~ OverallQual + LotArea + GrLivArea ...,0.868805,6,0.860887,0.919745,0.34796,0.179732,0.602048
0,CentralAir,price_log ~ OverallQual + LotArea + GrLivArea ...,0.874948,7,0.868805,0.707117,0.381792,0.187496,0.450597
0,KitchenQual,price_log ~ OverallQual + LotArea + GrLivArea ...,0.882238,8,0.874948,0.833154,0.66272,0.351394,0.413658
0,Neighborhood,price_log ~ OverallQual + LotArea + GrLivArea ...,0.888168,9,0.882238,0.672197,0.705746,0.444006,0.312686


In [227]:
report = model_selection.greedy_r2_gain_adj(
    data=train_data, 
    target_var='price_log', 
    r2_gain_discard_level=0.5,
    r2_discard_level=0.02, 
    a_max=1, a_avg=0
)

report.groupby('step').head(1)

Unnamed: 0,predictor,formula,r2,step,base_r2,r2_gain,cor_abs_max,cor_abs_avg,r2_gain_adj
0,OverallQual,price_log ~ OverallQual,0.674705,1,,,,,
0,LotArea,price_log ~ OverallQual + LotArea,0.746262,2,0.674705,10.605577,0.164507,0.164507,9.107351
0,GrLivArea,price_log ~ OverallQual + LotArea + GrLivArea,0.794871,3,0.746262,6.513697,0.608875,0.489315,4.048604
0,BsmtFinSF1,price_log ~ OverallQual + LotArea + GrLivArea ...,0.829395,4,0.794871,4.34336,0.222467,0.18901,3.552947
0,YearBuilt,price_log ~ OverallQual + LotArea + GrLivArea ...,0.860887,5,0.829395,3.796929,0.571368,0.268476,2.416321
0,SaleCondition,price_log ~ OverallQual + LotArea + GrLivArea ...,0.868805,6,0.860887,0.919745,0.34796,0.179732,0.682323
0,KitchenQual,price_log ~ OverallQual + LotArea + GrLivArea ...,0.876226,7,0.868805,0.854261,0.66272,0.370076,0.513773
0,CentralAir,price_log ~ OverallQual + LotArea + GrLivArea ...,0.882238,8,0.876226,0.686041,0.381792,0.194897,0.496486
0,Neighborhood,price_log ~ OverallQual + LotArea + GrLivArea ...,0.888168,9,0.882238,0.672197,0.705746,0.444006,0.394078


In [228]:
report = model_selection.greedy_r2_gain_adj(
    data=train_data, 
    target_var='price_log', 
    r2_gain_discard_level=0.5,
    r2_discard_level=0.02, 
    a_max=0, a_avg=1
)

report.groupby('step').head(1)

Unnamed: 0,predictor,formula,r2,step,base_r2,r2_gain,cor_abs_max,cor_abs_avg,r2_gain_adj
0,OverallQual,price_log ~ OverallQual,0.674705,1,,,,,
0,LotArea,price_log ~ OverallQual + LotArea,0.746262,2,0.674705,10.605577,0.164507,0.164507,9.107351
0,GrLivArea,price_log ~ OverallQual + LotArea + GrLivArea,0.794871,3,0.746262,6.513697,0.608875,0.489315,4.37362
0,YearBuilt,price_log ~ OverallQual + LotArea + GrLivArea ...,0.837125,4,0.794871,5.315882,0.571368,0.274695,4.170316
0,BsmtFinSF1,price_log ~ OverallQual + LotArea + GrLivArea ...,0.860887,5,0.837125,2.838434,0.249818,0.204212,2.357089
0,YearRemodAdd,price_log ~ OverallQual + LotArea + GrLivArea ...,0.869877,6,0.860887,1.044289,0.592105,0.31953,0.79141
0,SaleCondition,price_log ~ OverallQual + LotArea + GrLivArea ...,0.875808,7,0.869877,0.681793,0.34796,0.206578,0.565063
0,CentralAir,price_log ~ OverallQual + LotArea + GrLivArea ...,0.880836,8,0.875808,0.574125,0.381792,0.203389,0.47709
0,Neighborhood,price_log ~ OverallQual + LotArea + GrLivArea ...,0.887303,9,0.880836,0.734194,0.705746,0.438352,0.510441
0,TotalBsmtSF,price_log ~ OverallQual + LotArea + GrLivArea ...,0.892516,10,0.887303,0.587522,0.538595,0.374101,0.427568


In [229]:
report = model_selection.greedy_r2_gain_adj(
    data=train_data, 
    target_var='price_log', 
    r2_gain_discard_level=0.5,
    r2_discard_level=0.02, 
    a_max=0.3, a_avg=0.3
)

report.groupby('step').head(1)

Unnamed: 0,predictor,formula,r2,step,base_r2,r2_gain,cor_abs_max,cor_abs_avg,r2_gain_adj
0,OverallQual,price_log ~ OverallQual,0.674705,1,,,,,
0,GrLivArea,price_log ~ OverallQual + GrLivArea,0.764191,2,0.674705,13.26286,0.608875,0.608875,9.714068
0,BsmtFinSF1,price_log ~ OverallQual + GrLivArea + BsmtFinSF1,0.811298,3,0.764191,6.164409,0.222467,0.17624,5.505843
0,Neighborhood,price_log ~ OverallQual + GrLivArea + BsmtFinS...,0.849388,4,0.811298,4.694839,0.679009,0.463035,3.496791
0,CentralAir,price_log ~ OverallQual + GrLivArea + BsmtFinS...,0.859963,5,0.849388,1.245057,0.272522,0.209468,1.087769
0,LotArea,price_log ~ OverallQual + GrLivArea + BsmtFinS...,0.86976,6,0.859963,1.139279,0.369755,0.218506,0.968381
0,SaleCondition,price_log ~ OverallQual + GrLivArea + BsmtFinS...,0.879226,7,0.86976,1.088279,0.32843,0.163967,0.94821
0,YearRemodAdd,price_log ~ OverallQual + GrLivArea + BsmtFinS...,0.88584,8,0.879226,0.752285,0.549827,0.312221,0.597709
0,TotalBsmtSF,price_log ~ OverallQual + GrLivArea + BsmtFinS...,0.891508,9,0.88584,0.63989,0.538595,0.37083,0.502731
