In [12]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%reload_ext rpy2.ipython

In [110]:
import numpy as np
import pandas as pd

from kaggle.house_prices import api
from kaggle.house_prices import helpers
from kaggle.house_prices import model_selection
from kaggle.house_prices import utils
from kaggle.house_prices.notebook import Plot, ggplot2

################################
# load data, missing, outliers #
################################
combined_dataset_step1 = api.stage1_transformation(helpers.load_data())
combined_dataset_step1.shape

################################
# transform numeric predictors #
################################
combined_dataset_step2, trans_config = api.stage2_transformation(
    combined_dataset_step1,
    threshold=20
)

trans_config

utils.frames_diff(
    combined_dataset_step1.drop(columns=['price_log']),
    combined_dataset_step2.drop(columns=['price_log'])
).head()

####################################                
# transform categorical predictors #
####################################
combined_dataset_step3 = api.stage3_transformation(combined_dataset_step2)

utils.frames_diff(
    combined_dataset_step2.drop(columns=['price_log']),
    combined_dataset_step3.drop(columns=['price_log'])
).head()

(2917, 81)

Unnamed: 0,var,progress_score,tran,tran_fn,r2_x,r2_tran
0,GrLivArea,91.157942,log,<function stage2_transformation.<locals>.<lamb...,0.525931,0.543804
3,LotArea,78.864026,inv4,<function stage2_transformation.<locals>.<lamb...,0.067883,0.159187
4,TotRmsAbvGrd,30.685015,log,<function stage2_transformation.<locals>.<lamb...,0.289123,0.293868


Unnamed: 0,GrLivArea.1,GrLivArea.2,LotArea.1,LotArea.2,TotRmsAbvGrd.1,TotRmsAbvGrd.2
0,1710,7.444833,8450,9.587694,8,2.197225
1,1262,7.141245,9600,9.898464,6,1.94591
2,1786,7.488294,11250,10.298836,6,1.94591
3,1717,7.448916,9550,9.88555,7,2.079442
4,2198,7.695758,14260,10.927728,9,2.302585


Unnamed: 0,Alley.1,Alley.2,BldgType.1,BldgType.2,BsmtCond.1,BsmtCond.2,BsmtExposure.1,BsmtExposure.2,BsmtFinType1.1,BsmtFinType1.2,...,RoofStyle.1,RoofStyle.2,SaleCondition.1,SaleCondition.2,SaleType.1,SaleType.2,Street.1,Street.2,Utilities.1,Utilities.2
0,_none_,2.531822,1Fam,2.564039,TA,2.540871,No,2.320042,GLQ,3.262019,...,Gable,2.418054,Normal,2.451586,WD,2.421468,Pave,2.499311,AllPub,2.496911
1,_none_,2.531822,1Fam,2.564039,TA,2.540871,Gd,3.295455,ALQ,2.231818,...,Gable,2.418054,Normal,2.451586,WD,2.421468,Pave,2.499311,AllPub,2.496911
2,_none_,2.531822,1Fam,2.564039,TA,2.540871,Mn,2.675439,GLQ,3.262019,...,Gable,2.418054,Normal,2.451586,WD,2.421468,Pave,2.499311,AllPub,2.496911
3,_none_,2.531822,1Fam,2.564039,Gd,3.046154,No,2.320042,ALQ,2.231818,...,Gable,2.418054,Abnorml,1.920792,WD,2.421468,Pave,2.499311,AllPub,2.496911
4,_none_,2.531822,1Fam,2.564039,TA,2.540871,Av,2.900452,GLQ,3.262019,...,Gable,2.418054,Normal,2.451586,WD,2.421468,Pave,2.499311,AllPub,2.496911


In [111]:
train_data = combined_dataset_step3.query("dataSource == 'train'").drop(columns=['dataSource'])
train_data.head()

Unnamed: 0,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFinType2,BsmtFullBath,...,TotalBsmtSF,Utilities,WoodDeckSF,X1stFlrSF,X2ndFlrSF,X3SsnPorch,YearBuilt,YearRemodAdd,YrSold,price_log
0,2.531822,3,2.564039,2.540871,2.320042,706.0,0.0,3.262019,2.553429,1.0,...,856.0,2.496911,0,856,854,0,2003,2003,2008,12.247694
1,2.531822,3,2.564039,2.540871,3.295455,978.0,0.0,2.231818,2.553429,0.0,...,1262.0,2.496911,298,1262,0,0,1976,1976,2007,12.109011
2,2.531822,3,2.564039,2.540871,2.675439,486.0,0.0,3.262019,2.553429,1.0,...,920.0,2.496911,0,920,866,0,2001,2002,2008,12.317167
3,2.531822,3,2.564039,3.046154,2.320042,216.0,0.0,2.231818,2.553429,1.0,...,756.0,2.496911,0,961,756,0,1915,1970,2006,11.849398
4,2.531822,4,2.564039,2.540871,2.900452,655.0,0.0,3.262019,2.553429,1.0,...,1145.0,2.496911,192,1145,1053,0,2000,2000,2008,12.429216


In [113]:
step1 = model_selection.find_initial_best_r2_predictor(
    data=train_data,
    target_var="price_log", 
    r2_discard_level=0.02
)
step1.head(6)

Unnamed: 0,formula,predictor,r2
0,price_log ~ OverallQual,OverallQual,0.674705
1,price_log ~ Neighborhood,Neighborhood,0.547709
2,price_log ~ GrLivArea,GrLivArea,0.543804
3,price_log ~ GarageCars,GarageCars,0.463805
4,price_log ~ ExterQual,ExterQual,0.445239
5,price_log ~ BsmtQual,BsmtQual,0.435309


In [186]:
step2 = model_selection.find_next_best_r2_predictor(
    data=train_data, 
    base_formula=step1.loc[0, "formula"], 
    predictors=step1['predictor'],
    r2_gain_discard_level=0.5,
    a_max=0, 
    a_avg=0
)

step2.head()

Unnamed: 0,base_r2,formula,predictor,r2,r2_gain,cor_abs_max,cor_abs_avg,r2_gain_adj
45,0.674705,price_log ~ OverallQual + GrLivArea,GrLivArea,0.764191,13.26286,0.608875,0.608875,13.26286
47,0.674705,price_log ~ OverallQual + LotArea,LotArea,0.746262,10.605577,0.164507,0.164507,10.605577
17,0.674705,price_log ~ OverallQual + X1stFlrSF,X1stFlrSF,0.745623,10.510875,0.469092,0.469092,10.510875
34,0.674705,price_log ~ OverallQual + Neighborhood,Neighborhood,0.736391,9.142558,0.679009,0.679009,9.142558
22,0.674705,price_log ~ OverallQual + TotalBsmtSF,TotalBsmtSF,0.733994,8.787371,0.538595,0.538595,8.787371
