In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%reload_ext rpy2.ipython

In [44]:
import numpy as np
import pandas as pd
import scipy.stats as stats

from kaggle.house_prices import helpers
from kaggle.house_prices import missing
from kaggle.house_prices import outliers
from kaggle.house_prices import transform_numeric as TN

combined_dataset = helpers.load_data()
combined_dataset = missing.fix_all(combined_dataset)
combined_dataset = outliers.remove_outliers(combined_dataset)
combined_dataset = combined_dataset.drop(columns=['Id'])
combined_dataset.shape

(2917, 81)

In [4]:
Trans = {
    'log':    lambda x: np.log(x + 1),
    'sqrt':   lambda x: np.sqrt(x),
    'invcube': lambda x: x**(1/3)
}

Trans

{'invcube': <function __main__.<lambda>>,
 'log': <function __main__.<lambda>>,
 'sqrt': <function __main__.<lambda>>}

In [51]:
df0 = (
    combined_dataset
    .select_dtypes(include=[np.number])
    .drop(columns=['SalePrice'])
)

df1 = TN.calc_tran_config.step1(df0, Trans)
    
df1.query("var == 'LotFrontage'").head()

Unnamed: 0,var,x,log,sqrt,invcube
55423,LotFrontage,65.0,4.189655,8.062258,4.020726
55424,LotFrontage,80.0,4.394449,8.944272,4.308869
55425,LotFrontage,68.0,4.234107,8.246211,4.081655
55426,LotFrontage,60.0,4.110874,7.745967,3.914868
55427,LotFrontage,84.0,4.442651,9.165151,4.379519


In [52]:
df2 = TN.calc_tran_config.step2(df1)

df2.query("var == 'LotArea'").groupby(['var', 'tran']).head(2)

Unnamed: 0,var,tran,value,value_normed
52506,LotArea,x,8450.0,-0.2164
52507,LotArea,x,9600.0,-0.069097
151684,LotArea,log,9.04204,-0.101744
151685,LotArea,log,9.169623,0.14941
250862,LotArea,sqrt,91.923882,-0.210815
250863,LotArea,sqrt,97.97959,0.024445
350040,LotArea,invcube,20.368181,-0.182045
350041,LotArea,invcube,21.253171,0.067009


In [53]:
df3 = TN.calc_tran_config.step3(df2)
df3.query("var == 'YrSold'")

Unnamed: 0,var,tran,value_normed,k
43932,YrSold,invcube,-1.363338,619
43933,YrSold,invcube,-0.602723,691
43934,YrSold,invcube,0.15764,621
43935,YrSold,invcube,0.91775,647
43936,YrSold,invcube,1.677608,339
43937,YrSold,log,-1.363451,619
43938,YrSold,log,-0.602662,691
43939,YrSold,log,0.157748,621
43940,YrSold,log,0.91778,647
43941,YrSold,log,1.677434,339


In [54]:
df4 = TN.calc_tran_config.step4(df3)

df4.query("var == 'LotArea'")

Unnamed: 0_level_0,Unnamed: 1_level_0,diff_L2
var,tran,Unnamed: 2_level_1
LotArea,invcube,10.788623
LotArea,log,13.313755
LotArea,sqrt,13.390061
LotArea,x,50.032779


In [57]:
df5 = TN.calc_tran_config.step5(df4)
df5

Unnamed: 0_level_0,Unnamed: 1_level_0,progress_score,tran
var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GrLivArea,0,91.157942,log
1stFlrSF,0,90.771657,log
BsmtUnfSF,0,79.976031,sqrt
LotArea,0,78.436891,invcube
TotRmsAbvGrd,0,30.685015,log
OverallQual,0,29.944357,log
GarageCars,0,24.956765,log
BedroomAbvGr,0,22.375086,log
OverallCond,0,16.247239,log
FullBath,0,8.797333,invcube


In [64]:
tran_config1 = TN.calc_tran_config.step6(df5, Trans)

tran_config2 = TN.get_transformation_config(
    df=(
        combined_dataset
        .select_dtypes(include=[np.number])
        .drop(columns=['SalePrice'])
    ),
    trans=Trans
)

pd.testing.assert_frame_equal(
    tran_config1.sort_index(axis=1), 
    tran_config2.sort_index(axis=1)
)

tran_config2

Unnamed: 0_level_0,Unnamed: 1_level_0,progress_score,tran,tran_fn
var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GrLivArea,0,91.157942,log,<function <lambda> at 0x7f42aaff46a8>
1stFlrSF,0,90.771657,log,<function <lambda> at 0x7f42aaff46a8>
BsmtUnfSF,0,79.976031,sqrt,<function <lambda> at 0x7f42aad9d840>
LotArea,0,78.436891,invcube,<function <lambda> at 0x7f42aad9d950>
TotRmsAbvGrd,0,30.685015,log,<function <lambda> at 0x7f42aaff46a8>
OverallQual,0,29.944357,log,<function <lambda> at 0x7f42aaff46a8>
GarageCars,0,24.956765,log,<function <lambda> at 0x7f42aaff46a8>
BedroomAbvGr,0,22.375086,log,<function <lambda> at 0x7f42aaff46a8>
OverallCond,0,16.247239,log,<function <lambda> at 0x7f42aaff46a8>
FullBath,0,8.797333,invcube,<function <lambda> at 0x7f42aad9d950>
